Start implementing headline support.

This commit is contained in:
Pyry Kontio 2023-02-04 21:38:42 +09:00
parent 4b70b0708d
commit 09915cabc5
No known key found for this signature in database
7 changed files with 164 additions and 26 deletions

View file

@ -3,12 +3,38 @@
A Rust library for parsing and interpreting the [Monokakido](https://www.monokakido.jp/en/dictionaries/app/) dictionary format. Aiming for full test coverage and efficient implementation with minimal dependencies.
## TODO:
- Refactor code for generic "rsc" and "nrsc" support
- Add headline support
- Refactor as a workspace to separate the dependencies of the library and the binaries
- Move to mmap-based indexes
- Add graphics support
- Add TTY detection to CLI (prevent binary output to shell)
- Add proper argument parser lib to CLI
- Refine CLI according to the plan below
- Document the rsc, nrsc and keystore and headline formats
### Test:
- Audio using "rsc" (CCCAD, WISDOM3)
- Audio using "nrsc" (DAIJISEN2, NHKACCENT2, OALD10, OLDAE, OLEX, OLT, RHEJ, SMK8)
- Multiple contents (WISDOM3, OLEX)
- Document the rsc, nrsc and keystore formats
- Split main.rs into "dict exploder" and "dict cli"
## CLI Planned
### Tab-separated output formats:
- keyword
- headline
- iid (item id)
- pid (page id)
- aid (audio id)
- gid (graphics id)
### \n\n separated output formats:
- item
- page
### binary output formats:
- audio
- graphics
## Planned to support:
- WISDOM3

View file

@ -2,6 +2,7 @@ use core::{
mem::{align_of, size_of},
slice,
};
use std::{fs::File, io::Read};
use crate::Error;
@ -34,6 +35,7 @@ impl From<u32> for LE32 {
}
unsafe impl TransmuteSafe for LE32 {}
unsafe impl TransmuteSafe for u8 {}
pub(crate) unsafe trait TransmuteSafe: Default + Clone {
fn from_buf(buf: &[u8]) -> Result<(&Self, &[u8]), Error> {
@ -83,3 +85,14 @@ pub(crate) unsafe trait TransmuteSafe: Default + Clone {
Self::slice_as_bytes(slice::from_ref(self))
}
}
pub(crate) fn read_vec<T: TransmuteSafe>(file: &mut File, start: usize, end: usize) -> Result<Option<Vec<T>>, Error> {
if start == 0 || end == 0 {
return Ok(None);
}
// Replace this with div_ceil once it stabilizes
let size = (end - start + size_of::<T>() - 1) / size_of::<T>();
let mut buf = vec![T::default(); size];
file.read_exact(T::slice_as_bytes_mut(&mut buf))?;
Ok(Some(buf))
}

View file

@ -7,6 +7,7 @@ fn print_help() {
println!("list - lists all dictionaries installed in the standard path");
println!("list_items {{dict}} {{keyword}} - lists all items");
println!("list_audio {{dict}} {{keyword}} - lists all audio files");
println!("get_audio {{dict}} {{id}} - writes an audio file to stdout");
println!("help - this help");
}

View file

@ -67,11 +67,25 @@ impl Paths {
pb
}
pub(crate) fn headword_key_path(&self) -> PathBuf {
pub(crate) fn key_headword_path(&self) -> PathBuf {
let mut pb = self.key_path();
pb.push("headword.keystore");
pb
}
pub(crate) fn headline_path(&self) -> PathBuf {
let mut pb = PathBuf::from(&self.base_path);
pb.push("Contents");
pb.push(&self.contents_dir);
pb.push("headline");
pb
}
pub(crate) fn headline_long_path(&self) -> PathBuf {
let mut pb = self.headline_path();
pb.push("headline.headlinestore");
pb
}
}
fn parse_dict_name(fname: &OsStr) -> Option<&str> {

93
src/headline.rs Normal file
View file

@ -0,0 +1,93 @@
use std::{
fs::File,
io::{Read, Seek},
};
use crate::{
abi_utils::{TransmuteSafe, LE32, read_vec},
dict::Paths,
Error, PageItemId,
};
mod abi {
use super::*;
#[repr(C)]
#[derive(Debug, Clone, Copy, Default)]
pub(super) struct FileHeader {
magic1: LE32,
magic2: LE32,
pub len: LE32,
pub rec_offset: LE32,
pub words_offset: LE32,
rec_bytes: LE32,
magic4: LE32,
magic5: LE32,
}
impl FileHeader {
pub(super) fn validate(&self) -> Result<(), Error> {
if self.magic1.read() == 0
&& self.magic2.read() == 0x2
&& self.rec_bytes.read() == 0x18
&& self.magic4.read() == 0
&& self.magic5.read() == 0
{
Ok(())
} else {
Err(Error::KeyFileHeaderValidate)
}
}
}
#[repr(C)]
#[derive(Debug, Clone, Copy, Default)]
pub(super) struct Offset {
pub page_id: LE32,
pub item_id: u8,
pub item_type: u8,
magic1: u16,
pub offset: LE32,
magic2: LE32,
magic3: LE32,
magic4: LE32,
}
unsafe impl TransmuteSafe for FileHeader {}
unsafe impl TransmuteSafe for Offset {}
}
use abi::{FileHeader, Offset};
pub struct Headlines {
recs: Vec<Offset>,
words: Vec<u8>,
}
impl Headlines {
pub fn new(paths: &Paths) -> Result<Headlines, Error> {
let mut file = File::open(paths.headline_long_path())?;
let file_size = file.metadata()?.len() as usize;
let mut hdr = FileHeader::default();
file.read_exact(hdr.as_bytes_mut())?;
hdr.validate()?;
file.seek(std::io::SeekFrom::Start(hdr.words_offset.read() as u64))?;
let offsets: Option<Vec<Offset>> = read_vec(&mut file, hdr.rec_offset.us(), hdr.words_offset.us())?;
let Some(recs) = offsets else { return Err(Error::InvalidIndex); };
let words: Option<Vec<u8>> = read_vec(&mut file, hdr.words_offset.us(), file_size)?;
let Some(words) = words else { return Err(Error::InvalidIndex); };
Ok(Headlines {
recs,
words,
})
}
pub fn get(&self, id: PageItemId) -> Result<String, Error> {
let rec = self.recs.binary_search_by(|rec|
rec.page_id.read().cmp(&id.page).then(rec.item_id.cmp(&id.item))
).map_err(|_| Error::InvalidIndex)?;
todo!();
}
}

View file

@ -8,7 +8,7 @@ use std::{
};
use crate::{
abi_utils::{TransmuteSafe, LE32},
abi_utils::{TransmuteSafe, LE32, read_vec},
dict::Paths,
Error,
};
@ -59,7 +59,7 @@ mod abi {
}
impl IndexHeader {
pub(super) fn validate(&self, file_end: usize) -> Result<(), Error> {
pub(super) fn validate(&self, idx_end: usize) -> Result<(), Error> {
let a = self.index_a_offset.us();
let b = self.index_b_offset.us();
let c = self.index_c_offset.us();
@ -69,7 +69,7 @@ mod abi {
&& check_order(a, b)
&& check_order(b, c)
&& check_order(c, d)
&& check_order(d, file_end)
&& check_order(d, idx_end)
{
Ok(())
} else {
@ -111,17 +111,6 @@ impl KeyIndex {
}
impl Keys {
fn read_vec(file: &mut File, start: usize, end: usize) -> Result<Option<Vec<LE32>>, Error> {
if start == 0 || end == 0 {
return Ok(None);
}
// Replace this with div_ceil once it stabilizes
let size = (end - start + size_of::<LE32>() - 1) / size_of::<LE32>();
let mut buf = vec![LE32::default(); size];
file.read_exact(LE32::slice_as_bytes_mut(&mut buf))?;
Ok(Some(buf))
}
fn check_vec_len(buf: &Option<Vec<LE32>>) -> Result<(), Error> {
let Some(buf) = buf else { return Ok(()) };
if buf.get(0).ok_or(Error::InvalidIndex)?.us() + 1 != buf.len() {
@ -131,44 +120,44 @@ impl Keys {
}
pub fn new(paths: &Paths) -> Result<Keys, Error> {
let mut file = File::open(paths.headword_key_path())?;
let mut file = File::open(paths.key_headword_path())?;
let file_size = file.metadata()?.len() as usize;
let mut hdr = FileHeader::default();
file.read_exact(hdr.as_bytes_mut())?;
hdr.validate()?;
file.seek(std::io::SeekFrom::Start(hdr.words_offset.read() as u64))?;
let words = Self::read_vec(&mut file, hdr.words_offset.us(), hdr.idx_offset.us())?;
let words = read_vec(&mut file, hdr.words_offset.us(), hdr.idx_offset.us())?;
let Some(words) = words else { return Err(Error::InvalidIndex); };
let file_end = file_size - hdr.idx_offset.us();
let idx_end = file_size - hdr.idx_offset.us();
let mut ihdr = IndexHeader::default();
file.seek(std::io::SeekFrom::Start(hdr.idx_offset.read() as u64))?;
file.read_exact(ihdr.as_bytes_mut())?;
ihdr.validate(file_end)?;
ihdr.validate(idx_end)?;
let index_a = Self::read_vec(
let index_a = read_vec(
&mut file,
ihdr.index_a_offset.us(),
ihdr.index_b_offset.us(),
)?;
Self::check_vec_len(&index_a)?;
let index_b = Self::read_vec(
let index_b = read_vec(
&mut file,
ihdr.index_b_offset.us(),
ihdr.index_c_offset.us(),
)?;
Self::check_vec_len(&index_b)?;
let index_c = Self::read_vec(
let index_c = read_vec(
&mut file,
ihdr.index_c_offset.us(),
ihdr.index_d_offset.us(),
)?;
Self::check_vec_len(&index_c)?;
let index_d = Self::read_vec(&mut file, ihdr.index_d_offset.us(), file_end)?;
let index_d = read_vec(&mut file, ihdr.index_d_offset.us(), idx_end)?;
Self::check_vec_len(&index_d)?;
Ok(Keys {

View file

@ -5,9 +5,11 @@ mod error;
mod key;
mod pages;
mod resource;
mod headline;
pub use audio::Audio;
pub use dict::MonokakidoDict;
pub use error::Error;
pub use key::{KeyIndex, Keys, PageItemId};
pub use pages::{Pages, XmlParser};
pub use headline::{Headlines};