From 09915cabc5863c40cb54aabec67899aa891e2d0f Mon Sep 17 00:00:00 2001 From: Pyry Kontio Date: Sat, 4 Feb 2023 21:38:42 +0900 Subject: [PATCH] Start implementing headline support. --- README.md | 32 ++++++++++++-- src/abi_utils.rs | 13 ++++++ src/bin/monokakido-cli.rs | 1 + src/dict.rs | 16 ++++++- src/headline.rs | 93 +++++++++++++++++++++++++++++++++++++++ src/key.rs | 33 +++++--------- src/lib.rs | 2 + 7 files changed, 164 insertions(+), 26 deletions(-) create mode 100644 src/headline.rs diff --git a/README.md b/README.md index ea6881c..1012c8f 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,38 @@ A Rust library for parsing and interpreting the [Monokakido](https://www.monokakido.jp/en/dictionaries/app/) dictionary format. Aiming for full test coverage and efficient implementation with minimal dependencies. ## TODO: -- Refactor code for generic "rsc" and "nrsc" support +- Add headline support +- Refactor as a workspace to separate the dependencies of the library and the binaries +- Move to mmap-based indexes +- Add graphics support +- Add TTY detection to CLI (prevent binary output to shell) +- Add proper argument parser lib to CLI +- Refine CLI according to the plan below +- Document the rsc, nrsc and keystore and headline formats +### Test: - Audio using "rsc" (CCCAD, WISDOM3) - Audio using "nrsc" (DAIJISEN2, NHKACCENT2, OALD10, OLDAE, OLEX, OLT, RHEJ, SMK8) - Multiple contents (WISDOM3, OLEX) -- Document the rsc, nrsc and keystore formats -- Split main.rs into "dict exploder" and "dict cli" + + +## CLI (Planned) + +### Tab-separated output formats: +- keyword +- headline +- iid (item id) +- pid (page id) +- aid (audio id) +- gid (graphics id) + +### \n\n separated output formats: +- item +- page + +### binary output formats: +- audio +- graphics + ## Planned to support: - WISDOM3 diff --git a/src/abi_utils.rs b/src/abi_utils.rs index 596c95e..5978ad1 100644 --- a/src/abi_utils.rs +++ b/src/abi_utils.rs @@ -2,6 +2,7 @@ use core::{ mem::{align_of, size_of}, slice, }; +use std::{fs::File, io::Read}; use crate::Error; @@ -34,6 +35,7 @@ impl From for LE32 { } unsafe impl TransmuteSafe for LE32 {} +unsafe impl TransmuteSafe for u8 {} pub(crate) unsafe trait TransmuteSafe: Default + Clone { fn from_buf(buf: &[u8]) -> Result<(&Self, &[u8]), Error> { @@ -83,3 +85,14 @@ pub(crate) unsafe trait TransmuteSafe: Default + Clone { Self::slice_as_bytes(slice::from_ref(self)) } } + +pub(crate) fn read_vec(file: &mut File, start: usize, end: usize) -> Result>, Error> { + if start == 0 || end == 0 { + return Ok(None); + } + // Replace this with div_ceil once it stabilizes + let size = (end - start + size_of::() - 1) / size_of::(); + let mut buf = vec![T::default(); size]; + file.read_exact(T::slice_as_bytes_mut(&mut buf))?; + Ok(Some(buf)) +} diff --git a/src/bin/monokakido-cli.rs b/src/bin/monokakido-cli.rs index cf1a2b7..dfd25f4 100644 --- a/src/bin/monokakido-cli.rs +++ b/src/bin/monokakido-cli.rs @@ -7,6 +7,7 @@ fn print_help() { println!("list - lists all dictionaries installed in the standard path"); println!("list_items {{dict}} {{keyword}} - lists all items"); println!("list_audio {{dict}} {{keyword}} - lists all audio files"); + println!("get_audio {{dict}} {{id}} - writes an audio file to stdout"); println!("help - this help"); } diff --git a/src/dict.rs b/src/dict.rs index eaca303..ecb2d38 100644 --- a/src/dict.rs +++ b/src/dict.rs @@ -67,11 +67,25 @@ impl Paths { pb } - pub(crate) fn headword_key_path(&self) -> PathBuf { + pub(crate) fn key_headword_path(&self) -> PathBuf { let mut pb = self.key_path(); pb.push("headword.keystore"); pb } + + pub(crate) fn headline_path(&self) -> PathBuf { + let mut pb = PathBuf::from(&self.base_path); + pb.push("Contents"); + pb.push(&self.contents_dir); + pb.push("headline"); + pb + } + + pub(crate) fn headline_long_path(&self) -> PathBuf { + let mut pb = self.headline_path(); + pb.push("headline.headlinestore"); + pb + } } fn parse_dict_name(fname: &OsStr) -> Option<&str> { diff --git a/src/headline.rs b/src/headline.rs new file mode 100644 index 0000000..25d56a6 --- /dev/null +++ b/src/headline.rs @@ -0,0 +1,93 @@ +use std::{ + fs::File, + io::{Read, Seek}, +}; + +use crate::{ + abi_utils::{TransmuteSafe, LE32, read_vec}, + dict::Paths, + Error, PageItemId, +}; + +mod abi { + use super::*; + + #[repr(C)] + #[derive(Debug, Clone, Copy, Default)] + pub(super) struct FileHeader { + magic1: LE32, + magic2: LE32, + pub len: LE32, + pub rec_offset: LE32, + pub words_offset: LE32, + rec_bytes: LE32, + magic4: LE32, + magic5: LE32, + } + + impl FileHeader { + pub(super) fn validate(&self) -> Result<(), Error> { + if self.magic1.read() == 0 + && self.magic2.read() == 0x2 + && self.rec_bytes.read() == 0x18 + && self.magic4.read() == 0 + && self.magic5.read() == 0 + { + Ok(()) + } else { + Err(Error::KeyFileHeaderValidate) + } + } + } + + #[repr(C)] + #[derive(Debug, Clone, Copy, Default)] + pub(super) struct Offset { + pub page_id: LE32, + pub item_id: u8, + pub item_type: u8, + magic1: u16, + pub offset: LE32, + magic2: LE32, + magic3: LE32, + magic4: LE32, + } + + unsafe impl TransmuteSafe for FileHeader {} + unsafe impl TransmuteSafe for Offset {} +} +use abi::{FileHeader, Offset}; + +pub struct Headlines { + recs: Vec, + words: Vec, +} + +impl Headlines { + pub fn new(paths: &Paths) -> Result { + let mut file = File::open(paths.headline_long_path())?; + let file_size = file.metadata()?.len() as usize; + let mut hdr = FileHeader::default(); + file.read_exact(hdr.as_bytes_mut())?; + hdr.validate()?; + + file.seek(std::io::SeekFrom::Start(hdr.words_offset.read() as u64))?; + let offsets: Option> = read_vec(&mut file, hdr.rec_offset.us(), hdr.words_offset.us())?; + let Some(recs) = offsets else { return Err(Error::InvalidIndex); }; + + let words: Option> = read_vec(&mut file, hdr.words_offset.us(), file_size)?; + let Some(words) = words else { return Err(Error::InvalidIndex); }; + + Ok(Headlines { + recs, + words, + }) + } + + pub fn get(&self, id: PageItemId) -> Result { + let rec = self.recs.binary_search_by(|rec| + rec.page_id.read().cmp(&id.page).then(rec.item_id.cmp(&id.item)) + ).map_err(|_| Error::InvalidIndex)?; + todo!(); + } +} diff --git a/src/key.rs b/src/key.rs index 4aaa585..0695e95 100644 --- a/src/key.rs +++ b/src/key.rs @@ -8,7 +8,7 @@ use std::{ }; use crate::{ - abi_utils::{TransmuteSafe, LE32}, + abi_utils::{TransmuteSafe, LE32, read_vec}, dict::Paths, Error, }; @@ -59,7 +59,7 @@ mod abi { } impl IndexHeader { - pub(super) fn validate(&self, file_end: usize) -> Result<(), Error> { + pub(super) fn validate(&self, idx_end: usize) -> Result<(), Error> { let a = self.index_a_offset.us(); let b = self.index_b_offset.us(); let c = self.index_c_offset.us(); @@ -69,7 +69,7 @@ mod abi { && check_order(a, b) && check_order(b, c) && check_order(c, d) - && check_order(d, file_end) + && check_order(d, idx_end) { Ok(()) } else { @@ -111,17 +111,6 @@ impl KeyIndex { } impl Keys { - fn read_vec(file: &mut File, start: usize, end: usize) -> Result>, Error> { - if start == 0 || end == 0 { - return Ok(None); - } - // Replace this with div_ceil once it stabilizes - let size = (end - start + size_of::() - 1) / size_of::(); - let mut buf = vec![LE32::default(); size]; - file.read_exact(LE32::slice_as_bytes_mut(&mut buf))?; - Ok(Some(buf)) - } - fn check_vec_len(buf: &Option>) -> Result<(), Error> { let Some(buf) = buf else { return Ok(()) }; if buf.get(0).ok_or(Error::InvalidIndex)?.us() + 1 != buf.len() { @@ -131,44 +120,44 @@ impl Keys { } pub fn new(paths: &Paths) -> Result { - let mut file = File::open(paths.headword_key_path())?; + let mut file = File::open(paths.key_headword_path())?; let file_size = file.metadata()?.len() as usize; let mut hdr = FileHeader::default(); file.read_exact(hdr.as_bytes_mut())?; hdr.validate()?; file.seek(std::io::SeekFrom::Start(hdr.words_offset.read() as u64))?; - let words = Self::read_vec(&mut file, hdr.words_offset.us(), hdr.idx_offset.us())?; + let words = read_vec(&mut file, hdr.words_offset.us(), hdr.idx_offset.us())?; let Some(words) = words else { return Err(Error::InvalidIndex); }; - let file_end = file_size - hdr.idx_offset.us(); + let idx_end = file_size - hdr.idx_offset.us(); let mut ihdr = IndexHeader::default(); file.seek(std::io::SeekFrom::Start(hdr.idx_offset.read() as u64))?; file.read_exact(ihdr.as_bytes_mut())?; - ihdr.validate(file_end)?; + ihdr.validate(idx_end)?; - let index_a = Self::read_vec( + let index_a = read_vec( &mut file, ihdr.index_a_offset.us(), ihdr.index_b_offset.us(), )?; Self::check_vec_len(&index_a)?; - let index_b = Self::read_vec( + let index_b = read_vec( &mut file, ihdr.index_b_offset.us(), ihdr.index_c_offset.us(), )?; Self::check_vec_len(&index_b)?; - let index_c = Self::read_vec( + let index_c = read_vec( &mut file, ihdr.index_c_offset.us(), ihdr.index_d_offset.us(), )?; Self::check_vec_len(&index_c)?; - let index_d = Self::read_vec(&mut file, ihdr.index_d_offset.us(), file_end)?; + let index_d = read_vec(&mut file, ihdr.index_d_offset.us(), idx_end)?; Self::check_vec_len(&index_d)?; Ok(Keys { diff --git a/src/lib.rs b/src/lib.rs index f81d773..0fd5c29 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,9 +5,11 @@ mod error; mod key; mod pages; mod resource; +mod headline; pub use audio::Audio; pub use dict::MonokakidoDict; pub use error::Error; pub use key::{KeyIndex, Keys, PageItemId}; pub use pages::{Pages, XmlParser}; +pub use headline::{Headlines};