diff --git a/.gitignore b/.gitignore index ea8c4bf..fd8c9ce 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ /target +*_out \ No newline at end of file diff --git a/src/audio.rs b/src/audio.rs index 1e34b8a..cd33aff 100644 --- a/src/audio.rs +++ b/src/audio.rs @@ -1,4 +1,4 @@ -use std::{path::PathBuf, ops::Range}; +use std::{path::PathBuf, ops::Range, fmt::Display}; use crate::{ dict::Paths, @@ -79,7 +79,17 @@ impl Audio { } } +#[derive(Debug)] pub enum AudioId<'a> { Str(&'a str), Num(u32) } + +impl Display for AudioId<'_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Str(str) => f.write_str(str), + Self::Num(num) => write!(f, "{num:0>10}"), + } + } +} diff --git a/src/bin/cli.rs b/src/bin/cli.rs index 560e184..804629e 100644 --- a/src/bin/cli.rs +++ b/src/bin/cli.rs @@ -65,7 +65,7 @@ fn main() { match result { Ok((_, pages)) => { for id in pages { - let page = dict.pages.get(id).unwrap(); + let page = dict.pages.get(id.page).unwrap(); println!("{page}"); /* if let Ok(accent) = get_accents(page) { @@ -101,7 +101,7 @@ fn main() { print!(" "); } } */ - println!() + /* let idx_list = [ @@ -178,6 +178,8 @@ fn main() { } } */ - //let mut stdout = stdout().lock(); - //stdout.write_all(audio).unwrap(); + let mut audio_rsc = dict.audio.unwrap(); + let audio = audio_rsc.get("jee").unwrap(); + let mut stdout = stdout().lock(); + stdout.write_all(audio).unwrap(); } diff --git a/src/bin/explode.rs b/src/bin/explode.rs index fa6e67d..1ed7b6e 100644 --- a/src/bin/explode.rs +++ b/src/bin/explode.rs @@ -1,23 +1,41 @@ use std::{ - fmt::Write as _, fs::{create_dir_all, File}, io::Write, - path::Path, + fmt::Write as _, }; -use monokakido::{Error, MonokakidoDict}; +use monokakido::{Error, MonokakidoDict, KeyIndex, PageItemId}; + +fn out_dir(dict: &MonokakidoDict) -> String { + dict.name().to_owned() + "_out/" +} + +fn write_index(dict: &MonokakidoDict, index: &KeyIndex, tsv_fname: &str) -> Result<(), Error> { + let mut index_tsv = File::create(out_dir(&dict) + tsv_fname)?; + for i in 0..index.len() { + let (id, pages) = dict.keys.get_idx(index, i)?; + index_tsv.write_all(id.as_bytes())?; + for PageItemId { page, item } in pages { + write!(&mut index_tsv, "\t{page:0>10}")?; + if item > 0 { + write!(&mut index_tsv, ":{item:0>3}")?; + } + } + index_tsv.write_all(b"\n")?; + } + Ok(()) +} fn explode() -> Result<(), Error> { let arg = std::env::args().nth(1).ok_or(Error::InvalidArg)?; - let mut dict = if Path::new(&arg).exists() { - MonokakidoDict::open_with_path(Path::new(&arg)) - } else { - MonokakidoDict::open(&arg) - }?; - let pages_dir = "./pages/"; - create_dir_all(pages_dir)?; - let mut path = String::from(pages_dir); + let mut dict = MonokakidoDict::open(&arg)?; + + let pages_dir = out_dir(&dict) + "pages/"; + let audio_dir = out_dir(&dict) + "audio/"; + + create_dir_all(&pages_dir)?; + let mut path = String::from(&pages_dir); for idx in dict.pages.idx_iter()? { let (id, page) = dict.pages.get_by_idx(idx)?; write!(&mut path, "{id:0>10}.xml")?; @@ -27,17 +45,21 @@ fn explode() -> Result<(), Error> { } if let Some(audio) = &mut dict.audio { - let audio_dir = "./audio/"; - create_dir_all(audio_dir)?; - let mut path = String::from(audio_dir); + create_dir_all(&audio_dir)?; + let mut path = String::from(&audio_dir); for idx in audio.idx_iter()? { - let (id, page) = dict.pages.get_by_idx(idx)?; - write!(&mut path, "{id:0>10}.aac")?; + let (id, audio) = audio.get_by_idx(idx)?; + write!(&mut path, "{id}.aac")?; let mut file = File::create(&path)?; - path.truncate(pages_dir.len()); - file.write_all(page.as_bytes())?; + path.truncate(audio_dir.len()); + file.write_all(audio)?; } } + + write_index(&dict, &dict.keys.index_len, "index_len.tsv")?; + write_index(&dict, &dict.keys.index_prefix, "index_prefix.tsv")?; + write_index(&dict, &dict.keys.index_suffix, "index_suffix.tsv")?; + write_index(&dict, &dict.keys.index_d, "index_d.tsv")?; Ok(()) } diff --git a/src/key.rs b/src/key.rs index 556da0d..d85af70 100644 --- a/src/key.rs +++ b/src/key.rs @@ -82,12 +82,30 @@ mod abi { } use abi::{FileHeader, IndexHeader}; +#[derive(Debug)] +pub struct KeyIndex { + index: Option> +} + pub struct Keys { words: Vec, - index_len: Option>, - index_prefix: Option>, - index_suffix: Option>, - index_d: Option>, + pub index_len: KeyIndex, + pub index_prefix: KeyIndex, + pub index_suffix: KeyIndex, + pub index_d: KeyIndex, +} + +impl KeyIndex { + fn get(&self, i: usize) -> Result { + let Some(index) = &self.index else { return Err(Error::IndexDoesntExist) }; + let i = i + 1; // Because the the index is prefixed by its legth + if i >= index.len() { return Err(Error::InvalidIndex) } + Ok(index[i].us()) + } + + pub fn len(&self) -> usize { + self.index.as_ref().map(|v| v.len()).unwrap_or(0) - 1 + } } impl Keys { @@ -110,7 +128,7 @@ impl Keys { Ok(()) } - pub(crate) fn new(paths: &Paths) -> Result { + pub fn new(paths: &Paths) -> Result { let mut file = File::open(paths.headword_key_path())?; let file_size = file.metadata()?.len() as usize; let mut hdr = FileHeader::default(); @@ -153,18 +171,13 @@ impl Keys { Ok(Keys { words, - index_len: index_a, - index_prefix: index_b, - index_suffix: index_c, - index_d, + index_len: KeyIndex { index: index_a }, + index_prefix: KeyIndex { index: index_b }, + index_suffix: KeyIndex { index: index_c }, + index_d: KeyIndex { index: index_d }, }) } - pub fn count(&self) -> usize { - // USE INVARIANT A - self.words[0].us() - } - fn get_page_iter(&self, pages_offset: usize) -> Result { let pages = &LE32::slice_as_bytes(&self.words)[pages_offset..]; PageIter::new(pages) @@ -185,8 +198,7 @@ impl Keys { } pub(crate) fn cmp_key(&self, target: &str, idx: usize) -> Result { - let Some(index) = &self.index_len else { return Err(Error::IndexDoesntExist) }; - let offset = index[idx + 1].us() + size_of::() + 1; + let offset = self.index_prefix.get(idx)? + size_of::() + 1; let words_bytes = LE32::slice_as_bytes(&self.words); if words_bytes.len() < offset + target.len() + 1 { return Err(Error::InvalidIndex); // Maybe just return Ordering::Less instead? @@ -205,39 +217,20 @@ impl Keys { }) } - fn get_inner(&self, index: &[LE32], idx: usize) -> Result<(&str, PageIter<'_>), Error> { - if idx >= self.count() { + pub fn get_idx(&self, index: &KeyIndex, idx: usize) -> Result<(&str, PageIter<'_>), Error> { + if idx >= index.len() { return Err(Error::NotFound); } - let word_offset = index[idx + 1].us(); + // TODO: Why is this indexing ok? + let word_offset = index.get(idx)?; let (word, pages_offset) = self.get_word_span(word_offset)?; let pages = self.get_page_iter(pages_offset)?; Ok((word, pages)) } - pub fn get_index_len(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> { - let Some(index) = &self.index_len else { return Err(Error::IndexDoesntExist) }; - self.get_inner(index, idx) - } - - pub fn get_index_prefix(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> { - let Some(index) = &self.index_prefix else { return Err(Error::IndexDoesntExist) }; - self.get_inner(index, idx) - } - - pub fn get_index_suffix(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> { - let Some(index) = &self.index_suffix else { return Err(Error::IndexDoesntExist) }; - self.get_inner(index, idx) - } - - pub fn get_index_d(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> { - let Some(index) = &self.index_d else { return Err(Error::IndexDoesntExist) }; - self.get_inner(index, idx) - } - pub fn search_exact(&self, target_key: &str) -> Result<(usize, PageIter<'_>), Error> { let target_key = &to_katakana(target_key); - let mut high = self.count(); + let mut high = self.index_prefix.len(); let mut low = 0; // TODO: Revise corner cases and add tests for this binary search @@ -249,7 +242,7 @@ impl Keys { match cmp { Ordering::Less => low = mid + 1, Ordering::Greater => high = mid - 1, - Ordering::Equal => return Ok((mid, self.get_index_prefix(mid)?.1)), + Ordering::Equal => return Ok((mid, self.get_idx(&self.index_prefix, mid)?.1)), } } @@ -309,8 +302,10 @@ impl<'a> PageIter<'a> { &[1, _, ref t @ ..] => tail = t, &[2, _, _, ref t @ ..] => tail = t, &[4, _, _, _, ref t @ ..] => tail = t, + &[17, _, _, ref t @ ..] => tail = t, + &[18, _, _, _, ref t @ ..] => tail = t, e => { - dbg!("hmm", &e[..100]); + dbg!("hmm", &e[..100]); // TODO: clean this up return Err(Error::InvalidIndex); } } @@ -324,16 +319,18 @@ impl<'a> PageIter<'a> { } impl<'a> Iterator for PageIter<'a> { - type Item = u32; + type Item = PageItemId; fn next(&mut self) -> Option { // USE INVARIANT B: `self.span` is checked to conform to this shape, // so unreachable is never reached. `self.count` is also checked to correspond, // so overflow never happens. let (id, tail) = match self.span { - &[1, hi, ref tail @ ..] => (u32::from_be_bytes([0, 0, 0, hi]), tail), - &[2, hi, lo, ref tail @ ..] => (u32::from_be_bytes([0, 0, hi, lo]), tail), - &[4, hi, mid, lo, ref tail @ ..] => (u32::from_be_bytes([0, hi, mid, lo]), tail), + &[1, hi, ref tail @ ..] => (pid([0, 0, hi], 0), tail), + &[2, hi, lo, ref tail @ ..] => (pid([0, hi, lo], 0), tail), + &[4, hi, mid, lo, ref tail @ ..] => (pid([hi, mid, lo], 0), tail), + &[17, hi, item, ref tail @ ..] => (pid([0, 0, hi], item), tail), + &[18, hi, lo, item, ref tail @ ..] => (pid([0, hi, lo], item), tail), &[] => return None, _ => unreachable!(), }; @@ -342,3 +339,12 @@ impl<'a> Iterator for PageIter<'a> { Some(id) } } + +pub struct PageItemId { + pub page: u32, + pub item: u8, +} + +fn pid([hi, mid, lo]: [u8; 3], item: u8) -> PageItemId { + PageItemId { page: u32::from_be_bytes([0, hi, mid, lo]), item } +} diff --git a/src/lib.rs b/src/lib.rs index 46ddd93..df347be 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -9,5 +9,5 @@ mod resource; pub use audio::Audio; pub use dict::MonokakidoDict; pub use error::Error; -pub use key::Keys; +pub use key::{Keys, KeyIndex, PageItemId}; pub use pages::Pages;