Dict exploding works

This commit is contained in:
Pyry Kontio 2023-02-01 04:39:15 +09:00
parent 5287f1b493
commit 9ded46f49c
No known key found for this signature in database
6 changed files with 111 additions and 70 deletions

1
.gitignore vendored
View file

@ -1 +1,2 @@
/target /target
*_out

View file

@ -1,4 +1,4 @@
use std::{path::PathBuf, ops::Range}; use std::{path::PathBuf, ops::Range, fmt::Display};
use crate::{ use crate::{
dict::Paths, dict::Paths,
@ -79,7 +79,17 @@ impl Audio {
} }
} }
#[derive(Debug)]
pub enum AudioId<'a> { pub enum AudioId<'a> {
Str(&'a str), Str(&'a str),
Num(u32) Num(u32)
} }
impl Display for AudioId<'_> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Str(str) => f.write_str(str),
Self::Num(num) => write!(f, "{num:0>10}"),
}
}
}

View file

@ -65,7 +65,7 @@ fn main() {
match result { match result {
Ok((_, pages)) => { Ok((_, pages)) => {
for id in pages { for id in pages {
let page = dict.pages.get(id).unwrap(); let page = dict.pages.get(id.page).unwrap();
println!("{page}"); println!("{page}");
/* /*
if let Ok(accent) = get_accents(page) { if let Ok(accent) = get_accents(page) {
@ -101,7 +101,7 @@ fn main() {
print!(" "); print!(" ");
} }
} */ } */
println!()
/* /*
let idx_list = [ let idx_list = [
@ -178,6 +178,8 @@ fn main() {
} }
} }
*/ */
//let mut stdout = stdout().lock(); let mut audio_rsc = dict.audio.unwrap();
//stdout.write_all(audio).unwrap(); let audio = audio_rsc.get("jee").unwrap();
let mut stdout = stdout().lock();
stdout.write_all(audio).unwrap();
} }

View file

@ -1,23 +1,41 @@
use std::{ use std::{
fmt::Write as _,
fs::{create_dir_all, File}, fs::{create_dir_all, File},
io::Write, io::Write,
path::Path, fmt::Write as _,
}; };
use monokakido::{Error, MonokakidoDict}; use monokakido::{Error, MonokakidoDict, KeyIndex, PageItemId};
fn out_dir(dict: &MonokakidoDict) -> String {
dict.name().to_owned() + "_out/"
}
fn write_index(dict: &MonokakidoDict, index: &KeyIndex, tsv_fname: &str) -> Result<(), Error> {
let mut index_tsv = File::create(out_dir(&dict) + tsv_fname)?;
for i in 0..index.len() {
let (id, pages) = dict.keys.get_idx(index, i)?;
index_tsv.write_all(id.as_bytes())?;
for PageItemId { page, item } in pages {
write!(&mut index_tsv, "\t{page:0>10}")?;
if item > 0 {
write!(&mut index_tsv, ":{item:0>3}")?;
}
}
index_tsv.write_all(b"\n")?;
}
Ok(())
}
fn explode() -> Result<(), Error> { fn explode() -> Result<(), Error> {
let arg = std::env::args().nth(1).ok_or(Error::InvalidArg)?; let arg = std::env::args().nth(1).ok_or(Error::InvalidArg)?;
let mut dict = if Path::new(&arg).exists() { let mut dict = MonokakidoDict::open(&arg)?;
MonokakidoDict::open_with_path(Path::new(&arg))
} else { let pages_dir = out_dir(&dict) + "pages/";
MonokakidoDict::open(&arg) let audio_dir = out_dir(&dict) + "audio/";
}?;
let pages_dir = "./pages/"; create_dir_all(&pages_dir)?;
create_dir_all(pages_dir)?; let mut path = String::from(&pages_dir);
let mut path = String::from(pages_dir);
for idx in dict.pages.idx_iter()? { for idx in dict.pages.idx_iter()? {
let (id, page) = dict.pages.get_by_idx(idx)?; let (id, page) = dict.pages.get_by_idx(idx)?;
write!(&mut path, "{id:0>10}.xml")?; write!(&mut path, "{id:0>10}.xml")?;
@ -27,17 +45,21 @@ fn explode() -> Result<(), Error> {
} }
if let Some(audio) = &mut dict.audio { if let Some(audio) = &mut dict.audio {
let audio_dir = "./audio/"; create_dir_all(&audio_dir)?;
create_dir_all(audio_dir)?; let mut path = String::from(&audio_dir);
let mut path = String::from(audio_dir);
for idx in audio.idx_iter()? { for idx in audio.idx_iter()? {
let (id, page) = dict.pages.get_by_idx(idx)?; let (id, audio) = audio.get_by_idx(idx)?;
write!(&mut path, "{id:0>10}.aac")?; write!(&mut path, "{id}.aac")?;
let mut file = File::create(&path)?; let mut file = File::create(&path)?;
path.truncate(pages_dir.len()); path.truncate(audio_dir.len());
file.write_all(page.as_bytes())?; file.write_all(audio)?;
} }
} }
write_index(&dict, &dict.keys.index_len, "index_len.tsv")?;
write_index(&dict, &dict.keys.index_prefix, "index_prefix.tsv")?;
write_index(&dict, &dict.keys.index_suffix, "index_suffix.tsv")?;
write_index(&dict, &dict.keys.index_d, "index_d.tsv")?;
Ok(()) Ok(())
} }

View file

@ -82,12 +82,30 @@ mod abi {
} }
use abi::{FileHeader, IndexHeader}; use abi::{FileHeader, IndexHeader};
#[derive(Debug)]
pub struct KeyIndex {
index: Option<Vec<LE32>>
}
pub struct Keys { pub struct Keys {
words: Vec<LE32>, words: Vec<LE32>,
index_len: Option<Vec<LE32>>, pub index_len: KeyIndex,
index_prefix: Option<Vec<LE32>>, pub index_prefix: KeyIndex,
index_suffix: Option<Vec<LE32>>, pub index_suffix: KeyIndex,
index_d: Option<Vec<LE32>>, pub index_d: KeyIndex,
}
impl KeyIndex {
fn get(&self, i: usize) -> Result<usize, Error> {
let Some(index) = &self.index else { return Err(Error::IndexDoesntExist) };
let i = i + 1; // Because the the index is prefixed by its legth
if i >= index.len() { return Err(Error::InvalidIndex) }
Ok(index[i].us())
}
pub fn len(&self) -> usize {
self.index.as_ref().map(|v| v.len()).unwrap_or(0) - 1
}
} }
impl Keys { impl Keys {
@ -110,7 +128,7 @@ impl Keys {
Ok(()) Ok(())
} }
pub(crate) fn new(paths: &Paths) -> Result<Keys, Error> { pub fn new(paths: &Paths) -> Result<Keys, Error> {
let mut file = File::open(paths.headword_key_path())?; let mut file = File::open(paths.headword_key_path())?;
let file_size = file.metadata()?.len() as usize; let file_size = file.metadata()?.len() as usize;
let mut hdr = FileHeader::default(); let mut hdr = FileHeader::default();
@ -153,18 +171,13 @@ impl Keys {
Ok(Keys { Ok(Keys {
words, words,
index_len: index_a, index_len: KeyIndex { index: index_a },
index_prefix: index_b, index_prefix: KeyIndex { index: index_b },
index_suffix: index_c, index_suffix: KeyIndex { index: index_c },
index_d, index_d: KeyIndex { index: index_d },
}) })
} }
pub fn count(&self) -> usize {
// USE INVARIANT A
self.words[0].us()
}
fn get_page_iter(&self, pages_offset: usize) -> Result<PageIter, Error> { fn get_page_iter(&self, pages_offset: usize) -> Result<PageIter, Error> {
let pages = &LE32::slice_as_bytes(&self.words)[pages_offset..]; let pages = &LE32::slice_as_bytes(&self.words)[pages_offset..];
PageIter::new(pages) PageIter::new(pages)
@ -185,8 +198,7 @@ impl Keys {
} }
pub(crate) fn cmp_key(&self, target: &str, idx: usize) -> Result<Ordering, Error> { pub(crate) fn cmp_key(&self, target: &str, idx: usize) -> Result<Ordering, Error> {
let Some(index) = &self.index_len else { return Err(Error::IndexDoesntExist) }; let offset = self.index_prefix.get(idx)? + size_of::<LE32>() + 1;
let offset = index[idx + 1].us() + size_of::<LE32>() + 1;
let words_bytes = LE32::slice_as_bytes(&self.words); let words_bytes = LE32::slice_as_bytes(&self.words);
if words_bytes.len() < offset + target.len() + 1 { if words_bytes.len() < offset + target.len() + 1 {
return Err(Error::InvalidIndex); // Maybe just return Ordering::Less instead? return Err(Error::InvalidIndex); // Maybe just return Ordering::Less instead?
@ -205,39 +217,20 @@ impl Keys {
}) })
} }
fn get_inner(&self, index: &[LE32], idx: usize) -> Result<(&str, PageIter<'_>), Error> { pub fn get_idx(&self, index: &KeyIndex, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
if idx >= self.count() { if idx >= index.len() {
return Err(Error::NotFound); return Err(Error::NotFound);
} }
let word_offset = index[idx + 1].us(); // TODO: Why is this indexing ok?
let word_offset = index.get(idx)?;
let (word, pages_offset) = self.get_word_span(word_offset)?; let (word, pages_offset) = self.get_word_span(word_offset)?;
let pages = self.get_page_iter(pages_offset)?; let pages = self.get_page_iter(pages_offset)?;
Ok((word, pages)) Ok((word, pages))
} }
pub fn get_index_len(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
let Some(index) = &self.index_len else { return Err(Error::IndexDoesntExist) };
self.get_inner(index, idx)
}
pub fn get_index_prefix(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
let Some(index) = &self.index_prefix else { return Err(Error::IndexDoesntExist) };
self.get_inner(index, idx)
}
pub fn get_index_suffix(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
let Some(index) = &self.index_suffix else { return Err(Error::IndexDoesntExist) };
self.get_inner(index, idx)
}
pub fn get_index_d(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
let Some(index) = &self.index_d else { return Err(Error::IndexDoesntExist) };
self.get_inner(index, idx)
}
pub fn search_exact(&self, target_key: &str) -> Result<(usize, PageIter<'_>), Error> { pub fn search_exact(&self, target_key: &str) -> Result<(usize, PageIter<'_>), Error> {
let target_key = &to_katakana(target_key); let target_key = &to_katakana(target_key);
let mut high = self.count(); let mut high = self.index_prefix.len();
let mut low = 0; let mut low = 0;
// TODO: Revise corner cases and add tests for this binary search // TODO: Revise corner cases and add tests for this binary search
@ -249,7 +242,7 @@ impl Keys {
match cmp { match cmp {
Ordering::Less => low = mid + 1, Ordering::Less => low = mid + 1,
Ordering::Greater => high = mid - 1, Ordering::Greater => high = mid - 1,
Ordering::Equal => return Ok((mid, self.get_index_prefix(mid)?.1)), Ordering::Equal => return Ok((mid, self.get_idx(&self.index_prefix, mid)?.1)),
} }
} }
@ -309,8 +302,10 @@ impl<'a> PageIter<'a> {
&[1, _, ref t @ ..] => tail = t, &[1, _, ref t @ ..] => tail = t,
&[2, _, _, ref t @ ..] => tail = t, &[2, _, _, ref t @ ..] => tail = t,
&[4, _, _, _, ref t @ ..] => tail = t, &[4, _, _, _, ref t @ ..] => tail = t,
&[17, _, _, ref t @ ..] => tail = t,
&[18, _, _, _, ref t @ ..] => tail = t,
e => { e => {
dbg!("hmm", &e[..100]); dbg!("hmm", &e[..100]); // TODO: clean this up
return Err(Error::InvalidIndex); return Err(Error::InvalidIndex);
} }
} }
@ -324,16 +319,18 @@ impl<'a> PageIter<'a> {
} }
impl<'a> Iterator for PageIter<'a> { impl<'a> Iterator for PageIter<'a> {
type Item = u32; type Item = PageItemId;
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {
// USE INVARIANT B: `self.span` is checked to conform to this shape, // USE INVARIANT B: `self.span` is checked to conform to this shape,
// so unreachable is never reached. `self.count` is also checked to correspond, // so unreachable is never reached. `self.count` is also checked to correspond,
// so overflow never happens. // so overflow never happens.
let (id, tail) = match self.span { let (id, tail) = match self.span {
&[1, hi, ref tail @ ..] => (u32::from_be_bytes([0, 0, 0, hi]), tail), &[1, hi, ref tail @ ..] => (pid([0, 0, hi], 0), tail),
&[2, hi, lo, ref tail @ ..] => (u32::from_be_bytes([0, 0, hi, lo]), tail), &[2, hi, lo, ref tail @ ..] => (pid([0, hi, lo], 0), tail),
&[4, hi, mid, lo, ref tail @ ..] => (u32::from_be_bytes([0, hi, mid, lo]), tail), &[4, hi, mid, lo, ref tail @ ..] => (pid([hi, mid, lo], 0), tail),
&[17, hi, item, ref tail @ ..] => (pid([0, 0, hi], item), tail),
&[18, hi, lo, item, ref tail @ ..] => (pid([0, hi, lo], item), tail),
&[] => return None, &[] => return None,
_ => unreachable!(), _ => unreachable!(),
}; };
@ -342,3 +339,12 @@ impl<'a> Iterator for PageIter<'a> {
Some(id) Some(id)
} }
} }
pub struct PageItemId {
pub page: u32,
pub item: u8,
}
fn pid([hi, mid, lo]: [u8; 3], item: u8) -> PageItemId {
PageItemId { page: u32::from_be_bytes([0, hi, mid, lo]), item }
}

View file

@ -9,5 +9,5 @@ mod resource;
pub use audio::Audio; pub use audio::Audio;
pub use dict::MonokakidoDict; pub use dict::MonokakidoDict;
pub use error::Error; pub use error::Error;
pub use key::Keys; pub use key::{Keys, KeyIndex, PageItemId};
pub use pages::Pages; pub use pages::Pages;