Dict exploding works

This commit is contained in:
Pyry Kontio 2023-02-01 04:39:15 +09:00
parent 5287f1b493
commit 9ded46f49c
No known key found for this signature in database
6 changed files with 111 additions and 70 deletions

1
.gitignore vendored
View file

@ -1 +1,2 @@
/target
*_out

View file

@ -1,4 +1,4 @@
use std::{path::PathBuf, ops::Range};
use std::{path::PathBuf, ops::Range, fmt::Display};
use crate::{
dict::Paths,
@ -79,7 +79,17 @@ impl Audio {
}
}
#[derive(Debug)]
pub enum AudioId<'a> {
Str(&'a str),
Num(u32)
}
impl Display for AudioId<'_> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Str(str) => f.write_str(str),
Self::Num(num) => write!(f, "{num:0>10}"),
}
}
}

View file

@ -65,7 +65,7 @@ fn main() {
match result {
Ok((_, pages)) => {
for id in pages {
let page = dict.pages.get(id).unwrap();
let page = dict.pages.get(id.page).unwrap();
println!("{page}");
/*
if let Ok(accent) = get_accents(page) {
@ -101,7 +101,7 @@ fn main() {
print!(" ");
}
} */
println!()
/*
let idx_list = [
@ -178,6 +178,8 @@ fn main() {
}
}
*/
//let mut stdout = stdout().lock();
//stdout.write_all(audio).unwrap();
let mut audio_rsc = dict.audio.unwrap();
let audio = audio_rsc.get("jee").unwrap();
let mut stdout = stdout().lock();
stdout.write_all(audio).unwrap();
}

View file

@ -1,23 +1,41 @@
use std::{
fmt::Write as _,
fs::{create_dir_all, File},
io::Write,
path::Path,
fmt::Write as _,
};
use monokakido::{Error, MonokakidoDict};
use monokakido::{Error, MonokakidoDict, KeyIndex, PageItemId};
fn out_dir(dict: &MonokakidoDict) -> String {
dict.name().to_owned() + "_out/"
}
fn write_index(dict: &MonokakidoDict, index: &KeyIndex, tsv_fname: &str) -> Result<(), Error> {
let mut index_tsv = File::create(out_dir(&dict) + tsv_fname)?;
for i in 0..index.len() {
let (id, pages) = dict.keys.get_idx(index, i)?;
index_tsv.write_all(id.as_bytes())?;
for PageItemId { page, item } in pages {
write!(&mut index_tsv, "\t{page:0>10}")?;
if item > 0 {
write!(&mut index_tsv, ":{item:0>3}")?;
}
}
index_tsv.write_all(b"\n")?;
}
Ok(())
}
fn explode() -> Result<(), Error> {
let arg = std::env::args().nth(1).ok_or(Error::InvalidArg)?;
let mut dict = if Path::new(&arg).exists() {
MonokakidoDict::open_with_path(Path::new(&arg))
} else {
MonokakidoDict::open(&arg)
}?;
let pages_dir = "./pages/";
create_dir_all(pages_dir)?;
let mut path = String::from(pages_dir);
let mut dict = MonokakidoDict::open(&arg)?;
let pages_dir = out_dir(&dict) + "pages/";
let audio_dir = out_dir(&dict) + "audio/";
create_dir_all(&pages_dir)?;
let mut path = String::from(&pages_dir);
for idx in dict.pages.idx_iter()? {
let (id, page) = dict.pages.get_by_idx(idx)?;
write!(&mut path, "{id:0>10}.xml")?;
@ -27,17 +45,21 @@ fn explode() -> Result<(), Error> {
}
if let Some(audio) = &mut dict.audio {
let audio_dir = "./audio/";
create_dir_all(audio_dir)?;
let mut path = String::from(audio_dir);
create_dir_all(&audio_dir)?;
let mut path = String::from(&audio_dir);
for idx in audio.idx_iter()? {
let (id, page) = dict.pages.get_by_idx(idx)?;
write!(&mut path, "{id:0>10}.aac")?;
let (id, audio) = audio.get_by_idx(idx)?;
write!(&mut path, "{id}.aac")?;
let mut file = File::create(&path)?;
path.truncate(pages_dir.len());
file.write_all(page.as_bytes())?;
path.truncate(audio_dir.len());
file.write_all(audio)?;
}
}
write_index(&dict, &dict.keys.index_len, "index_len.tsv")?;
write_index(&dict, &dict.keys.index_prefix, "index_prefix.tsv")?;
write_index(&dict, &dict.keys.index_suffix, "index_suffix.tsv")?;
write_index(&dict, &dict.keys.index_d, "index_d.tsv")?;
Ok(())
}

View file

@ -82,12 +82,30 @@ mod abi {
}
use abi::{FileHeader, IndexHeader};
#[derive(Debug)]
pub struct KeyIndex {
index: Option<Vec<LE32>>
}
pub struct Keys {
words: Vec<LE32>,
index_len: Option<Vec<LE32>>,
index_prefix: Option<Vec<LE32>>,
index_suffix: Option<Vec<LE32>>,
index_d: Option<Vec<LE32>>,
pub index_len: KeyIndex,
pub index_prefix: KeyIndex,
pub index_suffix: KeyIndex,
pub index_d: KeyIndex,
}
impl KeyIndex {
fn get(&self, i: usize) -> Result<usize, Error> {
let Some(index) = &self.index else { return Err(Error::IndexDoesntExist) };
let i = i + 1; // Because the the index is prefixed by its legth
if i >= index.len() { return Err(Error::InvalidIndex) }
Ok(index[i].us())
}
pub fn len(&self) -> usize {
self.index.as_ref().map(|v| v.len()).unwrap_or(0) - 1
}
}
impl Keys {
@ -110,7 +128,7 @@ impl Keys {
Ok(())
}
pub(crate) fn new(paths: &Paths) -> Result<Keys, Error> {
pub fn new(paths: &Paths) -> Result<Keys, Error> {
let mut file = File::open(paths.headword_key_path())?;
let file_size = file.metadata()?.len() as usize;
let mut hdr = FileHeader::default();
@ -153,18 +171,13 @@ impl Keys {
Ok(Keys {
words,
index_len: index_a,
index_prefix: index_b,
index_suffix: index_c,
index_d,
index_len: KeyIndex { index: index_a },
index_prefix: KeyIndex { index: index_b },
index_suffix: KeyIndex { index: index_c },
index_d: KeyIndex { index: index_d },
})
}
pub fn count(&self) -> usize {
// USE INVARIANT A
self.words[0].us()
}
fn get_page_iter(&self, pages_offset: usize) -> Result<PageIter, Error> {
let pages = &LE32::slice_as_bytes(&self.words)[pages_offset..];
PageIter::new(pages)
@ -185,8 +198,7 @@ impl Keys {
}
pub(crate) fn cmp_key(&self, target: &str, idx: usize) -> Result<Ordering, Error> {
let Some(index) = &self.index_len else { return Err(Error::IndexDoesntExist) };
let offset = index[idx + 1].us() + size_of::<LE32>() + 1;
let offset = self.index_prefix.get(idx)? + size_of::<LE32>() + 1;
let words_bytes = LE32::slice_as_bytes(&self.words);
if words_bytes.len() < offset + target.len() + 1 {
return Err(Error::InvalidIndex); // Maybe just return Ordering::Less instead?
@ -205,39 +217,20 @@ impl Keys {
})
}
fn get_inner(&self, index: &[LE32], idx: usize) -> Result<(&str, PageIter<'_>), Error> {
if idx >= self.count() {
pub fn get_idx(&self, index: &KeyIndex, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
if idx >= index.len() {
return Err(Error::NotFound);
}
let word_offset = index[idx + 1].us();
// TODO: Why is this indexing ok?
let word_offset = index.get(idx)?;
let (word, pages_offset) = self.get_word_span(word_offset)?;
let pages = self.get_page_iter(pages_offset)?;
Ok((word, pages))
}
pub fn get_index_len(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
let Some(index) = &self.index_len else { return Err(Error::IndexDoesntExist) };
self.get_inner(index, idx)
}
pub fn get_index_prefix(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
let Some(index) = &self.index_prefix else { return Err(Error::IndexDoesntExist) };
self.get_inner(index, idx)
}
pub fn get_index_suffix(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
let Some(index) = &self.index_suffix else { return Err(Error::IndexDoesntExist) };
self.get_inner(index, idx)
}
pub fn get_index_d(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
let Some(index) = &self.index_d else { return Err(Error::IndexDoesntExist) };
self.get_inner(index, idx)
}
pub fn search_exact(&self, target_key: &str) -> Result<(usize, PageIter<'_>), Error> {
let target_key = &to_katakana(target_key);
let mut high = self.count();
let mut high = self.index_prefix.len();
let mut low = 0;
// TODO: Revise corner cases and add tests for this binary search
@ -249,7 +242,7 @@ impl Keys {
match cmp {
Ordering::Less => low = mid + 1,
Ordering::Greater => high = mid - 1,
Ordering::Equal => return Ok((mid, self.get_index_prefix(mid)?.1)),
Ordering::Equal => return Ok((mid, self.get_idx(&self.index_prefix, mid)?.1)),
}
}
@ -309,8 +302,10 @@ impl<'a> PageIter<'a> {
&[1, _, ref t @ ..] => tail = t,
&[2, _, _, ref t @ ..] => tail = t,
&[4, _, _, _, ref t @ ..] => tail = t,
&[17, _, _, ref t @ ..] => tail = t,
&[18, _, _, _, ref t @ ..] => tail = t,
e => {
dbg!("hmm", &e[..100]);
dbg!("hmm", &e[..100]); // TODO: clean this up
return Err(Error::InvalidIndex);
}
}
@ -324,16 +319,18 @@ impl<'a> PageIter<'a> {
}
impl<'a> Iterator for PageIter<'a> {
type Item = u32;
type Item = PageItemId;
fn next(&mut self) -> Option<Self::Item> {
// USE INVARIANT B: `self.span` is checked to conform to this shape,
// so unreachable is never reached. `self.count` is also checked to correspond,
// so overflow never happens.
let (id, tail) = match self.span {
&[1, hi, ref tail @ ..] => (u32::from_be_bytes([0, 0, 0, hi]), tail),
&[2, hi, lo, ref tail @ ..] => (u32::from_be_bytes([0, 0, hi, lo]), tail),
&[4, hi, mid, lo, ref tail @ ..] => (u32::from_be_bytes([0, hi, mid, lo]), tail),
&[1, hi, ref tail @ ..] => (pid([0, 0, hi], 0), tail),
&[2, hi, lo, ref tail @ ..] => (pid([0, hi, lo], 0), tail),
&[4, hi, mid, lo, ref tail @ ..] => (pid([hi, mid, lo], 0), tail),
&[17, hi, item, ref tail @ ..] => (pid([0, 0, hi], item), tail),
&[18, hi, lo, item, ref tail @ ..] => (pid([0, hi, lo], item), tail),
&[] => return None,
_ => unreachable!(),
};
@ -342,3 +339,12 @@ impl<'a> Iterator for PageIter<'a> {
Some(id)
}
}
pub struct PageItemId {
pub page: u32,
pub item: u8,
}
fn pid([hi, mid, lo]: [u8; 3], item: u8) -> PageItemId {
PageItemId { page: u32::from_be_bytes([0, hi, mid, lo]), item }
}

View file

@ -9,5 +9,5 @@ mod resource;
pub use audio::Audio;
pub use dict::MonokakidoDict;
pub use error::Error;
pub use key::Keys;
pub use key::{Keys, KeyIndex, PageItemId};
pub use pages::Pages;