Dict exploding works
This commit is contained in:
parent
5287f1b493
commit
9ded46f49c
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -1 +1,2 @@
|
|||
/target
|
||||
*_out
|
12
src/audio.rs
12
src/audio.rs
|
@ -1,4 +1,4 @@
|
|||
use std::{path::PathBuf, ops::Range};
|
||||
use std::{path::PathBuf, ops::Range, fmt::Display};
|
||||
|
||||
use crate::{
|
||||
dict::Paths,
|
||||
|
@ -79,7 +79,17 @@ impl Audio {
|
|||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum AudioId<'a> {
|
||||
Str(&'a str),
|
||||
Num(u32)
|
||||
}
|
||||
|
||||
impl Display for AudioId<'_> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
Self::Str(str) => f.write_str(str),
|
||||
Self::Num(num) => write!(f, "{num:0>10}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -65,7 +65,7 @@ fn main() {
|
|||
match result {
|
||||
Ok((_, pages)) => {
|
||||
for id in pages {
|
||||
let page = dict.pages.get(id).unwrap();
|
||||
let page = dict.pages.get(id.page).unwrap();
|
||||
println!("{page}");
|
||||
/*
|
||||
if let Ok(accent) = get_accents(page) {
|
||||
|
@ -101,7 +101,7 @@ fn main() {
|
|||
print!(" ");
|
||||
}
|
||||
} */
|
||||
println!()
|
||||
|
||||
|
||||
/*
|
||||
let idx_list = [
|
||||
|
@ -178,6 +178,8 @@ fn main() {
|
|||
}
|
||||
}
|
||||
*/
|
||||
//let mut stdout = stdout().lock();
|
||||
//stdout.write_all(audio).unwrap();
|
||||
let mut audio_rsc = dict.audio.unwrap();
|
||||
let audio = audio_rsc.get("jee").unwrap();
|
||||
let mut stdout = stdout().lock();
|
||||
stdout.write_all(audio).unwrap();
|
||||
}
|
||||
|
|
|
@ -1,23 +1,41 @@
|
|||
use std::{
|
||||
fmt::Write as _,
|
||||
fs::{create_dir_all, File},
|
||||
io::Write,
|
||||
path::Path,
|
||||
fmt::Write as _,
|
||||
};
|
||||
|
||||
use monokakido::{Error, MonokakidoDict};
|
||||
use monokakido::{Error, MonokakidoDict, KeyIndex, PageItemId};
|
||||
|
||||
fn out_dir(dict: &MonokakidoDict) -> String {
|
||||
dict.name().to_owned() + "_out/"
|
||||
}
|
||||
|
||||
fn write_index(dict: &MonokakidoDict, index: &KeyIndex, tsv_fname: &str) -> Result<(), Error> {
|
||||
let mut index_tsv = File::create(out_dir(&dict) + tsv_fname)?;
|
||||
for i in 0..index.len() {
|
||||
let (id, pages) = dict.keys.get_idx(index, i)?;
|
||||
index_tsv.write_all(id.as_bytes())?;
|
||||
for PageItemId { page, item } in pages {
|
||||
write!(&mut index_tsv, "\t{page:0>10}")?;
|
||||
if item > 0 {
|
||||
write!(&mut index_tsv, ":{item:0>3}")?;
|
||||
}
|
||||
}
|
||||
index_tsv.write_all(b"\n")?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn explode() -> Result<(), Error> {
|
||||
let arg = std::env::args().nth(1).ok_or(Error::InvalidArg)?;
|
||||
|
||||
let mut dict = if Path::new(&arg).exists() {
|
||||
MonokakidoDict::open_with_path(Path::new(&arg))
|
||||
} else {
|
||||
MonokakidoDict::open(&arg)
|
||||
}?;
|
||||
let pages_dir = "./pages/";
|
||||
create_dir_all(pages_dir)?;
|
||||
let mut path = String::from(pages_dir);
|
||||
let mut dict = MonokakidoDict::open(&arg)?;
|
||||
|
||||
let pages_dir = out_dir(&dict) + "pages/";
|
||||
let audio_dir = out_dir(&dict) + "audio/";
|
||||
|
||||
create_dir_all(&pages_dir)?;
|
||||
let mut path = String::from(&pages_dir);
|
||||
for idx in dict.pages.idx_iter()? {
|
||||
let (id, page) = dict.pages.get_by_idx(idx)?;
|
||||
write!(&mut path, "{id:0>10}.xml")?;
|
||||
|
@ -27,17 +45,21 @@ fn explode() -> Result<(), Error> {
|
|||
}
|
||||
|
||||
if let Some(audio) = &mut dict.audio {
|
||||
let audio_dir = "./audio/";
|
||||
create_dir_all(audio_dir)?;
|
||||
let mut path = String::from(audio_dir);
|
||||
create_dir_all(&audio_dir)?;
|
||||
let mut path = String::from(&audio_dir);
|
||||
for idx in audio.idx_iter()? {
|
||||
let (id, page) = dict.pages.get_by_idx(idx)?;
|
||||
write!(&mut path, "{id:0>10}.aac")?;
|
||||
let (id, audio) = audio.get_by_idx(idx)?;
|
||||
write!(&mut path, "{id}.aac")?;
|
||||
let mut file = File::create(&path)?;
|
||||
path.truncate(pages_dir.len());
|
||||
file.write_all(page.as_bytes())?;
|
||||
path.truncate(audio_dir.len());
|
||||
file.write_all(audio)?;
|
||||
}
|
||||
}
|
||||
|
||||
write_index(&dict, &dict.keys.index_len, "index_len.tsv")?;
|
||||
write_index(&dict, &dict.keys.index_prefix, "index_prefix.tsv")?;
|
||||
write_index(&dict, &dict.keys.index_suffix, "index_suffix.tsv")?;
|
||||
write_index(&dict, &dict.keys.index_d, "index_d.tsv")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
|
98
src/key.rs
98
src/key.rs
|
@ -82,12 +82,30 @@ mod abi {
|
|||
}
|
||||
use abi::{FileHeader, IndexHeader};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct KeyIndex {
|
||||
index: Option<Vec<LE32>>
|
||||
}
|
||||
|
||||
pub struct Keys {
|
||||
words: Vec<LE32>,
|
||||
index_len: Option<Vec<LE32>>,
|
||||
index_prefix: Option<Vec<LE32>>,
|
||||
index_suffix: Option<Vec<LE32>>,
|
||||
index_d: Option<Vec<LE32>>,
|
||||
pub index_len: KeyIndex,
|
||||
pub index_prefix: KeyIndex,
|
||||
pub index_suffix: KeyIndex,
|
||||
pub index_d: KeyIndex,
|
||||
}
|
||||
|
||||
impl KeyIndex {
|
||||
fn get(&self, i: usize) -> Result<usize, Error> {
|
||||
let Some(index) = &self.index else { return Err(Error::IndexDoesntExist) };
|
||||
let i = i + 1; // Because the the index is prefixed by its legth
|
||||
if i >= index.len() { return Err(Error::InvalidIndex) }
|
||||
Ok(index[i].us())
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
self.index.as_ref().map(|v| v.len()).unwrap_or(0) - 1
|
||||
}
|
||||
}
|
||||
|
||||
impl Keys {
|
||||
|
@ -110,7 +128,7 @@ impl Keys {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn new(paths: &Paths) -> Result<Keys, Error> {
|
||||
pub fn new(paths: &Paths) -> Result<Keys, Error> {
|
||||
let mut file = File::open(paths.headword_key_path())?;
|
||||
let file_size = file.metadata()?.len() as usize;
|
||||
let mut hdr = FileHeader::default();
|
||||
|
@ -153,18 +171,13 @@ impl Keys {
|
|||
|
||||
Ok(Keys {
|
||||
words,
|
||||
index_len: index_a,
|
||||
index_prefix: index_b,
|
||||
index_suffix: index_c,
|
||||
index_d,
|
||||
index_len: KeyIndex { index: index_a },
|
||||
index_prefix: KeyIndex { index: index_b },
|
||||
index_suffix: KeyIndex { index: index_c },
|
||||
index_d: KeyIndex { index: index_d },
|
||||
})
|
||||
}
|
||||
|
||||
pub fn count(&self) -> usize {
|
||||
// USE INVARIANT A
|
||||
self.words[0].us()
|
||||
}
|
||||
|
||||
fn get_page_iter(&self, pages_offset: usize) -> Result<PageIter, Error> {
|
||||
let pages = &LE32::slice_as_bytes(&self.words)[pages_offset..];
|
||||
PageIter::new(pages)
|
||||
|
@ -185,8 +198,7 @@ impl Keys {
|
|||
}
|
||||
|
||||
pub(crate) fn cmp_key(&self, target: &str, idx: usize) -> Result<Ordering, Error> {
|
||||
let Some(index) = &self.index_len else { return Err(Error::IndexDoesntExist) };
|
||||
let offset = index[idx + 1].us() + size_of::<LE32>() + 1;
|
||||
let offset = self.index_prefix.get(idx)? + size_of::<LE32>() + 1;
|
||||
let words_bytes = LE32::slice_as_bytes(&self.words);
|
||||
if words_bytes.len() < offset + target.len() + 1 {
|
||||
return Err(Error::InvalidIndex); // Maybe just return Ordering::Less instead?
|
||||
|
@ -205,39 +217,20 @@ impl Keys {
|
|||
})
|
||||
}
|
||||
|
||||
fn get_inner(&self, index: &[LE32], idx: usize) -> Result<(&str, PageIter<'_>), Error> {
|
||||
if idx >= self.count() {
|
||||
pub fn get_idx(&self, index: &KeyIndex, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
|
||||
if idx >= index.len() {
|
||||
return Err(Error::NotFound);
|
||||
}
|
||||
let word_offset = index[idx + 1].us();
|
||||
// TODO: Why is this indexing ok?
|
||||
let word_offset = index.get(idx)?;
|
||||
let (word, pages_offset) = self.get_word_span(word_offset)?;
|
||||
let pages = self.get_page_iter(pages_offset)?;
|
||||
Ok((word, pages))
|
||||
}
|
||||
|
||||
pub fn get_index_len(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
|
||||
let Some(index) = &self.index_len else { return Err(Error::IndexDoesntExist) };
|
||||
self.get_inner(index, idx)
|
||||
}
|
||||
|
||||
pub fn get_index_prefix(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
|
||||
let Some(index) = &self.index_prefix else { return Err(Error::IndexDoesntExist) };
|
||||
self.get_inner(index, idx)
|
||||
}
|
||||
|
||||
pub fn get_index_suffix(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
|
||||
let Some(index) = &self.index_suffix else { return Err(Error::IndexDoesntExist) };
|
||||
self.get_inner(index, idx)
|
||||
}
|
||||
|
||||
pub fn get_index_d(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
|
||||
let Some(index) = &self.index_d else { return Err(Error::IndexDoesntExist) };
|
||||
self.get_inner(index, idx)
|
||||
}
|
||||
|
||||
pub fn search_exact(&self, target_key: &str) -> Result<(usize, PageIter<'_>), Error> {
|
||||
let target_key = &to_katakana(target_key);
|
||||
let mut high = self.count();
|
||||
let mut high = self.index_prefix.len();
|
||||
let mut low = 0;
|
||||
|
||||
// TODO: Revise corner cases and add tests for this binary search
|
||||
|
@ -249,7 +242,7 @@ impl Keys {
|
|||
match cmp {
|
||||
Ordering::Less => low = mid + 1,
|
||||
Ordering::Greater => high = mid - 1,
|
||||
Ordering::Equal => return Ok((mid, self.get_index_prefix(mid)?.1)),
|
||||
Ordering::Equal => return Ok((mid, self.get_idx(&self.index_prefix, mid)?.1)),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -309,8 +302,10 @@ impl<'a> PageIter<'a> {
|
|||
&[1, _, ref t @ ..] => tail = t,
|
||||
&[2, _, _, ref t @ ..] => tail = t,
|
||||
&[4, _, _, _, ref t @ ..] => tail = t,
|
||||
&[17, _, _, ref t @ ..] => tail = t,
|
||||
&[18, _, _, _, ref t @ ..] => tail = t,
|
||||
e => {
|
||||
dbg!("hmm", &e[..100]);
|
||||
dbg!("hmm", &e[..100]); // TODO: clean this up
|
||||
return Err(Error::InvalidIndex);
|
||||
}
|
||||
}
|
||||
|
@ -324,16 +319,18 @@ impl<'a> PageIter<'a> {
|
|||
}
|
||||
|
||||
impl<'a> Iterator for PageIter<'a> {
|
||||
type Item = u32;
|
||||
type Item = PageItemId;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
// USE INVARIANT B: `self.span` is checked to conform to this shape,
|
||||
// so unreachable is never reached. `self.count` is also checked to correspond,
|
||||
// so overflow never happens.
|
||||
let (id, tail) = match self.span {
|
||||
&[1, hi, ref tail @ ..] => (u32::from_be_bytes([0, 0, 0, hi]), tail),
|
||||
&[2, hi, lo, ref tail @ ..] => (u32::from_be_bytes([0, 0, hi, lo]), tail),
|
||||
&[4, hi, mid, lo, ref tail @ ..] => (u32::from_be_bytes([0, hi, mid, lo]), tail),
|
||||
&[1, hi, ref tail @ ..] => (pid([0, 0, hi], 0), tail),
|
||||
&[2, hi, lo, ref tail @ ..] => (pid([0, hi, lo], 0), tail),
|
||||
&[4, hi, mid, lo, ref tail @ ..] => (pid([hi, mid, lo], 0), tail),
|
||||
&[17, hi, item, ref tail @ ..] => (pid([0, 0, hi], item), tail),
|
||||
&[18, hi, lo, item, ref tail @ ..] => (pid([0, hi, lo], item), tail),
|
||||
&[] => return None,
|
||||
_ => unreachable!(),
|
||||
};
|
||||
|
@ -342,3 +339,12 @@ impl<'a> Iterator for PageIter<'a> {
|
|||
Some(id)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PageItemId {
|
||||
pub page: u32,
|
||||
pub item: u8,
|
||||
}
|
||||
|
||||
fn pid([hi, mid, lo]: [u8; 3], item: u8) -> PageItemId {
|
||||
PageItemId { page: u32::from_be_bytes([0, hi, mid, lo]), item }
|
||||
}
|
||||
|
|
|
@ -9,5 +9,5 @@ mod resource;
|
|||
pub use audio::Audio;
|
||||
pub use dict::MonokakidoDict;
|
||||
pub use error::Error;
|
||||
pub use key::Keys;
|
||||
pub use key::{Keys, KeyIndex, PageItemId};
|
||||
pub use pages::Pages;
|
||||
|
|
Loading…
Reference in a new issue