Dict exploding works
This commit is contained in:
parent
5287f1b493
commit
9ded46f49c
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -1 +1,2 @@
|
||||||
/target
|
/target
|
||||||
|
*_out
|
12
src/audio.rs
12
src/audio.rs
|
@ -1,4 +1,4 @@
|
||||||
use std::{path::PathBuf, ops::Range};
|
use std::{path::PathBuf, ops::Range, fmt::Display};
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
dict::Paths,
|
dict::Paths,
|
||||||
|
@ -79,7 +79,17 @@ impl Audio {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
pub enum AudioId<'a> {
|
pub enum AudioId<'a> {
|
||||||
Str(&'a str),
|
Str(&'a str),
|
||||||
Num(u32)
|
Num(u32)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl Display for AudioId<'_> {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
match self {
|
||||||
|
Self::Str(str) => f.write_str(str),
|
||||||
|
Self::Num(num) => write!(f, "{num:0>10}"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -65,7 +65,7 @@ fn main() {
|
||||||
match result {
|
match result {
|
||||||
Ok((_, pages)) => {
|
Ok((_, pages)) => {
|
||||||
for id in pages {
|
for id in pages {
|
||||||
let page = dict.pages.get(id).unwrap();
|
let page = dict.pages.get(id.page).unwrap();
|
||||||
println!("{page}");
|
println!("{page}");
|
||||||
/*
|
/*
|
||||||
if let Ok(accent) = get_accents(page) {
|
if let Ok(accent) = get_accents(page) {
|
||||||
|
@ -101,7 +101,7 @@ fn main() {
|
||||||
print!(" ");
|
print!(" ");
|
||||||
}
|
}
|
||||||
} */
|
} */
|
||||||
println!()
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
let idx_list = [
|
let idx_list = [
|
||||||
|
@ -178,6 +178,8 @@ fn main() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
//let mut stdout = stdout().lock();
|
let mut audio_rsc = dict.audio.unwrap();
|
||||||
//stdout.write_all(audio).unwrap();
|
let audio = audio_rsc.get("jee").unwrap();
|
||||||
|
let mut stdout = stdout().lock();
|
||||||
|
stdout.write_all(audio).unwrap();
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,23 +1,41 @@
|
||||||
use std::{
|
use std::{
|
||||||
fmt::Write as _,
|
|
||||||
fs::{create_dir_all, File},
|
fs::{create_dir_all, File},
|
||||||
io::Write,
|
io::Write,
|
||||||
path::Path,
|
fmt::Write as _,
|
||||||
};
|
};
|
||||||
|
|
||||||
use monokakido::{Error, MonokakidoDict};
|
use monokakido::{Error, MonokakidoDict, KeyIndex, PageItemId};
|
||||||
|
|
||||||
|
fn out_dir(dict: &MonokakidoDict) -> String {
|
||||||
|
dict.name().to_owned() + "_out/"
|
||||||
|
}
|
||||||
|
|
||||||
|
fn write_index(dict: &MonokakidoDict, index: &KeyIndex, tsv_fname: &str) -> Result<(), Error> {
|
||||||
|
let mut index_tsv = File::create(out_dir(&dict) + tsv_fname)?;
|
||||||
|
for i in 0..index.len() {
|
||||||
|
let (id, pages) = dict.keys.get_idx(index, i)?;
|
||||||
|
index_tsv.write_all(id.as_bytes())?;
|
||||||
|
for PageItemId { page, item } in pages {
|
||||||
|
write!(&mut index_tsv, "\t{page:0>10}")?;
|
||||||
|
if item > 0 {
|
||||||
|
write!(&mut index_tsv, ":{item:0>3}")?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
index_tsv.write_all(b"\n")?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
fn explode() -> Result<(), Error> {
|
fn explode() -> Result<(), Error> {
|
||||||
let arg = std::env::args().nth(1).ok_or(Error::InvalidArg)?;
|
let arg = std::env::args().nth(1).ok_or(Error::InvalidArg)?;
|
||||||
|
|
||||||
let mut dict = if Path::new(&arg).exists() {
|
let mut dict = MonokakidoDict::open(&arg)?;
|
||||||
MonokakidoDict::open_with_path(Path::new(&arg))
|
|
||||||
} else {
|
let pages_dir = out_dir(&dict) + "pages/";
|
||||||
MonokakidoDict::open(&arg)
|
let audio_dir = out_dir(&dict) + "audio/";
|
||||||
}?;
|
|
||||||
let pages_dir = "./pages/";
|
create_dir_all(&pages_dir)?;
|
||||||
create_dir_all(pages_dir)?;
|
let mut path = String::from(&pages_dir);
|
||||||
let mut path = String::from(pages_dir);
|
|
||||||
for idx in dict.pages.idx_iter()? {
|
for idx in dict.pages.idx_iter()? {
|
||||||
let (id, page) = dict.pages.get_by_idx(idx)?;
|
let (id, page) = dict.pages.get_by_idx(idx)?;
|
||||||
write!(&mut path, "{id:0>10}.xml")?;
|
write!(&mut path, "{id:0>10}.xml")?;
|
||||||
|
@ -27,17 +45,21 @@ fn explode() -> Result<(), Error> {
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(audio) = &mut dict.audio {
|
if let Some(audio) = &mut dict.audio {
|
||||||
let audio_dir = "./audio/";
|
create_dir_all(&audio_dir)?;
|
||||||
create_dir_all(audio_dir)?;
|
let mut path = String::from(&audio_dir);
|
||||||
let mut path = String::from(audio_dir);
|
|
||||||
for idx in audio.idx_iter()? {
|
for idx in audio.idx_iter()? {
|
||||||
let (id, page) = dict.pages.get_by_idx(idx)?;
|
let (id, audio) = audio.get_by_idx(idx)?;
|
||||||
write!(&mut path, "{id:0>10}.aac")?;
|
write!(&mut path, "{id}.aac")?;
|
||||||
let mut file = File::create(&path)?;
|
let mut file = File::create(&path)?;
|
||||||
path.truncate(pages_dir.len());
|
path.truncate(audio_dir.len());
|
||||||
file.write_all(page.as_bytes())?;
|
file.write_all(audio)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
write_index(&dict, &dict.keys.index_len, "index_len.tsv")?;
|
||||||
|
write_index(&dict, &dict.keys.index_prefix, "index_prefix.tsv")?;
|
||||||
|
write_index(&dict, &dict.keys.index_suffix, "index_suffix.tsv")?;
|
||||||
|
write_index(&dict, &dict.keys.index_d, "index_d.tsv")?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
98
src/key.rs
98
src/key.rs
|
@ -82,12 +82,30 @@ mod abi {
|
||||||
}
|
}
|
||||||
use abi::{FileHeader, IndexHeader};
|
use abi::{FileHeader, IndexHeader};
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct KeyIndex {
|
||||||
|
index: Option<Vec<LE32>>
|
||||||
|
}
|
||||||
|
|
||||||
pub struct Keys {
|
pub struct Keys {
|
||||||
words: Vec<LE32>,
|
words: Vec<LE32>,
|
||||||
index_len: Option<Vec<LE32>>,
|
pub index_len: KeyIndex,
|
||||||
index_prefix: Option<Vec<LE32>>,
|
pub index_prefix: KeyIndex,
|
||||||
index_suffix: Option<Vec<LE32>>,
|
pub index_suffix: KeyIndex,
|
||||||
index_d: Option<Vec<LE32>>,
|
pub index_d: KeyIndex,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl KeyIndex {
|
||||||
|
fn get(&self, i: usize) -> Result<usize, Error> {
|
||||||
|
let Some(index) = &self.index else { return Err(Error::IndexDoesntExist) };
|
||||||
|
let i = i + 1; // Because the the index is prefixed by its legth
|
||||||
|
if i >= index.len() { return Err(Error::InvalidIndex) }
|
||||||
|
Ok(index[i].us())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn len(&self) -> usize {
|
||||||
|
self.index.as_ref().map(|v| v.len()).unwrap_or(0) - 1
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Keys {
|
impl Keys {
|
||||||
|
@ -110,7 +128,7 @@ impl Keys {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn new(paths: &Paths) -> Result<Keys, Error> {
|
pub fn new(paths: &Paths) -> Result<Keys, Error> {
|
||||||
let mut file = File::open(paths.headword_key_path())?;
|
let mut file = File::open(paths.headword_key_path())?;
|
||||||
let file_size = file.metadata()?.len() as usize;
|
let file_size = file.metadata()?.len() as usize;
|
||||||
let mut hdr = FileHeader::default();
|
let mut hdr = FileHeader::default();
|
||||||
|
@ -153,18 +171,13 @@ impl Keys {
|
||||||
|
|
||||||
Ok(Keys {
|
Ok(Keys {
|
||||||
words,
|
words,
|
||||||
index_len: index_a,
|
index_len: KeyIndex { index: index_a },
|
||||||
index_prefix: index_b,
|
index_prefix: KeyIndex { index: index_b },
|
||||||
index_suffix: index_c,
|
index_suffix: KeyIndex { index: index_c },
|
||||||
index_d,
|
index_d: KeyIndex { index: index_d },
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn count(&self) -> usize {
|
|
||||||
// USE INVARIANT A
|
|
||||||
self.words[0].us()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn get_page_iter(&self, pages_offset: usize) -> Result<PageIter, Error> {
|
fn get_page_iter(&self, pages_offset: usize) -> Result<PageIter, Error> {
|
||||||
let pages = &LE32::slice_as_bytes(&self.words)[pages_offset..];
|
let pages = &LE32::slice_as_bytes(&self.words)[pages_offset..];
|
||||||
PageIter::new(pages)
|
PageIter::new(pages)
|
||||||
|
@ -185,8 +198,7 @@ impl Keys {
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn cmp_key(&self, target: &str, idx: usize) -> Result<Ordering, Error> {
|
pub(crate) fn cmp_key(&self, target: &str, idx: usize) -> Result<Ordering, Error> {
|
||||||
let Some(index) = &self.index_len else { return Err(Error::IndexDoesntExist) };
|
let offset = self.index_prefix.get(idx)? + size_of::<LE32>() + 1;
|
||||||
let offset = index[idx + 1].us() + size_of::<LE32>() + 1;
|
|
||||||
let words_bytes = LE32::slice_as_bytes(&self.words);
|
let words_bytes = LE32::slice_as_bytes(&self.words);
|
||||||
if words_bytes.len() < offset + target.len() + 1 {
|
if words_bytes.len() < offset + target.len() + 1 {
|
||||||
return Err(Error::InvalidIndex); // Maybe just return Ordering::Less instead?
|
return Err(Error::InvalidIndex); // Maybe just return Ordering::Less instead?
|
||||||
|
@ -205,39 +217,20 @@ impl Keys {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_inner(&self, index: &[LE32], idx: usize) -> Result<(&str, PageIter<'_>), Error> {
|
pub fn get_idx(&self, index: &KeyIndex, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
|
||||||
if idx >= self.count() {
|
if idx >= index.len() {
|
||||||
return Err(Error::NotFound);
|
return Err(Error::NotFound);
|
||||||
}
|
}
|
||||||
let word_offset = index[idx + 1].us();
|
// TODO: Why is this indexing ok?
|
||||||
|
let word_offset = index.get(idx)?;
|
||||||
let (word, pages_offset) = self.get_word_span(word_offset)?;
|
let (word, pages_offset) = self.get_word_span(word_offset)?;
|
||||||
let pages = self.get_page_iter(pages_offset)?;
|
let pages = self.get_page_iter(pages_offset)?;
|
||||||
Ok((word, pages))
|
Ok((word, pages))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_index_len(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
|
|
||||||
let Some(index) = &self.index_len else { return Err(Error::IndexDoesntExist) };
|
|
||||||
self.get_inner(index, idx)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn get_index_prefix(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
|
|
||||||
let Some(index) = &self.index_prefix else { return Err(Error::IndexDoesntExist) };
|
|
||||||
self.get_inner(index, idx)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn get_index_suffix(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
|
|
||||||
let Some(index) = &self.index_suffix else { return Err(Error::IndexDoesntExist) };
|
|
||||||
self.get_inner(index, idx)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn get_index_d(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
|
|
||||||
let Some(index) = &self.index_d else { return Err(Error::IndexDoesntExist) };
|
|
||||||
self.get_inner(index, idx)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn search_exact(&self, target_key: &str) -> Result<(usize, PageIter<'_>), Error> {
|
pub fn search_exact(&self, target_key: &str) -> Result<(usize, PageIter<'_>), Error> {
|
||||||
let target_key = &to_katakana(target_key);
|
let target_key = &to_katakana(target_key);
|
||||||
let mut high = self.count();
|
let mut high = self.index_prefix.len();
|
||||||
let mut low = 0;
|
let mut low = 0;
|
||||||
|
|
||||||
// TODO: Revise corner cases and add tests for this binary search
|
// TODO: Revise corner cases and add tests for this binary search
|
||||||
|
@ -249,7 +242,7 @@ impl Keys {
|
||||||
match cmp {
|
match cmp {
|
||||||
Ordering::Less => low = mid + 1,
|
Ordering::Less => low = mid + 1,
|
||||||
Ordering::Greater => high = mid - 1,
|
Ordering::Greater => high = mid - 1,
|
||||||
Ordering::Equal => return Ok((mid, self.get_index_prefix(mid)?.1)),
|
Ordering::Equal => return Ok((mid, self.get_idx(&self.index_prefix, mid)?.1)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -309,8 +302,10 @@ impl<'a> PageIter<'a> {
|
||||||
&[1, _, ref t @ ..] => tail = t,
|
&[1, _, ref t @ ..] => tail = t,
|
||||||
&[2, _, _, ref t @ ..] => tail = t,
|
&[2, _, _, ref t @ ..] => tail = t,
|
||||||
&[4, _, _, _, ref t @ ..] => tail = t,
|
&[4, _, _, _, ref t @ ..] => tail = t,
|
||||||
|
&[17, _, _, ref t @ ..] => tail = t,
|
||||||
|
&[18, _, _, _, ref t @ ..] => tail = t,
|
||||||
e => {
|
e => {
|
||||||
dbg!("hmm", &e[..100]);
|
dbg!("hmm", &e[..100]); // TODO: clean this up
|
||||||
return Err(Error::InvalidIndex);
|
return Err(Error::InvalidIndex);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -324,16 +319,18 @@ impl<'a> PageIter<'a> {
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Iterator for PageIter<'a> {
|
impl<'a> Iterator for PageIter<'a> {
|
||||||
type Item = u32;
|
type Item = PageItemId;
|
||||||
|
|
||||||
fn next(&mut self) -> Option<Self::Item> {
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
// USE INVARIANT B: `self.span` is checked to conform to this shape,
|
// USE INVARIANT B: `self.span` is checked to conform to this shape,
|
||||||
// so unreachable is never reached. `self.count` is also checked to correspond,
|
// so unreachable is never reached. `self.count` is also checked to correspond,
|
||||||
// so overflow never happens.
|
// so overflow never happens.
|
||||||
let (id, tail) = match self.span {
|
let (id, tail) = match self.span {
|
||||||
&[1, hi, ref tail @ ..] => (u32::from_be_bytes([0, 0, 0, hi]), tail),
|
&[1, hi, ref tail @ ..] => (pid([0, 0, hi], 0), tail),
|
||||||
&[2, hi, lo, ref tail @ ..] => (u32::from_be_bytes([0, 0, hi, lo]), tail),
|
&[2, hi, lo, ref tail @ ..] => (pid([0, hi, lo], 0), tail),
|
||||||
&[4, hi, mid, lo, ref tail @ ..] => (u32::from_be_bytes([0, hi, mid, lo]), tail),
|
&[4, hi, mid, lo, ref tail @ ..] => (pid([hi, mid, lo], 0), tail),
|
||||||
|
&[17, hi, item, ref tail @ ..] => (pid([0, 0, hi], item), tail),
|
||||||
|
&[18, hi, lo, item, ref tail @ ..] => (pid([0, hi, lo], item), tail),
|
||||||
&[] => return None,
|
&[] => return None,
|
||||||
_ => unreachable!(),
|
_ => unreachable!(),
|
||||||
};
|
};
|
||||||
|
@ -342,3 +339,12 @@ impl<'a> Iterator for PageIter<'a> {
|
||||||
Some(id)
|
Some(id)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub struct PageItemId {
|
||||||
|
pub page: u32,
|
||||||
|
pub item: u8,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn pid([hi, mid, lo]: [u8; 3], item: u8) -> PageItemId {
|
||||||
|
PageItemId { page: u32::from_be_bytes([0, hi, mid, lo]), item }
|
||||||
|
}
|
||||||
|
|
|
@ -9,5 +9,5 @@ mod resource;
|
||||||
pub use audio::Audio;
|
pub use audio::Audio;
|
||||||
pub use dict::MonokakidoDict;
|
pub use dict::MonokakidoDict;
|
||||||
pub use error::Error;
|
pub use error::Error;
|
||||||
pub use key::Keys;
|
pub use key::{Keys, KeyIndex, PageItemId};
|
||||||
pub use pages::Pages;
|
pub use pages::Pages;
|
||||||
|
|
Loading…
Reference in a new issue