From dac55a38c904add8d9bcaa2d4dd027ef83da534e Mon Sep 17 00:00:00 2001 From: Pyry Kontio Date: Fri, 2 Dec 2022 14:18:03 +0900 Subject: [PATCH] Fixed bugs, added tests, modified main. --- Cargo.lock | 4 +- src/audio.rs | 33 ++++++++++------ src/key.rs | 109 +++++++++++++++++++++++++++++++++++++++++++++------ src/main.rs | 97 ++++++++++++++++++++++++++++++++++++++++++--- 4 files changed, 212 insertions(+), 31 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d08b525..4d6f184 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -55,9 +55,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.46" +version = "1.0.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94e2ef8dbfc347b10c094890f778ee2e36ca9bb4262e86dc99cd217e35f3470b" +checksum = "5ea3d908b0e36316caf9e9e2c4625cdde190a7e6f440d794667ed17a1855e725" dependencies = [ "unicode-ident", ] diff --git a/src/audio.rs b/src/audio.rs index 9becc7a..1e8f272 100644 --- a/src/audio.rs +++ b/src/audio.rs @@ -16,6 +16,8 @@ pub(crate) struct AudioIndex { } mod abi { + use std::mem::size_of; + use crate::{audio::AudioFormat, Error}; #[repr(C)] @@ -68,19 +70,28 @@ mod abi { idx: vec![air(0), air(1), air(3), air(6), air(10)], ids: "\0a\0bb\0ccc\0dddd".to_owned(), }; - assert_eq!(audio_idx.get_id_at(0).unwrap(), ""); - assert_eq!(audio_idx.get_id_at(1).unwrap(), "a"); - assert_eq!(audio_idx.get_id_at(3).unwrap(), "bb"); - assert_eq!(audio_idx.get_id_at(4), Err(Error::InvalidIndex)); - assert_eq!(audio_idx.get_id_at(6).unwrap(), "ccc"); - assert_eq!(audio_idx.get_id_at(10), Err(Error::InvalidIndex)); + + let diff = 8 + audio_idx.idx.len() * size_of::(); + // Fix offsets now that they are known + for air in audio_idx.idx.iter_mut() { + air.id_str_offset += diff as u32; + } + + dbg!(&audio_idx); + assert_eq!(audio_idx.get_id_at(diff + 0).unwrap(), ""); + assert_eq!(audio_idx.get_id_at(diff + 1).unwrap(), "a"); + assert_eq!(audio_idx.get_id_at(diff + 3).unwrap(), "bb"); + assert_eq!(audio_idx.get_id_at(diff + 4), Err(Error::InvalidIndex)); + assert_eq!(audio_idx.get_id_at(diff + 6).unwrap(), "ccc"); + assert_eq!(audio_idx.get_id_at(diff + 10), Err(Error::InvalidIndex)); audio_idx.ids = "\0a\0bb\0ccc\0dddd\0".to_owned(); - assert_eq!(audio_idx.get_by_id("").unwrap(), air(0)); - assert_eq!(audio_idx.get_by_id("a").unwrap(), air(1)); - assert_eq!(audio_idx.get_by_id("bb").unwrap(), air(3)); - assert_eq!(audio_idx.get_by_id("ccc").unwrap(), air(6)); - assert_eq!(audio_idx.get_by_id("dddd").unwrap(), air(10)); + let diff = diff as u32; + assert_eq!(audio_idx.get_by_id("").unwrap(), air(diff + 0)); + assert_eq!(audio_idx.get_by_id("a").unwrap(), air(diff + 1)); + assert_eq!(audio_idx.get_by_id("bb").unwrap(), air(diff + 3)); + assert_eq!(audio_idx.get_by_id("ccc").unwrap(), air(diff + 6)); + assert_eq!(audio_idx.get_by_id("dddd").unwrap(), air(diff + 10)); assert_eq!(audio_idx.get_by_id("ddd"), Err(Error::NotFound)); } } diff --git a/src/key.rs b/src/key.rs index 7fbc037..0a383b0 100644 --- a/src/key.rs +++ b/src/key.rs @@ -3,6 +3,7 @@ use std::{ io::{Read, Seek}, mem::size_of, str::from_utf8, + cmp::Ordering, borrow::Cow, }; use crate::{ @@ -77,9 +78,9 @@ use abi::{FileHeader, IndexHeader}; pub struct Keys { words: Vec, - index_a: Vec, - index_b: Vec, - index_c: Vec, + index_len: Vec, + index_prefix: Vec, + index_suffix: Vec, index_d: Vec, } @@ -144,9 +145,9 @@ impl Keys { Ok(Keys { words, - index_a, - index_b, - index_c, + index_len: index_a, + index_prefix: index_b, + index_suffix: index_c, index_d, }) } @@ -163,6 +164,7 @@ impl Keys { pub(crate) fn get_word_span(&self, offset: usize) -> Result<(&str, usize), Error> { let words_bytes = LE32::slice_as_bytes(&self.words); + // TODO: add comment. What is this guarding against? if words_bytes.len() < offset + 2 * size_of::() { return Err(Error::InvalidIndex); } @@ -174,6 +176,27 @@ impl Keys { } } + pub(crate) fn cmp_key(&self, target: &str, idx: usize) -> Result { + let offset = self.index_prefix[idx + 1].us() + size_of::() + 1; + let words_bytes = LE32::slice_as_bytes(&self.words); + if words_bytes.len() < offset + target.len() + 1 { + + return Err(Error::InvalidIndex); // Maybe just return Ordering::Less instead? + } + let found_tail = &words_bytes[offset..]; + let found = &found_tail[..target.len()]; + Ok(match found.cmp(target.as_bytes()) { + Ordering::Equal => if found_tail[target.len()] == b'\0' + { + Ordering::Equal + } else { + Ordering::Greater + }, + ord => ord, + }) + } + + fn get_inner(&self, index: &[LE32], idx: usize) -> Result<(&str, PageIter<'_>), Error> { if idx >= self.count() { return Err(Error::NotFound); @@ -184,21 +207,76 @@ impl Keys { Ok((word, pages)) } - pub fn get_index_a(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> { - self.get_inner(&self.index_a, idx) + pub fn get_index_len(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> { + self.get_inner(&self.index_len, idx) } - pub fn get_index_b(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> { - self.get_inner(&self.index_b, idx) + pub fn get_index_prefix(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> { + self.get_inner(&self.index_prefix, idx) } - pub fn get_index_c(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> { - self.get_inner(&self.index_c, idx) + pub fn get_index_suffix(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> { + self.get_inner(&self.index_suffix, idx) } pub fn get_index_d(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> { self.get_inner(&self.index_d, idx) } + + pub fn search_exact(&self, target_key: &str) -> Result<(usize, PageIter<'_>), Error> { + let target_key = &to_katakana(target_key); + let mut high = self.count(); + let mut low = 0; + + // TODO: Revise corner cases and add tests for this binary search + while low <= high { + let mid = low + (high - low) / 2; + + let cmp = self.cmp_key(target_key, mid)?; + + match cmp { + Ordering::Less => low = mid + 1, + Ordering::Greater => high = mid - 1, + Ordering::Equal => return Ok((mid, self.get_index_prefix(mid)?.1)), + } + } + + return Err(Error::NotFound); + } +} + +fn to_katakana(input: &str) -> Cow { + let diff = 'ア' as u32 - 'あ' as u32; + if let Some(pos) = input.find(|c| matches!(c, 'ぁ'..='ん')) { + let mut output = input[..pos].to_owned(); + for c in input[pos..].chars() { + if matches!(c, 'ぁ'..='ん') { + output.push(char::from_u32(c as u32 + diff).unwrap()); + } else { + output.push(c); + } + } + return Cow::Owned(output); + } else { + return Cow::Borrowed(input); + } +} + +#[test] +fn test_to_katakana() { + assert_eq!(*to_katakana(""), *""); + assert_eq!(*to_katakana("あ"), *"ア"); + assert_eq!(*to_katakana("ぁ"), *"ァ"); + assert_eq!(*to_katakana("ん"), *"ン"); + assert_eq!(*to_katakana("っ"), *"ッ"); + assert_eq!(*to_katakana("ア"), *"ア"); + assert_eq!(*to_katakana("ァ"), *"ァ"); + assert_eq!(*to_katakana("ン"), *"ン"); + assert_eq!(*to_katakana("ッ"), *"ッ"); + assert_eq!(*to_katakana("aアa"), *"aアa"); + assert_eq!(*to_katakana("aァa"), *"aァa"); + assert_eq!(*to_katakana("aンa"), *"aンa"); + assert_eq!(*to_katakana("aッa"), *"aッa"); } #[derive(Debug, Clone)] @@ -216,9 +294,13 @@ impl<'a> PageIter<'a> { let mut tail = pages; for _ in 0..count { match tail { + &[1, _, ref t @ ..] => tail = t, &[2, _, _, ref t @ ..] => tail = t, &[4, _, _, _, ref t @ ..] => tail = t, - _ => return Err(Error::InvalidIndex), + e => { + dbg!("hmm", &e[..100]); + return Err(Error::InvalidIndex); + }, } } let span_len = pages.len() - tail.len(); @@ -237,6 +319,7 @@ impl<'a> Iterator for PageIter<'a> { // so unreachable is never reached. `self.count` is also checked to correspond, // so overflow never happens. let (id, tail) = match self.span { + &[1, hi, ref tail @ ..] => (u32::from_be_bytes([0, 0, 0, hi]), tail), &[2, hi, lo, ref tail @ ..] => (u32::from_be_bytes([0, 0, hi, lo]), tail), &[4, hi, mid, lo, ref tail @ ..] => (u32::from_be_bytes([0, hi, mid, lo]), tail), &[] => return None, diff --git a/src/main.rs b/src/main.rs index 53b4414..e185327 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,13 +1,100 @@ -use monokakido::MonokakidoDict; +use std::{io::{stdout, Write}, ops::Neg}; + +use monokakido::{MonokakidoDict, Error}; + +fn get_first_audio_id(page: &str) -> Result<&str, Error> { + if let Some((_, sound_tail)) = page.split_once("") { + if let Some((sound, _)) = sound_tail.split_once("") { + if let Some((head_id, _)) = sound.split_once(".aac") { + if let Some((_, id)) = head_id.split_once("href=\"") { + return Ok(id); + } + } + } + } + Err(Error::NotFound) +} + +fn get_first_accent(page: &str) -> Result { + if let Some((_, accent_tail)) = page.split_once("") { + if let Some((mut accent, _)) = accent_tail.split_once("") { + if let Some((a, _)) = accent.split_once("") { + accent = a; + } + if let Some(pos) = accent.find("") { + let endpos = pos + "".len(); + let before = &accent[..pos]; + let after = &accent[endpos..]; + let is_mora = |&c: &char| (matches!(c, 'ぁ'..='ん' | 'ァ'..='ン' | 'ー') && !matches!(c, 'ゃ'..='ょ' | 'ャ'..='ョ')); + return Ok((before.chars().filter(is_mora).count() as i8)); + } + if let Some(_) = accent.find("") { + return Ok(0); + } + } + } + Err(Error::NotFound) +} + +fn get_accents(page: &str) -> Result<(i8, Option), Error> { + if let Some((first, tail)) = page.split_once("") { + return Ok((get_first_accent(first)?, get_first_accent(tail).ok())); + } + Err(Error::NotFound) +} fn main() { + + let Some(key) = std::env::args().nth(1) else { + return; + }; /* for dict in MonokakidoDict::list().unwrap() { dbg!(dict.unwrap()); } */ let mut dict = MonokakidoDict::open("NHKACCENT2").unwrap(); + let mut accents = vec![]; + let result = dict.keys.search_exact(&key); + match result { + Ok((_, pages)) => { + for id in pages{ + let page = dict.pages.get(id).unwrap(); + if let Ok(accent) = get_accents(page) { + accents.push(accent); + } + /* + let id = get_first_audio_id(page).unwrap(); + let audio = dict.audio.get(id).unwrap(); + let mut stdout = stdout().lock(); + stdout.write_all(audio).unwrap(); + */ + } + }, + Err(e) => { + println!("{:?}", e); + return; + }, + } + print!("{key}\t"); + accents.sort(); + accents.dedup(); + if accents.is_empty() { + print!("N/A"); + } else { + for (accent_main, accent_sub) in accents { + print!("{accent_main}"); + if let Some(accent_sub) = accent_sub { + if accent_main != accent_sub { + print!("/{accent_sub}"); + } + } + print!(" "); + } + } + println!() +/* let idx_list = [ 0, 1, @@ -48,7 +135,7 @@ fn main() { println!("Index: length order"); for idx in idx_list { - let (word, pages) = dict.keys.get_index_a(idx).unwrap(); + let (word, pages) = dict.keys.get_index_len(idx).unwrap(); println!("\n{}", word); for id in pages { println!("{}", dict.pages.get(id).unwrap()); @@ -57,7 +144,7 @@ fn main() { println!("Index: prefix order"); for idx in idx_list { - let (word, pages) = dict.keys.get_index_b(idx).unwrap(); + let (word, pages) = dict.keys.get_index_prefix(idx).unwrap(); println!("\n{}", word); for id in pages { println!("{}", dict.pages.get(id).unwrap()); @@ -66,7 +153,7 @@ fn main() { println!("Index: suffix order"); for idx in idx_list { - let (word, pages) = dict.keys.get_index_c(idx).unwrap(); + let (word, pages) = dict.keys.get_index_suffix(idx).unwrap(); println!("\n{}", word); for id in pages { println!("{}", dict.pages.get(id).unwrap()); @@ -81,7 +168,7 @@ fn main() { println!("{}", dict.pages.get(id).unwrap()); } } - + */ //let mut stdout = stdout().lock(); //stdout.write_all(audio).unwrap(); }