Fixed bugs, added tests, modified main.

2022-12-02 14:18:03 +09:00 · 2022-12-02 14:18:03 +09:00 · dac55a38c9
parent baf2bc4a6c
commit dac55a38c9
4 changed files with 212 additions and 31 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -55,9 +55,9 @@ dependencies = [

 [[package]]
 name = "proc-macro2"
-version = "1.0.46"
+version = "1.0.47"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "94e2ef8dbfc347b10c094890f778ee2e36ca9bb4262e86dc99cd217e35f3470b"
+checksum = "5ea3d908b0e36316caf9e9e2c4625cdde190a7e6f440d794667ed17a1855e725"
 dependencies = [
 "unicode-ident",
 ]
--- a/src/audio.rs
+++ b/src/audio.rs
@ -16,6 +16,8 @@ pub(crate) struct AudioIndex {
 }

 mod abi {
+    use std::mem::size_of;
+
    use crate::{audio::AudioFormat, Error};

    #[repr(C)]
@ -68,19 +70,28 @@ mod abi {
            idx: vec![air(0), air(1), air(3), air(6), air(10)],
            ids: "\0a\0bb\0ccc\0dddd".to_owned(),
        };
-        assert_eq!(audio_idx.get_id_at(0).unwrap(), "");
-        assert_eq!(audio_idx.get_id_at(1).unwrap(), "a");
-        assert_eq!(audio_idx.get_id_at(3).unwrap(), "bb");
-        assert_eq!(audio_idx.get_id_at(4), Err(Error::InvalidIndex));
-        assert_eq!(audio_idx.get_id_at(6).unwrap(), "ccc");
-        assert_eq!(audio_idx.get_id_at(10), Err(Error::InvalidIndex));
+
+        let diff = 8 + audio_idx.idx.len() * size_of::<AudioIdxRecord>();
+        // Fix offsets now that they are known
+        for air in audio_idx.idx.iter_mut() {
+            air.id_str_offset += diff as u32;
+        }
+
+        dbg!(&audio_idx);
+        assert_eq!(audio_idx.get_id_at(diff + 0).unwrap(), "");
+        assert_eq!(audio_idx.get_id_at(diff + 1).unwrap(), "a");
+        assert_eq!(audio_idx.get_id_at(diff + 3).unwrap(), "bb");
+        assert_eq!(audio_idx.get_id_at(diff + 4), Err(Error::InvalidIndex));
+        assert_eq!(audio_idx.get_id_at(diff + 6).unwrap(), "ccc");
+        assert_eq!(audio_idx.get_id_at(diff + 10), Err(Error::InvalidIndex));

        audio_idx.ids = "\0a\0bb\0ccc\0dddd\0".to_owned();
-        assert_eq!(audio_idx.get_by_id("").unwrap(), air(0));
-        assert_eq!(audio_idx.get_by_id("a").unwrap(), air(1));
-        assert_eq!(audio_idx.get_by_id("bb").unwrap(), air(3));
-        assert_eq!(audio_idx.get_by_id("ccc").unwrap(), air(6));
-        assert_eq!(audio_idx.get_by_id("dddd").unwrap(), air(10));
+        let diff = diff as u32;
+        assert_eq!(audio_idx.get_by_id("").unwrap(), air(diff + 0));
+        assert_eq!(audio_idx.get_by_id("a").unwrap(), air(diff + 1));
+        assert_eq!(audio_idx.get_by_id("bb").unwrap(), air(diff + 3));
+        assert_eq!(audio_idx.get_by_id("ccc").unwrap(), air(diff + 6));
+        assert_eq!(audio_idx.get_by_id("dddd").unwrap(), air(diff + 10));
        assert_eq!(audio_idx.get_by_id("ddd"), Err(Error::NotFound));
    }
 }
--- a/src/key.rs
+++ b/src/key.rs
@ -3,6 +3,7 @@ use std::{
    io::{Read, Seek},
    mem::size_of,
    str::from_utf8,
+    cmp::Ordering, borrow::Cow,
 };

 use crate::{
@ -77,9 +78,9 @@ use abi::{FileHeader, IndexHeader};

 pub struct Keys {
    words: Vec<LE32>,
-    index_a: Vec<LE32>,
-    index_b: Vec<LE32>,
-    index_c: Vec<LE32>,
+    index_len: Vec<LE32>,
+    index_prefix: Vec<LE32>,
+    index_suffix: Vec<LE32>,
    index_d: Vec<LE32>,
 }

@ -144,9 +145,9 @@ impl Keys {

        Ok(Keys {
            words,
-            index_a,
-            index_b,
-            index_c,
+            index_len: index_a,
+            index_prefix: index_b,
+            index_suffix: index_c,
            index_d,
        })
    }
@ -163,6 +164,7 @@ impl Keys {

    pub(crate) fn get_word_span(&self, offset: usize) -> Result<(&str, usize), Error> {
        let words_bytes = LE32::slice_as_bytes(&self.words);
+        // TODO: add comment. What is this guarding against?
        if words_bytes.len() < offset + 2 * size_of::<LE32>() {
            return Err(Error::InvalidIndex);
        }
@ -174,6 +176,27 @@ impl Keys {
        }
    }

+    pub(crate) fn cmp_key(&self, target: &str, idx: usize) -> Result<Ordering, Error> {
+        let offset = self.index_prefix[idx + 1].us() + size_of::<LE32>() + 1;
+        let words_bytes = LE32::slice_as_bytes(&self.words);
+        if words_bytes.len() < offset + target.len() + 1 {
+
+            return Err(Error::InvalidIndex); // Maybe just return Ordering::Less instead?
+        }
+        let found_tail = &words_bytes[offset..];
+        let found = &found_tail[..target.len()];
+        Ok(match found.cmp(target.as_bytes()) {
+            Ordering::Equal => if found_tail[target.len()] == b'\0'
+                {
+                    Ordering::Equal
+                } else {
+                    Ordering::Greater
+                },
+            ord => ord,
+        })
+    }
+
+
    fn get_inner(&self, index: &[LE32], idx: usize) -> Result<(&str, PageIter<'_>), Error> {
        if idx >= self.count() {
            return Err(Error::NotFound);
@ -184,21 +207,76 @@ impl Keys {
        Ok((word, pages))
    }

-    pub fn get_index_a(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
-        self.get_inner(&self.index_a, idx)
+    pub fn get_index_len(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
+        self.get_inner(&self.index_len, idx)
    }

-    pub fn get_index_b(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
-        self.get_inner(&self.index_b, idx)
+    pub fn get_index_prefix(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
+        self.get_inner(&self.index_prefix, idx)
    }

-    pub fn get_index_c(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
-        self.get_inner(&self.index_c, idx)
+    pub fn get_index_suffix(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
+        self.get_inner(&self.index_suffix, idx)
    }

    pub fn get_index_d(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
        self.get_inner(&self.index_d, idx)
    }
+
+    pub fn search_exact(&self, target_key: &str) -> Result<(usize, PageIter<'_>), Error> {
+        let target_key = &to_katakana(target_key);
+        let mut high = self.count();
+        let mut low = 0;
+
+        // TODO: Revise corner cases and add tests for this binary search
+        while low <= high {
+            let mid = low + (high - low) / 2;
+
+            let cmp = self.cmp_key(target_key, mid)?;
+
+            match cmp {
+                Ordering::Less => low = mid + 1,
+                Ordering::Greater => high = mid - 1,
+                Ordering::Equal => return Ok((mid, self.get_index_prefix(mid)?.1)),
+            }
+        }
+
+        return Err(Error::NotFound);
+    }
+}
+
+fn to_katakana(input: &str) -> Cow<str> {
+    let diff = 'ア' as u32 - 'あ' as u32;
+    if let Some(pos) = input.find(|c| matches!(c, 'ぁ'..='ん')) {
+        let mut output = input[..pos].to_owned();
+        for c in input[pos..].chars() {
+            if matches!(c, 'ぁ'..='ん') {
+                output.push(char::from_u32(c as u32 + diff).unwrap());
+            } else {
+                output.push(c);
+            }
+        }
+        return Cow::Owned(output);
+    } else {
+        return Cow::Borrowed(input);
+    }
+}
+
+#[test]
+fn test_to_katakana() {
+    assert_eq!(*to_katakana(""), *"");
+    assert_eq!(*to_katakana("あ"), *"ア");
+    assert_eq!(*to_katakana("ぁ"), *"ァ");
+    assert_eq!(*to_katakana("ん"), *"ン");
+    assert_eq!(*to_katakana("っ"), *"ッ");
+    assert_eq!(*to_katakana("ア"), *"ア");
+    assert_eq!(*to_katakana("ァ"), *"ァ");
+    assert_eq!(*to_katakana("ン"), *"ン");
+    assert_eq!(*to_katakana("ッ"), *"ッ");
+    assert_eq!(*to_katakana("aアa"), *"aアa");
+    assert_eq!(*to_katakana("aァa"), *"aァa");
+    assert_eq!(*to_katakana("aンa"), *"aンa");
+    assert_eq!(*to_katakana("aッa"), *"aッa");
 }

 #[derive(Debug, Clone)]
@ -216,9 +294,13 @@ impl<'a> PageIter<'a> {
        let mut tail = pages;
        for _ in 0..count {
            match tail {
+                &[1, _, ref t @ ..] => tail = t,
                &[2, _, _, ref t @ ..] => tail = t,
                &[4, _, _, _, ref t @ ..] => tail = t,
-                _ => return Err(Error::InvalidIndex),
+                e => {
+                    dbg!("hmm", &e[..100]);
+                    return Err(Error::InvalidIndex);
+                },
            }
        }
        let span_len = pages.len() - tail.len();
@ -237,6 +319,7 @@ impl<'a> Iterator for PageIter<'a> {
        // so unreachable is never reached. `self.count` is also checked to correspond,
        // so overflow never happens.
        let (id, tail) = match self.span {
+            &[1, hi, ref tail @ ..] => (u32::from_be_bytes([0, 0, 0, hi]), tail),
            &[2, hi, lo, ref tail @ ..] => (u32::from_be_bytes([0, 0, hi, lo]), tail),
            &[4, hi, mid, lo, ref tail @ ..] => (u32::from_be_bytes([0, hi, mid, lo]), tail),
            &[] => return None,
--- a/src/main.rs
+++ b/src/main.rs
@ -1,13 +1,100 @@
-use monokakido::MonokakidoDict;
+use std::{io::{stdout, Write}, ops::Neg};
+
+use monokakido::{MonokakidoDict, Error};
+
+fn get_first_audio_id(page: &str) -> Result<&str, Error> {
+    if let Some((_, sound_tail)) = page.split_once("<sound>") {
+        if let Some((sound, _)) = sound_tail.split_once("</sound>") {
+            if let Some((head_id, _)) = sound.split_once(".aac") {
+                if let Some((_, id)) = head_id.split_once("href=\"") {
+                    return Ok(id);
+                }
+            }
+        }
+    }
+    Err(Error::NotFound)
+}
+
+fn get_first_accent(page: &str) -> Result<i8, Error> {
+    if let Some((_, accent_tail)) = page.split_once("<accent_text>") {
+        if let Some((mut accent, _)) = accent_tail.split_once("</accent_text>") {
+            if let Some((a, _)) = accent.split_once("<sound>") {
+                    accent = a;
+            }
+            if let Some(pos) = accent.find("<symbol_backslash>＼</symbol_backslash>") {
+                let endpos = pos + "<symbol_backslash>＼</symbol_backslash>".len();
+                let before = &accent[..pos];
+                let after = &accent[endpos..];
+                let is_mora = |&c: &char| (matches!(c, 'ぁ'..='ん' | 'ァ'..='ン' | 'ー') && !matches!(c, 'ゃ'..='ょ' | 'ャ'..='ョ'));
+                return Ok((before.chars().filter(is_mora).count() as i8));
+            }
+            if let Some(_) = accent.find("<symbol_macron>━</symbol_macron>") {
+                return Ok(0);
+            }
+        }
+    }
+    Err(Error::NotFound)
+}
+
+fn get_accents(page: &str) -> Result<(i8, Option<i8>), Error> {
+    if let Some((first, tail)) = page.split_once("</accent>") {
+        return Ok((get_first_accent(first)?, get_first_accent(tail).ok()));
+    }
+    Err(Error::NotFound)
+}

 fn main() {
+
+    let Some(key) = std::env::args().nth(1) else {
+        return;
+    };
    /*
       for dict in MonokakidoDict::list().unwrap() {
           dbg!(dict.unwrap());
       }
    */
    let mut dict = MonokakidoDict::open("NHKACCENT2").unwrap();
+    let mut accents = vec![];
+    let result = dict.keys.search_exact(&key);
+    match result {
+        Ok((_, pages)) => {
+            for id in pages{
+                let page = dict.pages.get(id).unwrap();
+                if let Ok(accent) = get_accents(page) {
+                    accents.push(accent);
+                }
+                /*
+                let id = get_first_audio_id(page).unwrap();
+                let audio = dict.audio.get(id).unwrap();
+                let mut stdout = stdout().lock();
+                stdout.write_all(audio).unwrap();
+                */
+            }
+        },
+        Err(e) => {
+            println!("{:?}", e);
+            return;
+        },
+    }
+    print!("{key}\t");
+    accents.sort();
+    accents.dedup();
+    if accents.is_empty() {
+        print!("N/A");
+    } else {
+        for (accent_main, accent_sub) in accents {
+            print!("{accent_main}");
+            if let Some(accent_sub) = accent_sub {
+                if accent_main != accent_sub {
+                    print!("/{accent_sub}");
+                }
+            }
+            print!(" ");
+        }
+    }
+    println!()

+/*
    let idx_list = [
        0,
        1,
@ -48,7 +135,7 @@ fn main() {

    println!("Index: length order");
    for idx in idx_list {
-        let (word, pages) = dict.keys.get_index_a(idx).unwrap();
+        let (word, pages) = dict.keys.get_index_len(idx).unwrap();
        println!("\n{}", word);
        for id in pages {
            println!("{}", dict.pages.get(id).unwrap());
@ -57,7 +144,7 @@ fn main() {

    println!("Index: prefix order");
    for idx in idx_list {
-        let (word, pages) = dict.keys.get_index_b(idx).unwrap();
+        let (word, pages) = dict.keys.get_index_prefix(idx).unwrap();
        println!("\n{}", word);
        for id in pages {
            println!("{}", dict.pages.get(id).unwrap());
@ -66,7 +153,7 @@ fn main() {

    println!("Index: suffix order");
    for idx in idx_list {
-        let (word, pages) = dict.keys.get_index_c(idx).unwrap();
+        let (word, pages) = dict.keys.get_index_suffix(idx).unwrap();
        println!("\n{}", word);
        for id in pages {
            println!("{}", dict.pages.get(id).unwrap());
@ -81,7 +168,7 @@ fn main() {
            println!("{}", dict.pages.get(id).unwrap());
        }
    }
-
+ */
    //let mut stdout = stdout().lock();
    //stdout.write_all(audio).unwrap();
 }