Fixed bugs, added tests, modified main.

This commit is contained in:
Pyry Kontio 2022-12-02 14:18:03 +09:00
parent baf2bc4a6c
commit dac55a38c9
No known key found for this signature in database
4 changed files with 212 additions and 31 deletions

4
Cargo.lock generated
View file

@ -55,9 +55,9 @@ dependencies = [
[[package]]
name = "proc-macro2"
version = "1.0.46"
version = "1.0.47"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94e2ef8dbfc347b10c094890f778ee2e36ca9bb4262e86dc99cd217e35f3470b"
checksum = "5ea3d908b0e36316caf9e9e2c4625cdde190a7e6f440d794667ed17a1855e725"
dependencies = [
"unicode-ident",
]

View file

@ -16,6 +16,8 @@ pub(crate) struct AudioIndex {
}
mod abi {
use std::mem::size_of;
use crate::{audio::AudioFormat, Error};
#[repr(C)]
@ -68,19 +70,28 @@ mod abi {
idx: vec![air(0), air(1), air(3), air(6), air(10)],
ids: "\0a\0bb\0ccc\0dddd".to_owned(),
};
assert_eq!(audio_idx.get_id_at(0).unwrap(), "");
assert_eq!(audio_idx.get_id_at(1).unwrap(), "a");
assert_eq!(audio_idx.get_id_at(3).unwrap(), "bb");
assert_eq!(audio_idx.get_id_at(4), Err(Error::InvalidIndex));
assert_eq!(audio_idx.get_id_at(6).unwrap(), "ccc");
assert_eq!(audio_idx.get_id_at(10), Err(Error::InvalidIndex));
let diff = 8 + audio_idx.idx.len() * size_of::<AudioIdxRecord>();
// Fix offsets now that they are known
for air in audio_idx.idx.iter_mut() {
air.id_str_offset += diff as u32;
}
dbg!(&audio_idx);
assert_eq!(audio_idx.get_id_at(diff + 0).unwrap(), "");
assert_eq!(audio_idx.get_id_at(diff + 1).unwrap(), "a");
assert_eq!(audio_idx.get_id_at(diff + 3).unwrap(), "bb");
assert_eq!(audio_idx.get_id_at(diff + 4), Err(Error::InvalidIndex));
assert_eq!(audio_idx.get_id_at(diff + 6).unwrap(), "ccc");
assert_eq!(audio_idx.get_id_at(diff + 10), Err(Error::InvalidIndex));
audio_idx.ids = "\0a\0bb\0ccc\0dddd\0".to_owned();
assert_eq!(audio_idx.get_by_id("").unwrap(), air(0));
assert_eq!(audio_idx.get_by_id("a").unwrap(), air(1));
assert_eq!(audio_idx.get_by_id("bb").unwrap(), air(3));
assert_eq!(audio_idx.get_by_id("ccc").unwrap(), air(6));
assert_eq!(audio_idx.get_by_id("dddd").unwrap(), air(10));
let diff = diff as u32;
assert_eq!(audio_idx.get_by_id("").unwrap(), air(diff + 0));
assert_eq!(audio_idx.get_by_id("a").unwrap(), air(diff + 1));
assert_eq!(audio_idx.get_by_id("bb").unwrap(), air(diff + 3));
assert_eq!(audio_idx.get_by_id("ccc").unwrap(), air(diff + 6));
assert_eq!(audio_idx.get_by_id("dddd").unwrap(), air(diff + 10));
assert_eq!(audio_idx.get_by_id("ddd"), Err(Error::NotFound));
}
}

View file

@ -3,6 +3,7 @@ use std::{
io::{Read, Seek},
mem::size_of,
str::from_utf8,
cmp::Ordering, borrow::Cow,
};
use crate::{
@ -77,9 +78,9 @@ use abi::{FileHeader, IndexHeader};
pub struct Keys {
words: Vec<LE32>,
index_a: Vec<LE32>,
index_b: Vec<LE32>,
index_c: Vec<LE32>,
index_len: Vec<LE32>,
index_prefix: Vec<LE32>,
index_suffix: Vec<LE32>,
index_d: Vec<LE32>,
}
@ -144,9 +145,9 @@ impl Keys {
Ok(Keys {
words,
index_a,
index_b,
index_c,
index_len: index_a,
index_prefix: index_b,
index_suffix: index_c,
index_d,
})
}
@ -163,6 +164,7 @@ impl Keys {
pub(crate) fn get_word_span(&self, offset: usize) -> Result<(&str, usize), Error> {
let words_bytes = LE32::slice_as_bytes(&self.words);
// TODO: add comment. What is this guarding against?
if words_bytes.len() < offset + 2 * size_of::<LE32>() {
return Err(Error::InvalidIndex);
}
@ -174,6 +176,27 @@ impl Keys {
}
}
pub(crate) fn cmp_key(&self, target: &str, idx: usize) -> Result<Ordering, Error> {
let offset = self.index_prefix[idx + 1].us() + size_of::<LE32>() + 1;
let words_bytes = LE32::slice_as_bytes(&self.words);
if words_bytes.len() < offset + target.len() + 1 {
return Err(Error::InvalidIndex); // Maybe just return Ordering::Less instead?
}
let found_tail = &words_bytes[offset..];
let found = &found_tail[..target.len()];
Ok(match found.cmp(target.as_bytes()) {
Ordering::Equal => if found_tail[target.len()] == b'\0'
{
Ordering::Equal
} else {
Ordering::Greater
},
ord => ord,
})
}
fn get_inner(&self, index: &[LE32], idx: usize) -> Result<(&str, PageIter<'_>), Error> {
if idx >= self.count() {
return Err(Error::NotFound);
@ -184,21 +207,76 @@ impl Keys {
Ok((word, pages))
}
pub fn get_index_a(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
self.get_inner(&self.index_a, idx)
pub fn get_index_len(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
self.get_inner(&self.index_len, idx)
}
pub fn get_index_b(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
self.get_inner(&self.index_b, idx)
pub fn get_index_prefix(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
self.get_inner(&self.index_prefix, idx)
}
pub fn get_index_c(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
self.get_inner(&self.index_c, idx)
pub fn get_index_suffix(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
self.get_inner(&self.index_suffix, idx)
}
pub fn get_index_d(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
self.get_inner(&self.index_d, idx)
}
pub fn search_exact(&self, target_key: &str) -> Result<(usize, PageIter<'_>), Error> {
let target_key = &to_katakana(target_key);
let mut high = self.count();
let mut low = 0;
// TODO: Revise corner cases and add tests for this binary search
while low <= high {
let mid = low + (high - low) / 2;
let cmp = self.cmp_key(target_key, mid)?;
match cmp {
Ordering::Less => low = mid + 1,
Ordering::Greater => high = mid - 1,
Ordering::Equal => return Ok((mid, self.get_index_prefix(mid)?.1)),
}
}
return Err(Error::NotFound);
}
}
fn to_katakana(input: &str) -> Cow<str> {
let diff = 'ア' as u32 - 'あ' as u32;
if let Some(pos) = input.find(|c| matches!(c, 'ぁ'..='ん')) {
let mut output = input[..pos].to_owned();
for c in input[pos..].chars() {
if matches!(c, 'ぁ'..='ん') {
output.push(char::from_u32(c as u32 + diff).unwrap());
} else {
output.push(c);
}
}
return Cow::Owned(output);
} else {
return Cow::Borrowed(input);
}
}
#[test]
fn test_to_katakana() {
assert_eq!(*to_katakana(""), *"");
assert_eq!(*to_katakana(""), *"");
assert_eq!(*to_katakana(""), *"");
assert_eq!(*to_katakana(""), *"");
assert_eq!(*to_katakana(""), *"");
assert_eq!(*to_katakana(""), *"");
assert_eq!(*to_katakana(""), *"");
assert_eq!(*to_katakana(""), *"");
assert_eq!(*to_katakana(""), *"");
assert_eq!(*to_katakana("aアa"), *"aアa");
assert_eq!(*to_katakana("aァa"), *"aァa");
assert_eq!(*to_katakana("aンa"), *"aンa");
assert_eq!(*to_katakana("aッa"), *"aッa");
}
#[derive(Debug, Clone)]
@ -216,9 +294,13 @@ impl<'a> PageIter<'a> {
let mut tail = pages;
for _ in 0..count {
match tail {
&[1, _, ref t @ ..] => tail = t,
&[2, _, _, ref t @ ..] => tail = t,
&[4, _, _, _, ref t @ ..] => tail = t,
_ => return Err(Error::InvalidIndex),
e => {
dbg!("hmm", &e[..100]);
return Err(Error::InvalidIndex);
},
}
}
let span_len = pages.len() - tail.len();
@ -237,6 +319,7 @@ impl<'a> Iterator for PageIter<'a> {
// so unreachable is never reached. `self.count` is also checked to correspond,
// so overflow never happens.
let (id, tail) = match self.span {
&[1, hi, ref tail @ ..] => (u32::from_be_bytes([0, 0, 0, hi]), tail),
&[2, hi, lo, ref tail @ ..] => (u32::from_be_bytes([0, 0, hi, lo]), tail),
&[4, hi, mid, lo, ref tail @ ..] => (u32::from_be_bytes([0, hi, mid, lo]), tail),
&[] => return None,

View file

@ -1,13 +1,100 @@
use monokakido::MonokakidoDict;
use std::{io::{stdout, Write}, ops::Neg};
use monokakido::{MonokakidoDict, Error};
fn get_first_audio_id(page: &str) -> Result<&str, Error> {
if let Some((_, sound_tail)) = page.split_once("<sound>") {
if let Some((sound, _)) = sound_tail.split_once("</sound>") {
if let Some((head_id, _)) = sound.split_once(".aac") {
if let Some((_, id)) = head_id.split_once("href=\"") {
return Ok(id);
}
}
}
}
Err(Error::NotFound)
}
fn get_first_accent(page: &str) -> Result<i8, Error> {
if let Some((_, accent_tail)) = page.split_once("<accent_text>") {
if let Some((mut accent, _)) = accent_tail.split_once("</accent_text>") {
if let Some((a, _)) = accent.split_once("<sound>") {
accent = a;
}
if let Some(pos) = accent.find("<symbol_backslash></symbol_backslash>") {
let endpos = pos + "<symbol_backslash></symbol_backslash>".len();
let before = &accent[..pos];
let after = &accent[endpos..];
let is_mora = |&c: &char| (matches!(c, 'ぁ'..='ん' | 'ァ'..='ン' | 'ー') && !matches!(c, 'ゃ'..='ょ' | 'ャ'..='ョ'));
return Ok((before.chars().filter(is_mora).count() as i8));
}
if let Some(_) = accent.find("<symbol_macron>━</symbol_macron>") {
return Ok(0);
}
}
}
Err(Error::NotFound)
}
fn get_accents(page: &str) -> Result<(i8, Option<i8>), Error> {
if let Some((first, tail)) = page.split_once("</accent>") {
return Ok((get_first_accent(first)?, get_first_accent(tail).ok()));
}
Err(Error::NotFound)
}
fn main() {
let Some(key) = std::env::args().nth(1) else {
return;
};
/*
for dict in MonokakidoDict::list().unwrap() {
dbg!(dict.unwrap());
}
*/
let mut dict = MonokakidoDict::open("NHKACCENT2").unwrap();
let mut accents = vec![];
let result = dict.keys.search_exact(&key);
match result {
Ok((_, pages)) => {
for id in pages{
let page = dict.pages.get(id).unwrap();
if let Ok(accent) = get_accents(page) {
accents.push(accent);
}
/*
let id = get_first_audio_id(page).unwrap();
let audio = dict.audio.get(id).unwrap();
let mut stdout = stdout().lock();
stdout.write_all(audio).unwrap();
*/
}
},
Err(e) => {
println!("{:?}", e);
return;
},
}
print!("{key}\t");
accents.sort();
accents.dedup();
if accents.is_empty() {
print!("N/A");
} else {
for (accent_main, accent_sub) in accents {
print!("{accent_main}");
if let Some(accent_sub) = accent_sub {
if accent_main != accent_sub {
print!("/{accent_sub}");
}
}
print!(" ");
}
}
println!()
/*
let idx_list = [
0,
1,
@ -48,7 +135,7 @@ fn main() {
println!("Index: length order");
for idx in idx_list {
let (word, pages) = dict.keys.get_index_a(idx).unwrap();
let (word, pages) = dict.keys.get_index_len(idx).unwrap();
println!("\n{}", word);
for id in pages {
println!("{}", dict.pages.get(id).unwrap());
@ -57,7 +144,7 @@ fn main() {
println!("Index: prefix order");
for idx in idx_list {
let (word, pages) = dict.keys.get_index_b(idx).unwrap();
let (word, pages) = dict.keys.get_index_prefix(idx).unwrap();
println!("\n{}", word);
for id in pages {
println!("{}", dict.pages.get(id).unwrap());
@ -66,7 +153,7 @@ fn main() {
println!("Index: suffix order");
for idx in idx_list {
let (word, pages) = dict.keys.get_index_c(idx).unwrap();
let (word, pages) = dict.keys.get_index_suffix(idx).unwrap();
println!("\n{}", word);
for id in pages {
println!("{}", dict.pages.get(id).unwrap());
@ -81,7 +168,7 @@ fn main() {
println!("{}", dict.pages.get(id).unwrap());
}
}
*/
//let mut stdout = stdout().lock();
//stdout.write_all(audio).unwrap();
}