Fixed bugs, added tests, modified main.
This commit is contained in:
parent
baf2bc4a6c
commit
dac55a38c9
4
Cargo.lock
generated
4
Cargo.lock
generated
|
@ -55,9 +55,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.46"
|
||||
version = "1.0.47"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "94e2ef8dbfc347b10c094890f778ee2e36ca9bb4262e86dc99cd217e35f3470b"
|
||||
checksum = "5ea3d908b0e36316caf9e9e2c4625cdde190a7e6f440d794667ed17a1855e725"
|
||||
dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
|
33
src/audio.rs
33
src/audio.rs
|
@ -16,6 +16,8 @@ pub(crate) struct AudioIndex {
|
|||
}
|
||||
|
||||
mod abi {
|
||||
use std::mem::size_of;
|
||||
|
||||
use crate::{audio::AudioFormat, Error};
|
||||
|
||||
#[repr(C)]
|
||||
|
@ -68,19 +70,28 @@ mod abi {
|
|||
idx: vec![air(0), air(1), air(3), air(6), air(10)],
|
||||
ids: "\0a\0bb\0ccc\0dddd".to_owned(),
|
||||
};
|
||||
assert_eq!(audio_idx.get_id_at(0).unwrap(), "");
|
||||
assert_eq!(audio_idx.get_id_at(1).unwrap(), "a");
|
||||
assert_eq!(audio_idx.get_id_at(3).unwrap(), "bb");
|
||||
assert_eq!(audio_idx.get_id_at(4), Err(Error::InvalidIndex));
|
||||
assert_eq!(audio_idx.get_id_at(6).unwrap(), "ccc");
|
||||
assert_eq!(audio_idx.get_id_at(10), Err(Error::InvalidIndex));
|
||||
|
||||
let diff = 8 + audio_idx.idx.len() * size_of::<AudioIdxRecord>();
|
||||
// Fix offsets now that they are known
|
||||
for air in audio_idx.idx.iter_mut() {
|
||||
air.id_str_offset += diff as u32;
|
||||
}
|
||||
|
||||
dbg!(&audio_idx);
|
||||
assert_eq!(audio_idx.get_id_at(diff + 0).unwrap(), "");
|
||||
assert_eq!(audio_idx.get_id_at(diff + 1).unwrap(), "a");
|
||||
assert_eq!(audio_idx.get_id_at(diff + 3).unwrap(), "bb");
|
||||
assert_eq!(audio_idx.get_id_at(diff + 4), Err(Error::InvalidIndex));
|
||||
assert_eq!(audio_idx.get_id_at(diff + 6).unwrap(), "ccc");
|
||||
assert_eq!(audio_idx.get_id_at(diff + 10), Err(Error::InvalidIndex));
|
||||
|
||||
audio_idx.ids = "\0a\0bb\0ccc\0dddd\0".to_owned();
|
||||
assert_eq!(audio_idx.get_by_id("").unwrap(), air(0));
|
||||
assert_eq!(audio_idx.get_by_id("a").unwrap(), air(1));
|
||||
assert_eq!(audio_idx.get_by_id("bb").unwrap(), air(3));
|
||||
assert_eq!(audio_idx.get_by_id("ccc").unwrap(), air(6));
|
||||
assert_eq!(audio_idx.get_by_id("dddd").unwrap(), air(10));
|
||||
let diff = diff as u32;
|
||||
assert_eq!(audio_idx.get_by_id("").unwrap(), air(diff + 0));
|
||||
assert_eq!(audio_idx.get_by_id("a").unwrap(), air(diff + 1));
|
||||
assert_eq!(audio_idx.get_by_id("bb").unwrap(), air(diff + 3));
|
||||
assert_eq!(audio_idx.get_by_id("ccc").unwrap(), air(diff + 6));
|
||||
assert_eq!(audio_idx.get_by_id("dddd").unwrap(), air(diff + 10));
|
||||
assert_eq!(audio_idx.get_by_id("ddd"), Err(Error::NotFound));
|
||||
}
|
||||
}
|
||||
|
|
109
src/key.rs
109
src/key.rs
|
@ -3,6 +3,7 @@ use std::{
|
|||
io::{Read, Seek},
|
||||
mem::size_of,
|
||||
str::from_utf8,
|
||||
cmp::Ordering, borrow::Cow,
|
||||
};
|
||||
|
||||
use crate::{
|
||||
|
@ -77,9 +78,9 @@ use abi::{FileHeader, IndexHeader};
|
|||
|
||||
pub struct Keys {
|
||||
words: Vec<LE32>,
|
||||
index_a: Vec<LE32>,
|
||||
index_b: Vec<LE32>,
|
||||
index_c: Vec<LE32>,
|
||||
index_len: Vec<LE32>,
|
||||
index_prefix: Vec<LE32>,
|
||||
index_suffix: Vec<LE32>,
|
||||
index_d: Vec<LE32>,
|
||||
}
|
||||
|
||||
|
@ -144,9 +145,9 @@ impl Keys {
|
|||
|
||||
Ok(Keys {
|
||||
words,
|
||||
index_a,
|
||||
index_b,
|
||||
index_c,
|
||||
index_len: index_a,
|
||||
index_prefix: index_b,
|
||||
index_suffix: index_c,
|
||||
index_d,
|
||||
})
|
||||
}
|
||||
|
@ -163,6 +164,7 @@ impl Keys {
|
|||
|
||||
pub(crate) fn get_word_span(&self, offset: usize) -> Result<(&str, usize), Error> {
|
||||
let words_bytes = LE32::slice_as_bytes(&self.words);
|
||||
// TODO: add comment. What is this guarding against?
|
||||
if words_bytes.len() < offset + 2 * size_of::<LE32>() {
|
||||
return Err(Error::InvalidIndex);
|
||||
}
|
||||
|
@ -174,6 +176,27 @@ impl Keys {
|
|||
}
|
||||
}
|
||||
|
||||
pub(crate) fn cmp_key(&self, target: &str, idx: usize) -> Result<Ordering, Error> {
|
||||
let offset = self.index_prefix[idx + 1].us() + size_of::<LE32>() + 1;
|
||||
let words_bytes = LE32::slice_as_bytes(&self.words);
|
||||
if words_bytes.len() < offset + target.len() + 1 {
|
||||
|
||||
return Err(Error::InvalidIndex); // Maybe just return Ordering::Less instead?
|
||||
}
|
||||
let found_tail = &words_bytes[offset..];
|
||||
let found = &found_tail[..target.len()];
|
||||
Ok(match found.cmp(target.as_bytes()) {
|
||||
Ordering::Equal => if found_tail[target.len()] == b'\0'
|
||||
{
|
||||
Ordering::Equal
|
||||
} else {
|
||||
Ordering::Greater
|
||||
},
|
||||
ord => ord,
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
fn get_inner(&self, index: &[LE32], idx: usize) -> Result<(&str, PageIter<'_>), Error> {
|
||||
if idx >= self.count() {
|
||||
return Err(Error::NotFound);
|
||||
|
@ -184,21 +207,76 @@ impl Keys {
|
|||
Ok((word, pages))
|
||||
}
|
||||
|
||||
pub fn get_index_a(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
|
||||
self.get_inner(&self.index_a, idx)
|
||||
pub fn get_index_len(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
|
||||
self.get_inner(&self.index_len, idx)
|
||||
}
|
||||
|
||||
pub fn get_index_b(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
|
||||
self.get_inner(&self.index_b, idx)
|
||||
pub fn get_index_prefix(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
|
||||
self.get_inner(&self.index_prefix, idx)
|
||||
}
|
||||
|
||||
pub fn get_index_c(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
|
||||
self.get_inner(&self.index_c, idx)
|
||||
pub fn get_index_suffix(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
|
||||
self.get_inner(&self.index_suffix, idx)
|
||||
}
|
||||
|
||||
pub fn get_index_d(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
|
||||
self.get_inner(&self.index_d, idx)
|
||||
}
|
||||
|
||||
pub fn search_exact(&self, target_key: &str) -> Result<(usize, PageIter<'_>), Error> {
|
||||
let target_key = &to_katakana(target_key);
|
||||
let mut high = self.count();
|
||||
let mut low = 0;
|
||||
|
||||
// TODO: Revise corner cases and add tests for this binary search
|
||||
while low <= high {
|
||||
let mid = low + (high - low) / 2;
|
||||
|
||||
let cmp = self.cmp_key(target_key, mid)?;
|
||||
|
||||
match cmp {
|
||||
Ordering::Less => low = mid + 1,
|
||||
Ordering::Greater => high = mid - 1,
|
||||
Ordering::Equal => return Ok((mid, self.get_index_prefix(mid)?.1)),
|
||||
}
|
||||
}
|
||||
|
||||
return Err(Error::NotFound);
|
||||
}
|
||||
}
|
||||
|
||||
fn to_katakana(input: &str) -> Cow<str> {
|
||||
let diff = 'ア' as u32 - 'あ' as u32;
|
||||
if let Some(pos) = input.find(|c| matches!(c, 'ぁ'..='ん')) {
|
||||
let mut output = input[..pos].to_owned();
|
||||
for c in input[pos..].chars() {
|
||||
if matches!(c, 'ぁ'..='ん') {
|
||||
output.push(char::from_u32(c as u32 + diff).unwrap());
|
||||
} else {
|
||||
output.push(c);
|
||||
}
|
||||
}
|
||||
return Cow::Owned(output);
|
||||
} else {
|
||||
return Cow::Borrowed(input);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_to_katakana() {
|
||||
assert_eq!(*to_katakana(""), *"");
|
||||
assert_eq!(*to_katakana("あ"), *"ア");
|
||||
assert_eq!(*to_katakana("ぁ"), *"ァ");
|
||||
assert_eq!(*to_katakana("ん"), *"ン");
|
||||
assert_eq!(*to_katakana("っ"), *"ッ");
|
||||
assert_eq!(*to_katakana("ア"), *"ア");
|
||||
assert_eq!(*to_katakana("ァ"), *"ァ");
|
||||
assert_eq!(*to_katakana("ン"), *"ン");
|
||||
assert_eq!(*to_katakana("ッ"), *"ッ");
|
||||
assert_eq!(*to_katakana("aアa"), *"aアa");
|
||||
assert_eq!(*to_katakana("aァa"), *"aァa");
|
||||
assert_eq!(*to_katakana("aンa"), *"aンa");
|
||||
assert_eq!(*to_katakana("aッa"), *"aッa");
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
|
@ -216,9 +294,13 @@ impl<'a> PageIter<'a> {
|
|||
let mut tail = pages;
|
||||
for _ in 0..count {
|
||||
match tail {
|
||||
&[1, _, ref t @ ..] => tail = t,
|
||||
&[2, _, _, ref t @ ..] => tail = t,
|
||||
&[4, _, _, _, ref t @ ..] => tail = t,
|
||||
_ => return Err(Error::InvalidIndex),
|
||||
e => {
|
||||
dbg!("hmm", &e[..100]);
|
||||
return Err(Error::InvalidIndex);
|
||||
},
|
||||
}
|
||||
}
|
||||
let span_len = pages.len() - tail.len();
|
||||
|
@ -237,6 +319,7 @@ impl<'a> Iterator for PageIter<'a> {
|
|||
// so unreachable is never reached. `self.count` is also checked to correspond,
|
||||
// so overflow never happens.
|
||||
let (id, tail) = match self.span {
|
||||
&[1, hi, ref tail @ ..] => (u32::from_be_bytes([0, 0, 0, hi]), tail),
|
||||
&[2, hi, lo, ref tail @ ..] => (u32::from_be_bytes([0, 0, hi, lo]), tail),
|
||||
&[4, hi, mid, lo, ref tail @ ..] => (u32::from_be_bytes([0, hi, mid, lo]), tail),
|
||||
&[] => return None,
|
||||
|
|
97
src/main.rs
97
src/main.rs
|
@ -1,13 +1,100 @@
|
|||
use monokakido::MonokakidoDict;
|
||||
use std::{io::{stdout, Write}, ops::Neg};
|
||||
|
||||
use monokakido::{MonokakidoDict, Error};
|
||||
|
||||
fn get_first_audio_id(page: &str) -> Result<&str, Error> {
|
||||
if let Some((_, sound_tail)) = page.split_once("<sound>") {
|
||||
if let Some((sound, _)) = sound_tail.split_once("</sound>") {
|
||||
if let Some((head_id, _)) = sound.split_once(".aac") {
|
||||
if let Some((_, id)) = head_id.split_once("href=\"") {
|
||||
return Ok(id);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(Error::NotFound)
|
||||
}
|
||||
|
||||
fn get_first_accent(page: &str) -> Result<i8, Error> {
|
||||
if let Some((_, accent_tail)) = page.split_once("<accent_text>") {
|
||||
if let Some((mut accent, _)) = accent_tail.split_once("</accent_text>") {
|
||||
if let Some((a, _)) = accent.split_once("<sound>") {
|
||||
accent = a;
|
||||
}
|
||||
if let Some(pos) = accent.find("<symbol_backslash>\</symbol_backslash>") {
|
||||
let endpos = pos + "<symbol_backslash>\</symbol_backslash>".len();
|
||||
let before = &accent[..pos];
|
||||
let after = &accent[endpos..];
|
||||
let is_mora = |&c: &char| (matches!(c, 'ぁ'..='ん' | 'ァ'..='ン' | 'ー') && !matches!(c, 'ゃ'..='ょ' | 'ャ'..='ョ'));
|
||||
return Ok((before.chars().filter(is_mora).count() as i8));
|
||||
}
|
||||
if let Some(_) = accent.find("<symbol_macron>━</symbol_macron>") {
|
||||
return Ok(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(Error::NotFound)
|
||||
}
|
||||
|
||||
fn get_accents(page: &str) -> Result<(i8, Option<i8>), Error> {
|
||||
if let Some((first, tail)) = page.split_once("</accent>") {
|
||||
return Ok((get_first_accent(first)?, get_first_accent(tail).ok()));
|
||||
}
|
||||
Err(Error::NotFound)
|
||||
}
|
||||
|
||||
fn main() {
|
||||
|
||||
let Some(key) = std::env::args().nth(1) else {
|
||||
return;
|
||||
};
|
||||
/*
|
||||
for dict in MonokakidoDict::list().unwrap() {
|
||||
dbg!(dict.unwrap());
|
||||
}
|
||||
*/
|
||||
let mut dict = MonokakidoDict::open("NHKACCENT2").unwrap();
|
||||
let mut accents = vec![];
|
||||
let result = dict.keys.search_exact(&key);
|
||||
match result {
|
||||
Ok((_, pages)) => {
|
||||
for id in pages{
|
||||
let page = dict.pages.get(id).unwrap();
|
||||
if let Ok(accent) = get_accents(page) {
|
||||
accents.push(accent);
|
||||
}
|
||||
/*
|
||||
let id = get_first_audio_id(page).unwrap();
|
||||
let audio = dict.audio.get(id).unwrap();
|
||||
let mut stdout = stdout().lock();
|
||||
stdout.write_all(audio).unwrap();
|
||||
*/
|
||||
}
|
||||
},
|
||||
Err(e) => {
|
||||
println!("{:?}", e);
|
||||
return;
|
||||
},
|
||||
}
|
||||
print!("{key}\t");
|
||||
accents.sort();
|
||||
accents.dedup();
|
||||
if accents.is_empty() {
|
||||
print!("N/A");
|
||||
} else {
|
||||
for (accent_main, accent_sub) in accents {
|
||||
print!("{accent_main}");
|
||||
if let Some(accent_sub) = accent_sub {
|
||||
if accent_main != accent_sub {
|
||||
print!("/{accent_sub}");
|
||||
}
|
||||
}
|
||||
print!(" ");
|
||||
}
|
||||
}
|
||||
println!()
|
||||
|
||||
/*
|
||||
let idx_list = [
|
||||
0,
|
||||
1,
|
||||
|
@ -48,7 +135,7 @@ fn main() {
|
|||
|
||||
println!("Index: length order");
|
||||
for idx in idx_list {
|
||||
let (word, pages) = dict.keys.get_index_a(idx).unwrap();
|
||||
let (word, pages) = dict.keys.get_index_len(idx).unwrap();
|
||||
println!("\n{}", word);
|
||||
for id in pages {
|
||||
println!("{}", dict.pages.get(id).unwrap());
|
||||
|
@ -57,7 +144,7 @@ fn main() {
|
|||
|
||||
println!("Index: prefix order");
|
||||
for idx in idx_list {
|
||||
let (word, pages) = dict.keys.get_index_b(idx).unwrap();
|
||||
let (word, pages) = dict.keys.get_index_prefix(idx).unwrap();
|
||||
println!("\n{}", word);
|
||||
for id in pages {
|
||||
println!("{}", dict.pages.get(id).unwrap());
|
||||
|
@ -66,7 +153,7 @@ fn main() {
|
|||
|
||||
println!("Index: suffix order");
|
||||
for idx in idx_list {
|
||||
let (word, pages) = dict.keys.get_index_c(idx).unwrap();
|
||||
let (word, pages) = dict.keys.get_index_suffix(idx).unwrap();
|
||||
println!("\n{}", word);
|
||||
for id in pages {
|
||||
println!("{}", dict.pages.get(id).unwrap());
|
||||
|
@ -81,7 +168,7 @@ fn main() {
|
|||
println!("{}", dict.pages.get(id).unwrap());
|
||||
}
|
||||
}
|
||||
|
||||
*/
|
||||
//let mut stdout = stdout().lock();
|
||||
//stdout.write_all(audio).unwrap();
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue