Fixed bugs, added tests, modified main.
This commit is contained in:
parent
baf2bc4a6c
commit
dac55a38c9
4
Cargo.lock
generated
4
Cargo.lock
generated
|
@ -55,9 +55,9 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "proc-macro2"
|
name = "proc-macro2"
|
||||||
version = "1.0.46"
|
version = "1.0.47"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "94e2ef8dbfc347b10c094890f778ee2e36ca9bb4262e86dc99cd217e35f3470b"
|
checksum = "5ea3d908b0e36316caf9e9e2c4625cdde190a7e6f440d794667ed17a1855e725"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"unicode-ident",
|
"unicode-ident",
|
||||||
]
|
]
|
||||||
|
|
33
src/audio.rs
33
src/audio.rs
|
@ -16,6 +16,8 @@ pub(crate) struct AudioIndex {
|
||||||
}
|
}
|
||||||
|
|
||||||
mod abi {
|
mod abi {
|
||||||
|
use std::mem::size_of;
|
||||||
|
|
||||||
use crate::{audio::AudioFormat, Error};
|
use crate::{audio::AudioFormat, Error};
|
||||||
|
|
||||||
#[repr(C)]
|
#[repr(C)]
|
||||||
|
@ -68,19 +70,28 @@ mod abi {
|
||||||
idx: vec![air(0), air(1), air(3), air(6), air(10)],
|
idx: vec![air(0), air(1), air(3), air(6), air(10)],
|
||||||
ids: "\0a\0bb\0ccc\0dddd".to_owned(),
|
ids: "\0a\0bb\0ccc\0dddd".to_owned(),
|
||||||
};
|
};
|
||||||
assert_eq!(audio_idx.get_id_at(0).unwrap(), "");
|
|
||||||
assert_eq!(audio_idx.get_id_at(1).unwrap(), "a");
|
let diff = 8 + audio_idx.idx.len() * size_of::<AudioIdxRecord>();
|
||||||
assert_eq!(audio_idx.get_id_at(3).unwrap(), "bb");
|
// Fix offsets now that they are known
|
||||||
assert_eq!(audio_idx.get_id_at(4), Err(Error::InvalidIndex));
|
for air in audio_idx.idx.iter_mut() {
|
||||||
assert_eq!(audio_idx.get_id_at(6).unwrap(), "ccc");
|
air.id_str_offset += diff as u32;
|
||||||
assert_eq!(audio_idx.get_id_at(10), Err(Error::InvalidIndex));
|
}
|
||||||
|
|
||||||
|
dbg!(&audio_idx);
|
||||||
|
assert_eq!(audio_idx.get_id_at(diff + 0).unwrap(), "");
|
||||||
|
assert_eq!(audio_idx.get_id_at(diff + 1).unwrap(), "a");
|
||||||
|
assert_eq!(audio_idx.get_id_at(diff + 3).unwrap(), "bb");
|
||||||
|
assert_eq!(audio_idx.get_id_at(diff + 4), Err(Error::InvalidIndex));
|
||||||
|
assert_eq!(audio_idx.get_id_at(diff + 6).unwrap(), "ccc");
|
||||||
|
assert_eq!(audio_idx.get_id_at(diff + 10), Err(Error::InvalidIndex));
|
||||||
|
|
||||||
audio_idx.ids = "\0a\0bb\0ccc\0dddd\0".to_owned();
|
audio_idx.ids = "\0a\0bb\0ccc\0dddd\0".to_owned();
|
||||||
assert_eq!(audio_idx.get_by_id("").unwrap(), air(0));
|
let diff = diff as u32;
|
||||||
assert_eq!(audio_idx.get_by_id("a").unwrap(), air(1));
|
assert_eq!(audio_idx.get_by_id("").unwrap(), air(diff + 0));
|
||||||
assert_eq!(audio_idx.get_by_id("bb").unwrap(), air(3));
|
assert_eq!(audio_idx.get_by_id("a").unwrap(), air(diff + 1));
|
||||||
assert_eq!(audio_idx.get_by_id("ccc").unwrap(), air(6));
|
assert_eq!(audio_idx.get_by_id("bb").unwrap(), air(diff + 3));
|
||||||
assert_eq!(audio_idx.get_by_id("dddd").unwrap(), air(10));
|
assert_eq!(audio_idx.get_by_id("ccc").unwrap(), air(diff + 6));
|
||||||
|
assert_eq!(audio_idx.get_by_id("dddd").unwrap(), air(diff + 10));
|
||||||
assert_eq!(audio_idx.get_by_id("ddd"), Err(Error::NotFound));
|
assert_eq!(audio_idx.get_by_id("ddd"), Err(Error::NotFound));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
109
src/key.rs
109
src/key.rs
|
@ -3,6 +3,7 @@ use std::{
|
||||||
io::{Read, Seek},
|
io::{Read, Seek},
|
||||||
mem::size_of,
|
mem::size_of,
|
||||||
str::from_utf8,
|
str::from_utf8,
|
||||||
|
cmp::Ordering, borrow::Cow,
|
||||||
};
|
};
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
|
@ -77,9 +78,9 @@ use abi::{FileHeader, IndexHeader};
|
||||||
|
|
||||||
pub struct Keys {
|
pub struct Keys {
|
||||||
words: Vec<LE32>,
|
words: Vec<LE32>,
|
||||||
index_a: Vec<LE32>,
|
index_len: Vec<LE32>,
|
||||||
index_b: Vec<LE32>,
|
index_prefix: Vec<LE32>,
|
||||||
index_c: Vec<LE32>,
|
index_suffix: Vec<LE32>,
|
||||||
index_d: Vec<LE32>,
|
index_d: Vec<LE32>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -144,9 +145,9 @@ impl Keys {
|
||||||
|
|
||||||
Ok(Keys {
|
Ok(Keys {
|
||||||
words,
|
words,
|
||||||
index_a,
|
index_len: index_a,
|
||||||
index_b,
|
index_prefix: index_b,
|
||||||
index_c,
|
index_suffix: index_c,
|
||||||
index_d,
|
index_d,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
@ -163,6 +164,7 @@ impl Keys {
|
||||||
|
|
||||||
pub(crate) fn get_word_span(&self, offset: usize) -> Result<(&str, usize), Error> {
|
pub(crate) fn get_word_span(&self, offset: usize) -> Result<(&str, usize), Error> {
|
||||||
let words_bytes = LE32::slice_as_bytes(&self.words);
|
let words_bytes = LE32::slice_as_bytes(&self.words);
|
||||||
|
// TODO: add comment. What is this guarding against?
|
||||||
if words_bytes.len() < offset + 2 * size_of::<LE32>() {
|
if words_bytes.len() < offset + 2 * size_of::<LE32>() {
|
||||||
return Err(Error::InvalidIndex);
|
return Err(Error::InvalidIndex);
|
||||||
}
|
}
|
||||||
|
@ -174,6 +176,27 @@ impl Keys {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn cmp_key(&self, target: &str, idx: usize) -> Result<Ordering, Error> {
|
||||||
|
let offset = self.index_prefix[idx + 1].us() + size_of::<LE32>() + 1;
|
||||||
|
let words_bytes = LE32::slice_as_bytes(&self.words);
|
||||||
|
if words_bytes.len() < offset + target.len() + 1 {
|
||||||
|
|
||||||
|
return Err(Error::InvalidIndex); // Maybe just return Ordering::Less instead?
|
||||||
|
}
|
||||||
|
let found_tail = &words_bytes[offset..];
|
||||||
|
let found = &found_tail[..target.len()];
|
||||||
|
Ok(match found.cmp(target.as_bytes()) {
|
||||||
|
Ordering::Equal => if found_tail[target.len()] == b'\0'
|
||||||
|
{
|
||||||
|
Ordering::Equal
|
||||||
|
} else {
|
||||||
|
Ordering::Greater
|
||||||
|
},
|
||||||
|
ord => ord,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
fn get_inner(&self, index: &[LE32], idx: usize) -> Result<(&str, PageIter<'_>), Error> {
|
fn get_inner(&self, index: &[LE32], idx: usize) -> Result<(&str, PageIter<'_>), Error> {
|
||||||
if idx >= self.count() {
|
if idx >= self.count() {
|
||||||
return Err(Error::NotFound);
|
return Err(Error::NotFound);
|
||||||
|
@ -184,21 +207,76 @@ impl Keys {
|
||||||
Ok((word, pages))
|
Ok((word, pages))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_index_a(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
|
pub fn get_index_len(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
|
||||||
self.get_inner(&self.index_a, idx)
|
self.get_inner(&self.index_len, idx)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_index_b(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
|
pub fn get_index_prefix(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
|
||||||
self.get_inner(&self.index_b, idx)
|
self.get_inner(&self.index_prefix, idx)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_index_c(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
|
pub fn get_index_suffix(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
|
||||||
self.get_inner(&self.index_c, idx)
|
self.get_inner(&self.index_suffix, idx)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_index_d(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
|
pub fn get_index_d(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> {
|
||||||
self.get_inner(&self.index_d, idx)
|
self.get_inner(&self.index_d, idx)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn search_exact(&self, target_key: &str) -> Result<(usize, PageIter<'_>), Error> {
|
||||||
|
let target_key = &to_katakana(target_key);
|
||||||
|
let mut high = self.count();
|
||||||
|
let mut low = 0;
|
||||||
|
|
||||||
|
// TODO: Revise corner cases and add tests for this binary search
|
||||||
|
while low <= high {
|
||||||
|
let mid = low + (high - low) / 2;
|
||||||
|
|
||||||
|
let cmp = self.cmp_key(target_key, mid)?;
|
||||||
|
|
||||||
|
match cmp {
|
||||||
|
Ordering::Less => low = mid + 1,
|
||||||
|
Ordering::Greater => high = mid - 1,
|
||||||
|
Ordering::Equal => return Ok((mid, self.get_index_prefix(mid)?.1)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return Err(Error::NotFound);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn to_katakana(input: &str) -> Cow<str> {
|
||||||
|
let diff = 'ア' as u32 - 'あ' as u32;
|
||||||
|
if let Some(pos) = input.find(|c| matches!(c, 'ぁ'..='ん')) {
|
||||||
|
let mut output = input[..pos].to_owned();
|
||||||
|
for c in input[pos..].chars() {
|
||||||
|
if matches!(c, 'ぁ'..='ん') {
|
||||||
|
output.push(char::from_u32(c as u32 + diff).unwrap());
|
||||||
|
} else {
|
||||||
|
output.push(c);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return Cow::Owned(output);
|
||||||
|
} else {
|
||||||
|
return Cow::Borrowed(input);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_to_katakana() {
|
||||||
|
assert_eq!(*to_katakana(""), *"");
|
||||||
|
assert_eq!(*to_katakana("あ"), *"ア");
|
||||||
|
assert_eq!(*to_katakana("ぁ"), *"ァ");
|
||||||
|
assert_eq!(*to_katakana("ん"), *"ン");
|
||||||
|
assert_eq!(*to_katakana("っ"), *"ッ");
|
||||||
|
assert_eq!(*to_katakana("ア"), *"ア");
|
||||||
|
assert_eq!(*to_katakana("ァ"), *"ァ");
|
||||||
|
assert_eq!(*to_katakana("ン"), *"ン");
|
||||||
|
assert_eq!(*to_katakana("ッ"), *"ッ");
|
||||||
|
assert_eq!(*to_katakana("aアa"), *"aアa");
|
||||||
|
assert_eq!(*to_katakana("aァa"), *"aァa");
|
||||||
|
assert_eq!(*to_katakana("aンa"), *"aンa");
|
||||||
|
assert_eq!(*to_katakana("aッa"), *"aッa");
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
|
@ -216,9 +294,13 @@ impl<'a> PageIter<'a> {
|
||||||
let mut tail = pages;
|
let mut tail = pages;
|
||||||
for _ in 0..count {
|
for _ in 0..count {
|
||||||
match tail {
|
match tail {
|
||||||
|
&[1, _, ref t @ ..] => tail = t,
|
||||||
&[2, _, _, ref t @ ..] => tail = t,
|
&[2, _, _, ref t @ ..] => tail = t,
|
||||||
&[4, _, _, _, ref t @ ..] => tail = t,
|
&[4, _, _, _, ref t @ ..] => tail = t,
|
||||||
_ => return Err(Error::InvalidIndex),
|
e => {
|
||||||
|
dbg!("hmm", &e[..100]);
|
||||||
|
return Err(Error::InvalidIndex);
|
||||||
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
let span_len = pages.len() - tail.len();
|
let span_len = pages.len() - tail.len();
|
||||||
|
@ -237,6 +319,7 @@ impl<'a> Iterator for PageIter<'a> {
|
||||||
// so unreachable is never reached. `self.count` is also checked to correspond,
|
// so unreachable is never reached. `self.count` is also checked to correspond,
|
||||||
// so overflow never happens.
|
// so overflow never happens.
|
||||||
let (id, tail) = match self.span {
|
let (id, tail) = match self.span {
|
||||||
|
&[1, hi, ref tail @ ..] => (u32::from_be_bytes([0, 0, 0, hi]), tail),
|
||||||
&[2, hi, lo, ref tail @ ..] => (u32::from_be_bytes([0, 0, hi, lo]), tail),
|
&[2, hi, lo, ref tail @ ..] => (u32::from_be_bytes([0, 0, hi, lo]), tail),
|
||||||
&[4, hi, mid, lo, ref tail @ ..] => (u32::from_be_bytes([0, hi, mid, lo]), tail),
|
&[4, hi, mid, lo, ref tail @ ..] => (u32::from_be_bytes([0, hi, mid, lo]), tail),
|
||||||
&[] => return None,
|
&[] => return None,
|
||||||
|
|
97
src/main.rs
97
src/main.rs
|
@ -1,13 +1,100 @@
|
||||||
use monokakido::MonokakidoDict;
|
use std::{io::{stdout, Write}, ops::Neg};
|
||||||
|
|
||||||
|
use monokakido::{MonokakidoDict, Error};
|
||||||
|
|
||||||
|
fn get_first_audio_id(page: &str) -> Result<&str, Error> {
|
||||||
|
if let Some((_, sound_tail)) = page.split_once("<sound>") {
|
||||||
|
if let Some((sound, _)) = sound_tail.split_once("</sound>") {
|
||||||
|
if let Some((head_id, _)) = sound.split_once(".aac") {
|
||||||
|
if let Some((_, id)) = head_id.split_once("href=\"") {
|
||||||
|
return Ok(id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(Error::NotFound)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_first_accent(page: &str) -> Result<i8, Error> {
|
||||||
|
if let Some((_, accent_tail)) = page.split_once("<accent_text>") {
|
||||||
|
if let Some((mut accent, _)) = accent_tail.split_once("</accent_text>") {
|
||||||
|
if let Some((a, _)) = accent.split_once("<sound>") {
|
||||||
|
accent = a;
|
||||||
|
}
|
||||||
|
if let Some(pos) = accent.find("<symbol_backslash>\</symbol_backslash>") {
|
||||||
|
let endpos = pos + "<symbol_backslash>\</symbol_backslash>".len();
|
||||||
|
let before = &accent[..pos];
|
||||||
|
let after = &accent[endpos..];
|
||||||
|
let is_mora = |&c: &char| (matches!(c, 'ぁ'..='ん' | 'ァ'..='ン' | 'ー') && !matches!(c, 'ゃ'..='ょ' | 'ャ'..='ョ'));
|
||||||
|
return Ok((before.chars().filter(is_mora).count() as i8));
|
||||||
|
}
|
||||||
|
if let Some(_) = accent.find("<symbol_macron>━</symbol_macron>") {
|
||||||
|
return Ok(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(Error::NotFound)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_accents(page: &str) -> Result<(i8, Option<i8>), Error> {
|
||||||
|
if let Some((first, tail)) = page.split_once("</accent>") {
|
||||||
|
return Ok((get_first_accent(first)?, get_first_accent(tail).ok()));
|
||||||
|
}
|
||||||
|
Err(Error::NotFound)
|
||||||
|
}
|
||||||
|
|
||||||
fn main() {
|
fn main() {
|
||||||
|
|
||||||
|
let Some(key) = std::env::args().nth(1) else {
|
||||||
|
return;
|
||||||
|
};
|
||||||
/*
|
/*
|
||||||
for dict in MonokakidoDict::list().unwrap() {
|
for dict in MonokakidoDict::list().unwrap() {
|
||||||
dbg!(dict.unwrap());
|
dbg!(dict.unwrap());
|
||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
let mut dict = MonokakidoDict::open("NHKACCENT2").unwrap();
|
let mut dict = MonokakidoDict::open("NHKACCENT2").unwrap();
|
||||||
|
let mut accents = vec![];
|
||||||
|
let result = dict.keys.search_exact(&key);
|
||||||
|
match result {
|
||||||
|
Ok((_, pages)) => {
|
||||||
|
for id in pages{
|
||||||
|
let page = dict.pages.get(id).unwrap();
|
||||||
|
if let Ok(accent) = get_accents(page) {
|
||||||
|
accents.push(accent);
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
let id = get_first_audio_id(page).unwrap();
|
||||||
|
let audio = dict.audio.get(id).unwrap();
|
||||||
|
let mut stdout = stdout().lock();
|
||||||
|
stdout.write_all(audio).unwrap();
|
||||||
|
*/
|
||||||
|
}
|
||||||
|
},
|
||||||
|
Err(e) => {
|
||||||
|
println!("{:?}", e);
|
||||||
|
return;
|
||||||
|
},
|
||||||
|
}
|
||||||
|
print!("{key}\t");
|
||||||
|
accents.sort();
|
||||||
|
accents.dedup();
|
||||||
|
if accents.is_empty() {
|
||||||
|
print!("N/A");
|
||||||
|
} else {
|
||||||
|
for (accent_main, accent_sub) in accents {
|
||||||
|
print!("{accent_main}");
|
||||||
|
if let Some(accent_sub) = accent_sub {
|
||||||
|
if accent_main != accent_sub {
|
||||||
|
print!("/{accent_sub}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
print!(" ");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
println!()
|
||||||
|
|
||||||
|
/*
|
||||||
let idx_list = [
|
let idx_list = [
|
||||||
0,
|
0,
|
||||||
1,
|
1,
|
||||||
|
@ -48,7 +135,7 @@ fn main() {
|
||||||
|
|
||||||
println!("Index: length order");
|
println!("Index: length order");
|
||||||
for idx in idx_list {
|
for idx in idx_list {
|
||||||
let (word, pages) = dict.keys.get_index_a(idx).unwrap();
|
let (word, pages) = dict.keys.get_index_len(idx).unwrap();
|
||||||
println!("\n{}", word);
|
println!("\n{}", word);
|
||||||
for id in pages {
|
for id in pages {
|
||||||
println!("{}", dict.pages.get(id).unwrap());
|
println!("{}", dict.pages.get(id).unwrap());
|
||||||
|
@ -57,7 +144,7 @@ fn main() {
|
||||||
|
|
||||||
println!("Index: prefix order");
|
println!("Index: prefix order");
|
||||||
for idx in idx_list {
|
for idx in idx_list {
|
||||||
let (word, pages) = dict.keys.get_index_b(idx).unwrap();
|
let (word, pages) = dict.keys.get_index_prefix(idx).unwrap();
|
||||||
println!("\n{}", word);
|
println!("\n{}", word);
|
||||||
for id in pages {
|
for id in pages {
|
||||||
println!("{}", dict.pages.get(id).unwrap());
|
println!("{}", dict.pages.get(id).unwrap());
|
||||||
|
@ -66,7 +153,7 @@ fn main() {
|
||||||
|
|
||||||
println!("Index: suffix order");
|
println!("Index: suffix order");
|
||||||
for idx in idx_list {
|
for idx in idx_list {
|
||||||
let (word, pages) = dict.keys.get_index_c(idx).unwrap();
|
let (word, pages) = dict.keys.get_index_suffix(idx).unwrap();
|
||||||
println!("\n{}", word);
|
println!("\n{}", word);
|
||||||
for id in pages {
|
for id in pages {
|
||||||
println!("{}", dict.pages.get(id).unwrap());
|
println!("{}", dict.pages.get(id).unwrap());
|
||||||
|
@ -81,7 +168,7 @@ fn main() {
|
||||||
println!("{}", dict.pages.get(id).unwrap());
|
println!("{}", dict.pages.get(id).unwrap());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
//let mut stdout = stdout().lock();
|
//let mut stdout = stdout().lock();
|
||||||
//stdout.write_all(audio).unwrap();
|
//stdout.write_all(audio).unwrap();
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue