diff --git a/Cargo.lock b/Cargo.lock index a695f01..ec5c4ba 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -51,6 +51,7 @@ version = "0.2.0" dependencies = [ "miniserde", "miniz_oxide", + "xmlparser", ] [[package]] @@ -93,3 +94,9 @@ name = "unicode-ident" version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc" + +[[package]] +name = "xmlparser" +version = "0.13.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d25c75bf9ea12c4040a97f829154768bbbce366287e2dc044af160cd79a13fd" diff --git a/Cargo.toml b/Cargo.toml index 68388ea..5528b90 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,3 +8,4 @@ license = "MIT" [dependencies] miniz_oxide = { version = "0.6", default-features = false } miniserde = "0.1" +xmlparser = "0.13.5" diff --git a/src/bin/cli.rs b/src/bin/cli.rs index 4e9a12b..7920c05 100644 --- a/src/bin/cli.rs +++ b/src/bin/cli.rs @@ -1,184 +1,92 @@ -use std::{ - io::{stdout, Write}, - ops::Neg, -}; - use monokakido::{Error, MonokakidoDict}; -fn get_first_audio_id(page: &str) -> Result<&str, Error> { - if let Some((_, sound_tail)) = page.split_once("") { - if let Some((sound, _)) = sound_tail.split_once("") { - if let Some((head_id, _)) = sound.split_once(".aac") { - if let Some((_, id)) = head_id.split_once("href=\"") { - return Ok(id); +fn print_help() { + println!("Monokakido CLI. Supported subcommands:"); + println!("list - lists all dictionaries installed in the standard path"); + println!("list_items {{dict}} {{keyword}} - lists all items"); + println!("list_audio {{dict}} {{keyword}} - lists all audio files"); + println!("help - this help"); +} + +fn list_items(dict_name: &str, keyword: &str) -> Result<(), Error> { + let mut dict = MonokakidoDict::open(dict_name)?; + let (_, items) = dict.keys.search_exact(keyword)?; + + for id in items { + let item = dict.pages.get_item(id)?; + println!("{item}"); + } + Ok(()) +} + +fn list_pages(dict_name: &str, keyword: &str) -> Result<(), Error> { + let mut dict = MonokakidoDict::open(dict_name)?; + let (_, items) = dict.keys.search_exact(keyword)?; + + for id in items { + let page = dict.pages.get_page(id)?; + println!("{page}"); + } + Ok(()) +} + +fn list_audio(dict_name: &str, keyword: &str) -> Result<(), Error> { + let mut dict = MonokakidoDict::open(dict_name)?; + let (_, items) = dict.keys.search_exact(keyword)?; + + for id in items { + for audio in dict.pages.get_item_audio(id)? { + if let Some((_, audio)) = audio?.split_once("href=\"") { + if let Some((id, _)) = audio.split_once('"') { + println!("{id}"); } } } } - Err(Error::NotFound) + Ok(()) } -fn get_first_accent(page: &str) -> Result { - if let Some((_, accent_tail)) = page.split_once("") { - if let Some((mut accent, _)) = accent_tail.split_once("") { - if let Some((a, _)) = accent.split_once("") { - accent = a; - } - if let Some(pos) = accent.find("") { - let endpos = pos + "".len(); - let before = &accent[..pos]; - let after = &accent[endpos..]; - let is_mora = |&c: &char| { - (matches!(c, 'ぁ'..='ん' | 'ァ'..='ン' | 'ー') - && !matches!(c, 'ゃ'..='ょ' | 'ャ'..='ョ')) - }; - return Ok((before.chars().filter(is_mora).count() as i8)); - } - if let Some(_) = accent.find("") { - return Ok(0); - } - } +fn list_dicts() -> Result<(), Error> { + for dict in MonokakidoDict::list()? { + println!("{}", dict?); } - Err(Error::NotFound) -} - -fn get_accents(page: &str) -> Result<(i8, Option), Error> { - if let Some((first, tail)) = page.split_once("") { - return Ok((get_first_accent(first)?, get_first_accent(tail).ok())); - } - Err(Error::NotFound) + Ok(()) } fn main() { - let Some(key) = std::env::args().nth(1) else { - return; + let mut args = std::env::args(); + let res = match args.nth(1).as_deref() { + Some("list_audio") => { + if let (Some(dict_name), Some(keyword)) = (args.next(), args.next()) { + list_audio(&dict_name, &keyword) + } else { + Err(Error::InvalidArg) + } + } + Some("list_items") => { + if let (Some(dict_name), Some(keyword)) = (args.next(), args.next()) { + list_items(&dict_name, &keyword) + } else { + Err(Error::InvalidArg) + } + } + Some("list_pages") => { + if let (Some(dict_name), Some(keyword)) = (args.next(), args.next()) { + list_pages(&dict_name, &keyword) + } else { + Err(Error::InvalidArg) + } + } + Some("list") => list_dicts(), + None | Some("help") => { + print_help(); + Ok(()) + } + _ => Err(Error::InvalidArg), }; - for dict in MonokakidoDict::list().unwrap() { - dbg!(dict.unwrap()); + if let Err(e) = res { + eprintln!("Error: {e:?}"); + std::process::exit(1) } - - let mut dict = MonokakidoDict::open("NHKACCENT2").unwrap(); - // let mut accents = vec![]; - let result = dict.keys.search_exact(&key); - - match result { - Ok((_, pages)) => { - for id in pages { - let page = dict.pages.get(id.page).unwrap(); - println!("{page}"); - /* - if let Ok(accent) = get_accents(page) { - accents.push(accent); - } */ - /* - let id = get_first_audio_id(page).unwrap(); - let audio = dict.audio.get(id).unwrap(); - let mut stdout = stdout().lock(); - stdout.write_all(audio).unwrap(); - */ - } - } - Err(e) => { - println!("{:?}", e); - return; - } - } - /* - print!("{key}\t"); - accents.sort(); - accents.dedup(); - if accents.is_empty() { - print!("N/A"); - } else { - for (accent_main, accent_sub) in accents { - print!("{accent_main}"); - if let Some(accent_sub) = accent_sub { - if accent_main != accent_sub { - print!("/{accent_sub}"); - } - } - print!(" "); - } - } */ - - /* - let idx_list = [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 46200, - 46201, - 46202, - 46203, - 46204, - 46205, - 46206, - 46207, - 46208, - 46209, - 46210, - 46211, - 70000, - dict.keys.count() - 1, - ]; - - println!("Index: length order"); - for idx in idx_list { - let (word, pages) = dict.keys.get_index_len(idx).unwrap(); - println!("\n{}", word); - for id in pages { - println!("{}", dict.pages.get(id).unwrap()); - } - } - - println!("Index: prefix order"); - for idx in idx_list { - let (word, pages) = dict.keys.get_index_prefix(idx).unwrap(); - println!("\n{}", word); - for id in pages { - println!("{}", dict.pages.get(id).unwrap()); - } - } - - println!("Index: suffix order"); - for idx in idx_list { - let (word, pages) = dict.keys.get_index_suffix(idx).unwrap(); - println!("\n{}", word); - for id in pages { - println!("{}", dict.pages.get(id).unwrap()); - } - } - - println!("Index: ?"); - for idx in idx_list { - let (word, pages) = dict.keys.get_index_d(idx).unwrap(); - println!("\n{}", word); - for id in pages { - println!("{}", dict.pages.get(id).unwrap()); - } - } - */ - let mut audio_rsc = dict.audio.unwrap(); - let audio = audio_rsc.get("jee").unwrap(); - let mut stdout = stdout().lock(); - stdout.write_all(audio).unwrap(); } diff --git a/src/bin/explode.rs b/src/bin/explode.rs index 6fb2706..ea3cd7d 100644 --- a/src/bin/explode.rs +++ b/src/bin/explode.rs @@ -18,7 +18,7 @@ fn write_index(dict: &MonokakidoDict, index: &KeyIndex, tsv_fname: &str) -> Resu for PageItemId { page, item } in pages { write!(&mut index_tsv, "\t{page:0>10}")?; if item > 0 { - write!(&mut index_tsv, ":{item:0>3}")?; + write!(&mut index_tsv, "-{item:0>3}")?; } } index_tsv.write_all(b"\n")?; @@ -37,7 +37,7 @@ fn explode() -> Result<(), Error> { create_dir_all(&pages_dir)?; let mut path = String::from(&pages_dir); for idx in dict.pages.idx_iter()? { - let (id, page) = dict.pages.get_by_idx(idx)?; + let (id, page) = dict.pages.page_by_idx(idx)?; write!(&mut path, "{id:0>10}.xml")?; let mut file = File::create(&path)?; path.truncate(pages_dir.len()); diff --git a/src/error.rs b/src/error.rs index cede223..61809cf 100644 --- a/src/error.rs +++ b/src/error.rs @@ -24,6 +24,7 @@ pub enum Error { InvalidArg, FmtError, IndexDoesntExist, + XmlError, } impl From for Error { @@ -43,3 +44,9 @@ impl From for Error { Error::FmtError } } + +impl From for Error { + fn from(_: xmlparser::Error) -> Self { + Error::XmlError + } +} diff --git a/src/key.rs b/src/key.rs index 2909e4b..4aaa585 100644 --- a/src/key.rs +++ b/src/key.rs @@ -342,6 +342,7 @@ impl<'a> Iterator for PageIter<'a> { } } +#[derive(Debug, Clone, Copy)] pub struct PageItemId { pub page: u32, pub item: u8, diff --git a/src/lib.rs b/src/lib.rs index 6afd41c..f81d773 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -10,4 +10,4 @@ pub use audio::Audio; pub use dict::MonokakidoDict; pub use error::Error; pub use key::{KeyIndex, Keys, PageItemId}; -pub use pages::Pages; +pub use pages::{Pages, XmlParser}; diff --git a/src/pages.rs b/src/pages.rs index 2ebe60c..0ca6999 100644 --- a/src/pages.rs +++ b/src/pages.rs @@ -1,6 +1,6 @@ use std::{ops::Range, path::PathBuf}; -use crate::{dict::Paths, resource::Rsc, Error}; +use crate::{dict::Paths, resource::Rsc, Error, PageItemId}; const RSC_NAME: &str = "contents"; @@ -9,6 +9,75 @@ pub struct Pages { res: Option, } +pub struct XmlParser<'a> { + xml: &'a str, + tokens: xmlparser::Tokenizer<'a>, + target_level: Option, + tag_stack: Vec<(&'a str, usize)>, +} + +impl<'a> XmlParser<'a> { + pub fn from(xml: &'a str) -> Self { + Self { + xml, + tokens: xmlparser::Tokenizer::from(xml), + target_level: None, + tag_stack: Vec::new(), + } + } + + pub fn next_fragment_by( + &mut self, + elem_cond: impl Fn(&str) -> bool, + attr_cond: impl Fn(&str, &str) -> bool, + ) -> Result, Error> { + use xmlparser::{ + ElementEnd::{Close, Empty}, + Token::{Attribute, ElementEnd, ElementStart}, + }; + + for token in &mut self.tokens { + let mut popped = None; + let token = token?; + match token { + ElementStart { local, span, .. } => { + self.tag_stack.push((local.as_str(), span.start())); + if elem_cond(&local) && self.target_level.is_none() { + self.target_level = Some(self.tag_stack.len()); + } + } + Attribute { local, value, .. } => { + if attr_cond(&local, &value) && self.target_level.is_none() { + self.target_level = Some(self.tag_stack.len()); + } + } + ElementEnd { + end: Close(_, tag), + span, + } => { + if Some(&*tag) == self.tag_stack.last().map(|(t, _)| *t) { + popped = self.tag_stack.pop().map(|(_, start)| (start, span.end())); + } else { + return Err(Error::XmlError); + } + } + ElementEnd { end: Empty, span } => { + popped = self.tag_stack.pop().map(|(_, start)| (start, span.end())); + } + _ => continue, + } + if let Some((start, end)) = popped { + if Some(self.tag_stack.len()) < self.target_level { + self.target_level = None; + return Ok(Some(&self.xml[start..end])); + } + } + } + // No body fragment or item fragment with suitable ID found + Ok(None) + } +} + impl Pages { pub fn new(paths: &Paths) -> Result { Ok(Pages { @@ -24,13 +93,43 @@ impl Pages { Ok(()) } - pub fn get(&mut self, id: u32) -> Result<&str, Error> { + pub fn get_page(&mut self, id: PageItemId) -> Result<&str, Error> { self.init()?; let Some(res) = self.res.as_mut() else { unreachable!() }; - std::str::from_utf8(res.get(id)?).map_err(|_| Error::Utf8Error) + let xml = std::str::from_utf8(res.get(id.page)?).map_err(|_| Error::Utf8Error)?; + Ok(xml) } - pub fn get_by_idx(&mut self, idx: usize) -> Result<(u32, &str), Error> { + pub fn get_item(&mut self, id: PageItemId) -> Result<&str, Error> { + let xml = self.get_page(id)?; + let mut parser = XmlParser::from(xml); + if id.item == 0 { + parser.next_fragment_by(|tag| tag == "body", |_, _| false) + } else { + parser.next_fragment_by( + |_| false, + |name, value| { + if name == "id" { + if let Some((page, item)) = value.split_once('-') { + if page.parse() == Ok(id.page) && item.parse() == Ok(id.item) { + return true; + } + } + } + false + }, + ) + }? + .ok_or(Error::XmlError) + } + + pub fn get_item_audio(&mut self, id: PageItemId) -> Result { + let xml = self.get_item(id)?; + let parser = XmlParser::from(xml); + Ok(AudioIter { parser }) + } + + pub fn page_by_idx(&mut self, idx: usize) -> Result<(u32, &str), Error> { self.init()?; let Some(res) = self.res.as_mut() else { unreachable!() }; let (id, page) = res.get_by_idx(idx)?; @@ -43,3 +142,20 @@ impl Pages { Ok(0..res.len()) } } + +pub struct AudioIter<'a> { + parser: XmlParser<'a>, +} + +impl<'a> Iterator for AudioIter<'a> { + type Item = Result<&'a str, Error>; + + fn next(&mut self) -> Option { + self.parser + .next_fragment_by( + |_| false, + |name, value| name == "href" && value.ends_with(".aac"), + ) + .transpose() + } +}