diff --git a/README.md b/README.md index d156826..ea6881c 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,24 @@ # monokakido.rs A Rust library for parsing and interpreting the [Monokakido](https://www.monokakido.jp/en/dictionaries/app/) dictionary format. Aiming for full test coverage and efficient implementation with minimal dependencies. + +## TODO: +- Refactor code for generic "rsc" and "nrsc" support +- Audio using "rsc" (CCCAD, WISDOM3) +- Audio using "nrsc" (DAIJISEN2, NHKACCENT2, OALD10, OLDAE, OLEX, OLT, RHEJ, SMK8) +- Multiple contents (WISDOM3, OLEX) +- Document the rsc, nrsc and keystore formats +- Split main.rs into "dict exploder" and "dict cli" + +## Planned to support: +- WISDOM3 +- SMK8 +- RHEJ +- OLT +- OLEX +- OLDAE +- OCD +- OALD10 +- NHKACCENT2 +- DAIJISEN2 +- CCCAD diff --git a/src/abi.rs b/src/abi_utils.rs similarity index 100% rename from src/abi.rs rename to src/abi_utils.rs diff --git a/src/audio.rs b/src/audio.rs index 1e8f272..1e34b8a 100644 --- a/src/audio.rs +++ b/src/audio.rs @@ -1,226 +1,85 @@ -use core::{mem::size_of, ops::Not}; -use std::{ - ffi::OsStr, - fs::{self, File}, - io::{Read, Seek, SeekFrom}, +use std::{path::PathBuf, ops::Range}; + +use crate::{ + dict::Paths, + resource::{Nrsc, Rsc}, + Error, }; -use miniz_oxide::inflate::core as zlib; - -use crate::{abi::TransmuteSafe, decompress, dict::Paths, ContentsFile, Error}; - -#[derive(Debug, Clone)] -pub(crate) struct AudioIndex { - idx: Vec, - ids: String, // contains null bytes as substring separators -} - -mod abi { - use std::mem::size_of; - - use crate::{audio::AudioFormat, Error}; - - #[repr(C)] - #[derive(Debug, Clone, Copy, Default, PartialEq, Eq)] - pub(crate) struct AudioIdxRecord { - format: u16, - fileseq: u16, - id_str_offset: u32, - file_offset: u32, - len: u32, - } - - impl AudioIdxRecord { - pub fn id_str_offset(&self) -> usize { - u32::from_le(self.id_str_offset) as usize - } - - pub(super) fn format(&self) -> Result { - match u16::from_le(self.format) { - 0 => Ok(AudioFormat::Acc), - 1 => Ok(AudioFormat::ZlibAcc), - _ => Err(Error::InvalidAudioFormat), - } - } - - pub fn fileseq(&self) -> usize { - u16::from_le(self.fileseq) as usize - } - - pub fn file_offset(&self) -> u64 { - u32::from_le(self.file_offset) as u64 - } - - pub fn len(&self) -> usize { - u32::from_le(self.len) as usize - } - } - - #[test] - fn test_audio_index() { - use crate::audio::AudioIndex; - let air = |id_str_offset| AudioIdxRecord { - format: 0, - fileseq: 0, - id_str_offset, - file_offset: 0, - len: 0, - }; - let mut audio_idx = AudioIndex { - idx: vec![air(0), air(1), air(3), air(6), air(10)], - ids: "\0a\0bb\0ccc\0dddd".to_owned(), - }; - - let diff = 8 + audio_idx.idx.len() * size_of::(); - // Fix offsets now that they are known - for air in audio_idx.idx.iter_mut() { - air.id_str_offset += diff as u32; - } - - dbg!(&audio_idx); - assert_eq!(audio_idx.get_id_at(diff + 0).unwrap(), ""); - assert_eq!(audio_idx.get_id_at(diff + 1).unwrap(), "a"); - assert_eq!(audio_idx.get_id_at(diff + 3).unwrap(), "bb"); - assert_eq!(audio_idx.get_id_at(diff + 4), Err(Error::InvalidIndex)); - assert_eq!(audio_idx.get_id_at(diff + 6).unwrap(), "ccc"); - assert_eq!(audio_idx.get_id_at(diff + 10), Err(Error::InvalidIndex)); - - audio_idx.ids = "\0a\0bb\0ccc\0dddd\0".to_owned(); - let diff = diff as u32; - assert_eq!(audio_idx.get_by_id("").unwrap(), air(diff + 0)); - assert_eq!(audio_idx.get_by_id("a").unwrap(), air(diff + 1)); - assert_eq!(audio_idx.get_by_id("bb").unwrap(), air(diff + 3)); - assert_eq!(audio_idx.get_by_id("ccc").unwrap(), air(diff + 6)); - assert_eq!(audio_idx.get_by_id("dddd").unwrap(), air(diff + 10)); - assert_eq!(audio_idx.get_by_id("ddd"), Err(Error::NotFound)); - } -} - -pub(crate) use abi::AudioIdxRecord; - -enum AudioFormat { - Acc, - ZlibAcc, -} - -unsafe impl TransmuteSafe for AudioIdxRecord {} - -impl AudioIndex { - pub(crate) fn new(paths: &Paths) -> Result { - let mut file = File::open(paths.audio_idx_path()).map_err(|_| Error::FopenError)?; - let mut len = [0; 8]; - file.read_exact(&mut len).map_err(|_| Error::IOError)?; - let len = u32::from_le_bytes(len[4..8].try_into().unwrap()) as usize; - let file_size = file.metadata().map_err(|_| Error::IOError)?.len() as usize; - let idx_expected_size = size_of::() * len + 8; - let mut idx = vec![AudioIdxRecord::default(); len]; - let mut ids = String::with_capacity(file_size - idx_expected_size); - file.read_exact(AudioIdxRecord::slice_as_bytes_mut(idx.as_mut_slice())) - .map_err(|_| Error::IOError)?; - file.read_to_string(&mut ids).map_err(|_| Error::IOError)?; - Ok(Self { idx, ids }) - } - - fn get_id_at(&self, offset: usize) -> Result<&str, Error> { - let offset = offset - (size_of::() * self.idx.len() + 8); - if offset > 0 && &self.ids[offset - 1..offset] != "\0" { - return Err(Error::InvalidIndex); - } - let tail = &self.ids[offset..]; - let len = tail.find('\0').ok_or(Error::InvalidIndex)?; - Ok(&tail[..len]) - } - - pub fn get_by_id(&self, id: &str) -> Result { - let mut idx_err = Ok(()); - let i = self - .idx - .binary_search_by_key(&id, |idx| match self.get_id_at(idx.id_str_offset()) { - Ok(ok) => ok, - Err(err) => { - idx_err = Err(err); - "" - } - }) - .map_err(|_| Error::NotFound)?; - idx_err?; - - Ok(self.idx[i]) - } -} +const RSC_NAME: &str = "audio"; pub struct Audio { - index: AudioIndex, - audio: Vec, - read_buf: Vec, - decomp_buf: Vec, - zlib_state: zlib::DecompressorOxide, + path: PathBuf, + res: Option, +} + +enum AudioResource { + Rsc(Rsc), + Nrsc(Nrsc), } impl Audio { - fn parse_fname(fname: &OsStr) -> Option { - let fname = fname.to_str()?; - if fname.ends_with(".nrsc").not() { - return None; - } - u32::from_str_radix(&fname[..5], 10).ok() - } - - pub(crate) fn new(paths: &Paths) -> Result { - let mut audio = Vec::new(); - for entry in fs::read_dir(&paths.audio_path()).map_err(|_| Error::IOError)? { - let entry = entry.map_err(|_| Error::IOError)?; - let seqnum = Audio::parse_fname(&entry.file_name()); - if let Some(seqnum) = seqnum { - audio.push(ContentsFile { - seqnum, - len: entry.metadata().map_err(|_| Error::IOError)?.len() as usize, - offset: 0, - file: File::open(entry.path()).map_err(|_| Error::IOError)?, - }); - } - } - audio.sort_by_key(|f| f.seqnum); - if Some(audio.len()) != audio.last().map(|a| a.seqnum as usize + 1) { - return Err(Error::NoContentFilesFound); - } - let index = AudioIndex::new(&paths)?; - Ok(Audio { - index, - audio, - read_buf: Vec::new(), - decomp_buf: Vec::new(), - zlib_state: zlib::DecompressorOxide::new(), + pub fn new(paths: &Paths) -> Result, Error> { + let mut path = paths.contents_path(); + path.push(RSC_NAME); + Ok(if path.exists() { + Some(Audio { path, res: None }) + } else { + None }) } - fn get_by_idx(&mut self, idx: AudioIdxRecord) -> Result<&[u8], Error> { - let file = &mut self.audio[idx.fileseq() as usize]; - - file.file - .seek(SeekFrom::Start(idx.file_offset())) - .map_err(|_| Error::IOError)?; - if self.read_buf.len() < idx.len() { - self.read_buf.resize(idx.len(), 0); - } - file.file - .read_exact(&mut self.read_buf[..idx.len()]) - .map_err(|_| Error::IOError)?; - - match idx.format()? { - AudioFormat::Acc => Ok(&self.read_buf[..idx.len()]), - AudioFormat::ZlibAcc => { - let n_out = decompress( - &mut self.zlib_state, - &self.read_buf[..idx.len()], - &mut self.decomp_buf, - )?; - Ok(&self.decomp_buf[..n_out]) - } + pub fn init(&mut self) -> Result<(), Error> { + if self.res.is_none() { + self.path.push("index.nidx"); + let nrsc_index_exists = self.path.exists(); + self.path.pop(); + self.res = Some(if nrsc_index_exists { + AudioResource::Nrsc(Nrsc::new(&self.path)?) + } else { + AudioResource::Rsc(Rsc::new(&self.path, RSC_NAME)?) + }); } + Ok(()) } pub fn get(&mut self, id: &str) -> Result<&[u8], Error> { - self.get_by_idx(self.index.get_by_id(id)?) + self.init()?; + let Some(res) = self.res.as_mut() else { unreachable!() }; + match res { + AudioResource::Rsc(rsc) => { + rsc.get(u32::from_str_radix(id, 10).map_err(|_| Error::InvalidIndex)?) + } + AudioResource::Nrsc(nrsc) => nrsc.get(id), + } + } + + pub fn get_by_idx(&mut self, idx: usize) -> Result<(AudioId, &[u8]), Error> { + self.init()?; + let Some(res) = self.res.as_mut() else { unreachable!() }; + Ok(match res { + AudioResource::Rsc(rsc) => { + let (id, page) = rsc.get_by_idx(idx)?; + (AudioId::Num(id), page) + }, + AudioResource::Nrsc(nrsc) => { + let (id, page) = nrsc.get_by_idx(idx)?; + (AudioId::Str(id), page) + }, + }) + } + + pub fn idx_iter(&mut self) -> Result, Error> { + self.init()?; + let Some(res) = self.res.as_ref() else { unreachable!() }; + Ok(0..match res { + AudioResource::Rsc(rsc) => rsc.len(), + AudioResource::Nrsc(nrsc) => nrsc.len(), + }) } } + +pub enum AudioId<'a> { + Str(&'a str), + Num(u32) +} diff --git a/src/main.rs b/src/bin/cli.rs similarity index 55% rename from src/main.rs rename to src/bin/cli.rs index e185327..560e184 100644 --- a/src/main.rs +++ b/src/bin/cli.rs @@ -1,6 +1,9 @@ -use std::{io::{stdout, Write}, ops::Neg}; +use std::{ + io::{stdout, Write}, + ops::Neg, +}; -use monokakido::{MonokakidoDict, Error}; +use monokakido::{Error, MonokakidoDict}; fn get_first_audio_id(page: &str) -> Result<&str, Error> { if let Some((_, sound_tail)) = page.split_once("") { @@ -19,13 +22,16 @@ fn get_first_accent(page: &str) -> Result { if let Some((_, accent_tail)) = page.split_once("") { if let Some((mut accent, _)) = accent_tail.split_once("") { if let Some((a, _)) = accent.split_once("") { - accent = a; + accent = a; } if let Some(pos) = accent.find("") { let endpos = pos + "".len(); let before = &accent[..pos]; let after = &accent[endpos..]; - let is_mora = |&c: &char| (matches!(c, 'ぁ'..='ん' | 'ァ'..='ン' | 'ー') && !matches!(c, 'ゃ'..='ょ' | 'ャ'..='ョ')); + let is_mora = |&c: &char| { + (matches!(c, 'ぁ'..='ん' | 'ァ'..='ン' | 'ー') + && !matches!(c, 'ゃ'..='ょ' | 'ャ'..='ョ')) + }; return Ok((before.chars().filter(is_mora).count() as i8)); } if let Some(_) = accent.find("") { @@ -44,25 +50,27 @@ fn get_accents(page: &str) -> Result<(i8, Option), Error> { } fn main() { - let Some(key) = std::env::args().nth(1) else { return; }; - /* - for dict in MonokakidoDict::list().unwrap() { - dbg!(dict.unwrap()); - } - */ + + for dict in MonokakidoDict::list().unwrap() { + dbg!(dict.unwrap()); + } + let mut dict = MonokakidoDict::open("NHKACCENT2").unwrap(); - let mut accents = vec![]; + // let mut accents = vec![]; let result = dict.keys.search_exact(&key); + match result { Ok((_, pages)) => { - for id in pages{ + for id in pages { let page = dict.pages.get(id).unwrap(); + println!("{page}"); + /* if let Ok(accent) = get_accents(page) { accents.push(accent); - } + } */ /* let id = get_first_audio_id(page).unwrap(); let audio = dict.audio.get(id).unwrap(); @@ -70,12 +78,13 @@ fn main() { stdout.write_all(audio).unwrap(); */ } - }, + } Err(e) => { println!("{:?}", e); return; - }, + } } + /* print!("{key}\t"); accents.sort(); accents.dedup(); @@ -91,84 +100,84 @@ fn main() { } print!(" "); } - } + } */ println!() -/* - let idx_list = [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 46200, - 46201, - 46202, - 46203, - 46204, - 46205, - 46206, - 46207, - 46208, - 46209, - 46210, - 46211, - 70000, - dict.keys.count() - 1, - ]; + /* + let idx_list = [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 46200, + 46201, + 46202, + 46203, + 46204, + 46205, + 46206, + 46207, + 46208, + 46209, + 46210, + 46211, + 70000, + dict.keys.count() - 1, + ]; - println!("Index: length order"); - for idx in idx_list { - let (word, pages) = dict.keys.get_index_len(idx).unwrap(); - println!("\n{}", word); - for id in pages { - println!("{}", dict.pages.get(id).unwrap()); - } - } + println!("Index: length order"); + for idx in idx_list { + let (word, pages) = dict.keys.get_index_len(idx).unwrap(); + println!("\n{}", word); + for id in pages { + println!("{}", dict.pages.get(id).unwrap()); + } + } - println!("Index: prefix order"); - for idx in idx_list { - let (word, pages) = dict.keys.get_index_prefix(idx).unwrap(); - println!("\n{}", word); - for id in pages { - println!("{}", dict.pages.get(id).unwrap()); - } - } + println!("Index: prefix order"); + for idx in idx_list { + let (word, pages) = dict.keys.get_index_prefix(idx).unwrap(); + println!("\n{}", word); + for id in pages { + println!("{}", dict.pages.get(id).unwrap()); + } + } - println!("Index: suffix order"); - for idx in idx_list { - let (word, pages) = dict.keys.get_index_suffix(idx).unwrap(); - println!("\n{}", word); - for id in pages { - println!("{}", dict.pages.get(id).unwrap()); - } - } + println!("Index: suffix order"); + for idx in idx_list { + let (word, pages) = dict.keys.get_index_suffix(idx).unwrap(); + println!("\n{}", word); + for id in pages { + println!("{}", dict.pages.get(id).unwrap()); + } + } - println!("Index: ?"); - for idx in idx_list { - let (word, pages) = dict.keys.get_index_d(idx).unwrap(); - println!("\n{}", word); - for id in pages { - println!("{}", dict.pages.get(id).unwrap()); - } - } - */ + println!("Index: ?"); + for idx in idx_list { + let (word, pages) = dict.keys.get_index_d(idx).unwrap(); + println!("\n{}", word); + for id in pages { + println!("{}", dict.pages.get(id).unwrap()); + } + } + */ //let mut stdout = stdout().lock(); //stdout.write_all(audio).unwrap(); } diff --git a/src/bin/explode.rs b/src/bin/explode.rs new file mode 100644 index 0000000..fa6e67d --- /dev/null +++ b/src/bin/explode.rs @@ -0,0 +1,49 @@ +use std::{ + fmt::Write as _, + fs::{create_dir_all, File}, + io::Write, + path::Path, +}; + +use monokakido::{Error, MonokakidoDict}; + +fn explode() -> Result<(), Error> { + let arg = std::env::args().nth(1).ok_or(Error::InvalidArg)?; + + let mut dict = if Path::new(&arg).exists() { + MonokakidoDict::open_with_path(Path::new(&arg)) + } else { + MonokakidoDict::open(&arg) + }?; + let pages_dir = "./pages/"; + create_dir_all(pages_dir)?; + let mut path = String::from(pages_dir); + for idx in dict.pages.idx_iter()? { + let (id, page) = dict.pages.get_by_idx(idx)?; + write!(&mut path, "{id:0>10}.xml")?; + let mut file = File::create(&path)?; + path.truncate(pages_dir.len()); + file.write_all(page.as_bytes())?; + } + + if let Some(audio) = &mut dict.audio { + let audio_dir = "./audio/"; + create_dir_all(audio_dir)?; + let mut path = String::from(audio_dir); + for idx in audio.idx_iter()? { + let (id, page) = dict.pages.get_by_idx(idx)?; + write!(&mut path, "{id:0>10}.aac")?; + let mut file = File::create(&path)?; + path.truncate(pages_dir.len()); + file.write_all(page.as_bytes())?; + } + } + Ok(()) +} + +fn main() { + if let Err(err) = explode() { + eprintln!("{err:?}"); + return; + }; +} diff --git a/src/dict.rs b/src/dict.rs index b2fdb28..061e8aa 100644 --- a/src/dict.rs +++ b/src/dict.rs @@ -2,7 +2,6 @@ use miniserde::{json, Deserialize}; use std::{ ffi::OsStr, fs, - ops::Not, path::{Path, PathBuf}, }; @@ -11,7 +10,7 @@ use crate::{audio::Audio, key::Keys, pages::Pages, Error}; pub struct MonokakidoDict { paths: Paths, pub pages: Pages, - pub audio: Audio, + pub audio: Option