diff --git a/Cargo.lock b/Cargo.lock index a7ffb2a..d08b525 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,94 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "adler" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" + +[[package]] +name = "itoa" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4217ad341ebadf8d8e724e264f13e593e0648f5b3e94b3896a5df283be015ecc" + +[[package]] +name = "mini-internal" +version = "0.1.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a63337614a1d280fdb2880599af563c99e9f388757f8d6515d785d85d14fb01" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "miniserde" +version = "0.1.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f4313e4a66a442473e181963daf8c1e9def85c2d9fb0bb2ae59444260b28285" +dependencies = [ + "itoa", + "mini-internal", + "ryu", +] + +[[package]] +name = "miniz_oxide" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b275950c28b37e794e8c55d88aeb5e139d0ce23fdbbeda68f8d7174abdf9e8fa" +dependencies = [ + "adler", +] + [[package]] name = "monokakido" version = "0.1.0" +dependencies = [ + "miniserde", + "miniz_oxide", +] + +[[package]] +name = "proc-macro2" +version = "1.0.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94e2ef8dbfc347b10c094890f778ee2e36ca9bb4262e86dc99cd217e35f3470b" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbe448f377a7d6961e30f5955f9b8d106c3f5e449d493ee1b125c1d43c2b5179" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "ryu" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4501abdff3ae82a1c1b477a17252eb69cee9e66eb915c1abaa4f44d873df9f09" + +[[package]] +name = "syn" +version = "1.0.102" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fcd952facd492f9be3ef0d0b7032a6e442ee9b361d4acc2b1d0c4aaa5f613a1" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ceab39d59e4c9499d4e5a8ee0e2735b891bb7308ac83dfb4e80cad195c9f6f3" diff --git a/Cargo.toml b/Cargo.toml index 1fa9f96..8f99a9c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,6 +3,6 @@ name = "monokakido" version = "0.1.0" edition = "2021" -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - [dependencies] +miniz_oxide = { version = "0.6", default-features = false } +miniserde = "0.1" diff --git a/src/abi.rs b/src/abi.rs new file mode 100644 index 0000000..596c95e --- /dev/null +++ b/src/abi.rs @@ -0,0 +1,85 @@ +use core::{ + mem::{align_of, size_of}, + slice, +}; + +use crate::Error; + +#[repr(transparent)] +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, PartialOrd, Ord)] +pub(crate) struct LE32(u32); + +impl LE32 { + pub fn read(self) -> u32 { + u32::from_le(self.0) + } + + pub fn us(self) -> usize { + self.read() as usize + } + + pub fn from(slice: &[u8]) -> Result<(Self, &[u8]), Error> { + if slice.len() < size_of::() { + return Err(Error::BufferTooSmall); + } + let (le32, tail) = slice.split_at(size_of::()); + Ok((LE32(u32::from_ne_bytes(le32.try_into().unwrap())), tail)) + } +} + +impl From for LE32 { + fn from(value: u32) -> Self { + Self(u32::from_le(value)) + } +} + +unsafe impl TransmuteSafe for LE32 {} + +pub(crate) unsafe trait TransmuteSafe: Default + Clone { + fn from_buf(buf: &[u8]) -> Result<(&Self, &[u8]), Error> { + if buf.len() < size_of::() { + return Err(Error::Transmute); + } + if buf.as_ptr() as usize % align_of::() != 0 { + return Err(Error::Transmute); + } + let (me, tail) = buf.split_at(size_of::()); + let me = unsafe { &*(me.as_ptr() as *const Self) }; + Ok((me, tail)) + } + + fn slice_from_buf(buf: &[u8], n: usize) -> Result<(&[Self], &[u8]), Error> { + if buf.len() < n * size_of::() { + return Err(Error::Transmute); + } + if buf.as_ptr() as usize % align_of::() != 0 { + return Err(Error::Transmute); + } + let tail = &buf[n * size_of::()..]; + let us: &[Self] = unsafe { slice::from_raw_parts(buf.as_ptr() as *const Self, n) }; + Ok((us, tail)) + } + + fn slice_as_bytes_mut(slice: &mut [Self]) -> &mut [u8] { + unsafe { + slice::from_raw_parts_mut( + slice.as_mut_ptr() as *mut u8, + slice.len() * size_of::(), + ) + } + } + + fn slice_as_bytes(slice: &[Self]) -> &[u8] { + unsafe { + slice::from_raw_parts(slice.as_ptr() as *const u8, slice.len() * size_of::()) + } + } + + fn as_bytes_mut(&mut self) -> &mut [u8] { + Self::slice_as_bytes_mut(slice::from_mut(self)) + } + + fn as_bytes(&self) -> &[u8] { + Self::slice_as_bytes(slice::from_ref(self)) + } +} diff --git a/src/audio.rs b/src/audio.rs new file mode 100644 index 0000000..9becc7a --- /dev/null +++ b/src/audio.rs @@ -0,0 +1,215 @@ +use core::{mem::size_of, ops::Not}; +use std::{ + ffi::OsStr, + fs::{self, File}, + io::{Read, Seek, SeekFrom}, +}; + +use miniz_oxide::inflate::core as zlib; + +use crate::{abi::TransmuteSafe, decompress, dict::Paths, ContentsFile, Error}; + +#[derive(Debug, Clone)] +pub(crate) struct AudioIndex { + idx: Vec, + ids: String, // contains null bytes as substring separators +} + +mod abi { + use crate::{audio::AudioFormat, Error}; + + #[repr(C)] + #[derive(Debug, Clone, Copy, Default, PartialEq, Eq)] + pub(crate) struct AudioIdxRecord { + format: u16, + fileseq: u16, + id_str_offset: u32, + file_offset: u32, + len: u32, + } + + impl AudioIdxRecord { + pub fn id_str_offset(&self) -> usize { + u32::from_le(self.id_str_offset) as usize + } + + pub(super) fn format(&self) -> Result { + match u16::from_le(self.format) { + 0 => Ok(AudioFormat::Acc), + 1 => Ok(AudioFormat::ZlibAcc), + _ => Err(Error::InvalidAudioFormat), + } + } + + pub fn fileseq(&self) -> usize { + u16::from_le(self.fileseq) as usize + } + + pub fn file_offset(&self) -> u64 { + u32::from_le(self.file_offset) as u64 + } + + pub fn len(&self) -> usize { + u32::from_le(self.len) as usize + } + } + + #[test] + fn test_audio_index() { + use crate::audio::AudioIndex; + let air = |id_str_offset| AudioIdxRecord { + format: 0, + fileseq: 0, + id_str_offset, + file_offset: 0, + len: 0, + }; + let mut audio_idx = AudioIndex { + idx: vec![air(0), air(1), air(3), air(6), air(10)], + ids: "\0a\0bb\0ccc\0dddd".to_owned(), + }; + assert_eq!(audio_idx.get_id_at(0).unwrap(), ""); + assert_eq!(audio_idx.get_id_at(1).unwrap(), "a"); + assert_eq!(audio_idx.get_id_at(3).unwrap(), "bb"); + assert_eq!(audio_idx.get_id_at(4), Err(Error::InvalidIndex)); + assert_eq!(audio_idx.get_id_at(6).unwrap(), "ccc"); + assert_eq!(audio_idx.get_id_at(10), Err(Error::InvalidIndex)); + + audio_idx.ids = "\0a\0bb\0ccc\0dddd\0".to_owned(); + assert_eq!(audio_idx.get_by_id("").unwrap(), air(0)); + assert_eq!(audio_idx.get_by_id("a").unwrap(), air(1)); + assert_eq!(audio_idx.get_by_id("bb").unwrap(), air(3)); + assert_eq!(audio_idx.get_by_id("ccc").unwrap(), air(6)); + assert_eq!(audio_idx.get_by_id("dddd").unwrap(), air(10)); + assert_eq!(audio_idx.get_by_id("ddd"), Err(Error::NotFound)); + } +} + +pub(crate) use abi::AudioIdxRecord; + +enum AudioFormat { + Acc, + ZlibAcc, +} + +unsafe impl TransmuteSafe for AudioIdxRecord {} + +impl AudioIndex { + pub(crate) fn new(paths: &Paths) -> Result { + let mut file = File::open(paths.audio_idx_path()).map_err(|_| Error::FopenError)?; + let mut len = [0; 8]; + file.read_exact(&mut len).map_err(|_| Error::IOError)?; + let len = u32::from_le_bytes(len[4..8].try_into().unwrap()) as usize; + let file_size = file.metadata().map_err(|_| Error::IOError)?.len() as usize; + let idx_expected_size = size_of::() * len + 8; + let mut idx = vec![AudioIdxRecord::default(); len]; + let mut ids = String::with_capacity(file_size - idx_expected_size); + file.read_exact(AudioIdxRecord::slice_as_bytes_mut(idx.as_mut_slice())) + .map_err(|_| Error::IOError)?; + file.read_to_string(&mut ids).map_err(|_| Error::IOError)?; + Ok(Self { idx, ids }) + } + + fn get_id_at(&self, offset: usize) -> Result<&str, Error> { + let offset = offset - (size_of::() * self.idx.len() + 8); + if offset > 0 && &self.ids[offset - 1..offset] != "\0" { + return Err(Error::InvalidIndex); + } + let tail = &self.ids[offset..]; + let len = tail.find('\0').ok_or(Error::InvalidIndex)?; + Ok(&tail[..len]) + } + + pub fn get_by_id(&self, id: &str) -> Result { + let mut idx_err = Ok(()); + let i = self + .idx + .binary_search_by_key(&id, |idx| match self.get_id_at(idx.id_str_offset()) { + Ok(ok) => ok, + Err(err) => { + idx_err = Err(err); + "" + } + }) + .map_err(|_| Error::NotFound)?; + idx_err?; + + Ok(self.idx[i]) + } +} + +pub struct Audio { + index: AudioIndex, + audio: Vec, + read_buf: Vec, + decomp_buf: Vec, + zlib_state: zlib::DecompressorOxide, +} + +impl Audio { + fn parse_fname(fname: &OsStr) -> Option { + let fname = fname.to_str()?; + if fname.ends_with(".nrsc").not() { + return None; + } + u32::from_str_radix(&fname[..5], 10).ok() + } + + pub(crate) fn new(paths: &Paths) -> Result { + let mut audio = Vec::new(); + for entry in fs::read_dir(&paths.audio_path()).map_err(|_| Error::IOError)? { + let entry = entry.map_err(|_| Error::IOError)?; + let seqnum = Audio::parse_fname(&entry.file_name()); + if let Some(seqnum) = seqnum { + audio.push(ContentsFile { + seqnum, + len: entry.metadata().map_err(|_| Error::IOError)?.len() as usize, + offset: 0, + file: File::open(entry.path()).map_err(|_| Error::IOError)?, + }); + } + } + audio.sort_by_key(|f| f.seqnum); + if Some(audio.len()) != audio.last().map(|a| a.seqnum as usize + 1) { + return Err(Error::NoContentFilesFound); + } + let index = AudioIndex::new(&paths)?; + Ok(Audio { + index, + audio, + read_buf: Vec::new(), + decomp_buf: Vec::new(), + zlib_state: zlib::DecompressorOxide::new(), + }) + } + + fn get_by_idx(&mut self, idx: AudioIdxRecord) -> Result<&[u8], Error> { + let file = &mut self.audio[idx.fileseq() as usize]; + + file.file + .seek(SeekFrom::Start(idx.file_offset())) + .map_err(|_| Error::IOError)?; + if self.read_buf.len() < idx.len() { + self.read_buf.resize(idx.len(), 0); + } + file.file + .read_exact(&mut self.read_buf[..idx.len()]) + .map_err(|_| Error::IOError)?; + + match idx.format()? { + AudioFormat::Acc => Ok(&self.read_buf[..idx.len()]), + AudioFormat::ZlibAcc => { + let n_out = decompress( + &mut self.zlib_state, + &self.read_buf[..idx.len()], + &mut self.decomp_buf, + )?; + Ok(&self.decomp_buf[..n_out]) + } + } + } + + pub fn get(&mut self, id: &str) -> Result<&[u8], Error> { + self.get_by_idx(self.index.get_by_id(id)?) + } +} diff --git a/src/dict.rs b/src/dict.rs new file mode 100644 index 0000000..b2fdb28 --- /dev/null +++ b/src/dict.rs @@ -0,0 +1,155 @@ +use miniserde::{json, Deserialize}; +use std::{ + ffi::OsStr, + fs, + ops::Not, + path::{Path, PathBuf}, +}; + +use crate::{audio::Audio, key::Keys, pages::Pages, Error}; + +pub struct MonokakidoDict { + paths: Paths, + pub pages: Pages, + pub audio: Audio, + pub keys: Keys, +} + +#[derive(Deserialize, Debug)] +struct DictJson { + #[serde(rename = "DSProductContents")] + contents: Vec, +} + +#[derive(Deserialize, Debug)] +struct DSProductContents { + #[serde(rename = "DSContentDirectory")] + dir: String, +} + +pub(crate) struct Paths { + base_path: PathBuf, + name: String, + contents_dir: String, +} + +impl Paths { + fn std_list_path() -> PathBuf { + PathBuf::from( + "/Library/Application Support/AppStoreContent/jp.monokakido.Dictionaries/Products/", + ) + } + + fn std_dict_path(name: &str) -> PathBuf { + let mut path = Paths::std_list_path(); + path.push(format!("jp.monokakido.Dictionaries.{name}")); + path + } + + fn json_path(path: &Path, name: &str) -> PathBuf { + let mut pb = PathBuf::from(path); + pb.push("Contents"); + pb.push(format!("{name}.json")); + pb + } + + pub(crate) fn contents_path(&self) -> PathBuf { + let mut pb = PathBuf::from(&self.base_path); + pb.push("Contents"); + pb.push(&self.contents_dir); + pb.push("contents"); + pb + } + + pub(crate) fn audio_path(&self) -> PathBuf { + let mut pb = PathBuf::from(&self.base_path); + pb.push("Contents"); + pb.push(&self.contents_dir); + pb.push("audio"); + pb + } + + pub(crate) fn contents_idx_path(&self) -> PathBuf { + let mut pb = self.contents_path(); + pb.push("contents.idx"); + pb + } + + pub(crate) fn contents_map_path(&self) -> PathBuf { + let mut pb = self.contents_path(); + pb.push("contents.map"); + pb + } + + pub(crate) fn audio_idx_path(&self) -> PathBuf { + let mut pb = self.audio_path(); + pb.push("index.nidx"); + pb + } + + pub(crate) fn key_path(&self) -> PathBuf { + let mut pb = PathBuf::from(&self.base_path); + pb.push("Contents"); + pb.push(&self.contents_dir); + pb.push("key"); + pb + } + + pub(crate) fn headword_key_path(&self) -> PathBuf { + let mut pb = self.key_path(); + pb.push("headword.keystore"); + pb + } +} + +fn parse_dict_name(fname: &OsStr) -> Option<&str> { + let fname = fname.to_str()?; + if fname.starts_with("jp.monokakido.Dictionaries.").not() { + return None; + } + Some(&fname[27..]) +} + +impl MonokakidoDict { + pub fn list() -> Result>, Error> { + let iter = fs::read_dir(&Paths::std_list_path()).map_err(|_| Error::IOError)?; + Ok(iter.filter_map(|entry| { + entry + .map_err(|_| Error::IOError) + .map(|e| parse_dict_name(&e.file_name()).map(ToOwned::to_owned)) + .transpose() + })) + } + + pub fn open(name: &str) -> Result { + let std_path = Paths::std_dict_path(name); + Self::open_with_path(&std_path, name) + } + + pub fn name(&self) -> &str { + &self.paths.name + } + + pub fn open_with_path(path: impl Into, name: &str) -> Result { + let base_path = path.into(); + let json_path = Paths::json_path(&base_path, name); + let json = fs::read_to_string(json_path).map_err(|_| Error::NoDictJsonFound)?; + let mut json: DictJson = json::from_str(&json).map_err(|_| Error::InvalidDictJson)?; + let contents = json.contents.pop().ok_or(Error::InvalidDictJson)?; + let paths = Paths { + base_path, + name: name.to_owned(), + contents_dir: contents.dir, + }; + let pages = Pages::new(&paths)?; + let audio = Audio::new(&paths)?; + let keys = Keys::new(&paths)?; + + Ok(MonokakidoDict { + paths, + pages, + audio, + keys, + }) + } +} diff --git a/src/error.rs b/src/error.rs new file mode 100644 index 0000000..ec93675 --- /dev/null +++ b/src/error.rs @@ -0,0 +1,35 @@ +use std::{io::Error as IoError, str::Utf8Error}; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Error { + Transmute, + Validate, + FopenError, + FstatError, + MmapError, + ZlibError, + Utf8Error, + RecordTooLarge, + IncorrectStreamLength, + BufferTooSmall, + IndexMismach, + NotFound, + NoDictJsonFound, + InvalidDictJson, + IOError, + NoContentFilesFound, + InvalidIndex, + InvalidAudioFormat, +} + +impl From for Error { + fn from(_: IoError) -> Self { + Error::IOError + } +} + +impl From for Error { + fn from(_: Utf8Error) -> Self { + Error::Utf8Error + } +} diff --git a/src/key.rs b/src/key.rs new file mode 100644 index 0000000..7fbc037 --- /dev/null +++ b/src/key.rs @@ -0,0 +1,249 @@ +use std::{ + fs::File, + io::{Read, Seek}, + mem::size_of, + str::from_utf8, +}; + +use crate::{ + abi::{TransmuteSafe, LE32}, + dict::Paths, + Error, +}; + +mod abi { + use super::*; + + #[repr(C)] + #[derive(Debug, Clone, Copy, Default)] + pub(super) struct FileHeader { + magic1: LE32, + magic2: LE32, + pub words_offset: LE32, + pub idx_offset: LE32, + magic3: LE32, + magic4: LE32, + magic5: LE32, + magic6: LE32, + } + + impl FileHeader { + pub(super) fn validate(&self) -> Result<(), Error> { + if self.magic1.read() == 0x20000 + && self.magic2.read() == 0 + && self.magic3.read() == 0 + && self.magic4.read() == 0 + && self.magic5.read() == 0 + && self.magic6.read() == 0 + && self.words_offset.us() < self.idx_offset.us() + { + Ok(()) + } else { + Err(Error::Validate) + } + } + } + + unsafe impl TransmuteSafe for FileHeader {} + + #[repr(C)] + #[derive(Debug, Clone, Copy, Default)] + pub(super) struct IndexHeader { + magic1: LE32, + pub index_a_offset: LE32, + pub index_b_offset: LE32, + pub index_c_offset: LE32, + pub index_d_offset: LE32, + } + + impl IndexHeader { + pub(super) fn validate(&self, file_end: usize) -> Result<(), Error> { + if self.magic1.read() == 0x04 + && self.index_a_offset.us() < self.index_b_offset.us() + && self.index_b_offset.us() < self.index_c_offset.us() + && self.index_c_offset.us() < self.index_d_offset.us() + && self.index_d_offset.us() < file_end + { + Ok(()) + } else { + Err(Error::Validate) + } + } + } + + unsafe impl TransmuteSafe for IndexHeader {} +} +use abi::{FileHeader, IndexHeader}; + +pub struct Keys { + words: Vec, + index_a: Vec, + index_b: Vec, + index_c: Vec, + index_d: Vec, +} + +impl Keys { + fn read_vec(file: &mut File, start: usize, end: usize) -> Result, Error> { + let size = (end - start + size_of::() - 1) / size_of::(); + let mut buf = vec![LE32::default(); size]; + file.read_exact(LE32::slice_as_bytes_mut(&mut buf))?; + Ok(buf) + } + + fn check_vec_len(buf: &Vec) -> Result<(), Error> { + if buf.get(0).ok_or(Error::InvalidIndex)?.us() + 1 != buf.len() { + return Err(Error::InvalidIndex); + } + Ok(()) + } + + pub(crate) fn new(paths: &Paths) -> Result { + let mut file = File::open(paths.headword_key_path())?; + let file_size = file.metadata()?.len() as usize; + let mut hdr = FileHeader::default(); + file.read_exact(hdr.as_bytes_mut())?; + hdr.validate()?; + + file.seek(std::io::SeekFrom::Start(hdr.words_offset.read() as u64))?; + let words = Self::read_vec(&mut file, hdr.words_offset.us(), hdr.idx_offset.us())?; + + if words.get(0).ok_or(Error::InvalidIndex)?.us() + 1 >= words.len() { + return Err(Error::InvalidIndex); + } + + let file_end = file_size - hdr.idx_offset.us(); + let mut ihdr = IndexHeader::default(); + file.seek(std::io::SeekFrom::Start(hdr.idx_offset.read() as u64))?; + file.read_exact(ihdr.as_bytes_mut())?; + ihdr.validate(file_end)?; + + let index_a = Self::read_vec( + &mut file, + ihdr.index_a_offset.us(), + ihdr.index_b_offset.us(), + )?; + Self::check_vec_len(&index_a)?; + + let index_b = Self::read_vec( + &mut file, + ihdr.index_b_offset.us(), + ihdr.index_c_offset.us(), + )?; + Self::check_vec_len(&index_b)?; + + let index_c = Self::read_vec( + &mut file, + ihdr.index_c_offset.us(), + ihdr.index_d_offset.us(), + )?; + Self::check_vec_len(&index_c)?; + + let index_d = Self::read_vec(&mut file, ihdr.index_d_offset.us(), file_end)?; + Self::check_vec_len(&index_d)?; + + Ok(Keys { + words, + index_a, + index_b, + index_c, + index_d, + }) + } + + pub fn count(&self) -> usize { + // USE INVARIANT A + self.words[0].us() + } + + fn get_page_iter(&self, pages_offset: usize) -> Result { + let pages = &LE32::slice_as_bytes(&self.words)[pages_offset..]; + PageIter::new(pages) + } + + pub(crate) fn get_word_span(&self, offset: usize) -> Result<(&str, usize), Error> { + let words_bytes = LE32::slice_as_bytes(&self.words); + if words_bytes.len() < offset + 2 * size_of::() { + return Err(Error::InvalidIndex); + } + let (pages_offset, word_bytes) = LE32::from(&words_bytes[offset..])?; + if let Some(word) = word_bytes[1..].split(|b| *b == b'\0').next() { + Ok((from_utf8(word)?, pages_offset.us())) + } else { + Err(Error::InvalidIndex) + } + } + + fn get_inner(&self, index: &[LE32], idx: usize) -> Result<(&str, PageIter<'_>), Error> { + if idx >= self.count() { + return Err(Error::NotFound); + } + let word_offset = index[idx + 1].us(); + let (word, pages_offset) = self.get_word_span(word_offset)?; + let pages = self.get_page_iter(pages_offset)?; + Ok((word, pages)) + } + + pub fn get_index_a(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> { + self.get_inner(&self.index_a, idx) + } + + pub fn get_index_b(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> { + self.get_inner(&self.index_b, idx) + } + + pub fn get_index_c(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> { + self.get_inner(&self.index_c, idx) + } + + pub fn get_index_d(&self, idx: usize) -> Result<(&str, PageIter<'_>), Error> { + self.get_inner(&self.index_d, idx) + } +} + +#[derive(Debug, Clone)] +pub struct PageIter<'a> { + count: u16, + span: &'a [u8], +} + +impl<'a> PageIter<'a> { + fn new(pages: &'a [u8]) -> Result { + let (count, pages) = pages.split_at(2); + let count = u16::from_le_bytes(count.try_into().unwrap()); + + // CHECK INVARIANT B: loop through `count` times and check that the shape is of expected + let mut tail = pages; + for _ in 0..count { + match tail { + &[2, _, _, ref t @ ..] => tail = t, + &[4, _, _, _, ref t @ ..] => tail = t, + _ => return Err(Error::InvalidIndex), + } + } + let span_len = pages.len() - tail.len(); + Ok(PageIter { + span: &pages[..span_len], + count, + }) + } +} + +impl<'a> Iterator for PageIter<'a> { + type Item = u32; + + fn next(&mut self) -> Option { + // USE INVARIANT B: `self.span` is checked to conform to this shape, + // so unreachable is never reached. `self.count` is also checked to correspond, + // so overflow never happens. + let (id, tail) = match self.span { + &[2, hi, lo, ref tail @ ..] => (u32::from_be_bytes([0, 0, hi, lo]), tail), + &[4, hi, mid, lo, ref tail @ ..] => (u32::from_be_bytes([0, hi, mid, lo]), tail), + &[] => return None, + _ => unreachable!(), + }; + self.count -= 1; + self.span = tail; + Some(id) + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..2ecaf0a --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,61 @@ +use std::fs; + +use miniz_oxide::inflate::{core as zlib, TINFLStatus as ZStatus}; + +mod abi; +mod audio; +mod dict; +mod error; +mod key; +mod pages; + +pub use dict::MonokakidoDict; +pub use error::Error; +pub use pages::Pages; +pub use audio::Audio; +pub use key::Keys; + +fn decompress( + zlib_state: &mut zlib::DecompressorOxide, + in_buf: &[u8], + out_buf: &mut Vec, +) -> Result { + use zlib::inflate_flags as flg; + use ZStatus::{Done, HasMoreOutput}; + + let flags = flg::TINFL_FLAG_PARSE_ZLIB_HEADER | flg::TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF; + let mut n_in_total = 0; + let mut n_out_total = 0; + zlib_state.init(); + loop { + let (status, n_in, n_out) = zlib::decompress( + zlib_state, + &in_buf[n_in_total..], + out_buf, + n_out_total, + flags, + ); + n_out_total += n_out; + n_in_total += n_in; + match status { + HasMoreOutput => { + out_buf.resize(out_buf.len() * 2 + 1, 0); + continue; + } + Done => break, + _ => return Err(Error::ZlibError), + } + } + if n_in_total != in_buf.len() { + return Err(Error::IncorrectStreamLength); + } + Ok(n_out_total) +} + +#[derive(Debug)] +struct ContentsFile { + seqnum: u32, + len: usize, + offset: usize, + file: fs::File, +} diff --git a/src/main.rs b/src/main.rs index e7a11a9..53b4414 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,3 +1,87 @@ +use monokakido::MonokakidoDict; + fn main() { - println!("Hello, world!"); + /* + for dict in MonokakidoDict::list().unwrap() { + dbg!(dict.unwrap()); + } + */ + let mut dict = MonokakidoDict::open("NHKACCENT2").unwrap(); + + let idx_list = [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 46200, + 46201, + 46202, + 46203, + 46204, + 46205, + 46206, + 46207, + 46208, + 46209, + 46210, + 46211, + 70000, + dict.keys.count() - 1, + ]; + + println!("Index: length order"); + for idx in idx_list { + let (word, pages) = dict.keys.get_index_a(idx).unwrap(); + println!("\n{}", word); + for id in pages { + println!("{}", dict.pages.get(id).unwrap()); + } + } + + println!("Index: prefix order"); + for idx in idx_list { + let (word, pages) = dict.keys.get_index_b(idx).unwrap(); + println!("\n{}", word); + for id in pages { + println!("{}", dict.pages.get(id).unwrap()); + } + } + + println!("Index: suffix order"); + for idx in idx_list { + let (word, pages) = dict.keys.get_index_c(idx).unwrap(); + println!("\n{}", word); + for id in pages { + println!("{}", dict.pages.get(id).unwrap()); + } + } + + println!("Index: ?"); + for idx in idx_list { + let (word, pages) = dict.keys.get_index_d(idx).unwrap(); + println!("\n{}", word); + for id in pages { + println!("{}", dict.pages.get(id).unwrap()); + } + } + + //let mut stdout = stdout().lock(); + //stdout.write_all(audio).unwrap(); } diff --git a/src/pages.rs b/src/pages.rs new file mode 100644 index 0000000..5498dbd --- /dev/null +++ b/src/pages.rs @@ -0,0 +1,444 @@ +use core::{cmp::min, mem::size_of, ops::Not}; +use miniz_oxide::inflate::core as zlib; +use std::{ + ffi::OsStr, + fs::{self, File}, + io::{Read, Seek, SeekFrom}, +}; + +use crate::{ + abi::{TransmuteSafe, LE32}, + decompress, + dict::Paths, + ContentsFile, Error, +}; + +mod abi { + use crate::abi::LE32; + + #[repr(C)] + #[derive(Debug, Clone, Copy, Default)] + pub(crate) struct TextIdxRecord { + pub dic_item_id: LE32, + pub map_idx: LE32, + } + + #[repr(C)] + #[derive(Debug, Clone, Copy, Default, PartialEq, Eq, PartialOrd, Ord)] + pub(crate) struct TextMapRecord { + pub zoffset: LE32, + pub ioffset: LE32, + } + + #[test] + fn test_get_by_id() { + use crate::{pages::PageIndex, Error}; + + fn idx(id: u32, idx: u32) -> TextIdxRecord { + TextIdxRecord { + dic_item_id: id.into(), + map_idx: idx.into(), + } + } + fn map(z: u32, i: u32) -> TextMapRecord { + TextMapRecord { + zoffset: z.into(), + ioffset: i.into(), + } + } + + assert_eq!( + PageIndex { + idx: vec![], + map: vec![], + } + .get_by_id(500), + Err(Error::NotFound) + ); + + assert_eq!( + PageIndex { + idx: vec![idx(1, 0)], + map: vec![map(0, 0)], + } + .get_by_id(500), + Err(Error::NotFound) + ); + + assert_eq!( + PageIndex { + idx: vec![idx(1, 0), idx(2, 1)], + map: vec![map(0, 0), map(0, 10)], + } + .get_by_id(500), + Err(Error::NotFound) + ); + + assert_eq!( + PageIndex { + idx: vec![idx(1, 0), idx(2, 1), idx(1000, 2)], + map: vec![map(0, 0), map(0, 10), map(0, 20)], + } + .get_by_id(500), + Err(Error::NotFound) + ); + + assert_eq!( + PageIndex { + idx: vec![idx(1, 0), idx(2, 1), idx(500, 2), idx(1000, 3)], + map: vec![map(0, 0), map(0, 10), map(0, 20), map(10, 0)], + } + .get_by_id(500), + Ok(map(0, 20)) + ); + + assert_eq!( + PageIndex { + idx: vec![ + idx(1, 0), + idx(2, 1), + idx(499, 2), + idx(500, 3), + idx(501, 4), + idx(1000, 5) + ], + map: vec![ + map(0, 0), + map(0, 10), + map(0, 20), + map(10, 0), + map(10, 0), + map(10, 0) + ], + } + .get_by_id(500), + Ok(map(10, 0)) + ); + } +} +pub(crate) use abi::{TextIdxRecord, TextMapRecord}; + +#[derive(Debug, Clone)] +pub(crate) struct PageIndex { + idx: Vec, + map: Vec, +} + +unsafe impl TransmuteSafe for TextMapRecord {} +unsafe impl TransmuteSafe for TextIdxRecord {} + +impl PageIndex { + pub(crate) fn new(paths: &Paths) -> Result { + let mut idx_file = File::open(paths.contents_idx_path())?; + let mut map_file = File::open(paths.contents_map_path())?; + let mut len = [0; 4]; + idx_file.read_exact(&mut len)?; + let len = u32::from_le_bytes(len) as usize; + idx_file.seek(SeekFrom::Start(8))?; + map_file.seek(SeekFrom::Start(8))?; + let idx_size = idx_file.metadata().map_err(|_| Error::IOError)?.len(); + let map_size = map_file.metadata().map_err(|_| Error::IOError)?.len(); + let idx_expected_size = (size_of::() * len + 8) as u64; + let map_expected_size = (size_of::() * len + 8) as u64; + if idx_size != idx_expected_size || map_size != map_expected_size { + return Err(Error::IncorrectStreamLength); + } + let mut idx = vec![TextIdxRecord::default(); len]; + let mut map = vec![TextMapRecord::default(); len]; + idx_file + .read_exact(TextIdxRecord::slice_as_bytes_mut(idx.as_mut_slice())) + .map_err(|_| Error::IOError)?; + map_file + .read_exact(TextMapRecord::slice_as_bytes_mut(map.as_mut_slice())) + .map_err(|_| Error::IOError)?; + Ok(PageIndex { idx, map }) + } + + fn get_idx_by_id(&self, id: u32) -> Option { + if self.idx.is_empty() { + return None; + } + // Let's guess first, since usually the IDs are completely predictable, without gaps. + let idx_list = self.idx.as_slice(); + let idx = min(id as usize, idx_list.len() - 1); + let guess = idx_list[idx].dic_item_id.read(); + if id == guess { + return Some(idx); + } + let idx = min(id.saturating_sub(1) as usize, idx_list.len() - 1); + let guess = idx_list[idx].dic_item_id.read(); + if id == guess { + return Some(idx); + } + return idx_list + .binary_search_by_key(&id, |r| r.dic_item_id.read()) + .ok(); + } + + pub fn get_by_id(&self, id: u32) -> Result { + if let Some(idx) = self.get_idx_by_id(id) { + let record = self.map[self.idx[idx].map_idx.us()]; + Ok(record) + } else { + Err(Error::NotFound) + } + } +} + +pub struct Pages { + index: PageIndex, + contents: Vec, + zlib_buf: Vec, + zlib_state: zlib::DecompressorOxide, + contents_buf: Vec, + current_offset: usize, + current_len: usize, +} + +impl Pages { + fn parse_fname(fname: &OsStr) -> Option { + let fname = fname.to_str()?; + if (fname.starts_with("contents-") && fname.ends_with(".rsc")).not() { + return None; + } + u32::from_str_radix(&fname[9..13], 10).ok() + } + + pub(crate) fn new(paths: &Paths) -> Result { + let mut contents = Vec::new(); + for entry in fs::read_dir(&paths.contents_path()).map_err(|_| Error::IOError)? { + let entry = entry.map_err(|_| Error::IOError)?; + let seqnum = Pages::parse_fname(&entry.file_name()); + if let Some(seqnum) = seqnum { + contents.push(ContentsFile { + seqnum, + len: entry.metadata().map_err(|_| Error::IOError)?.len() as usize, + offset: 0, + file: File::open(entry.path()).map_err(|_| Error::IOError)?, + }); + } + } + contents.sort_by_key(|f| f.seqnum); + let mut offset = 0; + for (i, cf) in contents.iter_mut().enumerate() { + if cf.seqnum != i as u32 + 1 { + return Err(Error::NoContentFilesFound); + } + cf.offset = offset; + offset += cf.len; + } + let index = PageIndex::new(&paths)?; + Ok(Pages { + index, + contents, + zlib_buf: Vec::new(), + zlib_state: zlib::DecompressorOxide::new(), + contents_buf: Vec::new(), + current_offset: 0, + current_len: 0, + }) + } + + fn load_contents(&mut self, zoffset: usize) -> Result<(), Error> { + let (file, file_offset) = file_offset(&mut self.contents, zoffset)?; + + let mut len = [0_u8; 4]; + file.seek(SeekFrom::Start(file_offset)) + .map_err(|_| Error::IOError)?; + file.read_exact(&mut len).map_err(|_| Error::IOError)?; + let len = u32::from_le_bytes(len) as usize; + if self.zlib_buf.len() < len { + self.zlib_buf.resize(len, 0); + } + file.read_exact(&mut self.zlib_buf[..len]) + .map_err(|_| Error::IOError)?; + + let n_out = decompress( + &mut self.zlib_state, + &self.zlib_buf[..len], + &mut self.contents_buf, + )?; + + self.current_len = n_out; + self.current_offset = zoffset; + + Ok(()) + } + + pub fn get(&mut self, id: u32) -> Result<&str, Error> { + self.get_by_idx(self.index.get_by_id(id)?) + } + + fn get_by_idx(&mut self, idx: TextMapRecord) -> Result<&str, Error> { + if self.contents_buf.is_empty() || idx.zoffset.us() != self.current_offset { + self.load_contents(idx.zoffset.us())?; + } + + let contents = &self.contents_buf[idx.ioffset.us()..self.current_len]; + let (len, contents_tail) = LE32::from(contents)?; + Ok(std::str::from_utf8(&contents_tail[..len.us()]).map_err(|_| Error::Utf8Error)?) + } +} + +fn file_offset(contents: &mut [ContentsFile], offset: usize) -> Result<(&mut File, u64), Error> { + let file_idx = contents + .binary_search_by(|cf| cmp_range(offset, cf.offset..cf.offset + cf.len).reverse()) + .map_err(|_| Error::InvalidIndex)?; + let cf = &mut contents[file_idx]; + let file = &mut cf.file; + let file_offset = (offset - cf.offset) as u64; + Ok((file, file_offset)) +} + +#[test] +fn test_file_offset() { + use std::os::unix::prelude::AsRawFd; + + assert_eq!(file_offset(&mut [], 0).err(), Some(Error::InvalidIndex)); + + let mock_file = || { + let f = File::open("/dev/zero").unwrap(); + let fd = f.as_raw_fd(); + (f, fd) + }; + let (f1, f1_fd) = mock_file(); + let one_file = &mut vec![ContentsFile { + seqnum: 1, + len: 100, + offset: 0, + file: f1, + }]; + + let result = file_offset(one_file, 101); + assert_eq!(result.err(), Some(Error::InvalidIndex)); + + let result = file_offset(one_file, 100); + assert_eq!(result.err(), Some(Error::InvalidIndex)); + + let result = file_offset(one_file, 0); + assert_eq!(result.as_ref().map(|f| f.0.as_raw_fd()), Ok(f1_fd)); + assert_eq!(result.as_ref().map(|f| f.1), Ok(0)); + + let result = file_offset(one_file, 99); + assert_eq!(result.as_ref().map(|f| f.0.as_raw_fd()), Ok(f1_fd)); + assert_eq!(result.as_ref().map(|f| f.1), Ok(99)); + + let (f1, f1_fd) = mock_file(); + let (f2, f2_fd) = mock_file(); + let two_files = &mut vec![ + ContentsFile { + seqnum: 1, + len: 100, + offset: 0, + file: f1, + }, + ContentsFile { + seqnum: 2, + len: 200, + offset: 100, + file: f2, + }, + ]; + + let result = file_offset(two_files, 301); + assert_eq!(result.err(), Some(Error::InvalidIndex)); + + let result = file_offset(two_files, 300); + assert_eq!(result.err(), Some(Error::InvalidIndex)); + + let result = file_offset(two_files, 0); + assert_eq!(result.as_ref().map(|f| f.0.as_raw_fd()), Ok(f1_fd)); + assert_eq!(result.as_ref().map(|f| f.1), Ok(0)); + + let result = file_offset(two_files, 99); + assert_eq!(result.as_ref().map(|f| f.0.as_raw_fd()), Ok(f1_fd)); + assert_eq!(result.as_ref().map(|f| f.1), Ok(99)); + + let result = file_offset(two_files, 100); + assert_eq!(result.as_ref().map(|f| f.0.as_raw_fd()), Ok(f2_fd)); + assert_eq!(result.as_ref().map(|f| f.1), Ok(0)); + + let result = file_offset(two_files, 299); + assert_eq!(result.as_ref().map(|f| f.0.as_raw_fd()), Ok(f2_fd)); + assert_eq!(result.as_ref().map(|f| f.1), Ok(199)); + + let (f1, f1_fd) = mock_file(); + let (f2, f2_fd) = mock_file(); + let (f3, f3_fd) = mock_file(); + let three_files = &mut vec![ + ContentsFile { + seqnum: 1, + len: 100, + offset: 0, + file: f1, + }, + ContentsFile { + seqnum: 2, + len: 200, + offset: 100, + file: f2, + }, + ContentsFile { + seqnum: 3, + len: 100, + offset: 300, + file: f3, + }, + ]; + + let result = file_offset(three_files, 401); + assert_eq!(result.err(), Some(Error::InvalidIndex)); + + let result = file_offset(three_files, 400); + assert_eq!(result.err(), Some(Error::InvalidIndex)); + + let result = file_offset(three_files, 0); + assert_eq!(result.as_ref().map(|f| f.0.as_raw_fd()), Ok(f1_fd)); + assert_eq!(result.as_ref().map(|f| f.1), Ok(0)); + + let result = file_offset(three_files, 99); + assert_eq!(result.as_ref().map(|f| f.0.as_raw_fd()), Ok(f1_fd)); + assert_eq!(result.as_ref().map(|f| f.1), Ok(99)); + + let result = file_offset(three_files, 100); + assert_eq!(result.as_ref().map(|f| f.0.as_raw_fd()), Ok(f2_fd)); + assert_eq!(result.as_ref().map(|f| f.1), Ok(0)); + + let result = file_offset(three_files, 299); + assert_eq!(result.as_ref().map(|f| f.0.as_raw_fd()), Ok(f2_fd)); + assert_eq!(result.as_ref().map(|f| f.1), Ok(199)); + + let result = file_offset(three_files, 300); + assert_eq!(result.as_ref().map(|f| f.0.as_raw_fd()), Ok(f3_fd)); + assert_eq!(result.as_ref().map(|f| f.1), Ok(0)); + + let result = file_offset(three_files, 399); + assert_eq!(result.as_ref().map(|f| f.0.as_raw_fd()), Ok(f3_fd)); + assert_eq!(result.as_ref().map(|f| f.1), Ok(99)); +} + +fn cmp_range(num: usize, range: core::ops::Range) -> core::cmp::Ordering { + use core::cmp::Ordering; + if num < range.start { + Ordering::Less + } else if range.end <= num { + Ordering::Greater + } else { + Ordering::Equal + } +} + +#[test] +fn test_cmp_to_range() { + use core::cmp::Ordering; + assert_eq!(cmp_range(0, 0..0), Ordering::Greater); + assert_eq!(cmp_range(0, 0..1), Ordering::Equal); + assert_eq!(cmp_range(0, 0..100), Ordering::Equal); + assert_eq!(cmp_range(1, 0..100), Ordering::Equal); + assert_eq!(cmp_range(99, 0..100), Ordering::Equal); + assert_eq!(cmp_range(100, 0..100), Ordering::Greater); + assert_eq!(cmp_range(101, 0..100), Ordering::Greater); + assert_eq!(cmp_range(0, 1..100), Ordering::Less); + assert_eq!(cmp_range(99, 100..100), Ordering::Less); + assert_eq!(cmp_range(100, 100..100), Ordering::Greater); +}