Start implementing headline support.
This commit is contained in:
parent
4b70b0708d
commit
09915cabc5
32
README.md
32
README.md
|
@ -3,12 +3,38 @@
|
||||||
A Rust library for parsing and interpreting the [Monokakido](https://www.monokakido.jp/en/dictionaries/app/) dictionary format. Aiming for full test coverage and efficient implementation with minimal dependencies.
|
A Rust library for parsing and interpreting the [Monokakido](https://www.monokakido.jp/en/dictionaries/app/) dictionary format. Aiming for full test coverage and efficient implementation with minimal dependencies.
|
||||||
|
|
||||||
## TODO:
|
## TODO:
|
||||||
- Refactor code for generic "rsc" and "nrsc" support
|
- Add headline support
|
||||||
|
- Refactor as a workspace to separate the dependencies of the library and the binaries
|
||||||
|
- Move to mmap-based indexes
|
||||||
|
- Add graphics support
|
||||||
|
- Add TTY detection to CLI (prevent binary output to shell)
|
||||||
|
- Add proper argument parser lib to CLI
|
||||||
|
- Refine CLI according to the plan below
|
||||||
|
- Document the rsc, nrsc and keystore and headline formats
|
||||||
|
### Test:
|
||||||
- Audio using "rsc" (CCCAD, WISDOM3)
|
- Audio using "rsc" (CCCAD, WISDOM3)
|
||||||
- Audio using "nrsc" (DAIJISEN2, NHKACCENT2, OALD10, OLDAE, OLEX, OLT, RHEJ, SMK8)
|
- Audio using "nrsc" (DAIJISEN2, NHKACCENT2, OALD10, OLDAE, OLEX, OLT, RHEJ, SMK8)
|
||||||
- Multiple contents (WISDOM3, OLEX)
|
- Multiple contents (WISDOM3, OLEX)
|
||||||
- Document the rsc, nrsc and keystore formats
|
|
||||||
- Split main.rs into "dict exploder" and "dict cli"
|
|
||||||
|
## CLI (Planned)
|
||||||
|
|
||||||
|
### Tab-separated output formats:
|
||||||
|
- keyword
|
||||||
|
- headline
|
||||||
|
- iid (item id)
|
||||||
|
- pid (page id)
|
||||||
|
- aid (audio id)
|
||||||
|
- gid (graphics id)
|
||||||
|
|
||||||
|
### \n\n separated output formats:
|
||||||
|
- item
|
||||||
|
- page
|
||||||
|
|
||||||
|
### binary output formats:
|
||||||
|
- audio
|
||||||
|
- graphics
|
||||||
|
|
||||||
|
|
||||||
## Planned to support:
|
## Planned to support:
|
||||||
- WISDOM3
|
- WISDOM3
|
||||||
|
|
|
@ -2,6 +2,7 @@ use core::{
|
||||||
mem::{align_of, size_of},
|
mem::{align_of, size_of},
|
||||||
slice,
|
slice,
|
||||||
};
|
};
|
||||||
|
use std::{fs::File, io::Read};
|
||||||
|
|
||||||
use crate::Error;
|
use crate::Error;
|
||||||
|
|
||||||
|
@ -34,6 +35,7 @@ impl From<u32> for LE32 {
|
||||||
}
|
}
|
||||||
|
|
||||||
unsafe impl TransmuteSafe for LE32 {}
|
unsafe impl TransmuteSafe for LE32 {}
|
||||||
|
unsafe impl TransmuteSafe for u8 {}
|
||||||
|
|
||||||
pub(crate) unsafe trait TransmuteSafe: Default + Clone {
|
pub(crate) unsafe trait TransmuteSafe: Default + Clone {
|
||||||
fn from_buf(buf: &[u8]) -> Result<(&Self, &[u8]), Error> {
|
fn from_buf(buf: &[u8]) -> Result<(&Self, &[u8]), Error> {
|
||||||
|
@ -83,3 +85,14 @@ pub(crate) unsafe trait TransmuteSafe: Default + Clone {
|
||||||
Self::slice_as_bytes(slice::from_ref(self))
|
Self::slice_as_bytes(slice::from_ref(self))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn read_vec<T: TransmuteSafe>(file: &mut File, start: usize, end: usize) -> Result<Option<Vec<T>>, Error> {
|
||||||
|
if start == 0 || end == 0 {
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
// Replace this with div_ceil once it stabilizes
|
||||||
|
let size = (end - start + size_of::<T>() - 1) / size_of::<T>();
|
||||||
|
let mut buf = vec![T::default(); size];
|
||||||
|
file.read_exact(T::slice_as_bytes_mut(&mut buf))?;
|
||||||
|
Ok(Some(buf))
|
||||||
|
}
|
||||||
|
|
|
@ -7,6 +7,7 @@ fn print_help() {
|
||||||
println!("list - lists all dictionaries installed in the standard path");
|
println!("list - lists all dictionaries installed in the standard path");
|
||||||
println!("list_items {{dict}} {{keyword}} - lists all items");
|
println!("list_items {{dict}} {{keyword}} - lists all items");
|
||||||
println!("list_audio {{dict}} {{keyword}} - lists all audio files");
|
println!("list_audio {{dict}} {{keyword}} - lists all audio files");
|
||||||
|
println!("get_audio {{dict}} {{id}} - writes an audio file to stdout");
|
||||||
println!("help - this help");
|
println!("help - this help");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
16
src/dict.rs
16
src/dict.rs
|
@ -67,11 +67,25 @@ impl Paths {
|
||||||
pb
|
pb
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn headword_key_path(&self) -> PathBuf {
|
pub(crate) fn key_headword_path(&self) -> PathBuf {
|
||||||
let mut pb = self.key_path();
|
let mut pb = self.key_path();
|
||||||
pb.push("headword.keystore");
|
pb.push("headword.keystore");
|
||||||
pb
|
pb
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn headline_path(&self) -> PathBuf {
|
||||||
|
let mut pb = PathBuf::from(&self.base_path);
|
||||||
|
pb.push("Contents");
|
||||||
|
pb.push(&self.contents_dir);
|
||||||
|
pb.push("headline");
|
||||||
|
pb
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn headline_long_path(&self) -> PathBuf {
|
||||||
|
let mut pb = self.headline_path();
|
||||||
|
pb.push("headline.headlinestore");
|
||||||
|
pb
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_dict_name(fname: &OsStr) -> Option<&str> {
|
fn parse_dict_name(fname: &OsStr) -> Option<&str> {
|
||||||
|
|
93
src/headline.rs
Normal file
93
src/headline.rs
Normal file
|
@ -0,0 +1,93 @@
|
||||||
|
use std::{
|
||||||
|
fs::File,
|
||||||
|
io::{Read, Seek},
|
||||||
|
};
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
abi_utils::{TransmuteSafe, LE32, read_vec},
|
||||||
|
dict::Paths,
|
||||||
|
Error, PageItemId,
|
||||||
|
};
|
||||||
|
|
||||||
|
mod abi {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[repr(C)]
|
||||||
|
#[derive(Debug, Clone, Copy, Default)]
|
||||||
|
pub(super) struct FileHeader {
|
||||||
|
magic1: LE32,
|
||||||
|
magic2: LE32,
|
||||||
|
pub len: LE32,
|
||||||
|
pub rec_offset: LE32,
|
||||||
|
pub words_offset: LE32,
|
||||||
|
rec_bytes: LE32,
|
||||||
|
magic4: LE32,
|
||||||
|
magic5: LE32,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FileHeader {
|
||||||
|
pub(super) fn validate(&self) -> Result<(), Error> {
|
||||||
|
if self.magic1.read() == 0
|
||||||
|
&& self.magic2.read() == 0x2
|
||||||
|
&& self.rec_bytes.read() == 0x18
|
||||||
|
&& self.magic4.read() == 0
|
||||||
|
&& self.magic5.read() == 0
|
||||||
|
{
|
||||||
|
Ok(())
|
||||||
|
} else {
|
||||||
|
Err(Error::KeyFileHeaderValidate)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[repr(C)]
|
||||||
|
#[derive(Debug, Clone, Copy, Default)]
|
||||||
|
pub(super) struct Offset {
|
||||||
|
pub page_id: LE32,
|
||||||
|
pub item_id: u8,
|
||||||
|
pub item_type: u8,
|
||||||
|
magic1: u16,
|
||||||
|
pub offset: LE32,
|
||||||
|
magic2: LE32,
|
||||||
|
magic3: LE32,
|
||||||
|
magic4: LE32,
|
||||||
|
}
|
||||||
|
|
||||||
|
unsafe impl TransmuteSafe for FileHeader {}
|
||||||
|
unsafe impl TransmuteSafe for Offset {}
|
||||||
|
}
|
||||||
|
use abi::{FileHeader, Offset};
|
||||||
|
|
||||||
|
pub struct Headlines {
|
||||||
|
recs: Vec<Offset>,
|
||||||
|
words: Vec<u8>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Headlines {
|
||||||
|
pub fn new(paths: &Paths) -> Result<Headlines, Error> {
|
||||||
|
let mut file = File::open(paths.headline_long_path())?;
|
||||||
|
let file_size = file.metadata()?.len() as usize;
|
||||||
|
let mut hdr = FileHeader::default();
|
||||||
|
file.read_exact(hdr.as_bytes_mut())?;
|
||||||
|
hdr.validate()?;
|
||||||
|
|
||||||
|
file.seek(std::io::SeekFrom::Start(hdr.words_offset.read() as u64))?;
|
||||||
|
let offsets: Option<Vec<Offset>> = read_vec(&mut file, hdr.rec_offset.us(), hdr.words_offset.us())?;
|
||||||
|
let Some(recs) = offsets else { return Err(Error::InvalidIndex); };
|
||||||
|
|
||||||
|
let words: Option<Vec<u8>> = read_vec(&mut file, hdr.words_offset.us(), file_size)?;
|
||||||
|
let Some(words) = words else { return Err(Error::InvalidIndex); };
|
||||||
|
|
||||||
|
Ok(Headlines {
|
||||||
|
recs,
|
||||||
|
words,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get(&self, id: PageItemId) -> Result<String, Error> {
|
||||||
|
let rec = self.recs.binary_search_by(|rec|
|
||||||
|
rec.page_id.read().cmp(&id.page).then(rec.item_id.cmp(&id.item))
|
||||||
|
).map_err(|_| Error::InvalidIndex)?;
|
||||||
|
todo!();
|
||||||
|
}
|
||||||
|
}
|
33
src/key.rs
33
src/key.rs
|
@ -8,7 +8,7 @@ use std::{
|
||||||
};
|
};
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
abi_utils::{TransmuteSafe, LE32},
|
abi_utils::{TransmuteSafe, LE32, read_vec},
|
||||||
dict::Paths,
|
dict::Paths,
|
||||||
Error,
|
Error,
|
||||||
};
|
};
|
||||||
|
@ -59,7 +59,7 @@ mod abi {
|
||||||
}
|
}
|
||||||
|
|
||||||
impl IndexHeader {
|
impl IndexHeader {
|
||||||
pub(super) fn validate(&self, file_end: usize) -> Result<(), Error> {
|
pub(super) fn validate(&self, idx_end: usize) -> Result<(), Error> {
|
||||||
let a = self.index_a_offset.us();
|
let a = self.index_a_offset.us();
|
||||||
let b = self.index_b_offset.us();
|
let b = self.index_b_offset.us();
|
||||||
let c = self.index_c_offset.us();
|
let c = self.index_c_offset.us();
|
||||||
|
@ -69,7 +69,7 @@ mod abi {
|
||||||
&& check_order(a, b)
|
&& check_order(a, b)
|
||||||
&& check_order(b, c)
|
&& check_order(b, c)
|
||||||
&& check_order(c, d)
|
&& check_order(c, d)
|
||||||
&& check_order(d, file_end)
|
&& check_order(d, idx_end)
|
||||||
{
|
{
|
||||||
Ok(())
|
Ok(())
|
||||||
} else {
|
} else {
|
||||||
|
@ -111,17 +111,6 @@ impl KeyIndex {
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Keys {
|
impl Keys {
|
||||||
fn read_vec(file: &mut File, start: usize, end: usize) -> Result<Option<Vec<LE32>>, Error> {
|
|
||||||
if start == 0 || end == 0 {
|
|
||||||
return Ok(None);
|
|
||||||
}
|
|
||||||
// Replace this with div_ceil once it stabilizes
|
|
||||||
let size = (end - start + size_of::<LE32>() - 1) / size_of::<LE32>();
|
|
||||||
let mut buf = vec![LE32::default(); size];
|
|
||||||
file.read_exact(LE32::slice_as_bytes_mut(&mut buf))?;
|
|
||||||
Ok(Some(buf))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn check_vec_len(buf: &Option<Vec<LE32>>) -> Result<(), Error> {
|
fn check_vec_len(buf: &Option<Vec<LE32>>) -> Result<(), Error> {
|
||||||
let Some(buf) = buf else { return Ok(()) };
|
let Some(buf) = buf else { return Ok(()) };
|
||||||
if buf.get(0).ok_or(Error::InvalidIndex)?.us() + 1 != buf.len() {
|
if buf.get(0).ok_or(Error::InvalidIndex)?.us() + 1 != buf.len() {
|
||||||
|
@ -131,44 +120,44 @@ impl Keys {
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn new(paths: &Paths) -> Result<Keys, Error> {
|
pub fn new(paths: &Paths) -> Result<Keys, Error> {
|
||||||
let mut file = File::open(paths.headword_key_path())?;
|
let mut file = File::open(paths.key_headword_path())?;
|
||||||
let file_size = file.metadata()?.len() as usize;
|
let file_size = file.metadata()?.len() as usize;
|
||||||
let mut hdr = FileHeader::default();
|
let mut hdr = FileHeader::default();
|
||||||
file.read_exact(hdr.as_bytes_mut())?;
|
file.read_exact(hdr.as_bytes_mut())?;
|
||||||
hdr.validate()?;
|
hdr.validate()?;
|
||||||
|
|
||||||
file.seek(std::io::SeekFrom::Start(hdr.words_offset.read() as u64))?;
|
file.seek(std::io::SeekFrom::Start(hdr.words_offset.read() as u64))?;
|
||||||
let words = Self::read_vec(&mut file, hdr.words_offset.us(), hdr.idx_offset.us())?;
|
let words = read_vec(&mut file, hdr.words_offset.us(), hdr.idx_offset.us())?;
|
||||||
let Some(words) = words else { return Err(Error::InvalidIndex); };
|
let Some(words) = words else { return Err(Error::InvalidIndex); };
|
||||||
|
|
||||||
let file_end = file_size - hdr.idx_offset.us();
|
let idx_end = file_size - hdr.idx_offset.us();
|
||||||
let mut ihdr = IndexHeader::default();
|
let mut ihdr = IndexHeader::default();
|
||||||
file.seek(std::io::SeekFrom::Start(hdr.idx_offset.read() as u64))?;
|
file.seek(std::io::SeekFrom::Start(hdr.idx_offset.read() as u64))?;
|
||||||
file.read_exact(ihdr.as_bytes_mut())?;
|
file.read_exact(ihdr.as_bytes_mut())?;
|
||||||
ihdr.validate(file_end)?;
|
ihdr.validate(idx_end)?;
|
||||||
|
|
||||||
let index_a = Self::read_vec(
|
let index_a = read_vec(
|
||||||
&mut file,
|
&mut file,
|
||||||
ihdr.index_a_offset.us(),
|
ihdr.index_a_offset.us(),
|
||||||
ihdr.index_b_offset.us(),
|
ihdr.index_b_offset.us(),
|
||||||
)?;
|
)?;
|
||||||
Self::check_vec_len(&index_a)?;
|
Self::check_vec_len(&index_a)?;
|
||||||
|
|
||||||
let index_b = Self::read_vec(
|
let index_b = read_vec(
|
||||||
&mut file,
|
&mut file,
|
||||||
ihdr.index_b_offset.us(),
|
ihdr.index_b_offset.us(),
|
||||||
ihdr.index_c_offset.us(),
|
ihdr.index_c_offset.us(),
|
||||||
)?;
|
)?;
|
||||||
Self::check_vec_len(&index_b)?;
|
Self::check_vec_len(&index_b)?;
|
||||||
|
|
||||||
let index_c = Self::read_vec(
|
let index_c = read_vec(
|
||||||
&mut file,
|
&mut file,
|
||||||
ihdr.index_c_offset.us(),
|
ihdr.index_c_offset.us(),
|
||||||
ihdr.index_d_offset.us(),
|
ihdr.index_d_offset.us(),
|
||||||
)?;
|
)?;
|
||||||
Self::check_vec_len(&index_c)?;
|
Self::check_vec_len(&index_c)?;
|
||||||
|
|
||||||
let index_d = Self::read_vec(&mut file, ihdr.index_d_offset.us(), file_end)?;
|
let index_d = read_vec(&mut file, ihdr.index_d_offset.us(), idx_end)?;
|
||||||
Self::check_vec_len(&index_d)?;
|
Self::check_vec_len(&index_d)?;
|
||||||
|
|
||||||
Ok(Keys {
|
Ok(Keys {
|
||||||
|
|
|
@ -5,9 +5,11 @@ mod error;
|
||||||
mod key;
|
mod key;
|
||||||
mod pages;
|
mod pages;
|
||||||
mod resource;
|
mod resource;
|
||||||
|
mod headline;
|
||||||
|
|
||||||
pub use audio::Audio;
|
pub use audio::Audio;
|
||||||
pub use dict::MonokakidoDict;
|
pub use dict::MonokakidoDict;
|
||||||
pub use error::Error;
|
pub use error::Error;
|
||||||
pub use key::{KeyIndex, Keys, PageItemId};
|
pub use key::{KeyIndex, Keys, PageItemId};
|
||||||
pub use pages::{Pages, XmlParser};
|
pub use pages::{Pages, XmlParser};
|
||||||
|
pub use headline::{Headlines};
|
||||||
|
|
Loading…
Reference in a new issue