Start implementing headline support.
This commit is contained in:
parent
4b70b0708d
commit
09915cabc5
32
README.md
32
README.md
|
@ -3,12 +3,38 @@
|
|||
A Rust library for parsing and interpreting the [Monokakido](https://www.monokakido.jp/en/dictionaries/app/) dictionary format. Aiming for full test coverage and efficient implementation with minimal dependencies.
|
||||
|
||||
## TODO:
|
||||
- Refactor code for generic "rsc" and "nrsc" support
|
||||
- Add headline support
|
||||
- Refactor as a workspace to separate the dependencies of the library and the binaries
|
||||
- Move to mmap-based indexes
|
||||
- Add graphics support
|
||||
- Add TTY detection to CLI (prevent binary output to shell)
|
||||
- Add proper argument parser lib to CLI
|
||||
- Refine CLI according to the plan below
|
||||
- Document the rsc, nrsc and keystore and headline formats
|
||||
### Test:
|
||||
- Audio using "rsc" (CCCAD, WISDOM3)
|
||||
- Audio using "nrsc" (DAIJISEN2, NHKACCENT2, OALD10, OLDAE, OLEX, OLT, RHEJ, SMK8)
|
||||
- Multiple contents (WISDOM3, OLEX)
|
||||
- Document the rsc, nrsc and keystore formats
|
||||
- Split main.rs into "dict exploder" and "dict cli"
|
||||
|
||||
|
||||
## CLI (Planned)
|
||||
|
||||
### Tab-separated output formats:
|
||||
- keyword
|
||||
- headline
|
||||
- iid (item id)
|
||||
- pid (page id)
|
||||
- aid (audio id)
|
||||
- gid (graphics id)
|
||||
|
||||
### \n\n separated output formats:
|
||||
- item
|
||||
- page
|
||||
|
||||
### binary output formats:
|
||||
- audio
|
||||
- graphics
|
||||
|
||||
|
||||
## Planned to support:
|
||||
- WISDOM3
|
||||
|
|
|
@ -2,6 +2,7 @@ use core::{
|
|||
mem::{align_of, size_of},
|
||||
slice,
|
||||
};
|
||||
use std::{fs::File, io::Read};
|
||||
|
||||
use crate::Error;
|
||||
|
||||
|
@ -34,6 +35,7 @@ impl From<u32> for LE32 {
|
|||
}
|
||||
|
||||
unsafe impl TransmuteSafe for LE32 {}
|
||||
unsafe impl TransmuteSafe for u8 {}
|
||||
|
||||
pub(crate) unsafe trait TransmuteSafe: Default + Clone {
|
||||
fn from_buf(buf: &[u8]) -> Result<(&Self, &[u8]), Error> {
|
||||
|
@ -83,3 +85,14 @@ pub(crate) unsafe trait TransmuteSafe: Default + Clone {
|
|||
Self::slice_as_bytes(slice::from_ref(self))
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn read_vec<T: TransmuteSafe>(file: &mut File, start: usize, end: usize) -> Result<Option<Vec<T>>, Error> {
|
||||
if start == 0 || end == 0 {
|
||||
return Ok(None);
|
||||
}
|
||||
// Replace this with div_ceil once it stabilizes
|
||||
let size = (end - start + size_of::<T>() - 1) / size_of::<T>();
|
||||
let mut buf = vec![T::default(); size];
|
||||
file.read_exact(T::slice_as_bytes_mut(&mut buf))?;
|
||||
Ok(Some(buf))
|
||||
}
|
||||
|
|
|
@ -7,6 +7,7 @@ fn print_help() {
|
|||
println!("list - lists all dictionaries installed in the standard path");
|
||||
println!("list_items {{dict}} {{keyword}} - lists all items");
|
||||
println!("list_audio {{dict}} {{keyword}} - lists all audio files");
|
||||
println!("get_audio {{dict}} {{id}} - writes an audio file to stdout");
|
||||
println!("help - this help");
|
||||
}
|
||||
|
||||
|
|
16
src/dict.rs
16
src/dict.rs
|
@ -67,11 +67,25 @@ impl Paths {
|
|||
pb
|
||||
}
|
||||
|
||||
pub(crate) fn headword_key_path(&self) -> PathBuf {
|
||||
pub(crate) fn key_headword_path(&self) -> PathBuf {
|
||||
let mut pb = self.key_path();
|
||||
pb.push("headword.keystore");
|
||||
pb
|
||||
}
|
||||
|
||||
pub(crate) fn headline_path(&self) -> PathBuf {
|
||||
let mut pb = PathBuf::from(&self.base_path);
|
||||
pb.push("Contents");
|
||||
pb.push(&self.contents_dir);
|
||||
pb.push("headline");
|
||||
pb
|
||||
}
|
||||
|
||||
pub(crate) fn headline_long_path(&self) -> PathBuf {
|
||||
let mut pb = self.headline_path();
|
||||
pb.push("headline.headlinestore");
|
||||
pb
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_dict_name(fname: &OsStr) -> Option<&str> {
|
||||
|
|
93
src/headline.rs
Normal file
93
src/headline.rs
Normal file
|
@ -0,0 +1,93 @@
|
|||
use std::{
|
||||
fs::File,
|
||||
io::{Read, Seek},
|
||||
};
|
||||
|
||||
use crate::{
|
||||
abi_utils::{TransmuteSafe, LE32, read_vec},
|
||||
dict::Paths,
|
||||
Error, PageItemId,
|
||||
};
|
||||
|
||||
mod abi {
|
||||
use super::*;
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Clone, Copy, Default)]
|
||||
pub(super) struct FileHeader {
|
||||
magic1: LE32,
|
||||
magic2: LE32,
|
||||
pub len: LE32,
|
||||
pub rec_offset: LE32,
|
||||
pub words_offset: LE32,
|
||||
rec_bytes: LE32,
|
||||
magic4: LE32,
|
||||
magic5: LE32,
|
||||
}
|
||||
|
||||
impl FileHeader {
|
||||
pub(super) fn validate(&self) -> Result<(), Error> {
|
||||
if self.magic1.read() == 0
|
||||
&& self.magic2.read() == 0x2
|
||||
&& self.rec_bytes.read() == 0x18
|
||||
&& self.magic4.read() == 0
|
||||
&& self.magic5.read() == 0
|
||||
{
|
||||
Ok(())
|
||||
} else {
|
||||
Err(Error::KeyFileHeaderValidate)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Clone, Copy, Default)]
|
||||
pub(super) struct Offset {
|
||||
pub page_id: LE32,
|
||||
pub item_id: u8,
|
||||
pub item_type: u8,
|
||||
magic1: u16,
|
||||
pub offset: LE32,
|
||||
magic2: LE32,
|
||||
magic3: LE32,
|
||||
magic4: LE32,
|
||||
}
|
||||
|
||||
unsafe impl TransmuteSafe for FileHeader {}
|
||||
unsafe impl TransmuteSafe for Offset {}
|
||||
}
|
||||
use abi::{FileHeader, Offset};
|
||||
|
||||
pub struct Headlines {
|
||||
recs: Vec<Offset>,
|
||||
words: Vec<u8>,
|
||||
}
|
||||
|
||||
impl Headlines {
|
||||
pub fn new(paths: &Paths) -> Result<Headlines, Error> {
|
||||
let mut file = File::open(paths.headline_long_path())?;
|
||||
let file_size = file.metadata()?.len() as usize;
|
||||
let mut hdr = FileHeader::default();
|
||||
file.read_exact(hdr.as_bytes_mut())?;
|
||||
hdr.validate()?;
|
||||
|
||||
file.seek(std::io::SeekFrom::Start(hdr.words_offset.read() as u64))?;
|
||||
let offsets: Option<Vec<Offset>> = read_vec(&mut file, hdr.rec_offset.us(), hdr.words_offset.us())?;
|
||||
let Some(recs) = offsets else { return Err(Error::InvalidIndex); };
|
||||
|
||||
let words: Option<Vec<u8>> = read_vec(&mut file, hdr.words_offset.us(), file_size)?;
|
||||
let Some(words) = words else { return Err(Error::InvalidIndex); };
|
||||
|
||||
Ok(Headlines {
|
||||
recs,
|
||||
words,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn get(&self, id: PageItemId) -> Result<String, Error> {
|
||||
let rec = self.recs.binary_search_by(|rec|
|
||||
rec.page_id.read().cmp(&id.page).then(rec.item_id.cmp(&id.item))
|
||||
).map_err(|_| Error::InvalidIndex)?;
|
||||
todo!();
|
||||
}
|
||||
}
|
33
src/key.rs
33
src/key.rs
|
@ -8,7 +8,7 @@ use std::{
|
|||
};
|
||||
|
||||
use crate::{
|
||||
abi_utils::{TransmuteSafe, LE32},
|
||||
abi_utils::{TransmuteSafe, LE32, read_vec},
|
||||
dict::Paths,
|
||||
Error,
|
||||
};
|
||||
|
@ -59,7 +59,7 @@ mod abi {
|
|||
}
|
||||
|
||||
impl IndexHeader {
|
||||
pub(super) fn validate(&self, file_end: usize) -> Result<(), Error> {
|
||||
pub(super) fn validate(&self, idx_end: usize) -> Result<(), Error> {
|
||||
let a = self.index_a_offset.us();
|
||||
let b = self.index_b_offset.us();
|
||||
let c = self.index_c_offset.us();
|
||||
|
@ -69,7 +69,7 @@ mod abi {
|
|||
&& check_order(a, b)
|
||||
&& check_order(b, c)
|
||||
&& check_order(c, d)
|
||||
&& check_order(d, file_end)
|
||||
&& check_order(d, idx_end)
|
||||
{
|
||||
Ok(())
|
||||
} else {
|
||||
|
@ -111,17 +111,6 @@ impl KeyIndex {
|
|||
}
|
||||
|
||||
impl Keys {
|
||||
fn read_vec(file: &mut File, start: usize, end: usize) -> Result<Option<Vec<LE32>>, Error> {
|
||||
if start == 0 || end == 0 {
|
||||
return Ok(None);
|
||||
}
|
||||
// Replace this with div_ceil once it stabilizes
|
||||
let size = (end - start + size_of::<LE32>() - 1) / size_of::<LE32>();
|
||||
let mut buf = vec![LE32::default(); size];
|
||||
file.read_exact(LE32::slice_as_bytes_mut(&mut buf))?;
|
||||
Ok(Some(buf))
|
||||
}
|
||||
|
||||
fn check_vec_len(buf: &Option<Vec<LE32>>) -> Result<(), Error> {
|
||||
let Some(buf) = buf else { return Ok(()) };
|
||||
if buf.get(0).ok_or(Error::InvalidIndex)?.us() + 1 != buf.len() {
|
||||
|
@ -131,44 +120,44 @@ impl Keys {
|
|||
}
|
||||
|
||||
pub fn new(paths: &Paths) -> Result<Keys, Error> {
|
||||
let mut file = File::open(paths.headword_key_path())?;
|
||||
let mut file = File::open(paths.key_headword_path())?;
|
||||
let file_size = file.metadata()?.len() as usize;
|
||||
let mut hdr = FileHeader::default();
|
||||
file.read_exact(hdr.as_bytes_mut())?;
|
||||
hdr.validate()?;
|
||||
|
||||
file.seek(std::io::SeekFrom::Start(hdr.words_offset.read() as u64))?;
|
||||
let words = Self::read_vec(&mut file, hdr.words_offset.us(), hdr.idx_offset.us())?;
|
||||
let words = read_vec(&mut file, hdr.words_offset.us(), hdr.idx_offset.us())?;
|
||||
let Some(words) = words else { return Err(Error::InvalidIndex); };
|
||||
|
||||
let file_end = file_size - hdr.idx_offset.us();
|
||||
let idx_end = file_size - hdr.idx_offset.us();
|
||||
let mut ihdr = IndexHeader::default();
|
||||
file.seek(std::io::SeekFrom::Start(hdr.idx_offset.read() as u64))?;
|
||||
file.read_exact(ihdr.as_bytes_mut())?;
|
||||
ihdr.validate(file_end)?;
|
||||
ihdr.validate(idx_end)?;
|
||||
|
||||
let index_a = Self::read_vec(
|
||||
let index_a = read_vec(
|
||||
&mut file,
|
||||
ihdr.index_a_offset.us(),
|
||||
ihdr.index_b_offset.us(),
|
||||
)?;
|
||||
Self::check_vec_len(&index_a)?;
|
||||
|
||||
let index_b = Self::read_vec(
|
||||
let index_b = read_vec(
|
||||
&mut file,
|
||||
ihdr.index_b_offset.us(),
|
||||
ihdr.index_c_offset.us(),
|
||||
)?;
|
||||
Self::check_vec_len(&index_b)?;
|
||||
|
||||
let index_c = Self::read_vec(
|
||||
let index_c = read_vec(
|
||||
&mut file,
|
||||
ihdr.index_c_offset.us(),
|
||||
ihdr.index_d_offset.us(),
|
||||
)?;
|
||||
Self::check_vec_len(&index_c)?;
|
||||
|
||||
let index_d = Self::read_vec(&mut file, ihdr.index_d_offset.us(), file_end)?;
|
||||
let index_d = read_vec(&mut file, ihdr.index_d_offset.us(), idx_end)?;
|
||||
Self::check_vec_len(&index_d)?;
|
||||
|
||||
Ok(Keys {
|
||||
|
|
|
@ -5,9 +5,11 @@ mod error;
|
|||
mod key;
|
||||
mod pages;
|
||||
mod resource;
|
||||
mod headline;
|
||||
|
||||
pub use audio::Audio;
|
||||
pub use dict::MonokakidoDict;
|
||||
pub use error::Error;
|
||||
pub use key::{KeyIndex, Keys, PageItemId};
|
||||
pub use pages::{Pages, XmlParser};
|
||||
pub use headline::{Headlines};
|
||||
|
|
Loading…
Reference in a new issue