Add export support for the MDict dictionary format

This commit is contained in:
stephenmk 2023-07-08 16:49:03 -05:00
parent e4a2e75d82
commit 4c837cd72d
No known key found for this signature in database
GPG key ID: B6DA730DB06235F1
53 changed files with 2227 additions and 269 deletions

1
.gitignore vendored
View file

@ -1,6 +1,7 @@
webcache/ webcache/
output/ output/
notes/ notes/
monokakido/
# Byte-compiled / optimized / DLL files # Byte-compiled / optimized / DLL files
__pycache__/ __pycache__/

View file

@ -49,7 +49,8 @@ compiling the scraped data into compact dictionary file formats.
# Usage # Usage
``` ```
usage: jitenbot [-h] [-p PAGE_DIR] [-i IMAGE_DIR] usage: jitenbot [-h] [-p PAGE_DIR] [-m MEDIA_DIR] [-i MDICT_ICON]
[--no-yomichan-export] [--no-mdict-export]
{jitenon-kokugo,jitenon-yoji,jitenon-kotowaza,smk8,daijirin2} {jitenon-kokugo,jitenon-yoji,jitenon-kotowaza,smk8,daijirin2}
Convert Japanese dictionary files to new formats. Convert Japanese dictionary files to new formats.
@ -62,9 +63,15 @@ options:
-h, --help show this help message and exit -h, --help show this help message and exit
-p PAGE_DIR, --page-dir PAGE_DIR -p PAGE_DIR, --page-dir PAGE_DIR
path to directory containing XML page files path to directory containing XML page files
-i IMAGE_DIR, --image-dir IMAGE_DIR -m MEDIA_DIR, --media-dir MEDIA_DIR
path to directory containing image folders (gaiji, path to directory containing media folders (gaiji,
graphics, etc.) graphics, audio, etc.)
-i MDICT_ICON, --mdict-icon MDICT_ICON
path to icon file to be used with MDict
--no-yomichan-export skip export of dictionary data to Yomichan format
--no-mdict-export skip export of dictionary data to MDict format
See README.md for details regarding media directory structures
``` ```
### Online Targets ### Online Targets
Jitenbot will scrape the target website and save the pages to the [user cache directory](https://pypi.org/project/platformdirs/). Jitenbot will scrape the target website and save the pages to the [user cache directory](https://pypi.org/project/platformdirs/).
@ -75,8 +82,55 @@ HTTP request headers (user agent string, etc.) may be customized by editing the
[user config directory](https://pypi.org/project/platformdirs/). [user config directory](https://pypi.org/project/platformdirs/).
### Offline Targets ### Offline Targets
Page data and image data must be procured by the user Page data and media data must be [procured by the user](https://github.com/golddranks/monokakido/)
and passed to jitenbot via the appropriate command line flags. and passed to jitenbot via the appropriate command line flags.
<details>
<summary>smk8 media directory</summary>
Since Yomichan does not support audio files from imported
dictionaries, the `audio/` directory may be omitted to save filesize
space in the output ZIP file if desired.
```
media
├── Audio.png
├── audio
│   ├── 00001.aac
│   ├── 00002.aac
│   ├── 00003.aac
│   │  ...
│   └── 82682.aac
└── gaiji
├── 1d110.svg
├── 1d15d.svg
├── 1d15e.svg
   │  ...
└── xbunnoa.svg
```
</details>
<details>
<summary>daijirin2 media directory</summary>
The `graphics/` directory may be omitted to save space if desired.
```
media
├── gaiji
│   ├── 1D10B.svg
│   ├── 1D110.svg
│   ├── 1D12A.svg
│   │  ...
│   └── vectorOB.svg
└── graphics
├── 3djr_0002.png
├── 3djr_0004.png
├── 3djr_0005.png
   │  ...
└── 4djr_yahazu.png
```
</details>
# Attribution # Attribution
`Adobe-Japan1_sequences.txt` is provided by [The Adobe-Japan1-7 Character Collection](https://github.com/adobe-type-tools/Adobe-Japan1). `Adobe-Japan1_sequences.txt` is provided by [The Adobe-Japan1-7 Character Collection](https://github.com/adobe-type-tools/Adobe-Japan1).

View file

@ -1,11 +1,11 @@
### Todo ### Todo
- [x] Add factory classes to reduce the amount of class import statements
- [x] Support exporting to MDict (.MDX) dictionary format
- [ ] Add test suite - [ ] Add test suite
- [ ] Add documentation (docstrings, etc.) - [ ] Add documentation (docstrings, etc.)
- [ ] Validate JSON schema of Yomichan terms during export - [ ] Validate JSON schema of Yomichan terms during export
- [ ] Add factory classes to reduce the amount of class import statements
- [ ] Add build scripts for producing program binaries - [ ] Add build scripts for producing program binaries
- [ ] Support exporting to MDict (.MDX) dictionary format
- [ ] Validate scraped webpages after downloading - [ ] Validate scraped webpages after downloading
- [ ] Log non-fatal failures to a log file instead of raising exceptions - [ ] Log non-fatal failures to a log file instead of raising exceptions
- [ ] Support more dictionary websites - [ ] Support more dictionary websites

View file

@ -5,7 +5,8 @@ from bs4 import BeautifulSoup
import bot.scraper as Scraper import bot.scraper as Scraper
from bot.entries.factory import new_entry from bot.entries.factory import new_entry
from bot.yomichan.exporters.factory import new_exporter from bot.yomichan.exporters.factory import new_yomi_exporter
from bot.mdict.exporters.factory import new_mdict_exporter
class Crawler(ABC): class Crawler(ABC):
@ -38,9 +39,13 @@ class Crawler(ABC):
self._entries.append(entry) self._entries.append(entry)
print() print()
def make_yomichan_dictionary(self, image_dir): def make_yomichan_dictionary(self, media_dir):
exporter = new_exporter(self._target) exporter = new_yomi_exporter(self._target)
exporter.export(self._entries, image_dir) exporter.export(self._entries, media_dir)
def make_mdict_dictionary(self, media_dir, icon_file):
exporter = new_mdict_exporter(self._target)
exporter.export(self._entries, media_dir, icon_file)
def _parse_page_id(self, page_link): def _parse_page_id(self, page_link):
m = re.search(self._page_id_pattern, page_link) m = re.search(self._page_id_pattern, page_link)
@ -142,10 +147,8 @@ class _MonokakidoCrawler(Crawler):
class Smk8Crawler(_MonokakidoCrawler): class Smk8Crawler(_MonokakidoCrawler):
def __init__(self, target): pass
super().__init__(target)
class Daijirin2Crawler(_MonokakidoCrawler): class Daijirin2Crawler(_MonokakidoCrawler):
def __init__(self, target): pass
super().__init__(target)

View file

@ -99,15 +99,15 @@ def load_daijirin2_kana_abbreviations():
@cache @cache
def load_smk8_yomichan_name_conversion(): def load_yomichan_name_conversion(target):
file_name = os.path.join("smk8", "yomichan_name_conversion.json") file_name = os.path.join(target.value, "yomichan_name_conversion.json")
data = __load_json(file_name) data = __load_json(file_name)
return data return data
@cache @cache
def load_daijirin2_yomichan_name_conversion(): def load_mdict_name_conversion(target):
file_name = os.path.join("daijirin2", "yomichan_name_conversion.json") file_name = os.path.join(target.value, "mdict_name_conversion.json")
data = __load_json(file_name) data = __load_json(file_name)
return data return data

View file

@ -1,4 +1,3 @@
import re
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import bot.entries.expressions as Expressions import bot.entries.expressions as Expressions
@ -10,19 +9,17 @@ from bot.entries.daijirin2_preprocess import preprocess_page
class _BaseDaijirin2Entry(Entry): class _BaseDaijirin2Entry(Entry):
ID_TO_ENTRY = {} def __init__(self, target, entry_id):
SUBENTRY_ID_TO_ENTRY_ID = {} super().__init__(target, entry_id)
def __init__(self, entry_id):
super().__init__(entry_id)
if entry_id not in self.ID_TO_ENTRY:
self.ID_TO_ENTRY[entry_id] = self
else:
raise Exception(f"Duplicate entry ID: {entry_id}")
self.children = [] self.children = []
self.phrases = [] self.phrases = []
self._kana_abbreviations = load_daijirin2_kana_abbreviations() self._kana_abbreviations = load_daijirin2_kana_abbreviations()
def get_global_identifier(self):
parent_part = format(self.entry_id[0], '06')
child_part = hex(self.entry_id[1]).lstrip('0x').zfill(4).upper()
return f"@{self.target.value}-{parent_part}-{child_part}"
def set_page(self, page): def set_page(self, page):
page = self.__decompose_subentries(page) page = self.__decompose_subentries(page)
self._page = page self._page = page
@ -57,14 +54,7 @@ class _BaseDaijirin2Entry(Entry):
else: else:
self._part_of_speech_tags.append(pos) self._part_of_speech_tags.append(pos)
def get_headwords(self): def _get_regular_headwords(self, soup):
if self._headwords is not None:
return self._headwords
self._set_headwords()
self._set_variant_headwords()
return self._headwords
def _set_regular_headwords(self, soup):
self._fill_alts(soup) self._fill_alts(soup)
reading = soup.find("見出仮名").text reading = soup.find("見出仮名").text
expressions = [] expressions = []
@ -78,10 +68,11 @@ class _BaseDaijirin2Entry(Entry):
expressions = Expressions.expand_abbreviation_list(expressions) expressions = Expressions.expand_abbreviation_list(expressions)
if len(expressions) == 0: if len(expressions) == 0:
expressions.append(reading) expressions.append(reading)
self._headwords = {reading: expressions} headwords = {reading: expressions}
return headwords
def _set_variant_headwords(self): def _add_variant_expressions(self, headwords):
for expressions in self._headwords.values(): for expressions in headwords.values():
Expressions.add_variant_kanji(expressions) Expressions.add_variant_kanji(expressions)
Expressions.add_fullwidth(expressions) Expressions.add_fullwidth(expressions)
Expressions.remove_iteration_mark(expressions) Expressions.remove_iteration_mark(expressions)
@ -101,7 +92,7 @@ class _BaseDaijirin2Entry(Entry):
tag_soup.name = "項目" tag_soup.name = "項目"
subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"]) subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"])
self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id
subentry = subentry_class(subentry_id) subentry = subentry_class(self.target, subentry_id)
page = tag_soup.decode() page = tag_soup.decode()
subentry.set_page(page) subentry.set_page(page)
subentry_list.append(subentry) subentry_list.append(subentry)
@ -122,6 +113,8 @@ class _BaseDaijirin2Entry(Entry):
@staticmethod @staticmethod
def _delete_unused_nodes(soup): def _delete_unused_nodes(soup):
"""Remove extra markup elements that appear in the entry
headword line which are not part of the entry headword"""
unused_nodes = [ unused_nodes = [
"漢字音logo", "活用分節", "連語句活用分節", "語構成", "漢字音logo", "活用分節", "連語句活用分節", "語構成",
"表外字マーク", "表外字マーク", "ルビG" "表外字マーク", "表外字マーク", "ルビG"
@ -144,25 +137,26 @@ class _BaseDaijirin2Entry(Entry):
class Daijirin2Entry(_BaseDaijirin2Entry): class Daijirin2Entry(_BaseDaijirin2Entry):
def __init__(self, page_id): def __init__(self, target, page_id):
entry_id = (page_id, 0) entry_id = (page_id, 0)
super().__init__(entry_id) super().__init__(target, entry_id)
def set_page(self, page): def set_page(self, page):
page = preprocess_page(page) page = preprocess_page(page)
super().set_page(page) super().set_page(page)
def _set_headwords(self): def _get_headwords(self):
soup = self.get_page_soup() soup = self.get_page_soup()
self._delete_unused_nodes(soup) self._delete_unused_nodes(soup)
if soup.find("漢字見出") is not None: if soup.find("漢字見出") is not None:
self._set_kanji_headwords(soup) headwords = self._get_kanji_headwords(soup)
elif soup.find("略語G") is not None: elif soup.find("略語G") is not None:
self._set_acronym_headwords(soup) headwords = self._get_acronym_headwords(soup)
else: else:
self._set_regular_headwords(soup) headwords = self._get_regular_headwords(soup)
return headwords
def _set_kanji_headwords(self, soup): def _get_kanji_headwords(self, soup):
readings = [] readings = []
for el in soup.find_all("漢字音"): for el in soup.find_all("漢字音"):
hira = Expressions.kata_to_hira(el.text) hira = Expressions.kata_to_hira(el.text)
@ -172,11 +166,12 @@ class Daijirin2Entry(_BaseDaijirin2Entry):
expressions = [] expressions = []
for el in soup.find_all("漢字見出"): for el in soup.find_all("漢字見出"):
expressions.append(el.text) expressions.append(el.text)
self._headwords = {} headwords = {}
for reading in readings: for reading in readings:
self._headwords[reading] = expressions headwords[reading] = expressions
return headwords
def _set_acronym_headwords(self, soup): def _get_acronym_headwords(self, soup):
expressions = [] expressions = []
for el in soup.find_all("略語"): for el in soup.find_all("略語"):
expression_parts = [] expression_parts = []
@ -184,29 +179,24 @@ class Daijirin2Entry(_BaseDaijirin2Entry):
expression_parts.append(part.text) expression_parts.append(part.text)
expression = "".join(expression_parts) expression = "".join(expression_parts)
expressions.append(expression) expressions.append(expression)
self._headwords = {"": expressions} headwords = {"": expressions}
return headwords
class Daijirin2ChildEntry(_BaseDaijirin2Entry): class Daijirin2ChildEntry(_BaseDaijirin2Entry):
def __init__(self, entry_id): def _get_headwords(self):
super().__init__(entry_id)
def _set_headwords(self):
soup = self.get_page_soup() soup = self.get_page_soup()
self._delete_unused_nodes(soup) self._delete_unused_nodes(soup)
self._set_regular_headwords(soup) headwords = self._get_regular_headwords(soup)
return headwords
class Daijirin2PhraseEntry(_BaseDaijirin2Entry): class Daijirin2PhraseEntry(_BaseDaijirin2Entry):
def __init__(self, entry_id):
super().__init__(entry_id)
self.__phrase_readings = load_daijirin2_phrase_readings()
def get_part_of_speech_tags(self): def get_part_of_speech_tags(self):
# phrases do not contain these tags # phrases do not contain these tags
return [] return []
def _set_headwords(self): def _get_headwords(self):
soup = self.get_page_soup() soup = self.get_page_soup()
headwords = {} headwords = {}
expressions = self._find_expressions(soup) expressions = self._find_expressions(soup)
@ -217,7 +207,7 @@ class Daijirin2PhraseEntry(_BaseDaijirin2Entry):
headwords[reading].append(expression) headwords[reading].append(expression)
else: else:
headwords[reading] = [expression] headwords[reading] = [expression]
self._headwords = headwords return headwords
def _find_expressions(self, soup): def _find_expressions(self, soup):
self._delete_unused_nodes(soup) self._delete_unused_nodes(soup)
@ -231,7 +221,8 @@ class Daijirin2PhraseEntry(_BaseDaijirin2Entry):
return expressions return expressions
def _find_readings(self): def _find_readings(self):
text = self.__phrase_readings[self.entry_id] phrase_readings = load_daijirin2_phrase_readings()
text = phrase_readings[self.entry_id]
alternatives = Expressions.expand_daijirin_alternatives(text) alternatives = Expressions.expand_daijirin_alternatives(text)
readings = [] readings = []
for alt in alternatives: for alt in alternatives:

View file

@ -2,12 +2,24 @@ from abc import ABC, abstractmethod
class Entry(ABC): class Entry(ABC):
def __init__(self, entry_id): ID_TO_ENTRY = {}
SUBENTRY_ID_TO_ENTRY_ID = {}
def __init__(self, target, entry_id):
if entry_id not in self.ID_TO_ENTRY:
self.ID_TO_ENTRY[entry_id] = self
else:
raise Exception(f"Duplicate entry ID: {entry_id}")
self.target = target
self.entry_id = entry_id self.entry_id = entry_id
self._page = None self._page = None
self._headwords = None self._headwords = None
self._part_of_speech_tags = None self._part_of_speech_tags = None
@abstractmethod
def get_global_identifier(self):
pass
@abstractmethod @abstractmethod
def set_page(self, page): def set_page(self, page):
pass pass
@ -16,14 +28,34 @@ class Entry(ABC):
def get_page_soup(self): def get_page_soup(self):
pass pass
@abstractmethod
def get_headwords(self): def get_headwords(self):
if self._headwords is not None:
return self._headwords
headwords = self._get_headwords()
self._add_variant_expressions(headwords)
self._headwords = headwords
return headwords
@abstractmethod
def _get_headwords(self):
pass
@abstractmethod
def _add_variant_expressions(self, headwords):
pass pass
@abstractmethod @abstractmethod
def get_part_of_speech_tags(self): def get_part_of_speech_tags(self):
pass pass
def get_parent(self):
if self.entry_id in self.SUBENTRY_ID_TO_ENTRY_ID:
parent_id = self.SUBENTRY_ID_TO_ENTRY_ID[self.entry_id]
parent = self.ID_TO_ENTRY[parent_id]
else:
parent = None
return parent
def get_first_expression(self): def get_first_expression(self):
headwords = self.get_headwords() headwords = self.get_headwords()
expressions = next(iter(headwords.values())) expressions = next(iter(headwords.values()))

View file

@ -15,4 +15,4 @@ def new_entry(target, page_id):
Targets.SMK8: Smk8Entry, Targets.SMK8: Smk8Entry,
Targets.DAIJIRIN2: Daijirin2Entry, Targets.DAIJIRIN2: Daijirin2Entry,
} }
return entry_map[target](page_id) return entry_map[target](target, page_id)

View file

@ -1,4 +1,5 @@
import re import re
from abc import abstractmethod
from datetime import datetime, date from datetime import datetime, date
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@ -7,18 +8,17 @@ import bot.entries.expressions as Expressions
class _JitenonEntry(Entry): class _JitenonEntry(Entry):
ID_TO_ENTRY = {} def __init__(self, target, entry_id):
super().__init__(target, entry_id)
def __init__(self, entry_id): self.expression = ""
super().__init__(entry_id) self.yomikata = ""
if entry_id not in self.ID_TO_ENTRY: self.definition = ""
self.ID_TO_ENTRY[entry_id] = self self.other_forms = []
else:
raise Exception(f"Duplicate entry ID: {entry_id}")
self.modified_date = date(1970, 1, 1) self.modified_date = date(1970, 1, 1)
self.attribution = "" self.attribution = ""
for column in self._COLUMNS.values():
setattr(self, column[0], column[1]) def get_global_identifier(self):
return f"@{self.target.value}-{format(self.entry_id, '06')}"
def set_page(self, page): def set_page(self, page):
soup = BeautifulSoup(page, features="html5lib") soup = BeautifulSoup(page, features="html5lib")
@ -39,36 +39,33 @@ class _JitenonEntry(Entry):
soup = BeautifulSoup(self._page, "html5lib") soup = BeautifulSoup(self._page, "html5lib")
return soup return soup
def get_headwords(self):
if self._headwords is not None:
return self._headwords
self._set_headwords()
self._set_variant_headwords()
return self._headwords
def get_part_of_speech_tags(self): def get_part_of_speech_tags(self):
# Jitenon doesn't have any # Jitenon doesn't have any
return [] return []
def _set_headwords(self): def _get_headwords(self):
headwords = {} headwords = {}
for yomikata in self._yomikatas(): for reading in self._get_readings():
headwords[yomikata] = [self.expression] headwords[reading] = [self.expression]
ikei_headwords = self._ikei_headwords() other_form_headwords = self._other_form_headwords()
for reading, expressions in ikei_headwords.items(): for reading, expressions in other_form_headwords.items():
if reading not in headwords: if reading not in headwords:
headwords[reading] = [] headwords[reading] = []
for expression in expressions: for expression in expressions:
if expression not in headwords[reading]: if expression not in headwords[reading]:
headwords[reading].append(expression) headwords[reading].append(expression)
self._headwords = headwords return headwords
@abstractmethod
def _get_column_map(self):
pass
def __set_modified_date(self, page): def __set_modified_date(self, page):
m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", page) m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", page)
if m is None: if m is None:
return return
date = datetime.strptime(m.group(1), '%Y-%m-%d').date() modified_date = datetime.strptime(m.group(1), '%Y-%m-%d').date()
self.modified_date = date self.modified_date = modified_date
def __set_attribution(self, soup): def __set_attribution(self, soup):
attribution = soup.find(class_="copyright") attribution = soup.find(class_="copyright")
@ -78,7 +75,8 @@ class _JitenonEntry(Entry):
self.attribution = "" self.attribution = ""
def __set_column(self, colname, colval): def __set_column(self, colname, colval):
attr_name = self._COLUMNS[colname][0] column_map = self._get_column_map()
attr_name = column_map[colname]
attr_value = getattr(self, attr_name) attr_value = getattr(self, attr_name)
if isinstance(attr_value, str): if isinstance(attr_value, str):
setattr(self, attr_name, colval) setattr(self, attr_name, colval)
@ -88,7 +86,7 @@ class _JitenonEntry(Entry):
else: else:
attr_value.append(colval) attr_value.append(colval)
def _yomikatas(self): def _get_readings(self):
yomikata = self.yomikata yomikata = self.yomikata
m = re.search(r"^[ぁ-ヿ、]+$", yomikata) m = re.search(r"^[ぁ-ヿ、]+$", yomikata)
if m: if m:
@ -109,20 +107,20 @@ class _JitenonEntry(Entry):
print(f"Invalid 読み方 format: {self.yomikata}\n{self}\n") print(f"Invalid 読み方 format: {self.yomikata}\n{self}\n")
return [""] return [""]
def _ikei_headwords(self): def _other_form_headwords(self):
ikei_headwords = {} other_form_headwords = {}
for val in self.ikei: for val in self.other_forms:
m = re.search(r"^([^]+)([ぁ-ヿ、]+)$", val) m = re.search(r"^([^]+)([ぁ-ヿ、]+)$", val)
if not m: if not m:
print(f"Invalid 異形 format: {val}\n{self}\n") print(f"Invalid 異形 format: {val}\n{self}\n")
continue continue
expression = m.group(1) expression = m.group(1)
reading = m.group(2) reading = m.group(2)
if reading not in ikei_headwords: if reading not in other_form_headwords:
ikei_headwords[reading] = [] other_form_headwords[reading] = []
if expression not in ikei_headwords[reading]: if expression not in other_form_headwords[reading]:
ikei_headwords[reading].append(expression) other_form_headwords[reading].append(expression)
return ikei_headwords return other_form_headwords
@staticmethod @staticmethod
def __clean_text(text): def __clean_text(text):
@ -133,9 +131,10 @@ class _JitenonEntry(Entry):
return text return text
def __str__(self): def __str__(self):
column_map = self._get_column_map()
colvals = [str(self.entry_id)] colvals = [str(self.entry_id)]
for attr in self._COLUMNS.values(): for attr_name in column_map.values():
attr_val = getattr(self, attr[0]) attr_val = getattr(self, attr_name)
if isinstance(attr_val, str): if isinstance(attr_val, str):
colvals.append(attr_val) colvals.append(attr_val)
elif isinstance(attr_val, list): elif isinstance(attr_val, list):
@ -144,83 +143,100 @@ class _JitenonEntry(Entry):
class JitenonYojiEntry(_JitenonEntry): class JitenonYojiEntry(_JitenonEntry):
_COLUMNS = { def __init__(self, target, entry_id):
"四字熟語": ["expression", ""], super().__init__(target, entry_id)
"読み方": ["yomikata", ""], self.origin = ""
"意味": ["imi", ""], self.kanken_level = ""
"出典": ["shutten", ""], self.category = ""
"漢検級": ["kankenkyuu", ""], self.related_expressions = []
"場面用途": ["bamenyouto", ""],
"異形": ["ikei", []], def _get_column_map(self):
"類義語": ["ruigigo", []], return {
"四字熟語": "expression",
"読み方": "yomikata",
"意味": "definition",
"異形": "other_forms",
"出典": "origin",
"漢検級": "kanken_level",
"場面用途": "category",
"類義語": "related_expressions",
} }
def __init__(self, entry_id): def _add_variant_expressions(self, headwords):
super().__init__(entry_id) for expressions in headwords.values():
def _set_variant_headwords(self):
for expressions in self._headwords.values():
Expressions.add_variant_kanji(expressions) Expressions.add_variant_kanji(expressions)
class JitenonKotowazaEntry(_JitenonEntry): class JitenonKotowazaEntry(_JitenonEntry):
_COLUMNS = { def __init__(self, target, entry_id):
"言葉": ["expression", ""], super().__init__(target, entry_id)
"読み方": ["yomikata", ""], self.origin = ""
"意味": ["imi", ""], self.example = ""
"出典": ["shutten", ""], self.related_expressions = []
"例文": ["reibun", ""],
"異形": ["ikei", []], def _get_column_map(self):
"類句": ["ruiku", []], return {
"言葉": "expression",
"読み方": "yomikata",
"意味": "definition",
"異形": "other_forms",
"出典": "origin",
"例文": "example",
"類句": "related_expressions",
} }
def __init__(self, entry_id): def _get_headwords(self):
super().__init__(entry_id)
def _set_headwords(self):
if self.expression == "金棒引き・鉄棒引き": if self.expression == "金棒引き・鉄棒引き":
self._headwords = { headwords = {
"かなぼうひき": ["金棒引き", "鉄棒引き"] "かなぼうひき": ["金棒引き", "鉄棒引き"]
} }
else: else:
super()._set_headwords() headwords = super()._get_headwords()
return headwords
def _set_variant_headwords(self): def _add_variant_expressions(self, headwords):
for expressions in self._headwords.values(): for expressions in headwords.values():
Expressions.add_variant_kanji(expressions) Expressions.add_variant_kanji(expressions)
Expressions.add_fullwidth(expressions) Expressions.add_fullwidth(expressions)
class JitenonKokugoEntry(_JitenonEntry): class JitenonKokugoEntry(_JitenonEntry):
_COLUMNS = { def __init__(self, target, entry_id):
"言葉": ["expression", ""], super().__init__(target, entry_id)
"読み方": ["yomikata", ""], self.example = ""
"意味": ["imi", ""], self.alt_expression = ""
"例文": ["reibun", ""], self.antonym = ""
"別表記": ["betsuhyouki", ""], self.attachments = ""
"対義語": ["taigigo", ""], self.compounds = ""
"活用": ["katsuyou", ""], self.related_words = ""
"用例": ["yourei", ""],
"類語": ["ruigo", ""], def _get_column_map(self):
return {
"言葉": "expression",
"読み方": "yomikata",
"意味": "definition",
"例文": "example",
"別表記": "alt_expression",
"対義語": "antonym",
"活用": "attachments",
"用例": "compounds",
"類語": "related_words",
} }
def __init__(self, entry_id): def _get_headwords(self):
super().__init__(entry_id)
def _set_headwords(self):
headwords = {} headwords = {}
for reading in self.yomikata.split(""): for reading in self.yomikata.split(""):
if reading not in headwords: if reading not in headwords:
headwords[reading] = [] headwords[reading] = []
for expression in self.expression.split(""): for expression in self.expression.split(""):
headwords[reading].append(expression) headwords[reading].append(expression)
if self.betsuhyouki.strip() != "": if self.alt_expression.strip() != "":
for expression in self.betsuhyouki.split(""): for expression in self.alt_expression.split(""):
headwords[reading].append(expression) headwords[reading].append(expression)
self._headwords = headwords return headwords
def _set_variant_headwords(self): def _add_variant_expressions(self, headwords):
for expressions in self._headwords.values(): for expressions in headwords.values():
Expressions.add_variant_kanji(expressions) Expressions.add_variant_kanji(expressions)
Expressions.add_fullwidth(expressions) Expressions.add_fullwidth(expressions)
Expressions.remove_iteration_mark(expressions) Expressions.remove_iteration_mark(expressions)

View file

@ -1,4 +1,3 @@
import re
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import bot.entries.expressions as Expressions import bot.entries.expressions as Expressions
@ -9,19 +8,17 @@ from bot.entries.smk8_preprocess import preprocess_page
class _BaseSmk8Entry(Entry): class _BaseSmk8Entry(Entry):
ID_TO_ENTRY = {} def __init__(self, target, entry_id):
SUBENTRY_ID_TO_ENTRY_ID = {} super().__init__(target, entry_id)
def __init__(self, entry_id):
super().__init__(entry_id)
if entry_id not in self.ID_TO_ENTRY:
self.ID_TO_ENTRY[entry_id] = self
else:
raise Exception(f"Duplicate entry ID: {entry_id}")
self.children = [] self.children = []
self.phrases = [] self.phrases = []
self.kanjis = [] self.kanjis = []
def get_global_identifier(self):
parent_part = format(self.entry_id[0], '06')
child_part = hex(self.entry_id[1]).lstrip('0x').zfill(4).upper()
return f"@{self.target.value}-{parent_part}-{child_part}"
def set_page(self, page): def set_page(self, page):
page = self.__decompose_subentries(page) page = self.__decompose_subentries(page)
self._page = page self._page = page
@ -30,13 +27,6 @@ class _BaseSmk8Entry(Entry):
soup = BeautifulSoup(self._page, "xml") soup = BeautifulSoup(self._page, "xml")
return soup return soup
def get_headwords(self):
if self._headwords is not None:
return self._headwords
self._set_headwords()
self._set_variant_headwords()
return self._headwords
def get_part_of_speech_tags(self): def get_part_of_speech_tags(self):
if self._part_of_speech_tags is not None: if self._part_of_speech_tags is not None:
return self._part_of_speech_tags return self._part_of_speech_tags
@ -50,8 +40,8 @@ class _BaseSmk8Entry(Entry):
self._part_of_speech_tags.append(tag.text) self._part_of_speech_tags.append(tag.text)
return self._part_of_speech_tags return self._part_of_speech_tags
def _set_variant_headwords(self): def _add_variant_expressions(self, headwords):
for expressions in self._headwords.values(): for expressions in headwords.values():
Expressions.add_variant_kanji(expressions) Expressions.add_variant_kanji(expressions)
Expressions.add_fullwidth(expressions) Expressions.add_fullwidth(expressions)
Expressions.remove_iteration_mark(expressions) Expressions.remove_iteration_mark(expressions)
@ -87,7 +77,7 @@ class _BaseSmk8Entry(Entry):
tag_soup.name = "項目" tag_soup.name = "項目"
subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"]) subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"])
self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id
subentry = subentry_class(subentry_id) subentry = subentry_class(self.target, subentry_id)
page = tag_soup.decode() page = tag_soup.decode()
subentry.set_page(page) subentry.set_page(page)
subentry_list.append(subentry) subentry_list.append(subentry)
@ -106,6 +96,16 @@ class _BaseSmk8Entry(Entry):
else: else:
raise Exception(f"Invalid entry ID: {id_string}") raise Exception(f"Invalid entry ID: {id_string}")
@staticmethod
def _delete_unused_nodes(soup):
"""Remove extra markup elements that appear in the entry
headword line which are not part of the entry headword"""
unused_nodes = [
"表音表記", "表外音訓マーク", "表外字マーク", "ルビG"
]
for name in unused_nodes:
Soup.delete_soup_nodes(soup, name)
@staticmethod @staticmethod
def _clean_expression(expression): def _clean_expression(expression):
for x in ["", "", "", "", "", " "]: for x in ["", "", "", "", "", " "]:
@ -114,24 +114,24 @@ class _BaseSmk8Entry(Entry):
@staticmethod @staticmethod
def _fill_alts(soup): def _fill_alts(soup):
for e in soup.find_all(["親見出仮名", "親見出表記"]): for el in soup.find_all(["親見出仮名", "親見出表記"]):
e.string = e.attrs["alt"] el.string = el.attrs["alt"]
for gaiji in soup.find_all("外字"): for gaiji in soup.find_all("外字"):
gaiji.string = gaiji.img.attrs["alt"] gaiji.string = gaiji.img.attrs["alt"]
class Smk8Entry(_BaseSmk8Entry): class Smk8Entry(_BaseSmk8Entry):
def __init__(self, page_id): def __init__(self, target, page_id):
entry_id = (page_id, 0) entry_id = (page_id, 0)
super().__init__(entry_id) super().__init__(target, entry_id)
def set_page(self, page): def set_page(self, page):
page = preprocess_page(page) page = preprocess_page(page)
super().set_page(page) super().set_page(page)
def _set_headwords(self): def _get_headwords(self):
soup = self.get_page_soup() soup = self.get_page_soup()
Soup.delete_soup_nodes(soup, "表音表記") self._delete_unused_nodes(soup)
self._fill_alts(soup) self._fill_alts(soup)
reading = self._find_reading(soup) reading = self._find_reading(soup)
expressions = [] expressions = []
@ -140,16 +140,14 @@ class Smk8Entry(_BaseSmk8Entry):
for expression in self._find_expressions(soup): for expression in self._find_expressions(soup):
if expression not in expressions: if expression not in expressions:
expressions.append(expression) expressions.append(expression)
self._headwords = {reading: expressions} headwords = {reading: expressions}
return headwords
class Smk8ChildEntry(_BaseSmk8Entry): class Smk8ChildEntry(_BaseSmk8Entry):
def __init__(self, entry_id): def _get_headwords(self):
super().__init__(entry_id)
def _set_headwords(self):
soup = self.get_page_soup() soup = self.get_page_soup()
Soup.delete_soup_nodes(soup, "表音表記") self._delete_unused_nodes(soup)
self._fill_alts(soup) self._fill_alts(soup)
reading = self._find_reading(soup) reading = self._find_reading(soup)
expressions = [] expressions = []
@ -158,19 +156,20 @@ class Smk8ChildEntry(_BaseSmk8Entry):
for expression in self._find_expressions(soup): for expression in self._find_expressions(soup):
if expression not in expressions: if expression not in expressions:
expressions.append(expression) expressions.append(expression)
self._headwords = {reading: expressions} headwords = {reading: expressions}
return headwords
class Smk8PhraseEntry(_BaseSmk8Entry): class Smk8PhraseEntry(_BaseSmk8Entry):
def __init__(self, entry_id): def __init__(self, target, entry_id):
super().__init__(entry_id) super().__init__(target, entry_id)
self.__phrase_readings = load_smk8_phrase_readings() self.__phrase_readings = load_smk8_phrase_readings()
def get_part_of_speech_tags(self): def get_part_of_speech_tags(self):
# phrases do not contain these tags # phrases do not contain these tags
return [] return []
def _set_headwords(self): def _get_headwords(self):
soup = self.get_page_soup() soup = self.get_page_soup()
headwords = {} headwords = {}
expressions = self._find_expressions(soup) expressions = self._find_expressions(soup)
@ -181,10 +180,10 @@ class Smk8PhraseEntry(_BaseSmk8Entry):
headwords[reading].append(expression) headwords[reading].append(expression)
else: else:
headwords[reading] = [expression] headwords[reading] = [expression]
self._headwords = headwords return headwords
def _find_expressions(self, soup): def _find_expressions(self, soup):
Soup.delete_soup_nodes(soup, "ルビG") self._delete_unused_nodes(soup)
self._fill_alts(soup) self._fill_alts(soup)
text = soup.find("標準表記").text text = soup.find("標準表記").text
text = self._clean_expression(text) text = self._clean_expression(text)
@ -206,15 +205,14 @@ class Smk8PhraseEntry(_BaseSmk8Entry):
class Smk8KanjiEntry(_BaseSmk8Entry): class Smk8KanjiEntry(_BaseSmk8Entry):
def __init__(self, entry_id): def _get_headwords(self):
super().__init__(entry_id)
def _set_headwords(self):
soup = self.get_page_soup() soup = self.get_page_soup()
self._delete_unused_nodes(soup)
self._fill_alts(soup) self._fill_alts(soup)
reading = self.__get_parent_reading() reading = self.__get_parent_reading()
expressions = self._find_expressions(soup) expressions = self._find_expressions(soup)
self._headwords = {reading: expressions} headwords = {reading: expressions}
return headwords
def __get_parent_reading(self): def __get_parent_reading(self):
parent_id = self.SUBENTRY_ID_TO_ENTRY_ID[self.entry_id] parent_id = self.SUBENTRY_ID_TO_ENTRY_ID[self.entry_id]

View file

@ -15,6 +15,7 @@ def preprocess_page(page):
page = __strip_page(page) page = __strip_page(page)
page = __replace_glyph_codes(page) page = __replace_glyph_codes(page)
page = __format_hyougai_marks(page) page = __format_hyougai_marks(page)
page = __remove_pronunciation_parentheses(page)
return page return page
@ -64,6 +65,7 @@ def __format_hyougai_marks(page):
for x in ["\n", "\t", " "]: for x in ["\n", "\t", " "]:
text = text.replace(x, "") text = text.replace(x, "")
text = re.sub(r"〈([^〈]+)〉", r"\1", text) text = re.sub(r"〈([^〈]+)〉", r"\1", text)
page = re.sub(r"〈([^〈]+)〉", r"\1␃", page) page = re.sub(r"〈([^〈]+)〉", r"\1␃", page)
for mark in re.findall(r"《.", text): for mark in re.findall(r"《.", text):
if mark[1] == "": if mark[1] == "":
@ -79,13 +81,29 @@ def __format_hyougai_marks(page):
page = re.sub(f"〈([^{mark[1]}]*)({mark[1]})", page = re.sub(f"〈([^{mark[1]}]*)({mark[1]})",
r"\1<表外字>\2</表外字>", r"\1<表外字>\2</表外字>",
page, count=1) page, count=1)
page = page.replace("", "") page = page.replace("", "")
page = page.replace("", "") page = page.replace("", "")
soup = BeautifulSoup(page, features="xml") soup = BeautifulSoup(page, features="xml")
for el in soup.find_all("表外音訓"): for el in soup.find_all("表外音訓"):
if el.text == "": if el.text == "":
el.append(el.next_sibling) el.append(el.next_sibling)
mark_xml = "<表外音訓マーク>︽</表外音訓マーク>"
mark_soup = BeautifulSoup(mark_xml, "xml")
el.append(mark_soup.表外音訓マーク)
for el in soup.find_all("表外字"): for el in soup.find_all("表外字"):
if el.text == "": if el.text == "":
el.append(el.next_sibling) el.append(el.next_sibling)
mark_xml = "<表外字マーク>︿</表外字マーク>"
mark_soup = BeautifulSoup(mark_xml, "xml")
el.append(mark_soup.表外字マーク)
return soup.decode() return soup.decode()
def __remove_pronunciation_parentheses(page):
page = page.replace("<表音表記>", "<表音表記>")
page = page.replace("</表音表記>", "</表音表記>")
return page

View file

@ -0,0 +1,204 @@
# pylint: disable=too-few-public-methods
import subprocess
import os
import shutil
from abc import ABC, abstractmethod
from pathlib import Path
from datetime import datetime
from platformdirs import user_documents_dir, user_cache_dir
from bot.targets import Targets
from bot.mdict.terms.factory import new_terminator
class Exporter(ABC):
def __init__(self, target):
self._target = target
self._terminator = new_terminator(target)
self._build_dir = None
self._build_media_dir = None
self._description_file = None
self._out_dir = None
def export(self, entries, media_dir, icon_file):
self._init_build_media_dir(media_dir)
self._init_description_file(entries)
terms = self._get_terms(entries)
print(f"Exporting {len(terms)} Mdict keys...")
self._write_mdx_file(terms)
self._write_mdd_file()
self._write_icon_file(icon_file)
self._rm_build_dir()
def _get_build_dir(self):
if self._build_dir is not None:
return self._build_dir
cache_dir = user_cache_dir("jitenbot")
build_directory = os.path.join(cache_dir, "mdict_build")
if Path(build_directory).is_dir():
shutil.rmtree(build_directory)
os.makedirs(build_directory)
self._build_dir = build_directory
return self._build_dir
def _init_build_media_dir(self, media_dir):
build_dir = self._get_build_dir()
build_media_dir = os.path.join(build_dir, self._target.value)
if media_dir is not None:
print("Copying media files to build directory...")
shutil.copytree(media_dir, build_media_dir)
else:
os.makedirs(build_media_dir)
css_file = self._get_css_file()
shutil.copy(css_file, build_media_dir)
self._terminator.set_media_dir(build_media_dir)
self._build_media_dir = build_media_dir
def _init_description_file(self, entries):
filename = f"{self._target.value}.mdx.description.html"
original_file = os.path.join(
"data", "mdict", "description", filename)
with open(original_file, "r", encoding="utf8") as f:
description = f.read()
description = description.replace(
"{{revision}}", self._get_revision(entries))
description = description.replace(
"{{attribution}}", self._get_attribution(entries))
build_dir = self._get_build_dir()
description_file = os.path.join(build_dir, filename)
with open(description_file, "w", encoding="utf8") as f:
f.write(description)
self._description_file = description_file
def _get_terms(self, entries):
terms = []
entries_len = len(entries)
for idx, entry in enumerate(entries):
update = f"Creating Mdict terms for entry {idx+1}/{entries_len}"
print(update, end='\r', flush=True)
new_terms = self._terminator.make_terms(entry)
for term in new_terms:
terms.append(term)
print()
return terms
def _write_mdx_file(self, terms):
out_dir = self._get_out_dir()
out_file = os.path.join(out_dir, f"{self._target.value}.mdx")
params = [
"mdict",
"-a", self._get_term_file(terms),
"--title", self._get_title_file(),
"--description", self._description_file,
out_file
]
subprocess.run(params, check=True)
def _write_mdd_file(self):
out_dir = self._get_out_dir()
out_file = os.path.join(out_dir, f"{self._target.value}.mdd")
params = [
"mdict",
"-a", self._build_media_dir,
"--title", self._get_title_file(),
"--description", self._description_file,
out_file
]
subprocess.run(params, check=True)
def _write_icon_file(self, icon_file):
premade_icon_file = f"data/mdict/icon/{self._target.value}.png"
out_dir = self._get_out_dir()
out_file = os.path.join(out_dir, f"{self._target.value}.png")
if icon_file is not None and Path(icon_file).is_file():
shutil.copy(icon_file, out_file)
elif Path(premade_icon_file).is_file():
shutil.copy(premade_icon_file, out_file)
def _get_out_dir(self):
if self._out_dir is not None:
return self._out_dir
out_dir = os.path.join(
user_documents_dir(), "jitenbot", "mdict", self._target.value)
if Path(out_dir).is_dir():
shutil.rmtree(out_dir)
os.makedirs(out_dir)
self._out_dir = out_dir
return out_dir
def _get_term_file(self, terms):
build_dir = self._get_build_dir()
term_file = os.path.join(build_dir, f"{self._target.value}.mdx.txt")
with open(term_file, "w", encoding="utf8") as f:
for term in terms:
f.write("\n".join(term))
f.write("\n</>\n")
return term_file
def _get_title_file(self):
return os.path.join(
"data", "mdict", "title",
f"{self._target.value}.mdx.title.html")
def _get_css_file(self):
return os.path.join(
"data", "mdict", "css",
f"{self._target.value}.css")
def _rm_build_dir(self):
build_dir = self._get_build_dir()
shutil.rmtree(build_dir)
@abstractmethod
def _get_revision(self, entries):
pass
@abstractmethod
def _get_attribution(self, entries):
pass
class _JitenonExporter(Exporter):
def _get_revision(self, entries):
modified_date = None
for entry in entries:
if modified_date is None or entry.modified_date > modified_date:
modified_date = entry.modified_date
revision = modified_date.strftime("%Y年%m月%d日閲覧")
return revision
def _get_attribution(self, entries):
modified_date = None
for entry in entries:
if modified_date is None or entry.modified_date > modified_date:
attribution = entry.attribution
return attribution
class JitenonKokugoExporter(_JitenonExporter):
pass
class JitenonYojiExporter(_JitenonExporter):
pass
class JitenonKotowazaExporter(_JitenonExporter):
pass
class _MonokakidoExporter(Exporter):
def _get_revision(self, entries):
timestamp = datetime.now().strftime("%Y年%m月%d日作成")
return timestamp
class Smk8Exporter(_MonokakidoExporter):
def _get_attribution(self, entries):
return "© Sanseido Co., LTD. 2020"
class Daijirin2Exporter(_MonokakidoExporter):
def _get_attribution(self, entries):
return "© Sanseido Co., LTD. 2019"

View file

@ -0,0 +1,18 @@
from bot.targets import Targets
from bot.mdict.exporters.export import JitenonKokugoExporter
from bot.mdict.exporters.export import JitenonYojiExporter
from bot.mdict.exporters.export import JitenonKotowazaExporter
from bot.mdict.exporters.export import Smk8Exporter
from bot.mdict.exporters.export import Daijirin2Exporter
def new_mdict_exporter(target):
exporter_map = {
Targets.JITENON_KOKUGO: JitenonKokugoExporter,
Targets.JITENON_YOJI: JitenonYojiExporter,
Targets.JITENON_KOTOWAZA: JitenonKotowazaExporter,
Targets.SMK8: Smk8Exporter,
Targets.DAIJIRIN2: Daijirin2Exporter,
}
return exporter_map[target](target)

View file

@ -0,0 +1,77 @@
import re
import os
from functools import cache
from pathlib import Path
from bot.soup import delete_soup_nodes
from bot.data import load_mdict_name_conversion
from bot.name_conversion import convert_names
def make_glossary(entry, media_dir):
soup = entry.get_page_soup()
__add_rubies(soup)
__hyperlink_parent_expression(soup, entry)
__delete_unused_nodes(soup, media_dir)
__convert_links(soup, entry)
name_conversion = load_mdict_name_conversion(entry.target)
convert_names(soup, name_conversion)
glossary = soup.span.decode()
return glossary
def __add_rubies(soup):
for name in ["表外音訓", "表外字"]:
for ruby in soup.find_all(name):
ruby.name = "ruby"
rt = ruby.find("表外字マーク")
rt.name = "rt"
ruby.append(rt) # needs to positioned after the text
def __hyperlink_parent_expression(soup, entry):
if soup.find("親表記") is None:
return
parent_entry = entry.get_parent()
gid = parent_entry.get_global_identifier()
for el in soup.find_all("親表記"):
el.name = "a"
el.attrs["href"] = f"entry://{gid}"
def __delete_unused_nodes(soup, media_dir):
if not __graphics_directory_exists(media_dir):
delete_soup_nodes(soup, "カットG")
for el in soup.find_all("logo"):
next_sibling = el.next_sibling
if next_sibling is None:
continue
elif next_sibling.name in ["漢字見出G", "漢字音G"]:
el.decompose()
for el in soup.find_all("漢字音G"):
for child in el.find_all(string=""):
child.replace_with("")
@cache
def __graphics_directory_exists(media_dir):
path = os.path.join(media_dir, "graphics")
return Path(path).is_dir()
def __convert_links(soup, entry):
for el in soup.find_all("a"):
href = el.attrs["href"]
if re.match(r"^[0-9]+(?:-[0-9A-F]{4})?$", href):
ref_entry_id = entry.id_string_to_entry_id(href)
ref_entry = entry.ID_TO_ENTRY[ref_entry_id]
gid = ref_entry.get_global_identifier()
el.attrs["href"] = f"entry://{gid}"
elif re.match(r"^entry:", href):
pass
elif re.match(r"^https?:[\w\W]*", href):
pass
else:
raise Exception(f"Invalid href format: {href}")

View file

@ -0,0 +1,141 @@
# pylint: disable=too-few-public-methods
import re
class JitenonGlossary():
def __init__(self):
self._id_pattern = None
self._expression_header = None
def _replace_punctuation(self, soup):
punctuation = {
"/": "",
",": "",
}
for el in soup.find_all(string=True):
text = el.text
for old, new in punctuation.items():
text = text.replace(old, new)
el.replace_with(text)
def _add_internal_links(self, soup, entry):
for el in soup.find_all("a"):
href = el.attrs["href"]
m = re.search(self._id_pattern, href)
if m is not None:
ref_entry_id = int(m.group(1))
ref_entry = entry.ID_TO_ENTRY[ref_entry_id]
gid = ref_entry.get_global_identifier()
el.attrs["href"] = f"entry://{gid}"
elif re.match(r"^(?:https?:|\?)[\w\W]*", href):
pass
else:
raise Exception(f"Invalid href format: {href}")
def _decompose_table_rows(self, soup, entry):
for tr in soup.find_all("tr"):
if tr.find("th") is None:
continue
elif tr.th.text == self._expression_header:
tr.decompose()
elif tr.th.text == "読み方":
if self._do_display_yomikata_in_headword(entry):
tr.decompose()
elif tr.th.text == "意味":
definition = tr.td
definition.name = "div"
definition.attrs["class"] = "意味"
soup.body.insert(0, definition)
tr.decompose()
if soup.find("tr") is None:
soup.table.decompose()
def _insert_headword_line(self, soup, entry):
headword_line = soup.new_tag("div")
headword_line.attrs["class"] = "見出し"
if self._do_display_yomikata_in_headword(entry):
reading = soup.new_tag("span")
reading.attrs["class"] = "読み方"
reading.string = entry.yomikata
headword_line.append(reading)
expression = soup.new_tag("span")
expression.attrs["class"] = self._expression_header
expression.string = f"{entry.expression}"
headword_line.append(expression)
soup.body.insert(0, headword_line)
def _do_display_yomikata_in_headword(self, entry):
if not re.match(r"^[ぁ-ヿ、]+$", entry.yomikata):
return False
elif len(entry.yomikata) > 10:
return False
else:
return True
class JitenonKokugoGlossary(JitenonGlossary):
def __init__(self):
super().__init__()
self._expression_header = "言葉"
self._id_pattern = r"kokugo.jitenon.jp/word/p([0-9]+)$"
def make_glossary(self, entry, media_dir):
soup = entry.get_page_soup()
self._remove_antonym_list_item(soup)
self._replace_number_icons(soup, media_dir)
self._replace_punctuation(soup)
self._add_internal_links(soup, entry)
self._decompose_table_rows(soup, entry)
self._insert_headword_line(soup, entry)
glossary = soup.body.prettify()
return glossary
def _remove_antonym_list_item(self, soup):
for el in soup.find_all("li"):
if el.text == "対義語辞典":
el.decompose()
def _replace_number_icons(self, soup, media_dir):
for el in soup.find_all("img"):
alt = el.attrs["alt"]
text = re.search(r"[-]+", alt).group(0)
el.name = "span"
el.string = text
del el.attrs["src"]
del el.attrs["alt"]
def _do_display_yomikata_in_headword(self, entry):
return len(entry.yomikata) <= 10
class JitenonYojiGlossary(JitenonGlossary):
def __init__(self):
super().__init__()
self._expression_header = "四字熟語"
self._id_pattern = r"yoji.jitenon.jp/yoji.?/([0-9]+)\.html$"
def make_glossary(self, entry, media_dir):
soup = entry.get_page_soup()
self._replace_punctuation(soup)
self._add_internal_links(soup, entry)
self._decompose_table_rows(soup, entry)
self._insert_headword_line(soup, entry)
glossary = soup.body.prettify()
return glossary
class JitenonKotowazaGlossary(JitenonGlossary):
def __init__(self):
super().__init__()
self._expression_header = "言葉"
self._id_pattern = r"kotowaza.jitenon.jp/kotowaza/([0-9]+)\.php$"
def make_glossary(self, entry, media_dir):
soup = entry.get_page_soup()
self._replace_punctuation(soup)
self._add_internal_links(soup, entry)
self._decompose_table_rows(soup, entry)
self._insert_headword_line(soup, entry)
glossary = soup.body.prettify()
return glossary

View file

@ -0,0 +1,67 @@
import re
from bot.soup import delete_soup_nodes
from bot.data import load_mdict_name_conversion
from bot.name_conversion import convert_names
def make_glossary(entry, media_dir):
soup = entry.get_page_soup()
__fill_alts(soup, entry)
__delete_unused_nodes(soup)
__convert_links(soup, entry)
__convert_priority_markers(soup)
name_conversion = load_mdict_name_conversion(entry.target)
convert_names(soup, name_conversion)
glossary = soup.span.decode()
return glossary
def __fill_alts(soup, entry):
names = ["親見出仮名", "親見出表記"]
if soup.find(names) is None:
return
parent_entry = entry.get_parent()
gid = parent_entry.get_global_identifier()
for el in soup.find_all(names):
el.name = "a"
alt = el.attrs["alt"]
el.string = alt
el.attrs["href"] = f"entry://{gid}"
del el.attrs["alt"]
def __delete_unused_nodes(soup):
for name in ["連濁"]:
delete_soup_nodes(soup, name)
def __convert_links(soup, entry):
for el in soup.find_all("a"):
href = el.attrs["href"]
if href.startswith("$"):
el.unwrap()
elif re.match(r"^[0-9]+(?:-[0-9A-F]{4})?$", href):
ref_entry_id = entry.id_string_to_entry_id(href)
ref_entry = entry.ID_TO_ENTRY[ref_entry_id]
gid = ref_entry.get_global_identifier()
el.attrs["href"] = f"entry://{gid}"
elif re.match(r"^[0-9]+[ab]?\.aac$", href):
el.attrs["href"] = f"sound://audio/{href}"
elif re.match(r"^entry:", href):
pass
elif re.match(r"^https?:[\w\W]*", href):
pass
else:
raise Exception(f"Invalid href format: {href}")
def __convert_priority_markers(soup):
for el in soup.find_all("img", attrs={"alt": "*"}):
el.name = "span"
el.string = ""
for el in soup.find_all("img", attrs={"alt": ""}):
el.name = "span"
el.string = ""

View file

@ -0,0 +1,23 @@
from bot.mdict.terms.terminator import Terminator
from bot.mdict.glossary.daijirin2 import make_glossary
class Daijirin2Terminator(Terminator):
def _glossary(self, entry):
if entry.entry_id in self._glossary_cache:
return self._glossary_cache[entry.entry_id]
glossary = make_glossary(entry, self._media_dir)
self._glossary_cache[entry.entry_id] = glossary
return glossary
def _link_glossary_parameters(self, entry):
return [
[entry.children, "子項目"],
[entry.phrases, "句項目"],
]
def _subentry_lists(self, entry):
return [
entry.children,
entry.phrases,
]

View file

@ -0,0 +1,18 @@
from bot.targets import Targets
from bot.mdict.terms.jitenon import JitenonKokugoTerminator
from bot.mdict.terms.jitenon import JitenonYojiTerminator
from bot.mdict.terms.jitenon import JitenonKotowazaTerminator
from bot.mdict.terms.smk8 import Smk8Terminator
from bot.mdict.terms.daijirin2 import Daijirin2Terminator
def new_terminator(target):
terminator_map = {
Targets.JITENON_KOKUGO: JitenonKokugoTerminator,
Targets.JITENON_YOJI: JitenonYojiTerminator,
Targets.JITENON_KOTOWAZA: JitenonKotowazaTerminator,
Targets.SMK8: Smk8Terminator,
Targets.DAIJIRIN2: Daijirin2Terminator,
}
return terminator_map[target](target)

View file

@ -0,0 +1,42 @@
from bot.mdict.terms.terminator import Terminator
from bot.mdict.glossary.jitenon import JitenonKokugoGlossary
from bot.mdict.glossary.jitenon import JitenonYojiGlossary
from bot.mdict.glossary.jitenon import JitenonKotowazaGlossary
class JitenonTerminator(Terminator):
def __init__(self, target):
super().__init__(target)
self._glossary_maker = None
def _glossary(self, entry):
if entry.entry_id in self._glossary_cache:
return self._glossary_cache[entry.entry_id]
glossary = self._glossary_maker.make_glossary(entry, self._media_dir)
self._glossary_cache[entry.entry_id] = glossary
return glossary
def _link_glossary_parameters(self, entry):
return []
def _subentry_lists(self, entry):
return []
class JitenonKokugoTerminator(JitenonTerminator):
def __init__(self, target):
super().__init__(target)
self._glossary_maker = JitenonKokugoGlossary()
class JitenonYojiTerminator(JitenonTerminator):
def __init__(self, target):
super().__init__(target)
self._glossary_maker = JitenonYojiGlossary()
class JitenonKotowazaTerminator(JitenonTerminator):
def __init__(self, target):
super().__init__(target)
self._glossary_maker = JitenonKotowazaGlossary()

24
bot/mdict/terms/smk8.py Normal file
View file

@ -0,0 +1,24 @@
from bot.mdict.terms.terminator import Terminator
from bot.mdict.glossary.smk8 import make_glossary
class Smk8Terminator(Terminator):
def _glossary(self, entry):
if entry.entry_id in self._glossary_cache:
return self._glossary_cache[entry.entry_id]
glossary = make_glossary(entry, self._media_dir)
self._glossary_cache[entry.entry_id] = glossary
return glossary
def _link_glossary_parameters(self, entry):
return [
[entry.children, "子項目"],
[entry.phrases, "句項目"],
]
def _subentry_lists(self, entry):
return [
entry.children,
entry.phrases,
entry.kanjis,
]

View file

@ -0,0 +1,73 @@
from abc import abstractmethod, ABC
class Terminator(ABC):
def __init__(self, target):
self._target = target
self._glossary_cache = {}
self._media_dir = None
def set_media_dir(self, media_dir):
self._media_dir = media_dir
def make_terms(self, entry):
gid = entry.get_global_identifier()
glossary = self.__full_glossary(entry)
terms = [[gid, glossary]]
keys = set()
headwords = entry.get_headwords()
for reading, expressions in headwords.items():
if len(expressions) == 0:
keys.add(reading)
for expression in expressions:
if expression.strip() == "":
keys.add(reading)
continue
keys.add(expression)
if reading.strip() == "":
continue
if reading != expression:
keys.add(f"{reading}{expression}")
else:
keys.add(reading)
link = f"@@@LINK={gid}"
for key in keys:
if key.strip() != "":
terms.append([key, link])
for subentries in self._subentry_lists(entry):
for subentry in subentries:
for term in self.make_terms(subentry):
terms.append(term)
return terms
def __full_glossary(self, entry):
glossary = []
style_link = f"<link rel='stylesheet' href='{self._target.value}.css' type='text/css'>"
glossary.append(style_link)
glossary.append(self._glossary(entry))
for x in self._link_glossary_parameters(entry):
(subentries, list_title) = x
if len(subentries) == 0:
continue
items = []
for subentry in subentries:
exp = subentry.get_first_expression()
gid = subentry.get_global_identifier()
item = f"<li><a href='entry://{gid}'>{exp}</a></li>"
items.append(item)
link_glossary = f"<div data-child-links='{list_title}'><span>{list_title}</span><ul>{''.join(items)}</ul></div>"
glossary.append(link_glossary)
return "\n".join(glossary)
@abstractmethod
def _glossary(self, entry):
pass
@abstractmethod
def _link_glossary_parameters(self, entry):
pass
@abstractmethod
def _subentry_lists(self, entry):
pass

View file

@ -30,7 +30,7 @@ def __apply_name_conversion_procedures(soup, procedures):
"has_previous_sibling": __has_previous_sibling, "has_previous_sibling": __has_previous_sibling,
"replace": __replace, "replace": __replace,
"wrap": __wrap, "wrap": __wrap,
"add_ruby_text": __add_ruby_text, "insert_span": __insert_span,
} }
for procedure in procedures: for procedure in procedures:
function = functions[procedure["procedure_name"]] function = functions[procedure["procedure_name"]]
@ -92,10 +92,9 @@ def __wrap(soup, l_wrap, r_wrap):
soup.string = f"{l_wrap}{soup.text}{r_wrap}" soup.string = f"{l_wrap}{soup.text}{r_wrap}"
def __add_ruby_text(soup, mark, style): def __insert_span(soup, attr_name, attr_val):
if style.strip() != "": span_markup = f"<span {attr_name}='{attr_val}'></span>"
markup = f"<rt><span style='{style}'>{mark}</span></rt>" span_soup = BeautifulSoup(span_markup, "xml")
else: for content in reversed(soup.contents):
markup = f"<rt>{mark}</rt>" span_soup.span.insert(0, content.extract())
rt_soup = BeautifulSoup(markup, "xml") soup.append(span_soup.span)
soup.append(rt_soup.rt)

View file

@ -1,15 +1,18 @@
# pylint: disable=too-few-public-methods
import json import json
import os import os
import shutil import shutil
from pathlib import Path from pathlib import Path
from datetime import datetime from datetime import datetime
from abc import ABC, abstractmethod
from platformdirs import user_documents_dir, user_cache_dir from platformdirs import user_documents_dir, user_cache_dir
from bot.data import load_yomichan_metadata from bot.data import load_yomichan_metadata
from bot.yomichan.terms.factory import new_terminator from bot.yomichan.terms.factory import new_terminator
class Exporter: class Exporter(ABC):
def __init__(self, target): def __init__(self, target):
self._target = target self._target = target
self._terminator = new_terminator(target) self._terminator = new_terminator(target)
@ -26,6 +29,14 @@ class Exporter:
terms = self.__get_terms(entries) terms = self.__get_terms(entries)
self.__make_dictionary(terms, index, tags) self.__make_dictionary(terms, index, tags)
@abstractmethod
def _get_revision(self, entries):
pass
@abstractmethod
def _get_attribution(self, entries):
pass
def _get_build_dir(self): def _get_build_dir(self):
if self._build_dir is not None: if self._build_dir is not None:
return self._build_dir return self._build_dir
@ -41,7 +52,7 @@ class Exporter:
build_dir = self._get_build_dir() build_dir = self._get_build_dir()
build_img_dir = os.path.join(build_dir, self._target.value) build_img_dir = os.path.join(build_dir, self._target.value)
if image_dir is not None: if image_dir is not None:
print("Copying image files to build directory...") print("Copying media files to build directory...")
shutil.copytree(image_dir, build_img_dir) shutil.copytree(image_dir, build_img_dir)
else: else:
os.makedirs(build_img_dir) os.makedirs(build_img_dir)
@ -93,7 +104,7 @@ class Exporter:
def __write_archive(self, filename): def __write_archive(self, filename):
archive_format = "zip" archive_format = "zip"
out_dir = os.path.join(user_documents_dir(), "jitenbot") out_dir = os.path.join(user_documents_dir(), "jitenbot", "yomichan")
if not Path(out_dir).is_dir(): if not Path(out_dir).is_dir():
os.makedirs(out_dir) os.makedirs(out_dir)
out_file = f"{filename}.{archive_format}" out_file = f"{filename}.{archive_format}"
@ -110,10 +121,7 @@ class Exporter:
shutil.rmtree(build_dir) shutil.rmtree(build_dir)
class JitenonExporter(Exporter): class _JitenonExporter(Exporter):
def __init__(self, target):
super().__init__(target)
def _get_revision(self, entries): def _get_revision(self, entries):
modified_date = None modified_date = None
for entry in entries: for entry in entries:
@ -130,25 +138,19 @@ class JitenonExporter(Exporter):
return attribution return attribution
class JitenonKokugoExporter(JitenonExporter): class JitenonKokugoExporter(_JitenonExporter):
def __init__(self, target): pass
super().__init__(target)
class JitenonYojiExporter(JitenonExporter): class JitenonYojiExporter(_JitenonExporter):
def __init__(self, target): pass
super().__init__(target)
class JitenonKotowazaExporter(JitenonExporter): class JitenonKotowazaExporter(_JitenonExporter):
def __init__(self, target): pass
super().__init__(target)
class Smk8Exporter(Exporter): class Smk8Exporter(Exporter):
def __init__(self, target):
super().__init__(target)
def _get_revision(self, entries): def _get_revision(self, entries):
timestamp = datetime.now().strftime("%Y-%m-%d") timestamp = datetime.now().strftime("%Y-%m-%d")
return f"{self._target.value};{timestamp}" return f"{self._target.value};{timestamp}"
@ -158,9 +160,6 @@ class Smk8Exporter(Exporter):
class Daijirin2Exporter(Exporter): class Daijirin2Exporter(Exporter):
def __init__(self, target):
super().__init__(target)
def _get_revision(self, entries): def _get_revision(self, entries):
timestamp = datetime.now().strftime("%Y-%m-%d") timestamp = datetime.now().strftime("%Y-%m-%d")
return f"{self._target.value};{timestamp}" return f"{self._target.value};{timestamp}"

View file

@ -7,7 +7,7 @@ from bot.yomichan.exporters.export import Smk8Exporter
from bot.yomichan.exporters.export import Daijirin2Exporter from bot.yomichan.exporters.export import Daijirin2Exporter
def new_exporter(target): def new_yomi_exporter(target):
exporter_map = { exporter_map = {
Targets.JITENON_KOKUGO: JitenonKokugoExporter, Targets.JITENON_KOKUGO: JitenonKokugoExporter,
Targets.JITENON_YOJI: JitenonYojiExporter, Targets.JITENON_YOJI: JitenonYojiExporter,

View file

@ -6,9 +6,9 @@ from pathlib import Path
import bot.icons as Icons import bot.icons as Icons
from bot.soup import delete_soup_nodes from bot.soup import delete_soup_nodes
from bot.data import load_daijirin2_yomichan_name_conversion from bot.data import load_yomichan_name_conversion
from bot.yomichan.glossary.gloss import make_gloss from bot.yomichan.glossary.gloss import make_gloss
from bot.yomichan.glossary.name_conversion import convert_names from bot.name_conversion import convert_names
def make_glossary(entry, image_dir): def make_glossary(entry, image_dir):
@ -26,7 +26,7 @@ def make_glossary(entry, image_dir):
__convert_daigoginum(soup, image_dir) __convert_daigoginum(soup, image_dir)
__convert_jundaigoginum(soup, image_dir) __convert_jundaigoginum(soup, image_dir)
name_conversion = load_daijirin2_yomichan_name_conversion() name_conversion = load_yomichan_name_conversion(entry.target)
convert_names(soup, name_conversion) convert_names(soup, name_conversion)
gloss = make_gloss(soup.span) gloss = make_gloss(soup.span)

View file

@ -58,9 +58,9 @@ class JitenonGlossary():
if self._do_display_yomikata_in_headword(entry): if self._do_display_yomikata_in_headword(entry):
tr.decompose() tr.decompose()
elif tr.th.text == "意味": elif tr.th.text == "意味":
imi = tr.td definition = tr.td
imi.name = "div" definition.name = "div"
soup.body.insert(0, imi) soup.body.insert(0, definition)
tr.decompose() tr.decompose()
if soup.find("tr") is None: if soup.find("tr") is None:
soup.table.decompose() soup.table.decompose()

View file

@ -4,9 +4,9 @@ from bs4 import BeautifulSoup
import bot.icons as Icons import bot.icons as Icons
from bot.soup import delete_soup_nodes from bot.soup import delete_soup_nodes
from bot.data import load_smk8_yomichan_name_conversion from bot.data import load_yomichan_name_conversion
from bot.yomichan.glossary.gloss import make_gloss from bot.yomichan.glossary.gloss import make_gloss
from bot.yomichan.glossary.name_conversion import convert_names from bot.name_conversion import convert_names
def make_glossary(entry, image_dir): def make_glossary(entry, image_dir):
@ -20,7 +20,7 @@ def make_glossary(entry, image_dir):
__convert_gaiji(soup, image_dir) __convert_gaiji(soup, image_dir)
__convert_rectangles(soup, image_dir) __convert_rectangles(soup, image_dir)
name_conversion = load_smk8_yomichan_name_conversion() name_conversion = load_yomichan_name_conversion(entry.target)
convert_names(soup, name_conversion) convert_names(soup, name_conversion)
gloss = make_gloss(soup.span) gloss = make_gloss(soup.span)

View file

@ -9,6 +9,7 @@ from bot.yomichan.glossary.jitenon import JitenonKotowazaGlossary
class JitenonTerminator(Terminator): class JitenonTerminator(Terminator):
def __init__(self, target): def __init__(self, target):
super().__init__(target) super().__init__(target)
self._glossary_maker = None
def _definition_tags(self, entry): def _definition_tags(self, entry):
return None return None
@ -51,7 +52,7 @@ class JitenonYojiTerminator(JitenonTerminator):
return "" return ""
def _term_tags(self, entry): def _term_tags(self, entry):
tags = entry.kankenkyuu.split("/") tags = entry.kanken_level.split("/")
return " ".join(tags) return " ".join(tags)

View file

@ -1,7 +1,8 @@
from abc import abstractmethod, ABC
from bot.data import load_yomichan_inflection_categories from bot.data import load_yomichan_inflection_categories
class Terminator: class Terminator(ABC):
def __init__(self, target): def __init__(self, target):
self._target = target self._target = target
self._glossary_cache = {} self._glossary_cache = {}
@ -62,3 +63,31 @@ class Terminator:
} }
glossary.append(gloss) glossary.append(gloss)
return glossary return glossary
@abstractmethod
def _definition_tags(self, entry):
pass
@abstractmethod
def _inflection_rules(self, entry, expression):
pass
@abstractmethod
def _glossary(self, entry):
pass
@abstractmethod
def _sequence(self, entry):
pass
@abstractmethod
def _term_tags(self, entry):
pass
@abstractmethod
def _link_glossary_parameters(self, entry):
pass
@abstractmethod
def _subentry_lists(self, entry):
pass

View file

@ -0,0 +1,12 @@
{
"a": {},
"br": {},
"img": {},
"div": {},
"span": {},
"ruby": {},
"rt": {},
"p": {},
"漢字音G": {"name": "ul"},
"漢字音": {"name": "li"}
}

View file

@ -0,0 +1,414 @@
body {
margin: 1em 44px 1em 1em;
line-height: 1.5em;
font-family: serif;
font-size: 1.2em;
color: black;
}
body.ABC {
margin: 0.5em 0.5em 2em 0.5em;
}
a {
text-decoration: none;
}
img.gaiji {
height: 1em;
}
img.cut {
max-height: 100px;
max-width: 600px;
}
p {
margin: 0.5em 0
}
span[data-name="i"] {
font-style: italic;
}
span[data-name="h1"] {
font-family: sans-serif;
font-size: 1em;
font-weight: bold;
}
span[data-name="image"] {
display: block;
}
span[data-name="ref"] a {
text-decoration: none;
}
span[data-name="sl"] {
text-decoration: accent;
}
span[data-name="sm"] {
font-size: 0.7em;
}
span[data-name="small"] {
font-size: 0.7em;
vertical-align: 0.35em;
}
span[data-name="sub"] {
font-size: 0.7em;
vertical-align: -0.35em;
}
span[data-name="ty2"] span[data-name="sub"] {
vertical-align: 0em;
}
span[data-name="ty2"] span[data-name="sup"] {
vertical-align: 0.5em;
}
span[data-name="文語形"] {
display: block;
}
span[data-name="用例"] {
display: block;
}
span[data-name="補説G"] {
display: block;
}
span[data-name="語義Gnum"] + span[data-name="補説G"] {
display: inline;
}
span[data-name="アクセントG"] + span[data-name="補説G"] {
display: inline;
}
span[data-name="補説G"] + span[data-name="語釈"] {
display: block;
}
span[data-name="アクセントG"] {
font-size: 0.7em;
vertical-align: super;
margin-left: 0.25em;
margin-right: 0.25em;
}
span[data-name="カット"] {
display: block;
}
span[data-name="カットG"] {
display: block;
margin-top: 0.5em;
margin-bottom: 0.5em;
margin-left: 1em;
}
span[data-name="キャプション"] {
display: block;
}
span[data-name="ルビG"] {
font-family: sans-serif;
font-size: 0.7em;
font-weight: normal;
vertical-align: 0.35em;
}
.warichu span[data-name="ルビG"] {
font-family: serif;
font-size: 0.5em;
font-weight: normal;
vertical-align: 0em;
}
span[data-name="中語義"] {
display: block;
}
span[data-name="付記"] {
font-size: 0.7em;
vertical-align: 0.35em;
}
span[data-name="副義"] {
display: block;
margin-left: 1em;
}
span[data-name="単位名"] {
font-size: 0.5em;
}
span[data-name="原籍"] {
font-size: 0.7em;
vertical-align: 0.35em;
}
span[data-name="句仮名"] {
font-size: 0.7em;
vertical-align: 0.35em;
}
span[data-name="句項目"] {
margin-top: 0.5em;
margin-left: 1em;
display: block;
}
span[data-name="和字"] {
font-family: sans-serif;
}
span[data-name="品詞行"] {
font-size: 0.7em;
vertical-align: 0.35em;
}
span[data-name="品詞用法"] {
font-size: 0.7em;
vertical-align: 0.35em;
}
span[data-name="大語義"] {
display: block;
}
span[data-name="大語義num"] {
margin: 0.025em;
padding: 0.1em;
font-family: sans-serif;
font-size: 0.8em;
color: white;
background-color: black;
}
span[data-name="子項目"] {
display: block;
margin-top: 0.5em;
margin-left: 1em;
}
span[data-name="慣用G"] {
display: block;
margin-top: 0.5em;
}
span[data-name="欧字"] {
font-family: sans-serif;
}
span[data-name="歴史仮名"] {
font-size: 0.7em;
vertical-align: 0.35em;
}
span[data-name="派生G"] {
display: block;
margin-top: 0.5em;
}
span[data-name="準大語義"] {
display: block;
}
span[data-name="準大語義num"] {
margin: 0.025em;
padding: 0.1em;
font-family: sans-serif;
font-size: 0.8em;
border: solid 1px black;
}
span[data-name="漢字音logo"] {
margin: 0.025em;
padding: 0.1em;
font-family: sans-serif;
font-size: 0.8em;
border: solid 0.5px black;
border-radius: 1em;
}
span[data-name="漢字音G"] {
font-size: 0.7em;
font-weight: normal;
vertical-align: 0.35em;
}
span[data-name="生没年"] {
margin-left: 0.25em;
margin-right: 0.25em;
font-size: 0.7em;
vertical-align: 0.35em;
}
span[data-name="生没年"]:first-child {
margin-left: 0;
}
span[data-name="用法"] {
font-size: 0.7em;
vertical-align: 0.35em;
}
span[data-name="異字同訓"] {
display: block;
margin-top: 0.5em;
}
span[data-name="異字同訓仮名"] {
font-family: sans-serif;
font-weight: bold;
}
span[data-name="異字同訓漢字"] {
font-family: serif;
font-weight: normal;
}
span[data-name="異字同訓表記"] {
font-family: sans-serif;
font-weight: bold;
}
span[data-name="異字同訓解説"] {
display: block;
}
span[data-name="異字同訓語義G"] {
display: block;
}
span[data-name="細義"] {
display: block;
}
span[data-name="表外字マーク"] {
font-size: 0.5em;
vertical-align: 0.5em;
}
span[data-name="見出仮名"] {
font-family: sans-serif;
font-weight: bold;
}
span[data-name="見出相当部"] {
font-family: sans-serif;
font-weight: bold;
}
span[data-name="見出部"] {
display: block;
}
span[data-name="解説部"] {
display: block;
margin-left: 1em;
}
span[data-name="語義G"] {
display: block;
}
span[data-name="語義区切"] {
font-size: 0.7em;
vertical-align: 0.35em;
}
span[data-name="返り点"] {
font-size: 0.5em;
font-weight: normal;
vertical-align: 1em;
}
span[data-name="返り点"].熟語記号 {
vertical-align: 0em;
}
span[data-name="項目"] {
display: block;
}
span[data-name="logo"] {
margin: 0.025em 0.25em;
padding: 0.1em;
font-size: 0.8em;
border: solid 1px black;
border-radius: 0.2em;
}
.gothic {
font-family: sans-serif;
font-weight: bold;
}
.warichu {
font-size: 1em;
}
.refnum {
font-size: 0.7em;
vertical-align: 0.35em;
}
#index {
display: none;
}
span[data-name="歴史仮名"]:before,
span[data-name="ルビG"]:before,
span[data-name="品詞行"]:before,
span[data-name="原籍"]:before,
span[data-name="品詞用法"]:before,
span[data-name="付記"]:before {
content: "(";
}
span[data-name="歴史仮名"]:after,
span[data-name="ルビG"]:after,
span[data-name="品詞行"]:after,
span[data-name="原籍"]:after,
span[data-name="品詞用法"]:after,
span[data-name="付記"]:after {
content: ")";
}
div[data-child-links] {
padding-top: 1em;
}
div[data-child-links] ul {
margin: 0;
padding-left: 2em;
}
div[data-child-links] span {
padding: 0.1em;
font-family: sans-serif;
font-size: 0.8em;
color: white;
border-width: 0.05em;
border-style: none;
border-color: black;
word-break: keep-all;
-webkit-border-radius: 0.2em;
}
div[data-child-links="子項目"] span {
background-color: rgb(153, 42, 103);
}
div[data-child-links="句項目"] span {
background-color: rgb(176, 127, 57);
}

View file

@ -0,0 +1,56 @@
body {
font-family: serif;
margin: 1em 44px 1em 1.5em;
line-height: 1.5em;
font-size: 1.2em;
color: black;
}
table, th, td {
border: 1px solid;
border-collapse: collapse;
padding: 0.5em;
}
th {
font-family: sans-serif;
color: black;
background-color: lightgray;
font-weight: normal;
white-space: nowrap;
}
a {
text-decoration: none;
}
td ul {
margin: -0.1em 0em -0.1em -1em;
}
.見出し {
}
.読み方 {
font-family: sans-serif;
font-weight: bold;
}
.意味 {
margin-left: 1.0em;
margin-bottom: 0.5em;
}
.num_icon {
font-family: sans-serif;
padding-left: 0.25em;
margin-right: 0.5em;
font-size: 0.8em;
word-break: keep-all;
color: white;
background-color: gray;
border-style: none;
-webkit-border-radius: 0.1em;
}

View file

@ -0,0 +1,40 @@
body {
font-family: serif;
margin: 1em 44px 1em 1.5em;
line-height: 1.5em;
font-size: 1.2em;
color: black;
}
table, th, td {
border: 1px solid;
border-collapse: collapse;
padding: 0.5em;
}
th {
font-family: sans-serif;
color: black;
background-color: lightgray;
font-weight: normal;
white-space: nowrap;
}
a {
text-decoration: none;
}
.見出し {
}
.読み方 {
font-family: sans-serif;
font-weight: bold;
}
.意味 {
margin-left: 1.0em;
margin-bottom: 0.5em;
}

View file

@ -0,0 +1,40 @@
body {
font-family: serif;
margin: 1em 44px 1em 1.5em;
line-height: 1.5em;
font-size: 1.2em;
color: black;
}
table, th, td {
border: 1px solid;
border-collapse: collapse;
padding: 0.5em;
}
th {
font-family: sans-serif;
color: black;
background-color: lightgray;
font-weight: normal;
white-space: nowrap;
}
a {
text-decoration: none;
}
.見出し {
}
.読み方 {
font-family: sans-serif;
font-weight: bold;
}
.意味 {
margin-left: 1.0em;
margin-bottom: 0.5em;
}

449
data/mdict/css/smk8.css Normal file
View file

@ -0,0 +1,449 @@
body {
margin: 1em 44px 1em 1.5em;
line-height: 1.5em;
font-family: serif;
font-size: 1.2em;
color: black;
}
span[data-name="項目"] {
display: block;
}
span[data-name="見出部"] {
display: block;
}
span[data-name="見出部"].pri {
margin-left: -0.4em;
}
span[data-name="見出仮名"] {
font-family: sans-serif;
font-weight: bold;
}
rt[data-name="表音表記"] {
font-size: 0.65em;
}
rt[data-name="表外音訓マーク"] {
font-size: 0.65em;
}
rt[data-name="表外字マーク"] {
font-size: 0.65em;
}
span[data-name="解説部"] {
display: block;
margin-left: 1em;
}
span[data-name="大語義"] {
display: block;
}
span[data-name="語義"] {
display: block;
}
span[data-name="副義"] {
display: block;
}
span[data-name="用例G"] {
display: block;
}
span[data-name="注記"] span[data-name="用例G"] {
display: inline;
}
span[data-name="用例"] {
display: block;
}
span[data-name="注記"] span[data-name="用例"] {
display: inline;
}
span[data-name="見出語省略"] {
margin-left: 0.125em;
margin-right: 0.125em;
}
span[data-name="教育漢字"] {
color: green;
}
span[data-name="ルビ"] {
font-size: 0.7em;
vertical-align: 0.5em;
}
span[data-name="ルビ区切"] {
font-size: 0.7em;
vertical-align: 0.65em;
}
span[data-name="名詞形G"] {
display: block;
}
span[data-name="可能形G"] {
display: block;
}
span[data-name="参照G"] {
display: block;
}
span[data-name="参照"] {
color: blue;
}
span[data-name="子項目"],
span[data-name="句項目"] {
display: block;
margin-bottom: 0.5em;
}
span[data-name="子項目F"],
span[data-name="句項目F"] {
display: block;
margin-bottom: 0.5em;
margin-top: 0.5em;
}
span[data-name="子見出部"] {
display: block;
}
span[data-name="子解説部"] {
display: block;
margin-left: 1em;
}
span[data-name="句見出部"] {
display: block;
}
span[data-name="句解説部"] {
display: block;
margin-left: 1em;
}
span[data-name="運用解説"] {
display: block;
}
span[data-name="表記解説"] {
display: block;
}
span[data-name="文法解説"] {
display: block;
}
span[data-name="かぞえ方解説"] {
display: block;
}
span[data-name="派生"] {
display: block;
margin-left: 1.25em;
}
span[data-name="派生SubGF"] {
display: block;
text-indent: -1.25em;
}
span[data-name="派生SubG"] {
display: block;
}
span[data-name="派生SubGF"] span[data-name="用例G"] {
text-indent: 0;
}
span[data-name="派生見出"] {
font-weight: bold;
}
span[data-name="派生見出"].normal {
font-weight: normal
}
span[data-name="造語成分項目"] {
display: block;
margin-top: 1em;
}
span[data-name="造語成分見出"] {
font-size:1.4em;
}
span[data-name="EM"] {
font-weight: bold;
}
span[data-name="アクセント"] {
font-size: 0.7em;
vertical-align: super;
}
span[data-name="アクセント組M"] {
vertical-align: 0.1em;
}
span[data-name="反意語M"],
span[data-name="同意語M"] {
vertical-align: 0.15em;
}
span[data-name="B"] {
font-weight: bold;
}
span[data-name="IT"] {
font-family: "Times New Roman";
font-style: italic;
}
span[data-name="EXCLAMATION"] {
font-family: "Times New Roman";
font-style: italic;
font-size: 1.2em;
}
span[data-name="歴史仮名"] {
font-family: serif;
font-size: 0.7em;
font-weight: normal;
vertical-align: 0.35em;
-webkit-user-select: nocopy;
}
span[data-name="出現形"] {
font-weight: bold;
}
span[data-name="品詞用法"] {
font-size: 0.7em;
}
span[data-name="品詞用法"] span[data-name="品詞G"] {
font-size: 1.2em;
}
span[data-name="基本構文型"] {
font-size: 0.8em;
}
span[data-name="基本構文em"] {
font-weight: bold;
}
span[data-name="ウ濁音参照"] {
font-family: sans-serif;
font-weight: bold;
}
span[data-name="rect"] {
padding: 0.1em;
font-family: sans-serif;
font-size: 0.8em;
border-width: 0.05em;
border-style: solid;
border-color: black;
word-break: keep-all;
-webkit-border-radius: 0.1em;
}
span[data-name="rect"].fill {
color: white;
border-style: none;
background-color: gray;
}
span[data-name="rect"].red {
color: red;
border-color: red;
}
span[data-name="rect"].redfill {
color: white;
border-style: none;
background-color: red;
}
span[data-name="red"] {
color: red;
}
span[data-name="大語義番号"],
span[data-name="語義番号"],
span[data-name="副義番号"] {
margin-right: 0.25em;
font-family: sans-serif;
}
span[data-name="ref"] span[data-name="大語義番号"],
span[data-name="ref"] span[data-name="語義番号"],
span[data-name="ref"] span[data-name="副義番号"] {
font-size: 0.8em;
margin-right: 0;
}
span[data-name="表外字マーク"] {
vertical-align: 0.5em;
}
span[data-name="表外音訓マーク"] {
font-size: 0.5em;
vertical-align: 0.5em;
}
span[data-name="言換M"] {
font-size: 0.5em;
}
span[data-name="字音語参照項目"] {
display: block;
}
span[data-name="本文項目M"] {
font-size: 0.7em;
}
span[data-name="運用解説M"],
span[data-name="表記解説M"],
span[data-name="文法解説M"],
span[data-name="かぞえ方解説M"],
span[data-name="派生M"] {
margin-right: 0.25em;
font-family: sans-serif;
}
span[data-name="派生ロゴ"] {
margin-left: 0.1em;
margin-right: 0.1em;
}
span[data-name="文字"] {
margin: 0 0.2em;
}
span[data-name="二分"] {
font-size: 0.5em;
}
span[data-name="四分"] {
font-size: 0.25em;
}
span[data-name="ref"] {
margin-left: 0.1em;
margin-right: 0.1em;
}
span[data-name="ref-small"] {
font-size: 0.7em;
}
span[data-name="sup"] {
font-size: 0.6em;
}
span[data-name="外字"] img {
height: 1em;
}
img.audio {
height: 1em;
margin: 0 0.25em;
}
img.外字 {
height: 1em;
}
img.外字欧 {
height: 1em;
}
span[data-name="レ点M"] {
font-size: 0.6em;
vertical-align: -0.7em;
}
a {
text-decoration: none;
}
span[data-name="audio"] a {
padding-bottom: 0;
border-bottom: none;
}
span[data-name="アクセント"] a,
span[data-name="古語M"] a,
span[data-name="雅語M"] a,
span[data-name="派生M"] a,
span[data-name="原籍M"] a,
span[data-name="品詞M"] a {
color: black;
border-bottom-style: none;
}
span[data-name="歴史仮名"]:before,
span[data-name="ルビ"]:before {
content: "(";
}
span[data-name="歴史仮名"]:after,
span[data-name="ルビ"]:after {
content: ")";
}
div[data-child-links] {
padding-top: 1em;
}
div[data-child-links] ul {
margin: 0;
padding-left: 2em;
}
div[data-child-links] span {
padding: 0.1em;
font-family: sans-serif;
font-size: 0.8em;
color: white;
border-width: 0.05em;
border-style: none;
border-color: black;
word-break: keep-all;
-webkit-border-radius: 0.2em;
}
div[data-child-links="子項目"] span {
background-color: rgb(153, 42, 103);
}
div[data-child-links="句項目"] span {
background-color: rgb(176, 127, 57);
}
span.pri > span.外字 {
font-size: 0.65em;
vertical-align: super;
}

View file

@ -0,0 +1,7 @@
大辞林 第四版
<br><br>
https://www.monokakido.jp/ja/dictionaries/daijirin2/index.html
<br><br>
{{revision}}
<br><br>
{{attribution}}

View file

@ -0,0 +1,7 @@
国語辞典オンライン
<br><br>
https://kokugo.jitenon.jp/
<br><br>
{{revision}}
<br><br>
{{attribution}}

View file

@ -0,0 +1,7 @@
故事・ことわざ・慣用句オンライン
<br><br>
https://kotowaza.jitenon.jp/
<br><br>
{{revision}}
<br><br>
{{attribution}}

View file

@ -0,0 +1,7 @@
四字熟語辞典オンライン
<br><br>
https://yoji.jitenon.jp/
<br><br>
{{revision}}
<br><br>
{{attribution}}

View file

@ -0,0 +1,7 @@
新明解国語辞典 第八版
<br><br>
https://www.monokakido.jp/ja/dictionaries/smk8/index.html
<br><br>
{{revision}}
<br><br>
{{attribution}}

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.3 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.3 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.6 KiB

View file

@ -0,0 +1 @@
大辞林 第四版

View file

@ -0,0 +1 @@
国語辞典オンライン

View file

@ -0,0 +1 @@
故事・ことわざ・慣用句オンライン

View file

@ -0,0 +1 @@
四字熟語辞典オンライン

View file

@ -0,0 +1 @@
新明解国語辞典 第八版

View file

@ -0,0 +1,25 @@
{
"a": {},
"br": {},
"img": {},
"div": {},
"span": {},
"表外字": {
"name": "ruby"
},
"表外字マーク": {
"name": "rt"
},
"表外音訓": {
"name": "ruby"
},
"表外音訓マーク": {
"name": "rt"
},
"表音式": {
"name": "ruby"
},
"表音表記": {
"name": "rt"
}
}

View file

@ -121,25 +121,31 @@
"style": "font-weight: bold;" "style": "font-weight: bold;"
}, },
"表外字": { "表外字": {
"name": "ruby", "name": "ruby"
},
"表外字マーク": {
"name": "rt",
"procedures": [ "procedures": [
{ {
"procedure_name": "add_ruby_text", "procedure_name": "insert_span",
"parameters": { "parameters": {
"mark": "︿", "attr_name": "style",
"style": "font-size: 2em;" "attr_val": "font-size: 2em;"
} }
} }
] ]
}, },
"表外音訓": { "表外音訓": {
"name": "ruby", "name": "ruby"
},
"表外音訓マーク": {
"name": "rt",
"procedures": [ "procedures": [
{ {
"procedure_name": "add_ruby_text", "procedure_name": "insert_span",
"parameters": { "parameters": {
"mark": "︽", "attr_name": "style",
"style": "font-size: 2em;" "attr_val": "font-size: 2em;"
} }
} }
] ]
@ -148,23 +154,7 @@
"name": "ruby" "name": "ruby"
}, },
"表音表記": { "表音表記": {
"name": "rt", "name": "rt"
"procedures": [
{
"procedure_name": "replace",
"parameters": {
"old": "",
"new": ""
}
},
{
"procedure_name": "replace",
"parameters": {
"old": "",
"new": ""
}
}
]
}, },
"派生見出": { "派生見出": {
"name": "span", "name": "span",

View file

@ -17,11 +17,22 @@ along with this program. If not, see <https://www.gnu.org/licenses/>.
""" """
import os import os
import sys
import argparse import argparse
import subprocess
from bot.targets import Targets from bot.targets import Targets
from bot.crawlers.factory import new_crawler from bot.crawlers.factory import new_crawler
def filename(f):
if not os.path.isfile(f):
raise argparse.ArgumentTypeError(f"`{f}` is not a valid filename")
elif not os.access(f, os.R_OK):
raise argparse.ArgumentTypeError(f"Cannot access file `{f}`")
else:
return f
def directory(d): def directory(d):
if not os.path.isdir(d): if not os.path.isdir(d):
raise argparse.ArgumentTypeError(f"`{d}` is not a valid directory") raise argparse.ArgumentTypeError(f"`{d}` is not a valid directory")
@ -35,34 +46,71 @@ def parse_args(target_names):
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
prog="jitenbot", prog="jitenbot",
description="Convert Japanese dictionary files to new formats.", description="Convert Japanese dictionary files to new formats.",
epilog="See README.md for details regarding media directory structures",
) )
parser.add_argument( parser.add_argument(
"target", "target",
choices=target_names, choices=target_names,
help="name of dictionary to convert" help="name of dictionary to convert",
) )
parser.add_argument( parser.add_argument(
"-p", "--page-dir", "-p", "--page-dir",
help="path to directory containing XML page files", help="path to directory containing XML page files",
type=directory type=directory,
) )
parser.add_argument( parser.add_argument(
"-i", "--image-dir", "-m", "--media-dir",
help="path to directory containing image folders (gaiji, graphics, etc.)", help="path to directory containing media folders (gaiji, graphics, audio, etc.)",
type=directory type=directory,
)
parser.add_argument(
"-i", "--mdict-icon",
help="path to icon file to be used with MDict",
type=filename,
)
parser.add_argument(
"--no-yomichan-export",
help="skip export of dictionary data to Yomichan format",
action='store_true',
)
parser.add_argument(
"--no-mdict-export",
help="skip export of dictionary data to MDict format",
action='store_true',
) )
args = parser.parse_args() args = parser.parse_args()
return args return args
def test_mdict():
try:
subprocess.run(
["mdict", "--version"],
check=True,
stdout=subprocess.DEVNULL,
)
except FileNotFoundError:
print("Could not find `mdict` pack tool.")
print("Ensure that mdict-utils is installed and")
print("included in the environment PATH.\n")
print("Mdict export functionality may also be")
print("disabled with the --no-mdict-export flag.")
sys.exit()
def main(): def main():
target_names = [x.value for x in Targets] target_names = [x.value for x in Targets]
args = parse_args(target_names) args = parse_args(target_names)
if not args.no_mdict_export:
test_mdict()
selected_target = Targets(args.target) selected_target = Targets(args.target)
crawler = new_crawler(selected_target) crawler = new_crawler(selected_target)
crawler.collect_pages(args.page_dir) crawler.collect_pages(args.page_dir)
crawler.read_pages() crawler.read_pages()
crawler.make_yomichan_dictionary(args.image_dir) if not args.no_yomichan_export:
crawler.make_yomichan_dictionary(args.media_dir)
if not args.no_mdict_export:
crawler.make_mdict_dictionary(args.media_dir, args.mdict_icon)
if __name__ == "__main__": if __name__ == "__main__":

View file

@ -6,6 +6,7 @@ css-parser==1.0.8
html5lib==1.1 html5lib==1.1
idna==3.4 idna==3.4
lxml==4.9.2 lxml==4.9.2
mdict-utils==1.3.12
Pillow==9.5.0 Pillow==9.5.0
platformdirs==3.5.0 platformdirs==3.5.0
requests==2.29.0 requests==2.29.0
@ -13,5 +14,7 @@ six==1.16.0
soupsieve==2.4.1 soupsieve==2.4.1
SudachiDict-full==20230110 SudachiDict-full==20230110
SudachiPy==0.6.7 SudachiPy==0.6.7
tqdm==4.65.0
urllib3==1.26.15 urllib3==1.26.15
webencodings==0.5.1 webencodings==0.5.1
xxhash==3.2.0

13
run_all.sh Normal file
View file

@ -0,0 +1,13 @@
python jitenbot.py jitenon-kokugo
python jitenbot.py jitenon-yoji
python jitenbot.py jitenon-kotowaza
python jitenbot.py smk8 \
--media-dir monokakido/SMK8/media \
--page-dir monokakido/SMK8/pages \
--mdict-icon monokakido/SMK8/SMK8-76@3x.png
python jitenbot.py daijirin2 \
--media-dir monokakido/DAIJIRIN2/media \
--page-dir monokakido/DAIJIRIN2/pages \
--mdict-icon monokakido/DAIJIRIN2/DAIJIRIN2-76@3x.png