diff --git a/.gitignore b/.gitignore
index b009cb5..4c7985d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
webcache/
output/
notes/
+monokakido/
# Byte-compiled / optimized / DLL files
__pycache__/
diff --git a/README.md b/README.md
index 88d0f2b..5a872ea 100644
--- a/README.md
+++ b/README.md
@@ -49,7 +49,8 @@ compiling the scraped data into compact dictionary file formats.
# Usage
```
-usage: jitenbot [-h] [-p PAGE_DIR] [-i IMAGE_DIR]
+usage: jitenbot [-h] [-p PAGE_DIR] [-m MEDIA_DIR] [-i MDICT_ICON]
+ [--no-yomichan-export] [--no-mdict-export]
{jitenon-kokugo,jitenon-yoji,jitenon-kotowaza,smk8,daijirin2}
Convert Japanese dictionary files to new formats.
@@ -62,9 +63,15 @@ options:
-h, --help show this help message and exit
-p PAGE_DIR, --page-dir PAGE_DIR
path to directory containing XML page files
- -i IMAGE_DIR, --image-dir IMAGE_DIR
- path to directory containing image folders (gaiji,
- graphics, etc.)
+ -m MEDIA_DIR, --media-dir MEDIA_DIR
+ path to directory containing media folders (gaiji,
+ graphics, audio, etc.)
+ -i MDICT_ICON, --mdict-icon MDICT_ICON
+ path to icon file to be used with MDict
+ --no-yomichan-export skip export of dictionary data to Yomichan format
+ --no-mdict-export skip export of dictionary data to MDict format
+
+See README.md for details regarding media directory structures
```
### Online Targets
Jitenbot will scrape the target website and save the pages to the [user cache directory](https://pypi.org/project/platformdirs/).
@@ -75,8 +82,55 @@ HTTP request headers (user agent string, etc.) may be customized by editing the
[user config directory](https://pypi.org/project/platformdirs/).
### Offline Targets
-Page data and image data must be procured by the user
+Page data and media data must be [procured by the user](https://github.com/golddranks/monokakido/)
and passed to jitenbot via the appropriate command line flags.
+
+ smk8 media directory
+
+Since Yomichan does not support audio files from imported
+dictionaries, the `audio/` directory may be omitted to save filesize
+space in the output ZIP file if desired.
+
+```
+media
+├── Audio.png
+├── audio
+│ ├── 00001.aac
+│ ├── 00002.aac
+│ ├── 00003.aac
+│ │ ...
+│ └── 82682.aac
+└── gaiji
+ ├── 1d110.svg
+ ├── 1d15d.svg
+ ├── 1d15e.svg
+ │ ...
+ └── xbunnoa.svg
+```
+
+
+
+ daijirin2 media directory
+
+The `graphics/` directory may be omitted to save space if desired.
+
+```
+media
+├── gaiji
+│ ├── 1D10B.svg
+│ ├── 1D110.svg
+│ ├── 1D12A.svg
+│ │ ...
+│ └── vectorOB.svg
+└── graphics
+ ├── 3djr_0002.png
+ ├── 3djr_0004.png
+ ├── 3djr_0005.png
+ │ ...
+ └── 4djr_yahazu.png
+```
+
+
# Attribution
`Adobe-Japan1_sequences.txt` is provided by [The Adobe-Japan1-7 Character Collection](https://github.com/adobe-type-tools/Adobe-Japan1).
diff --git a/TODO.md b/TODO.md
index 30c860d..2f2a5d5 100644
--- a/TODO.md
+++ b/TODO.md
@@ -1,11 +1,11 @@
### Todo
+- [x] Add factory classes to reduce the amount of class import statements
+- [x] Support exporting to MDict (.MDX) dictionary format
- [ ] Add test suite
- [ ] Add documentation (docstrings, etc.)
- [ ] Validate JSON schema of Yomichan terms during export
-- [ ] Add factory classes to reduce the amount of class import statements
- [ ] Add build scripts for producing program binaries
-- [ ] Support exporting to MDict (.MDX) dictionary format
- [ ] Validate scraped webpages after downloading
- [ ] Log non-fatal failures to a log file instead of raising exceptions
- [ ] Support more dictionary websites
diff --git a/bot/crawlers/crawlers.py b/bot/crawlers/crawlers.py
index c7bf8ea..97b3794 100644
--- a/bot/crawlers/crawlers.py
+++ b/bot/crawlers/crawlers.py
@@ -5,7 +5,8 @@ from bs4 import BeautifulSoup
import bot.scraper as Scraper
from bot.entries.factory import new_entry
-from bot.yomichan.exporters.factory import new_exporter
+from bot.yomichan.exporters.factory import new_yomi_exporter
+from bot.mdict.exporters.factory import new_mdict_exporter
class Crawler(ABC):
@@ -38,9 +39,13 @@ class Crawler(ABC):
self._entries.append(entry)
print()
- def make_yomichan_dictionary(self, image_dir):
- exporter = new_exporter(self._target)
- exporter.export(self._entries, image_dir)
+ def make_yomichan_dictionary(self, media_dir):
+ exporter = new_yomi_exporter(self._target)
+ exporter.export(self._entries, media_dir)
+
+ def make_mdict_dictionary(self, media_dir, icon_file):
+ exporter = new_mdict_exporter(self._target)
+ exporter.export(self._entries, media_dir, icon_file)
def _parse_page_id(self, page_link):
m = re.search(self._page_id_pattern, page_link)
@@ -142,10 +147,8 @@ class _MonokakidoCrawler(Crawler):
class Smk8Crawler(_MonokakidoCrawler):
- def __init__(self, target):
- super().__init__(target)
+ pass
class Daijirin2Crawler(_MonokakidoCrawler):
- def __init__(self, target):
- super().__init__(target)
+ pass
diff --git a/bot/data.py b/bot/data.py
index 5d68769..3b1effd 100644
--- a/bot/data.py
+++ b/bot/data.py
@@ -99,15 +99,15 @@ def load_daijirin2_kana_abbreviations():
@cache
-def load_smk8_yomichan_name_conversion():
- file_name = os.path.join("smk8", "yomichan_name_conversion.json")
+def load_yomichan_name_conversion(target):
+ file_name = os.path.join(target.value, "yomichan_name_conversion.json")
data = __load_json(file_name)
return data
@cache
-def load_daijirin2_yomichan_name_conversion():
- file_name = os.path.join("daijirin2", "yomichan_name_conversion.json")
+def load_mdict_name_conversion(target):
+ file_name = os.path.join(target.value, "mdict_name_conversion.json")
data = __load_json(file_name)
return data
diff --git a/bot/entries/daijirin2.py b/bot/entries/daijirin2.py
index 1463442..196bd0c 100644
--- a/bot/entries/daijirin2.py
+++ b/bot/entries/daijirin2.py
@@ -1,4 +1,3 @@
-import re
from bs4 import BeautifulSoup
import bot.entries.expressions as Expressions
@@ -10,19 +9,17 @@ from bot.entries.daijirin2_preprocess import preprocess_page
class _BaseDaijirin2Entry(Entry):
- ID_TO_ENTRY = {}
- SUBENTRY_ID_TO_ENTRY_ID = {}
-
- def __init__(self, entry_id):
- super().__init__(entry_id)
- if entry_id not in self.ID_TO_ENTRY:
- self.ID_TO_ENTRY[entry_id] = self
- else:
- raise Exception(f"Duplicate entry ID: {entry_id}")
+ def __init__(self, target, entry_id):
+ super().__init__(target, entry_id)
self.children = []
self.phrases = []
self._kana_abbreviations = load_daijirin2_kana_abbreviations()
+ def get_global_identifier(self):
+ parent_part = format(self.entry_id[0], '06')
+ child_part = hex(self.entry_id[1]).lstrip('0x').zfill(4).upper()
+ return f"@{self.target.value}-{parent_part}-{child_part}"
+
def set_page(self, page):
page = self.__decompose_subentries(page)
self._page = page
@@ -57,14 +54,7 @@ class _BaseDaijirin2Entry(Entry):
else:
self._part_of_speech_tags.append(pos)
- def get_headwords(self):
- if self._headwords is not None:
- return self._headwords
- self._set_headwords()
- self._set_variant_headwords()
- return self._headwords
-
- def _set_regular_headwords(self, soup):
+ def _get_regular_headwords(self, soup):
self._fill_alts(soup)
reading = soup.find("見出仮名").text
expressions = []
@@ -78,10 +68,11 @@ class _BaseDaijirin2Entry(Entry):
expressions = Expressions.expand_abbreviation_list(expressions)
if len(expressions) == 0:
expressions.append(reading)
- self._headwords = {reading: expressions}
+ headwords = {reading: expressions}
+ return headwords
- def _set_variant_headwords(self):
- for expressions in self._headwords.values():
+ def _add_variant_expressions(self, headwords):
+ for expressions in headwords.values():
Expressions.add_variant_kanji(expressions)
Expressions.add_fullwidth(expressions)
Expressions.remove_iteration_mark(expressions)
@@ -101,7 +92,7 @@ class _BaseDaijirin2Entry(Entry):
tag_soup.name = "項目"
subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"])
self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id
- subentry = subentry_class(subentry_id)
+ subentry = subentry_class(self.target, subentry_id)
page = tag_soup.decode()
subentry.set_page(page)
subentry_list.append(subentry)
@@ -122,6 +113,8 @@ class _BaseDaijirin2Entry(Entry):
@staticmethod
def _delete_unused_nodes(soup):
+ """Remove extra markup elements that appear in the entry
+ headword line which are not part of the entry headword"""
unused_nodes = [
"漢字音logo", "活用分節", "連語句活用分節", "語構成",
"表外字マーク", "表外字マーク", "ルビG"
@@ -144,25 +137,26 @@ class _BaseDaijirin2Entry(Entry):
class Daijirin2Entry(_BaseDaijirin2Entry):
- def __init__(self, page_id):
+ def __init__(self, target, page_id):
entry_id = (page_id, 0)
- super().__init__(entry_id)
+ super().__init__(target, entry_id)
def set_page(self, page):
page = preprocess_page(page)
super().set_page(page)
- def _set_headwords(self):
+ def _get_headwords(self):
soup = self.get_page_soup()
self._delete_unused_nodes(soup)
if soup.find("漢字見出") is not None:
- self._set_kanji_headwords(soup)
+ headwords = self._get_kanji_headwords(soup)
elif soup.find("略語G") is not None:
- self._set_acronym_headwords(soup)
+ headwords = self._get_acronym_headwords(soup)
else:
- self._set_regular_headwords(soup)
+ headwords = self._get_regular_headwords(soup)
+ return headwords
- def _set_kanji_headwords(self, soup):
+ def _get_kanji_headwords(self, soup):
readings = []
for el in soup.find_all("漢字音"):
hira = Expressions.kata_to_hira(el.text)
@@ -172,11 +166,12 @@ class Daijirin2Entry(_BaseDaijirin2Entry):
expressions = []
for el in soup.find_all("漢字見出"):
expressions.append(el.text)
- self._headwords = {}
+ headwords = {}
for reading in readings:
- self._headwords[reading] = expressions
+ headwords[reading] = expressions
+ return headwords
- def _set_acronym_headwords(self, soup):
+ def _get_acronym_headwords(self, soup):
expressions = []
for el in soup.find_all("略語"):
expression_parts = []
@@ -184,29 +179,24 @@ class Daijirin2Entry(_BaseDaijirin2Entry):
expression_parts.append(part.text)
expression = "".join(expression_parts)
expressions.append(expression)
- self._headwords = {"": expressions}
+ headwords = {"": expressions}
+ return headwords
class Daijirin2ChildEntry(_BaseDaijirin2Entry):
- def __init__(self, entry_id):
- super().__init__(entry_id)
-
- def _set_headwords(self):
+ def _get_headwords(self):
soup = self.get_page_soup()
self._delete_unused_nodes(soup)
- self._set_regular_headwords(soup)
+ headwords = self._get_regular_headwords(soup)
+ return headwords
class Daijirin2PhraseEntry(_BaseDaijirin2Entry):
- def __init__(self, entry_id):
- super().__init__(entry_id)
- self.__phrase_readings = load_daijirin2_phrase_readings()
-
def get_part_of_speech_tags(self):
# phrases do not contain these tags
return []
- def _set_headwords(self):
+ def _get_headwords(self):
soup = self.get_page_soup()
headwords = {}
expressions = self._find_expressions(soup)
@@ -217,7 +207,7 @@ class Daijirin2PhraseEntry(_BaseDaijirin2Entry):
headwords[reading].append(expression)
else:
headwords[reading] = [expression]
- self._headwords = headwords
+ return headwords
def _find_expressions(self, soup):
self._delete_unused_nodes(soup)
@@ -231,7 +221,8 @@ class Daijirin2PhraseEntry(_BaseDaijirin2Entry):
return expressions
def _find_readings(self):
- text = self.__phrase_readings[self.entry_id]
+ phrase_readings = load_daijirin2_phrase_readings()
+ text = phrase_readings[self.entry_id]
alternatives = Expressions.expand_daijirin_alternatives(text)
readings = []
for alt in alternatives:
diff --git a/bot/entries/entry.py b/bot/entries/entry.py
index 57316f6..3811a77 100644
--- a/bot/entries/entry.py
+++ b/bot/entries/entry.py
@@ -2,12 +2,24 @@ from abc import ABC, abstractmethod
class Entry(ABC):
- def __init__(self, entry_id):
+ ID_TO_ENTRY = {}
+ SUBENTRY_ID_TO_ENTRY_ID = {}
+
+ def __init__(self, target, entry_id):
+ if entry_id not in self.ID_TO_ENTRY:
+ self.ID_TO_ENTRY[entry_id] = self
+ else:
+ raise Exception(f"Duplicate entry ID: {entry_id}")
+ self.target = target
self.entry_id = entry_id
self._page = None
self._headwords = None
self._part_of_speech_tags = None
+ @abstractmethod
+ def get_global_identifier(self):
+ pass
+
@abstractmethod
def set_page(self, page):
pass
@@ -16,14 +28,34 @@ class Entry(ABC):
def get_page_soup(self):
pass
- @abstractmethod
def get_headwords(self):
+ if self._headwords is not None:
+ return self._headwords
+ headwords = self._get_headwords()
+ self._add_variant_expressions(headwords)
+ self._headwords = headwords
+ return headwords
+
+ @abstractmethod
+ def _get_headwords(self):
+ pass
+
+ @abstractmethod
+ def _add_variant_expressions(self, headwords):
pass
@abstractmethod
def get_part_of_speech_tags(self):
pass
+ def get_parent(self):
+ if self.entry_id in self.SUBENTRY_ID_TO_ENTRY_ID:
+ parent_id = self.SUBENTRY_ID_TO_ENTRY_ID[self.entry_id]
+ parent = self.ID_TO_ENTRY[parent_id]
+ else:
+ parent = None
+ return parent
+
def get_first_expression(self):
headwords = self.get_headwords()
expressions = next(iter(headwords.values()))
diff --git a/bot/entries/factory.py b/bot/entries/factory.py
index 23ca066..a3dec69 100644
--- a/bot/entries/factory.py
+++ b/bot/entries/factory.py
@@ -15,4 +15,4 @@ def new_entry(target, page_id):
Targets.SMK8: Smk8Entry,
Targets.DAIJIRIN2: Daijirin2Entry,
}
- return entry_map[target](page_id)
+ return entry_map[target](target, page_id)
diff --git a/bot/entries/jitenon.py b/bot/entries/jitenon.py
index fd9fcd2..65c4d2e 100644
--- a/bot/entries/jitenon.py
+++ b/bot/entries/jitenon.py
@@ -1,4 +1,5 @@
import re
+from abc import abstractmethod
from datetime import datetime, date
from bs4 import BeautifulSoup
@@ -7,18 +8,17 @@ import bot.entries.expressions as Expressions
class _JitenonEntry(Entry):
- ID_TO_ENTRY = {}
-
- def __init__(self, entry_id):
- super().__init__(entry_id)
- if entry_id not in self.ID_TO_ENTRY:
- self.ID_TO_ENTRY[entry_id] = self
- else:
- raise Exception(f"Duplicate entry ID: {entry_id}")
+ def __init__(self, target, entry_id):
+ super().__init__(target, entry_id)
+ self.expression = ""
+ self.yomikata = ""
+ self.definition = ""
+ self.other_forms = []
self.modified_date = date(1970, 1, 1)
self.attribution = ""
- for column in self._COLUMNS.values():
- setattr(self, column[0], column[1])
+
+ def get_global_identifier(self):
+ return f"@{self.target.value}-{format(self.entry_id, '06')}"
def set_page(self, page):
soup = BeautifulSoup(page, features="html5lib")
@@ -39,36 +39,33 @@ class _JitenonEntry(Entry):
soup = BeautifulSoup(self._page, "html5lib")
return soup
- def get_headwords(self):
- if self._headwords is not None:
- return self._headwords
- self._set_headwords()
- self._set_variant_headwords()
- return self._headwords
-
def get_part_of_speech_tags(self):
# Jitenon doesn't have any
return []
- def _set_headwords(self):
+ def _get_headwords(self):
headwords = {}
- for yomikata in self._yomikatas():
- headwords[yomikata] = [self.expression]
- ikei_headwords = self._ikei_headwords()
- for reading, expressions in ikei_headwords.items():
+ for reading in self._get_readings():
+ headwords[reading] = [self.expression]
+ other_form_headwords = self._other_form_headwords()
+ for reading, expressions in other_form_headwords.items():
if reading not in headwords:
headwords[reading] = []
for expression in expressions:
if expression not in headwords[reading]:
headwords[reading].append(expression)
- self._headwords = headwords
+ return headwords
+
+ @abstractmethod
+ def _get_column_map(self):
+ pass
def __set_modified_date(self, page):
m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", page)
if m is None:
return
- date = datetime.strptime(m.group(1), '%Y-%m-%d').date()
- self.modified_date = date
+ modified_date = datetime.strptime(m.group(1), '%Y-%m-%d').date()
+ self.modified_date = modified_date
def __set_attribution(self, soup):
attribution = soup.find(class_="copyright")
@@ -78,7 +75,8 @@ class _JitenonEntry(Entry):
self.attribution = ""
def __set_column(self, colname, colval):
- attr_name = self._COLUMNS[colname][0]
+ column_map = self._get_column_map()
+ attr_name = column_map[colname]
attr_value = getattr(self, attr_name)
if isinstance(attr_value, str):
setattr(self, attr_name, colval)
@@ -88,7 +86,7 @@ class _JitenonEntry(Entry):
else:
attr_value.append(colval)
- def _yomikatas(self):
+ def _get_readings(self):
yomikata = self.yomikata
m = re.search(r"^[ぁ-ヿ、]+$", yomikata)
if m:
@@ -109,20 +107,20 @@ class _JitenonEntry(Entry):
print(f"Invalid 読み方 format: {self.yomikata}\n{self}\n")
return [""]
- def _ikei_headwords(self):
- ikei_headwords = {}
- for val in self.ikei:
+ def _other_form_headwords(self):
+ other_form_headwords = {}
+ for val in self.other_forms:
m = re.search(r"^([^(]+)(([ぁ-ヿ、]+))$", val)
if not m:
print(f"Invalid 異形 format: {val}\n{self}\n")
continue
expression = m.group(1)
reading = m.group(2)
- if reading not in ikei_headwords:
- ikei_headwords[reading] = []
- if expression not in ikei_headwords[reading]:
- ikei_headwords[reading].append(expression)
- return ikei_headwords
+ if reading not in other_form_headwords:
+ other_form_headwords[reading] = []
+ if expression not in other_form_headwords[reading]:
+ other_form_headwords[reading].append(expression)
+ return other_form_headwords
@staticmethod
def __clean_text(text):
@@ -133,9 +131,10 @@ class _JitenonEntry(Entry):
return text
def __str__(self):
+ column_map = self._get_column_map()
colvals = [str(self.entry_id)]
- for attr in self._COLUMNS.values():
- attr_val = getattr(self, attr[0])
+ for attr_name in column_map.values():
+ attr_val = getattr(self, attr_name)
if isinstance(attr_val, str):
colvals.append(attr_val)
elif isinstance(attr_val, list):
@@ -144,83 +143,100 @@ class _JitenonEntry(Entry):
class JitenonYojiEntry(_JitenonEntry):
- _COLUMNS = {
- "四字熟語": ["expression", ""],
- "読み方": ["yomikata", ""],
- "意味": ["imi", ""],
- "出典": ["shutten", ""],
- "漢検級": ["kankenkyuu", ""],
- "場面用途": ["bamenyouto", ""],
- "異形": ["ikei", []],
- "類義語": ["ruigigo", []],
- }
+ def __init__(self, target, entry_id):
+ super().__init__(target, entry_id)
+ self.origin = ""
+ self.kanken_level = ""
+ self.category = ""
+ self.related_expressions = []
- def __init__(self, entry_id):
- super().__init__(entry_id)
+ def _get_column_map(self):
+ return {
+ "四字熟語": "expression",
+ "読み方": "yomikata",
+ "意味": "definition",
+ "異形": "other_forms",
+ "出典": "origin",
+ "漢検級": "kanken_level",
+ "場面用途": "category",
+ "類義語": "related_expressions",
+ }
- def _set_variant_headwords(self):
- for expressions in self._headwords.values():
+ def _add_variant_expressions(self, headwords):
+ for expressions in headwords.values():
Expressions.add_variant_kanji(expressions)
class JitenonKotowazaEntry(_JitenonEntry):
- _COLUMNS = {
- "言葉": ["expression", ""],
- "読み方": ["yomikata", ""],
- "意味": ["imi", ""],
- "出典": ["shutten", ""],
- "例文": ["reibun", ""],
- "異形": ["ikei", []],
- "類句": ["ruiku", []],
- }
+ def __init__(self, target, entry_id):
+ super().__init__(target, entry_id)
+ self.origin = ""
+ self.example = ""
+ self.related_expressions = []
- def __init__(self, entry_id):
- super().__init__(entry_id)
+ def _get_column_map(self):
+ return {
+ "言葉": "expression",
+ "読み方": "yomikata",
+ "意味": "definition",
+ "異形": "other_forms",
+ "出典": "origin",
+ "例文": "example",
+ "類句": "related_expressions",
+ }
- def _set_headwords(self):
+ def _get_headwords(self):
if self.expression == "金棒引き・鉄棒引き":
- self._headwords = {
+ headwords = {
"かなぼうひき": ["金棒引き", "鉄棒引き"]
}
else:
- super()._set_headwords()
+ headwords = super()._get_headwords()
+ return headwords
- def _set_variant_headwords(self):
- for expressions in self._headwords.values():
+ def _add_variant_expressions(self, headwords):
+ for expressions in headwords.values():
Expressions.add_variant_kanji(expressions)
Expressions.add_fullwidth(expressions)
class JitenonKokugoEntry(_JitenonEntry):
- _COLUMNS = {
- "言葉": ["expression", ""],
- "読み方": ["yomikata", ""],
- "意味": ["imi", ""],
- "例文": ["reibun", ""],
- "別表記": ["betsuhyouki", ""],
- "対義語": ["taigigo", ""],
- "活用": ["katsuyou", ""],
- "用例": ["yourei", ""],
- "類語": ["ruigo", ""],
- }
+ def __init__(self, target, entry_id):
+ super().__init__(target, entry_id)
+ self.example = ""
+ self.alt_expression = ""
+ self.antonym = ""
+ self.attachments = ""
+ self.compounds = ""
+ self.related_words = ""
- def __init__(self, entry_id):
- super().__init__(entry_id)
+ def _get_column_map(self):
+ return {
+ "言葉": "expression",
+ "読み方": "yomikata",
+ "意味": "definition",
+ "例文": "example",
+ "別表記": "alt_expression",
+ "対義語": "antonym",
+ "活用": "attachments",
+ "用例": "compounds",
+ "類語": "related_words",
+ }
- def _set_headwords(self):
+ def _get_headwords(self):
headwords = {}
for reading in self.yomikata.split("・"):
if reading not in headwords:
headwords[reading] = []
for expression in self.expression.split("・"):
headwords[reading].append(expression)
- if self.betsuhyouki.strip() != "":
- for expression in self.betsuhyouki.split("・"):
+ if self.alt_expression.strip() != "":
+ for expression in self.alt_expression.split("・"):
headwords[reading].append(expression)
- self._headwords = headwords
+ return headwords
- def _set_variant_headwords(self):
- for expressions in self._headwords.values():
+ def _add_variant_expressions(self, headwords):
+ for expressions in headwords.values():
Expressions.add_variant_kanji(expressions)
Expressions.add_fullwidth(expressions)
Expressions.remove_iteration_mark(expressions)
diff --git a/bot/entries/smk8.py b/bot/entries/smk8.py
index 11ef7e6..2308893 100644
--- a/bot/entries/smk8.py
+++ b/bot/entries/smk8.py
@@ -1,4 +1,3 @@
-import re
from bs4 import BeautifulSoup
import bot.entries.expressions as Expressions
@@ -9,19 +8,17 @@ from bot.entries.smk8_preprocess import preprocess_page
class _BaseSmk8Entry(Entry):
- ID_TO_ENTRY = {}
- SUBENTRY_ID_TO_ENTRY_ID = {}
-
- def __init__(self, entry_id):
- super().__init__(entry_id)
- if entry_id not in self.ID_TO_ENTRY:
- self.ID_TO_ENTRY[entry_id] = self
- else:
- raise Exception(f"Duplicate entry ID: {entry_id}")
+ def __init__(self, target, entry_id):
+ super().__init__(target, entry_id)
self.children = []
self.phrases = []
self.kanjis = []
+ def get_global_identifier(self):
+ parent_part = format(self.entry_id[0], '06')
+ child_part = hex(self.entry_id[1]).lstrip('0x').zfill(4).upper()
+ return f"@{self.target.value}-{parent_part}-{child_part}"
+
def set_page(self, page):
page = self.__decompose_subentries(page)
self._page = page
@@ -30,13 +27,6 @@ class _BaseSmk8Entry(Entry):
soup = BeautifulSoup(self._page, "xml")
return soup
- def get_headwords(self):
- if self._headwords is not None:
- return self._headwords
- self._set_headwords()
- self._set_variant_headwords()
- return self._headwords
-
def get_part_of_speech_tags(self):
if self._part_of_speech_tags is not None:
return self._part_of_speech_tags
@@ -50,8 +40,8 @@ class _BaseSmk8Entry(Entry):
self._part_of_speech_tags.append(tag.text)
return self._part_of_speech_tags
- def _set_variant_headwords(self):
- for expressions in self._headwords.values():
+ def _add_variant_expressions(self, headwords):
+ for expressions in headwords.values():
Expressions.add_variant_kanji(expressions)
Expressions.add_fullwidth(expressions)
Expressions.remove_iteration_mark(expressions)
@@ -87,7 +77,7 @@ class _BaseSmk8Entry(Entry):
tag_soup.name = "項目"
subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"])
self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id
- subentry = subentry_class(subentry_id)
+ subentry = subentry_class(self.target, subentry_id)
page = tag_soup.decode()
subentry.set_page(page)
subentry_list.append(subentry)
@@ -106,6 +96,16 @@ class _BaseSmk8Entry(Entry):
else:
raise Exception(f"Invalid entry ID: {id_string}")
+ @staticmethod
+ def _delete_unused_nodes(soup):
+ """Remove extra markup elements that appear in the entry
+ headword line which are not part of the entry headword"""
+ unused_nodes = [
+ "表音表記", "表外音訓マーク", "表外字マーク", "ルビG"
+ ]
+ for name in unused_nodes:
+ Soup.delete_soup_nodes(soup, name)
+
@staticmethod
def _clean_expression(expression):
for x in ["〈", "〉", "{", "}", "…", " "]:
@@ -114,24 +114,24 @@ class _BaseSmk8Entry(Entry):
@staticmethod
def _fill_alts(soup):
- for e in soup.find_all(["親見出仮名", "親見出表記"]):
- e.string = e.attrs["alt"]
+ for el in soup.find_all(["親見出仮名", "親見出表記"]):
+ el.string = el.attrs["alt"]
for gaiji in soup.find_all("外字"):
gaiji.string = gaiji.img.attrs["alt"]
class Smk8Entry(_BaseSmk8Entry):
- def __init__(self, page_id):
+ def __init__(self, target, page_id):
entry_id = (page_id, 0)
- super().__init__(entry_id)
+ super().__init__(target, entry_id)
def set_page(self, page):
page = preprocess_page(page)
super().set_page(page)
- def _set_headwords(self):
+ def _get_headwords(self):
soup = self.get_page_soup()
- Soup.delete_soup_nodes(soup, "表音表記")
+ self._delete_unused_nodes(soup)
self._fill_alts(soup)
reading = self._find_reading(soup)
expressions = []
@@ -140,16 +140,14 @@ class Smk8Entry(_BaseSmk8Entry):
for expression in self._find_expressions(soup):
if expression not in expressions:
expressions.append(expression)
- self._headwords = {reading: expressions}
+ headwords = {reading: expressions}
+ return headwords
class Smk8ChildEntry(_BaseSmk8Entry):
- def __init__(self, entry_id):
- super().__init__(entry_id)
-
- def _set_headwords(self):
+ def _get_headwords(self):
soup = self.get_page_soup()
- Soup.delete_soup_nodes(soup, "表音表記")
+ self._delete_unused_nodes(soup)
self._fill_alts(soup)
reading = self._find_reading(soup)
expressions = []
@@ -158,19 +156,20 @@ class Smk8ChildEntry(_BaseSmk8Entry):
for expression in self._find_expressions(soup):
if expression not in expressions:
expressions.append(expression)
- self._headwords = {reading: expressions}
+ headwords = {reading: expressions}
+ return headwords
class Smk8PhraseEntry(_BaseSmk8Entry):
- def __init__(self, entry_id):
- super().__init__(entry_id)
+ def __init__(self, target, entry_id):
+ super().__init__(target, entry_id)
self.__phrase_readings = load_smk8_phrase_readings()
def get_part_of_speech_tags(self):
# phrases do not contain these tags
return []
- def _set_headwords(self):
+ def _get_headwords(self):
soup = self.get_page_soup()
headwords = {}
expressions = self._find_expressions(soup)
@@ -181,10 +180,10 @@ class Smk8PhraseEntry(_BaseSmk8Entry):
headwords[reading].append(expression)
else:
headwords[reading] = [expression]
- self._headwords = headwords
+ return headwords
def _find_expressions(self, soup):
- Soup.delete_soup_nodes(soup, "ルビG")
+ self._delete_unused_nodes(soup)
self._fill_alts(soup)
text = soup.find("標準表記").text
text = self._clean_expression(text)
@@ -206,15 +205,14 @@ class Smk8PhraseEntry(_BaseSmk8Entry):
class Smk8KanjiEntry(_BaseSmk8Entry):
- def __init__(self, entry_id):
- super().__init__(entry_id)
-
- def _set_headwords(self):
+ def _get_headwords(self):
soup = self.get_page_soup()
+ self._delete_unused_nodes(soup)
self._fill_alts(soup)
reading = self.__get_parent_reading()
expressions = self._find_expressions(soup)
- self._headwords = {reading: expressions}
+ headwords = {reading: expressions}
+ return headwords
def __get_parent_reading(self):
parent_id = self.SUBENTRY_ID_TO_ENTRY_ID[self.entry_id]
diff --git a/bot/entries/smk8_preprocess.py b/bot/entries/smk8_preprocess.py
index 2e480a8..5c9b924 100644
--- a/bot/entries/smk8_preprocess.py
+++ b/bot/entries/smk8_preprocess.py
@@ -15,6 +15,7 @@ def preprocess_page(page):
page = __strip_page(page)
page = __replace_glyph_codes(page)
page = __format_hyougai_marks(page)
+ page = __remove_pronunciation_parentheses(page)
return page
@@ -64,6 +65,7 @@ def __format_hyougai_marks(page):
for x in ["\n", "\t", " "]:
text = text.replace(x, "")
text = re.sub(r"〈([^〈]+)〉", r"\1", text)
+
page = re.sub(r"〈([^〈]+)〉", r"␂\1␃", page)
for mark in re.findall(r"《.", text):
if mark[1] == "〓":
@@ -79,13 +81,29 @@ def __format_hyougai_marks(page):
page = re.sub(f"〈([^{mark[1]}]*)({mark[1]})",
r"\1<表外字>\2表外字>",
page, count=1)
+
page = page.replace("␂", "〈")
page = page.replace("␃", "〉")
soup = BeautifulSoup(page, features="xml")
+
for el in soup.find_all("表外音訓"):
if el.text == "":
el.append(el.next_sibling)
+ mark_xml = "<表外音訓マーク>︽表外音訓マーク>"
+ mark_soup = BeautifulSoup(mark_xml, "xml")
+ el.append(mark_soup.表外音訓マーク)
+
for el in soup.find_all("表外字"):
if el.text == "":
el.append(el.next_sibling)
+ mark_xml = "<表外字マーク>︿表外字マーク>"
+ mark_soup = BeautifulSoup(mark_xml, "xml")
+ el.append(mark_soup.表外字マーク)
+
return soup.decode()
+
+
+def __remove_pronunciation_parentheses(page):
+ page = page.replace("<表音表記>(", "<表音表記>")
+ page = page.replace(")表音表記>", "表音表記>")
+ return page
diff --git a/bot/mdict/exporters/export.py b/bot/mdict/exporters/export.py
new file mode 100644
index 0000000..2d76f1d
--- /dev/null
+++ b/bot/mdict/exporters/export.py
@@ -0,0 +1,204 @@
+# pylint: disable=too-few-public-methods
+
+import subprocess
+import os
+import shutil
+from abc import ABC, abstractmethod
+from pathlib import Path
+from datetime import datetime
+from platformdirs import user_documents_dir, user_cache_dir
+
+from bot.targets import Targets
+from bot.mdict.terms.factory import new_terminator
+
+
+class Exporter(ABC):
+ def __init__(self, target):
+ self._target = target
+ self._terminator = new_terminator(target)
+ self._build_dir = None
+ self._build_media_dir = None
+ self._description_file = None
+ self._out_dir = None
+
+ def export(self, entries, media_dir, icon_file):
+ self._init_build_media_dir(media_dir)
+ self._init_description_file(entries)
+ terms = self._get_terms(entries)
+ print(f"Exporting {len(terms)} Mdict keys...")
+ self._write_mdx_file(terms)
+ self._write_mdd_file()
+ self._write_icon_file(icon_file)
+ self._rm_build_dir()
+
+ def _get_build_dir(self):
+ if self._build_dir is not None:
+ return self._build_dir
+ cache_dir = user_cache_dir("jitenbot")
+ build_directory = os.path.join(cache_dir, "mdict_build")
+ if Path(build_directory).is_dir():
+ shutil.rmtree(build_directory)
+ os.makedirs(build_directory)
+ self._build_dir = build_directory
+ return self._build_dir
+
+ def _init_build_media_dir(self, media_dir):
+ build_dir = self._get_build_dir()
+ build_media_dir = os.path.join(build_dir, self._target.value)
+ if media_dir is not None:
+ print("Copying media files to build directory...")
+ shutil.copytree(media_dir, build_media_dir)
+ else:
+ os.makedirs(build_media_dir)
+ css_file = self._get_css_file()
+ shutil.copy(css_file, build_media_dir)
+ self._terminator.set_media_dir(build_media_dir)
+ self._build_media_dir = build_media_dir
+
+ def _init_description_file(self, entries):
+ filename = f"{self._target.value}.mdx.description.html"
+ original_file = os.path.join(
+ "data", "mdict", "description", filename)
+ with open(original_file, "r", encoding="utf8") as f:
+ description = f.read()
+ description = description.replace(
+ "{{revision}}", self._get_revision(entries))
+ description = description.replace(
+ "{{attribution}}", self._get_attribution(entries))
+ build_dir = self._get_build_dir()
+ description_file = os.path.join(build_dir, filename)
+ with open(description_file, "w", encoding="utf8") as f:
+ f.write(description)
+ self._description_file = description_file
+
+ def _get_terms(self, entries):
+ terms = []
+ entries_len = len(entries)
+ for idx, entry in enumerate(entries):
+ update = f"Creating Mdict terms for entry {idx+1}/{entries_len}"
+ print(update, end='\r', flush=True)
+ new_terms = self._terminator.make_terms(entry)
+ for term in new_terms:
+ terms.append(term)
+ print()
+ return terms
+
+ def _write_mdx_file(self, terms):
+ out_dir = self._get_out_dir()
+ out_file = os.path.join(out_dir, f"{self._target.value}.mdx")
+ params = [
+ "mdict",
+ "-a", self._get_term_file(terms),
+ "--title", self._get_title_file(),
+ "--description", self._description_file,
+ out_file
+ ]
+ subprocess.run(params, check=True)
+
+ def _write_mdd_file(self):
+ out_dir = self._get_out_dir()
+ out_file = os.path.join(out_dir, f"{self._target.value}.mdd")
+ params = [
+ "mdict",
+ "-a", self._build_media_dir,
+ "--title", self._get_title_file(),
+ "--description", self._description_file,
+ out_file
+ ]
+ subprocess.run(params, check=True)
+
+ def _write_icon_file(self, icon_file):
+ premade_icon_file = f"data/mdict/icon/{self._target.value}.png"
+ out_dir = self._get_out_dir()
+ out_file = os.path.join(out_dir, f"{self._target.value}.png")
+ if icon_file is not None and Path(icon_file).is_file():
+ shutil.copy(icon_file, out_file)
+ elif Path(premade_icon_file).is_file():
+ shutil.copy(premade_icon_file, out_file)
+
+ def _get_out_dir(self):
+ if self._out_dir is not None:
+ return self._out_dir
+ out_dir = os.path.join(
+ user_documents_dir(), "jitenbot", "mdict", self._target.value)
+ if Path(out_dir).is_dir():
+ shutil.rmtree(out_dir)
+ os.makedirs(out_dir)
+ self._out_dir = out_dir
+ return out_dir
+
+ def _get_term_file(self, terms):
+ build_dir = self._get_build_dir()
+ term_file = os.path.join(build_dir, f"{self._target.value}.mdx.txt")
+ with open(term_file, "w", encoding="utf8") as f:
+ for term in terms:
+ f.write("\n".join(term))
+ f.write("\n>\n")
+ return term_file
+
+ def _get_title_file(self):
+ return os.path.join(
+ "data", "mdict", "title",
+ f"{self._target.value}.mdx.title.html")
+
+ def _get_css_file(self):
+ return os.path.join(
+ "data", "mdict", "css",
+ f"{self._target.value}.css")
+
+ def _rm_build_dir(self):
+ build_dir = self._get_build_dir()
+ shutil.rmtree(build_dir)
+
+ @abstractmethod
+ def _get_revision(self, entries):
+ pass
+
+ @abstractmethod
+ def _get_attribution(self, entries):
+ pass
+
+
+class _JitenonExporter(Exporter):
+ def _get_revision(self, entries):
+ modified_date = None
+ for entry in entries:
+ if modified_date is None or entry.modified_date > modified_date:
+ modified_date = entry.modified_date
+ revision = modified_date.strftime("%Y年%m月%d日閲覧")
+ return revision
+
+ def _get_attribution(self, entries):
+ modified_date = None
+ for entry in entries:
+ if modified_date is None or entry.modified_date > modified_date:
+ attribution = entry.attribution
+ return attribution
+
+
+class JitenonKokugoExporter(_JitenonExporter):
+ pass
+
+
+class JitenonYojiExporter(_JitenonExporter):
+ pass
+
+
+class JitenonKotowazaExporter(_JitenonExporter):
+ pass
+
+
+class _MonokakidoExporter(Exporter):
+ def _get_revision(self, entries):
+ timestamp = datetime.now().strftime("%Y年%m月%d日作成")
+ return timestamp
+
+
+class Smk8Exporter(_MonokakidoExporter):
+ def _get_attribution(self, entries):
+ return "© Sanseido Co., LTD. 2020"
+
+
+class Daijirin2Exporter(_MonokakidoExporter):
+ def _get_attribution(self, entries):
+ return "© Sanseido Co., LTD. 2019"
diff --git a/bot/mdict/exporters/factory.py b/bot/mdict/exporters/factory.py
new file mode 100644
index 0000000..2c2015c
--- /dev/null
+++ b/bot/mdict/exporters/factory.py
@@ -0,0 +1,18 @@
+from bot.targets import Targets
+
+from bot.mdict.exporters.export import JitenonKokugoExporter
+from bot.mdict.exporters.export import JitenonYojiExporter
+from bot.mdict.exporters.export import JitenonKotowazaExporter
+from bot.mdict.exporters.export import Smk8Exporter
+from bot.mdict.exporters.export import Daijirin2Exporter
+
+
+def new_mdict_exporter(target):
+ exporter_map = {
+ Targets.JITENON_KOKUGO: JitenonKokugoExporter,
+ Targets.JITENON_YOJI: JitenonYojiExporter,
+ Targets.JITENON_KOTOWAZA: JitenonKotowazaExporter,
+ Targets.SMK8: Smk8Exporter,
+ Targets.DAIJIRIN2: Daijirin2Exporter,
+ }
+ return exporter_map[target](target)
diff --git a/bot/mdict/glossary/daijirin2.py b/bot/mdict/glossary/daijirin2.py
new file mode 100644
index 0000000..1a8b0d5
--- /dev/null
+++ b/bot/mdict/glossary/daijirin2.py
@@ -0,0 +1,77 @@
+import re
+import os
+from functools import cache
+from pathlib import Path
+
+from bot.soup import delete_soup_nodes
+from bot.data import load_mdict_name_conversion
+from bot.name_conversion import convert_names
+
+
+def make_glossary(entry, media_dir):
+ soup = entry.get_page_soup()
+ __add_rubies(soup)
+ __hyperlink_parent_expression(soup, entry)
+ __delete_unused_nodes(soup, media_dir)
+ __convert_links(soup, entry)
+
+ name_conversion = load_mdict_name_conversion(entry.target)
+ convert_names(soup, name_conversion)
+
+ glossary = soup.span.decode()
+ return glossary
+
+
+def __add_rubies(soup):
+ for name in ["表外音訓", "表外字"]:
+ for ruby in soup.find_all(name):
+ ruby.name = "ruby"
+ rt = ruby.find("表外字マーク")
+ rt.name = "rt"
+ ruby.append(rt) # needs to positioned after the text
+
+
+def __hyperlink_parent_expression(soup, entry):
+ if soup.find("親表記") is None:
+ return
+ parent_entry = entry.get_parent()
+ gid = parent_entry.get_global_identifier()
+ for el in soup.find_all("親表記"):
+ el.name = "a"
+ el.attrs["href"] = f"entry://{gid}"
+
+
+def __delete_unused_nodes(soup, media_dir):
+ if not __graphics_directory_exists(media_dir):
+ delete_soup_nodes(soup, "カットG")
+ for el in soup.find_all("logo"):
+ next_sibling = el.next_sibling
+ if next_sibling is None:
+ continue
+ elif next_sibling.name in ["漢字見出G", "漢字音G"]:
+ el.decompose()
+ for el in soup.find_all("漢字音G"):
+ for child in el.find_all(string="・"):
+ child.replace_with("")
+
+
+@cache
+def __graphics_directory_exists(media_dir):
+ path = os.path.join(media_dir, "graphics")
+ return Path(path).is_dir()
+
+
+def __convert_links(soup, entry):
+ for el in soup.find_all("a"):
+ href = el.attrs["href"]
+ if re.match(r"^[0-9]+(?:-[0-9A-F]{4})?$", href):
+ ref_entry_id = entry.id_string_to_entry_id(href)
+ ref_entry = entry.ID_TO_ENTRY[ref_entry_id]
+ gid = ref_entry.get_global_identifier()
+ el.attrs["href"] = f"entry://{gid}"
+ elif re.match(r"^entry:", href):
+ pass
+ elif re.match(r"^https?:[\w\W]*", href):
+ pass
+ else:
+ raise Exception(f"Invalid href format: {href}")
diff --git a/bot/mdict/glossary/jitenon.py b/bot/mdict/glossary/jitenon.py
new file mode 100644
index 0000000..737ea59
--- /dev/null
+++ b/bot/mdict/glossary/jitenon.py
@@ -0,0 +1,141 @@
+# pylint: disable=too-few-public-methods
+
+import re
+
+
+class JitenonGlossary():
+ def __init__(self):
+ self._id_pattern = None
+ self._expression_header = None
+
+ def _replace_punctuation(self, soup):
+ punctuation = {
+ "/": "/",
+ ",": "、",
+ }
+ for el in soup.find_all(string=True):
+ text = el.text
+ for old, new in punctuation.items():
+ text = text.replace(old, new)
+ el.replace_with(text)
+
+ def _add_internal_links(self, soup, entry):
+ for el in soup.find_all("a"):
+ href = el.attrs["href"]
+ m = re.search(self._id_pattern, href)
+ if m is not None:
+ ref_entry_id = int(m.group(1))
+ ref_entry = entry.ID_TO_ENTRY[ref_entry_id]
+ gid = ref_entry.get_global_identifier()
+ el.attrs["href"] = f"entry://{gid}"
+ elif re.match(r"^(?:https?:|\?)[\w\W]*", href):
+ pass
+ else:
+ raise Exception(f"Invalid href format: {href}")
+
+ def _decompose_table_rows(self, soup, entry):
+ for tr in soup.find_all("tr"):
+ if tr.find("th") is None:
+ continue
+ elif tr.th.text == self._expression_header:
+ tr.decompose()
+ elif tr.th.text == "読み方":
+ if self._do_display_yomikata_in_headword(entry):
+ tr.decompose()
+ elif tr.th.text == "意味":
+ definition = tr.td
+ definition.name = "div"
+ definition.attrs["class"] = "意味"
+ soup.body.insert(0, definition)
+ tr.decompose()
+ if soup.find("tr") is None:
+ soup.table.decompose()
+
+ def _insert_headword_line(self, soup, entry):
+ headword_line = soup.new_tag("div")
+ headword_line.attrs["class"] = "見出し"
+ if self._do_display_yomikata_in_headword(entry):
+ reading = soup.new_tag("span")
+ reading.attrs["class"] = "読み方"
+ reading.string = entry.yomikata
+ headword_line.append(reading)
+ expression = soup.new_tag("span")
+ expression.attrs["class"] = self._expression_header
+ expression.string = f"【{entry.expression}】"
+ headword_line.append(expression)
+ soup.body.insert(0, headword_line)
+
+ def _do_display_yomikata_in_headword(self, entry):
+ if not re.match(r"^[ぁ-ヿ、]+$", entry.yomikata):
+ return False
+ elif len(entry.yomikata) > 10:
+ return False
+ else:
+ return True
+
+
+class JitenonKokugoGlossary(JitenonGlossary):
+ def __init__(self):
+ super().__init__()
+ self._expression_header = "言葉"
+ self._id_pattern = r"kokugo.jitenon.jp/word/p([0-9]+)$"
+
+ def make_glossary(self, entry, media_dir):
+ soup = entry.get_page_soup()
+ self._remove_antonym_list_item(soup)
+ self._replace_number_icons(soup, media_dir)
+ self._replace_punctuation(soup)
+ self._add_internal_links(soup, entry)
+ self._decompose_table_rows(soup, entry)
+ self._insert_headword_line(soup, entry)
+ glossary = soup.body.prettify()
+ return glossary
+
+ def _remove_antonym_list_item(self, soup):
+ for el in soup.find_all("li"):
+ if el.text == "対義語辞典":
+ el.decompose()
+
+ def _replace_number_icons(self, soup, media_dir):
+ for el in soup.find_all("img"):
+ alt = el.attrs["alt"]
+ text = re.search(r"[0-9]+", alt).group(0)
+ el.name = "span"
+ el.string = text
+ del el.attrs["src"]
+ del el.attrs["alt"]
+
+ def _do_display_yomikata_in_headword(self, entry):
+ return len(entry.yomikata) <= 10
+
+
+class JitenonYojiGlossary(JitenonGlossary):
+ def __init__(self):
+ super().__init__()
+ self._expression_header = "四字熟語"
+ self._id_pattern = r"yoji.jitenon.jp/yoji.?/([0-9]+)\.html$"
+
+ def make_glossary(self, entry, media_dir):
+ soup = entry.get_page_soup()
+ self._replace_punctuation(soup)
+ self._add_internal_links(soup, entry)
+ self._decompose_table_rows(soup, entry)
+ self._insert_headword_line(soup, entry)
+ glossary = soup.body.prettify()
+ return glossary
+
+
+class JitenonKotowazaGlossary(JitenonGlossary):
+ def __init__(self):
+ super().__init__()
+ self._expression_header = "言葉"
+ self._id_pattern = r"kotowaza.jitenon.jp/kotowaza/([0-9]+)\.php$"
+
+ def make_glossary(self, entry, media_dir):
+ soup = entry.get_page_soup()
+ self._replace_punctuation(soup)
+ self._add_internal_links(soup, entry)
+ self._decompose_table_rows(soup, entry)
+ self._insert_headword_line(soup, entry)
+ glossary = soup.body.prettify()
+ return glossary
diff --git a/bot/mdict/glossary/smk8.py b/bot/mdict/glossary/smk8.py
new file mode 100644
index 0000000..613fc1b
--- /dev/null
+++ b/bot/mdict/glossary/smk8.py
@@ -0,0 +1,67 @@
+import re
+
+from bot.soup import delete_soup_nodes
+from bot.data import load_mdict_name_conversion
+from bot.name_conversion import convert_names
+
+
+def make_glossary(entry, media_dir):
+ soup = entry.get_page_soup()
+ __fill_alts(soup, entry)
+ __delete_unused_nodes(soup)
+ __convert_links(soup, entry)
+ __convert_priority_markers(soup)
+
+ name_conversion = load_mdict_name_conversion(entry.target)
+ convert_names(soup, name_conversion)
+
+ glossary = soup.span.decode()
+ return glossary
+
+
+def __fill_alts(soup, entry):
+ names = ["親見出仮名", "親見出表記"]
+ if soup.find(names) is None:
+ return
+ parent_entry = entry.get_parent()
+ gid = parent_entry.get_global_identifier()
+ for el in soup.find_all(names):
+ el.name = "a"
+ alt = el.attrs["alt"]
+ el.string = alt
+ el.attrs["href"] = f"entry://{gid}"
+ del el.attrs["alt"]
+
+
+def __delete_unused_nodes(soup):
+ for name in ["連濁"]:
+ delete_soup_nodes(soup, name)
+
+
+def __convert_links(soup, entry):
+ for el in soup.find_all("a"):
+ href = el.attrs["href"]
+ if href.startswith("$"):
+ el.unwrap()
+ elif re.match(r"^[0-9]+(?:-[0-9A-F]{4})?$", href):
+ ref_entry_id = entry.id_string_to_entry_id(href)
+ ref_entry = entry.ID_TO_ENTRY[ref_entry_id]
+ gid = ref_entry.get_global_identifier()
+ el.attrs["href"] = f"entry://{gid}"
+ elif re.match(r"^[0-9]+[ab]?\.aac$", href):
+ el.attrs["href"] = f"sound://audio/{href}"
+ elif re.match(r"^entry:", href):
+ pass
+ elif re.match(r"^https?:[\w\W]*", href):
+ pass
+ else:
+ raise Exception(f"Invalid href format: {href}")
+
+
+def __convert_priority_markers(soup):
+ for el in soup.find_all("img", attrs={"alt": "*"}):
+ el.name = "span"
+ el.string = "*"
+ for el in soup.find_all("img", attrs={"alt": "⁑"}):
+ el.name = "span"
+ el.string = "**"
diff --git a/bot/mdict/terms/daijirin2.py b/bot/mdict/terms/daijirin2.py
new file mode 100644
index 0000000..3b5ce68
--- /dev/null
+++ b/bot/mdict/terms/daijirin2.py
@@ -0,0 +1,23 @@
+from bot.mdict.terms.terminator import Terminator
+from bot.mdict.glossary.daijirin2 import make_glossary
+
+
+class Daijirin2Terminator(Terminator):
+ def _glossary(self, entry):
+ if entry.entry_id in self._glossary_cache:
+ return self._glossary_cache[entry.entry_id]
+ glossary = make_glossary(entry, self._media_dir)
+ self._glossary_cache[entry.entry_id] = glossary
+ return glossary
+
+ def _link_glossary_parameters(self, entry):
+ return [
+ [entry.children, "子項目"],
+ [entry.phrases, "句項目"],
+ ]
+
+ def _subentry_lists(self, entry):
+ return [
+ entry.children,
+ entry.phrases,
+ ]
diff --git a/bot/mdict/terms/factory.py b/bot/mdict/terms/factory.py
new file mode 100644
index 0000000..78a05cd
--- /dev/null
+++ b/bot/mdict/terms/factory.py
@@ -0,0 +1,18 @@
+from bot.targets import Targets
+
+from bot.mdict.terms.jitenon import JitenonKokugoTerminator
+from bot.mdict.terms.jitenon import JitenonYojiTerminator
+from bot.mdict.terms.jitenon import JitenonKotowazaTerminator
+from bot.mdict.terms.smk8 import Smk8Terminator
+from bot.mdict.terms.daijirin2 import Daijirin2Terminator
+
+
+def new_terminator(target):
+ terminator_map = {
+ Targets.JITENON_KOKUGO: JitenonKokugoTerminator,
+ Targets.JITENON_YOJI: JitenonYojiTerminator,
+ Targets.JITENON_KOTOWAZA: JitenonKotowazaTerminator,
+ Targets.SMK8: Smk8Terminator,
+ Targets.DAIJIRIN2: Daijirin2Terminator,
+ }
+ return terminator_map[target](target)
diff --git a/bot/mdict/terms/jitenon.py b/bot/mdict/terms/jitenon.py
new file mode 100644
index 0000000..3f9cfc1
--- /dev/null
+++ b/bot/mdict/terms/jitenon.py
@@ -0,0 +1,42 @@
+from bot.mdict.terms.terminator import Terminator
+
+from bot.mdict.glossary.jitenon import JitenonKokugoGlossary
+from bot.mdict.glossary.jitenon import JitenonYojiGlossary
+from bot.mdict.glossary.jitenon import JitenonKotowazaGlossary
+
+
+class JitenonTerminator(Terminator):
+ def __init__(self, target):
+ super().__init__(target)
+ self._glossary_maker = None
+
+ def _glossary(self, entry):
+ if entry.entry_id in self._glossary_cache:
+ return self._glossary_cache[entry.entry_id]
+ glossary = self._glossary_maker.make_glossary(entry, self._media_dir)
+ self._glossary_cache[entry.entry_id] = glossary
+ return glossary
+
+ def _link_glossary_parameters(self, entry):
+ return []
+
+ def _subentry_lists(self, entry):
+ return []
+
+
+class JitenonKokugoTerminator(JitenonTerminator):
+ def __init__(self, target):
+ super().__init__(target)
+ self._glossary_maker = JitenonKokugoGlossary()
+
+
+class JitenonYojiTerminator(JitenonTerminator):
+ def __init__(self, target):
+ super().__init__(target)
+ self._glossary_maker = JitenonYojiGlossary()
+
+
+class JitenonKotowazaTerminator(JitenonTerminator):
+ def __init__(self, target):
+ super().__init__(target)
+ self._glossary_maker = JitenonKotowazaGlossary()
diff --git a/bot/mdict/terms/smk8.py b/bot/mdict/terms/smk8.py
new file mode 100644
index 0000000..22275d5
--- /dev/null
+++ b/bot/mdict/terms/smk8.py
@@ -0,0 +1,24 @@
+from bot.mdict.terms.terminator import Terminator
+from bot.mdict.glossary.smk8 import make_glossary
+
+
+class Smk8Terminator(Terminator):
+ def _glossary(self, entry):
+ if entry.entry_id in self._glossary_cache:
+ return self._glossary_cache[entry.entry_id]
+ glossary = make_glossary(entry, self._media_dir)
+ self._glossary_cache[entry.entry_id] = glossary
+ return glossary
+
+ def _link_glossary_parameters(self, entry):
+ return [
+ [entry.children, "子項目"],
+ [entry.phrases, "句項目"],
+ ]
+
+ def _subentry_lists(self, entry):
+ return [
+ entry.children,
+ entry.phrases,
+ entry.kanjis,
+ ]
diff --git a/bot/mdict/terms/terminator.py b/bot/mdict/terms/terminator.py
new file mode 100644
index 0000000..e69d9fb
--- /dev/null
+++ b/bot/mdict/terms/terminator.py
@@ -0,0 +1,73 @@
+from abc import abstractmethod, ABC
+
+
+class Terminator(ABC):
+ def __init__(self, target):
+ self._target = target
+ self._glossary_cache = {}
+ self._media_dir = None
+
+ def set_media_dir(self, media_dir):
+ self._media_dir = media_dir
+
+ def make_terms(self, entry):
+ gid = entry.get_global_identifier()
+ glossary = self.__full_glossary(entry)
+ terms = [[gid, glossary]]
+ keys = set()
+ headwords = entry.get_headwords()
+ for reading, expressions in headwords.items():
+ if len(expressions) == 0:
+ keys.add(reading)
+ for expression in expressions:
+ if expression.strip() == "":
+ keys.add(reading)
+ continue
+ keys.add(expression)
+ if reading.strip() == "":
+ continue
+ if reading != expression:
+ keys.add(f"{reading}【{expression}】")
+ else:
+ keys.add(reading)
+ link = f"@@@LINK={gid}"
+ for key in keys:
+ if key.strip() != "":
+ terms.append([key, link])
+ for subentries in self._subentry_lists(entry):
+ for subentry in subentries:
+ for term in self.make_terms(subentry):
+ terms.append(term)
+ return terms
+
+ def __full_glossary(self, entry):
+ glossary = []
+ style_link = f""
+ glossary.append(style_link)
+ glossary.append(self._glossary(entry))
+
+ for x in self._link_glossary_parameters(entry):
+ (subentries, list_title) = x
+ if len(subentries) == 0:
+ continue
+ items = []
+ for subentry in subentries:
+ exp = subentry.get_first_expression()
+ gid = subentry.get_global_identifier()
+ item = f"
{exp}"
+ items.append(item)
+ link_glossary = f""
+ glossary.append(link_glossary)
+ return "\n".join(glossary)
+
+ @abstractmethod
+ def _glossary(self, entry):
+ pass
+
+ @abstractmethod
+ def _link_glossary_parameters(self, entry):
+ pass
+
+ @abstractmethod
+ def _subentry_lists(self, entry):
+ pass
diff --git a/bot/yomichan/glossary/name_conversion.py b/bot/name_conversion.py
similarity index 88%
rename from bot/yomichan/glossary/name_conversion.py
rename to bot/name_conversion.py
index 776d65e..2c9b808 100644
--- a/bot/yomichan/glossary/name_conversion.py
+++ b/bot/name_conversion.py
@@ -30,7 +30,7 @@ def __apply_name_conversion_procedures(soup, procedures):
"has_previous_sibling": __has_previous_sibling,
"replace": __replace,
"wrap": __wrap,
- "add_ruby_text": __add_ruby_text,
+ "insert_span": __insert_span,
}
for procedure in procedures:
function = functions[procedure["procedure_name"]]
@@ -92,10 +92,9 @@ def __wrap(soup, l_wrap, r_wrap):
soup.string = f"{l_wrap}{soup.text}{r_wrap}"
-def __add_ruby_text(soup, mark, style):
- if style.strip() != "":
- markup = f""
- else:
- markup = f""
- rt_soup = BeautifulSoup(markup, "xml")
- soup.append(rt_soup.rt)
+def __insert_span(soup, attr_name, attr_val):
+ span_markup = f""
+ span_soup = BeautifulSoup(span_markup, "xml")
+ for content in reversed(soup.contents):
+ span_soup.span.insert(0, content.extract())
+ soup.append(span_soup.span)
diff --git a/bot/yomichan/exporters/export.py b/bot/yomichan/exporters/export.py
index 4658030..03e1b95 100644
--- a/bot/yomichan/exporters/export.py
+++ b/bot/yomichan/exporters/export.py
@@ -1,15 +1,18 @@
+# pylint: disable=too-few-public-methods
+
import json
import os
import shutil
from pathlib import Path
from datetime import datetime
+from abc import ABC, abstractmethod
from platformdirs import user_documents_dir, user_cache_dir
from bot.data import load_yomichan_metadata
from bot.yomichan.terms.factory import new_terminator
-class Exporter:
+class Exporter(ABC):
def __init__(self, target):
self._target = target
self._terminator = new_terminator(target)
@@ -26,6 +29,14 @@ class Exporter:
terms = self.__get_terms(entries)
self.__make_dictionary(terms, index, tags)
+ @abstractmethod
+ def _get_revision(self, entries):
+ pass
+
+ @abstractmethod
+ def _get_attribution(self, entries):
+ pass
+
def _get_build_dir(self):
if self._build_dir is not None:
return self._build_dir
@@ -41,7 +52,7 @@ class Exporter:
build_dir = self._get_build_dir()
build_img_dir = os.path.join(build_dir, self._target.value)
if image_dir is not None:
- print("Copying image files to build directory...")
+ print("Copying media files to build directory...")
shutil.copytree(image_dir, build_img_dir)
else:
os.makedirs(build_img_dir)
@@ -93,7 +104,7 @@ class Exporter:
def __write_archive(self, filename):
archive_format = "zip"
- out_dir = os.path.join(user_documents_dir(), "jitenbot")
+ out_dir = os.path.join(user_documents_dir(), "jitenbot", "yomichan")
if not Path(out_dir).is_dir():
os.makedirs(out_dir)
out_file = f"{filename}.{archive_format}"
@@ -110,10 +121,7 @@ class Exporter:
shutil.rmtree(build_dir)
-class JitenonExporter(Exporter):
- def __init__(self, target):
- super().__init__(target)
-
+class _JitenonExporter(Exporter):
def _get_revision(self, entries):
modified_date = None
for entry in entries:
@@ -130,25 +138,19 @@ class JitenonExporter(Exporter):
return attribution
-class JitenonKokugoExporter(JitenonExporter):
- def __init__(self, target):
- super().__init__(target)
+class JitenonKokugoExporter(_JitenonExporter):
+ pass
-class JitenonYojiExporter(JitenonExporter):
- def __init__(self, target):
- super().__init__(target)
+class JitenonYojiExporter(_JitenonExporter):
+ pass
-class JitenonKotowazaExporter(JitenonExporter):
- def __init__(self, target):
- super().__init__(target)
+class JitenonKotowazaExporter(_JitenonExporter):
+ pass
class Smk8Exporter(Exporter):
- def __init__(self, target):
- super().__init__(target)
-
def _get_revision(self, entries):
timestamp = datetime.now().strftime("%Y-%m-%d")
return f"{self._target.value};{timestamp}"
@@ -158,9 +160,6 @@ class Smk8Exporter(Exporter):
class Daijirin2Exporter(Exporter):
- def __init__(self, target):
- super().__init__(target)
-
def _get_revision(self, entries):
timestamp = datetime.now().strftime("%Y-%m-%d")
return f"{self._target.value};{timestamp}"
diff --git a/bot/yomichan/exporters/factory.py b/bot/yomichan/exporters/factory.py
index 5ab9a6a..06568e3 100644
--- a/bot/yomichan/exporters/factory.py
+++ b/bot/yomichan/exporters/factory.py
@@ -7,7 +7,7 @@ from bot.yomichan.exporters.export import Smk8Exporter
from bot.yomichan.exporters.export import Daijirin2Exporter
-def new_exporter(target):
+def new_yomi_exporter(target):
exporter_map = {
Targets.JITENON_KOKUGO: JitenonKokugoExporter,
Targets.JITENON_YOJI: JitenonYojiExporter,
diff --git a/bot/yomichan/glossary/daijirin2.py b/bot/yomichan/glossary/daijirin2.py
index f2b6f2c..c42841c 100644
--- a/bot/yomichan/glossary/daijirin2.py
+++ b/bot/yomichan/glossary/daijirin2.py
@@ -6,9 +6,9 @@ from pathlib import Path
import bot.icons as Icons
from bot.soup import delete_soup_nodes
-from bot.data import load_daijirin2_yomichan_name_conversion
+from bot.data import load_yomichan_name_conversion
from bot.yomichan.glossary.gloss import make_gloss
-from bot.yomichan.glossary.name_conversion import convert_names
+from bot.name_conversion import convert_names
def make_glossary(entry, image_dir):
@@ -26,7 +26,7 @@ def make_glossary(entry, image_dir):
__convert_daigoginum(soup, image_dir)
__convert_jundaigoginum(soup, image_dir)
- name_conversion = load_daijirin2_yomichan_name_conversion()
+ name_conversion = load_yomichan_name_conversion(entry.target)
convert_names(soup, name_conversion)
gloss = make_gloss(soup.span)
diff --git a/bot/yomichan/glossary/jitenon.py b/bot/yomichan/glossary/jitenon.py
index 6e3a192..ca76f19 100644
--- a/bot/yomichan/glossary/jitenon.py
+++ b/bot/yomichan/glossary/jitenon.py
@@ -58,9 +58,9 @@ class JitenonGlossary():
if self._do_display_yomikata_in_headword(entry):
tr.decompose()
elif tr.th.text == "意味":
- imi = tr.td
- imi.name = "div"
- soup.body.insert(0, imi)
+ definition = tr.td
+ definition.name = "div"
+ soup.body.insert(0, definition)
tr.decompose()
if soup.find("tr") is None:
soup.table.decompose()
diff --git a/bot/yomichan/glossary/smk8.py b/bot/yomichan/glossary/smk8.py
index 870c3fc..8754a02 100644
--- a/bot/yomichan/glossary/smk8.py
+++ b/bot/yomichan/glossary/smk8.py
@@ -4,9 +4,9 @@ from bs4 import BeautifulSoup
import bot.icons as Icons
from bot.soup import delete_soup_nodes
-from bot.data import load_smk8_yomichan_name_conversion
+from bot.data import load_yomichan_name_conversion
from bot.yomichan.glossary.gloss import make_gloss
-from bot.yomichan.glossary.name_conversion import convert_names
+from bot.name_conversion import convert_names
def make_glossary(entry, image_dir):
@@ -20,7 +20,7 @@ def make_glossary(entry, image_dir):
__convert_gaiji(soup, image_dir)
__convert_rectangles(soup, image_dir)
- name_conversion = load_smk8_yomichan_name_conversion()
+ name_conversion = load_yomichan_name_conversion(entry.target)
convert_names(soup, name_conversion)
gloss = make_gloss(soup.span)
diff --git a/bot/yomichan/terms/jitenon.py b/bot/yomichan/terms/jitenon.py
index f74abaa..66bbed7 100644
--- a/bot/yomichan/terms/jitenon.py
+++ b/bot/yomichan/terms/jitenon.py
@@ -9,6 +9,7 @@ from bot.yomichan.glossary.jitenon import JitenonKotowazaGlossary
class JitenonTerminator(Terminator):
def __init__(self, target):
super().__init__(target)
+ self._glossary_maker = None
def _definition_tags(self, entry):
return None
@@ -51,7 +52,7 @@ class JitenonYojiTerminator(JitenonTerminator):
return ""
def _term_tags(self, entry):
- tags = entry.kankenkyuu.split("/")
+ tags = entry.kanken_level.split("/")
return " ".join(tags)
diff --git a/bot/yomichan/terms/terminator.py b/bot/yomichan/terms/terminator.py
index d41a50a..dd0c02d 100644
--- a/bot/yomichan/terms/terminator.py
+++ b/bot/yomichan/terms/terminator.py
@@ -1,7 +1,8 @@
+from abc import abstractmethod, ABC
from bot.data import load_yomichan_inflection_categories
-class Terminator:
+class Terminator(ABC):
def __init__(self, target):
self._target = target
self._glossary_cache = {}
@@ -62,3 +63,31 @@ class Terminator:
}
glossary.append(gloss)
return glossary
+
+ @abstractmethod
+ def _definition_tags(self, entry):
+ pass
+
+ @abstractmethod
+ def _inflection_rules(self, entry, expression):
+ pass
+
+ @abstractmethod
+ def _glossary(self, entry):
+ pass
+
+ @abstractmethod
+ def _sequence(self, entry):
+ pass
+
+ @abstractmethod
+ def _term_tags(self, entry):
+ pass
+
+ @abstractmethod
+ def _link_glossary_parameters(self, entry):
+ pass
+
+ @abstractmethod
+ def _subentry_lists(self, entry):
+ pass
diff --git a/data/daijirin2/mdict_name_conversion.json b/data/daijirin2/mdict_name_conversion.json
new file mode 100644
index 0000000..d783d28
--- /dev/null
+++ b/data/daijirin2/mdict_name_conversion.json
@@ -0,0 +1,12 @@
+{
+ "a": {},
+ "br": {},
+ "img": {},
+ "div": {},
+ "span": {},
+ "ruby": {},
+ "rt": {},
+ "p": {},
+ "漢字音G": {"name": "ul"},
+ "漢字音": {"name": "li"}
+}
diff --git a/data/mdict/css/daijirin2.css b/data/mdict/css/daijirin2.css
new file mode 100644
index 0000000..703cb35
--- /dev/null
+++ b/data/mdict/css/daijirin2.css
@@ -0,0 +1,414 @@
+
+body {
+ margin: 1em 44px 1em 1em;
+ line-height: 1.5em;
+ font-family: serif;
+ font-size: 1.2em;
+ color: black;
+}
+
+body.ABC {
+ margin: 0.5em 0.5em 2em 0.5em;
+}
+
+a {
+ text-decoration: none;
+}
+
+img.gaiji {
+ height: 1em;
+}
+
+img.cut {
+ max-height: 100px;
+ max-width: 600px;
+}
+
+p {
+ margin: 0.5em 0
+}
+
+span[data-name="i"] {
+ font-style: italic;
+}
+
+span[data-name="h1"] {
+ font-family: sans-serif;
+ font-size: 1em;
+ font-weight: bold;
+}
+
+span[data-name="image"] {
+ display: block;
+}
+
+span[data-name="ref"] a {
+ text-decoration: none;
+}
+
+span[data-name="sl"] {
+ text-decoration: accent;
+}
+
+span[data-name="sm"] {
+ font-size: 0.7em;
+}
+
+span[data-name="small"] {
+ font-size: 0.7em;
+ vertical-align: 0.35em;
+}
+
+span[data-name="sub"] {
+ font-size: 0.7em;
+ vertical-align: -0.35em;
+}
+
+span[data-name="ty2"] span[data-name="sub"] {
+ vertical-align: 0em;
+}
+
+span[data-name="ty2"] span[data-name="sup"] {
+ vertical-align: 0.5em;
+}
+
+span[data-name="文語形"] {
+ display: block;
+}
+
+span[data-name="用例"] {
+ display: block;
+}
+
+span[data-name="補説G"] {
+ display: block;
+}
+
+span[data-name="語義Gnum"] + span[data-name="補説G"] {
+ display: inline;
+}
+
+span[data-name="アクセントG"] + span[data-name="補説G"] {
+ display: inline;
+}
+
+span[data-name="補説G"] + span[data-name="語釈"] {
+ display: block;
+}
+
+span[data-name="アクセントG"] {
+ font-size: 0.7em;
+ vertical-align: super;
+ margin-left: 0.25em;
+ margin-right: 0.25em;
+}
+
+span[data-name="カット"] {
+ display: block;
+}
+
+span[data-name="カットG"] {
+ display: block;
+ margin-top: 0.5em;
+ margin-bottom: 0.5em;
+ margin-left: 1em;
+}
+
+span[data-name="キャプション"] {
+ display: block;
+}
+
+span[data-name="ルビG"] {
+ font-family: sans-serif;
+ font-size: 0.7em;
+ font-weight: normal;
+ vertical-align: 0.35em;
+}
+
+.warichu span[data-name="ルビG"] {
+ font-family: serif;
+ font-size: 0.5em;
+ font-weight: normal;
+ vertical-align: 0em;
+}
+
+span[data-name="中語義"] {
+ display: block;
+}
+
+span[data-name="付記"] {
+ font-size: 0.7em;
+ vertical-align: 0.35em;
+}
+
+span[data-name="副義"] {
+ display: block;
+ margin-left: 1em;
+}
+
+span[data-name="単位名"] {
+ font-size: 0.5em;
+}
+
+span[data-name="原籍"] {
+ font-size: 0.7em;
+ vertical-align: 0.35em;
+}
+
+span[data-name="句仮名"] {
+ font-size: 0.7em;
+ vertical-align: 0.35em;
+}
+
+span[data-name="句項目"] {
+ margin-top: 0.5em;
+ margin-left: 1em;
+ display: block;
+}
+
+span[data-name="和字"] {
+ font-family: sans-serif;
+}
+
+span[data-name="品詞行"] {
+ font-size: 0.7em;
+ vertical-align: 0.35em;
+}
+
+span[data-name="品詞用法"] {
+ font-size: 0.7em;
+ vertical-align: 0.35em;
+}
+
+span[data-name="大語義"] {
+ display: block;
+}
+
+span[data-name="大語義num"] {
+ margin: 0.025em;
+ padding: 0.1em;
+ font-family: sans-serif;
+ font-size: 0.8em;
+ color: white;
+ background-color: black;
+}
+
+span[data-name="子項目"] {
+ display: block;
+ margin-top: 0.5em;
+ margin-left: 1em;
+}
+
+span[data-name="慣用G"] {
+ display: block;
+ margin-top: 0.5em;
+}
+
+span[data-name="欧字"] {
+ font-family: sans-serif;
+}
+
+span[data-name="歴史仮名"] {
+ font-size: 0.7em;
+ vertical-align: 0.35em;
+}
+
+span[data-name="派生G"] {
+ display: block;
+ margin-top: 0.5em;
+}
+
+span[data-name="準大語義"] {
+ display: block;
+}
+
+span[data-name="準大語義num"] {
+ margin: 0.025em;
+ padding: 0.1em;
+ font-family: sans-serif;
+ font-size: 0.8em;
+ border: solid 1px black;
+}
+
+span[data-name="漢字音logo"] {
+ margin: 0.025em;
+ padding: 0.1em;
+ font-family: sans-serif;
+ font-size: 0.8em;
+ border: solid 0.5px black;
+ border-radius: 1em;
+}
+
+span[data-name="漢字音G"] {
+ font-size: 0.7em;
+ font-weight: normal;
+ vertical-align: 0.35em;
+}
+
+span[data-name="生没年"] {
+ margin-left: 0.25em;
+ margin-right: 0.25em;
+ font-size: 0.7em;
+ vertical-align: 0.35em;
+}
+
+span[data-name="生没年"]:first-child {
+ margin-left: 0;
+}
+
+span[data-name="用法"] {
+ font-size: 0.7em;
+ vertical-align: 0.35em;
+}
+
+span[data-name="異字同訓"] {
+ display: block;
+ margin-top: 0.5em;
+}
+
+span[data-name="異字同訓仮名"] {
+ font-family: sans-serif;
+ font-weight: bold;
+}
+
+span[data-name="異字同訓漢字"] {
+ font-family: serif;
+ font-weight: normal;
+}
+
+span[data-name="異字同訓表記"] {
+ font-family: sans-serif;
+ font-weight: bold;
+}
+
+span[data-name="異字同訓解説"] {
+ display: block;
+}
+
+span[data-name="異字同訓語義G"] {
+ display: block;
+}
+
+span[data-name="細義"] {
+ display: block;
+}
+
+span[data-name="表外字マーク"] {
+ font-size: 0.5em;
+ vertical-align: 0.5em;
+}
+
+span[data-name="見出仮名"] {
+ font-family: sans-serif;
+ font-weight: bold;
+}
+
+span[data-name="見出相当部"] {
+ font-family: sans-serif;
+ font-weight: bold;
+}
+
+span[data-name="見出部"] {
+ display: block;
+}
+
+span[data-name="解説部"] {
+ display: block;
+ margin-left: 1em;
+}
+
+span[data-name="語義G"] {
+ display: block;
+}
+
+span[data-name="語義区切"] {
+ font-size: 0.7em;
+ vertical-align: 0.35em;
+}
+
+span[data-name="返り点"] {
+ font-size: 0.5em;
+ font-weight: normal;
+ vertical-align: 1em;
+}
+
+span[data-name="返り点"].熟語記号 {
+ vertical-align: 0em;
+}
+
+span[data-name="項目"] {
+ display: block;
+}
+
+span[data-name="logo"] {
+ margin: 0.025em 0.25em;
+ padding: 0.1em;
+ font-size: 0.8em;
+ border: solid 1px black;
+ border-radius: 0.2em;
+}
+
+.gothic {
+ font-family: sans-serif;
+ font-weight: bold;
+}
+
+.warichu {
+ font-size: 1em;
+}
+
+.refnum {
+ font-size: 0.7em;
+ vertical-align: 0.35em;
+}
+
+#index {
+ display: none;
+}
+
+span[data-name="歴史仮名"]:before,
+span[data-name="ルビG"]:before,
+span[data-name="品詞行"]:before,
+span[data-name="原籍"]:before,
+span[data-name="品詞用法"]:before,
+span[data-name="付記"]:before {
+ content: "(";
+}
+
+span[data-name="歴史仮名"]:after,
+span[data-name="ルビG"]:after,
+span[data-name="品詞行"]:after,
+span[data-name="原籍"]:after,
+span[data-name="品詞用法"]:after,
+span[data-name="付記"]:after {
+ content: ")";
+}
+
+div[data-child-links] {
+ padding-top: 1em;
+}
+
+div[data-child-links] ul {
+ margin: 0;
+ padding-left: 2em;
+}
+
+div[data-child-links] span {
+ padding: 0.1em;
+ font-family: sans-serif;
+ font-size: 0.8em;
+ color: white;
+ border-width: 0.05em;
+ border-style: none;
+ border-color: black;
+ word-break: keep-all;
+ -webkit-border-radius: 0.2em;
+}
+
+div[data-child-links="子項目"] span {
+ background-color: rgb(153, 42, 103);
+}
+
+div[data-child-links="句項目"] span {
+ background-color: rgb(176, 127, 57);
+}
diff --git a/data/mdict/css/jitenon-kokugo.css b/data/mdict/css/jitenon-kokugo.css
new file mode 100644
index 0000000..687ae14
--- /dev/null
+++ b/data/mdict/css/jitenon-kokugo.css
@@ -0,0 +1,56 @@
+
+body {
+ font-family: serif;
+ margin: 1em 44px 1em 1.5em;
+ line-height: 1.5em;
+ font-size: 1.2em;
+ color: black;
+}
+
+table, th, td {
+ border: 1px solid;
+ border-collapse: collapse;
+ padding: 0.5em;
+}
+
+th {
+ font-family: sans-serif;
+ color: black;
+ background-color: lightgray;
+ font-weight: normal;
+ white-space: nowrap;
+}
+
+a {
+ text-decoration: none;
+}
+
+td ul {
+ margin: -0.1em 0em -0.1em -1em;
+}
+
+.見出し {
+}
+
+.読み方 {
+ font-family: sans-serif;
+ font-weight: bold;
+}
+
+.意味 {
+ margin-left: 1.0em;
+ margin-bottom: 0.5em;
+}
+
+.num_icon {
+ font-family: sans-serif;
+ padding-left: 0.25em;
+ margin-right: 0.5em;
+ font-size: 0.8em;
+ word-break: keep-all;
+ color: white;
+ background-color: gray;
+ border-style: none;
+ -webkit-border-radius: 0.1em;
+}
+
diff --git a/data/mdict/css/jitenon-kotowaza.css b/data/mdict/css/jitenon-kotowaza.css
new file mode 100644
index 0000000..2dfb1be
--- /dev/null
+++ b/data/mdict/css/jitenon-kotowaza.css
@@ -0,0 +1,40 @@
+
+body {
+ font-family: serif;
+ margin: 1em 44px 1em 1.5em;
+ line-height: 1.5em;
+ font-size: 1.2em;
+ color: black;
+}
+
+table, th, td {
+ border: 1px solid;
+ border-collapse: collapse;
+ padding: 0.5em;
+}
+
+th {
+ font-family: sans-serif;
+ color: black;
+ background-color: lightgray;
+ font-weight: normal;
+ white-space: nowrap;
+}
+
+a {
+ text-decoration: none;
+}
+
+.見出し {
+}
+
+.読み方 {
+ font-family: sans-serif;
+ font-weight: bold;
+}
+
+.意味 {
+ margin-left: 1.0em;
+ margin-bottom: 0.5em;
+}
+
diff --git a/data/mdict/css/jitenon-yoji.css b/data/mdict/css/jitenon-yoji.css
new file mode 100644
index 0000000..2dfb1be
--- /dev/null
+++ b/data/mdict/css/jitenon-yoji.css
@@ -0,0 +1,40 @@
+
+body {
+ font-family: serif;
+ margin: 1em 44px 1em 1.5em;
+ line-height: 1.5em;
+ font-size: 1.2em;
+ color: black;
+}
+
+table, th, td {
+ border: 1px solid;
+ border-collapse: collapse;
+ padding: 0.5em;
+}
+
+th {
+ font-family: sans-serif;
+ color: black;
+ background-color: lightgray;
+ font-weight: normal;
+ white-space: nowrap;
+}
+
+a {
+ text-decoration: none;
+}
+
+.見出し {
+}
+
+.読み方 {
+ font-family: sans-serif;
+ font-weight: bold;
+}
+
+.意味 {
+ margin-left: 1.0em;
+ margin-bottom: 0.5em;
+}
+
diff --git a/data/mdict/css/smk8.css b/data/mdict/css/smk8.css
new file mode 100644
index 0000000..e88da1c
--- /dev/null
+++ b/data/mdict/css/smk8.css
@@ -0,0 +1,449 @@
+
+body {
+ margin: 1em 44px 1em 1.5em;
+ line-height: 1.5em;
+ font-family: serif;
+ font-size: 1.2em;
+ color: black;
+}
+
+span[data-name="項目"] {
+ display: block;
+}
+
+span[data-name="見出部"] {
+ display: block;
+}
+
+span[data-name="見出部"].pri {
+ margin-left: -0.4em;
+}
+
+span[data-name="見出仮名"] {
+ font-family: sans-serif;
+ font-weight: bold;
+}
+
+rt[data-name="表音表記"] {
+ font-size: 0.65em;
+}
+
+rt[data-name="表外音訓マーク"] {
+ font-size: 0.65em;
+}
+
+rt[data-name="表外字マーク"] {
+ font-size: 0.65em;
+}
+
+span[data-name="解説部"] {
+ display: block;
+ margin-left: 1em;
+}
+
+span[data-name="大語義"] {
+ display: block;
+}
+
+span[data-name="語義"] {
+ display: block;
+}
+
+span[data-name="副義"] {
+ display: block;
+}
+
+span[data-name="用例G"] {
+ display: block;
+}
+
+span[data-name="注記"] span[data-name="用例G"] {
+ display: inline;
+}
+
+span[data-name="用例"] {
+ display: block;
+}
+
+span[data-name="注記"] span[data-name="用例"] {
+ display: inline;
+}
+
+span[data-name="見出語省略"] {
+ margin-left: 0.125em;
+ margin-right: 0.125em;
+}
+
+span[data-name="教育漢字"] {
+ color: green;
+}
+
+span[data-name="ルビ"] {
+ font-size: 0.7em;
+ vertical-align: 0.5em;
+}
+
+span[data-name="ルビ区切"] {
+ font-size: 0.7em;
+ vertical-align: 0.65em;
+}
+
+span[data-name="名詞形G"] {
+ display: block;
+}
+
+span[data-name="可能形G"] {
+ display: block;
+}
+
+span[data-name="参照G"] {
+ display: block;
+}
+
+span[data-name="参照"] {
+ color: blue;
+}
+
+span[data-name="子項目"],
+span[data-name="句項目"] {
+ display: block;
+ margin-bottom: 0.5em;
+}
+
+span[data-name="子項目F"],
+span[data-name="句項目F"] {
+ display: block;
+ margin-bottom: 0.5em;
+ margin-top: 0.5em;
+}
+
+span[data-name="子見出部"] {
+ display: block;
+}
+
+span[data-name="子解説部"] {
+ display: block;
+ margin-left: 1em;
+}
+
+span[data-name="句見出部"] {
+ display: block;
+}
+
+span[data-name="句解説部"] {
+ display: block;
+ margin-left: 1em;
+}
+
+span[data-name="運用解説"] {
+ display: block;
+}
+
+span[data-name="表記解説"] {
+ display: block;
+}
+
+span[data-name="文法解説"] {
+ display: block;
+}
+
+span[data-name="かぞえ方解説"] {
+ display: block;
+}
+
+span[data-name="派生"] {
+ display: block;
+ margin-left: 1.25em;
+}
+
+span[data-name="派生SubGF"] {
+ display: block;
+ text-indent: -1.25em;
+}
+
+span[data-name="派生SubG"] {
+ display: block;
+}
+
+span[data-name="派生SubGF"] span[data-name="用例G"] {
+ text-indent: 0;
+}
+
+span[data-name="派生見出"] {
+ font-weight: bold;
+}
+
+span[data-name="派生見出"].normal {
+ font-weight: normal
+}
+
+span[data-name="造語成分項目"] {
+ display: block;
+ margin-top: 1em;
+}
+
+span[data-name="造語成分見出"] {
+ font-size:1.4em;
+}
+
+span[data-name="EM"] {
+ font-weight: bold;
+}
+
+span[data-name="アクセント"] {
+ font-size: 0.7em;
+ vertical-align: super;
+}
+
+span[data-name="アクセント組M"] {
+ vertical-align: 0.1em;
+}
+
+
+span[data-name="反意語M"],
+span[data-name="同意語M"] {
+ vertical-align: 0.15em;
+}
+
+span[data-name="B"] {
+ font-weight: bold;
+}
+
+span[data-name="IT"] {
+ font-family: "Times New Roman";
+ font-style: italic;
+}
+
+span[data-name="EXCLAMATION"] {
+ font-family: "Times New Roman";
+ font-style: italic;
+ font-size: 1.2em;
+}
+
+span[data-name="歴史仮名"] {
+ font-family: serif;
+ font-size: 0.7em;
+ font-weight: normal;
+ vertical-align: 0.35em;
+ -webkit-user-select: nocopy;
+}
+
+span[data-name="出現形"] {
+ font-weight: bold;
+}
+
+span[data-name="品詞用法"] {
+ font-size: 0.7em;
+}
+
+span[data-name="品詞用法"] span[data-name="品詞G"] {
+ font-size: 1.2em;
+}
+
+span[data-name="基本構文型"] {
+ font-size: 0.8em;
+}
+
+span[data-name="基本構文em"] {
+ font-weight: bold;
+}
+
+span[data-name="ウ濁音参照"] {
+ font-family: sans-serif;
+ font-weight: bold;
+}
+
+span[data-name="rect"] {
+ padding: 0.1em;
+ font-family: sans-serif;
+ font-size: 0.8em;
+ border-width: 0.05em;
+ border-style: solid;
+ border-color: black;
+ word-break: keep-all;
+ -webkit-border-radius: 0.1em;
+}
+
+span[data-name="rect"].fill {
+ color: white;
+ border-style: none;
+ background-color: gray;
+}
+
+span[data-name="rect"].red {
+ color: red;
+ border-color: red;
+}
+
+span[data-name="rect"].redfill {
+ color: white;
+ border-style: none;
+ background-color: red;
+}
+
+span[data-name="red"] {
+ color: red;
+}
+
+span[data-name="大語義番号"],
+span[data-name="語義番号"],
+span[data-name="副義番号"] {
+ margin-right: 0.25em;
+ font-family: sans-serif;
+}
+
+span[data-name="ref"] span[data-name="大語義番号"],
+span[data-name="ref"] span[data-name="語義番号"],
+span[data-name="ref"] span[data-name="副義番号"] {
+ font-size: 0.8em;
+ margin-right: 0;
+}
+
+span[data-name="表外字マーク"] {
+ vertical-align: 0.5em;
+}
+
+span[data-name="表外音訓マーク"] {
+ font-size: 0.5em;
+ vertical-align: 0.5em;
+}
+
+span[data-name="言換M"] {
+ font-size: 0.5em;
+}
+
+span[data-name="字音語参照項目"] {
+ display: block;
+}
+
+span[data-name="本文項目M"] {
+ font-size: 0.7em;
+}
+
+span[data-name="運用解説M"],
+span[data-name="表記解説M"],
+span[data-name="文法解説M"],
+span[data-name="かぞえ方解説M"],
+span[data-name="派生M"] {
+ margin-right: 0.25em;
+ font-family: sans-serif;
+}
+
+span[data-name="派生ロゴ"] {
+ margin-left: 0.1em;
+ margin-right: 0.1em;
+}
+
+span[data-name="文字"] {
+ margin: 0 0.2em;
+}
+
+span[data-name="二分"] {
+ font-size: 0.5em;
+}
+
+span[data-name="四分"] {
+ font-size: 0.25em;
+}
+
+span[data-name="ref"] {
+ margin-left: 0.1em;
+ margin-right: 0.1em;
+}
+
+span[data-name="ref-small"] {
+ font-size: 0.7em;
+}
+
+span[data-name="sup"] {
+ font-size: 0.6em;
+}
+
+span[data-name="外字"] img {
+ height: 1em;
+}
+
+img.audio {
+ height: 1em;
+ margin: 0 0.25em;
+}
+
+img.外字 {
+ height: 1em;
+}
+
+img.外字欧 {
+ height: 1em;
+}
+
+span[data-name="レ点M"] {
+ font-size: 0.6em;
+ vertical-align: -0.7em;
+}
+
+a {
+ text-decoration: none;
+}
+
+span[data-name="audio"] a {
+ padding-bottom: 0;
+ border-bottom: none;
+}
+
+span[data-name="アクセント"] a,
+span[data-name="古語M"] a,
+span[data-name="雅語M"] a,
+span[data-name="派生M"] a,
+span[data-name="原籍M"] a,
+span[data-name="品詞M"] a {
+ color: black;
+ border-bottom-style: none;
+}
+
+
+span[data-name="歴史仮名"]:before,
+span[data-name="ルビ"]:before {
+ content: "(";
+}
+
+span[data-name="歴史仮名"]:after,
+span[data-name="ルビ"]:after {
+ content: ")";
+}
+
+div[data-child-links] {
+ padding-top: 1em;
+}
+
+div[data-child-links] ul {
+ margin: 0;
+ padding-left: 2em;
+}
+
+div[data-child-links] span {
+ padding: 0.1em;
+ font-family: sans-serif;
+ font-size: 0.8em;
+ color: white;
+ border-width: 0.05em;
+ border-style: none;
+ border-color: black;
+ word-break: keep-all;
+ -webkit-border-radius: 0.2em;
+}
+
+div[data-child-links="子項目"] span {
+ background-color: rgb(153, 42, 103);
+}
+
+div[data-child-links="句項目"] span {
+ background-color: rgb(176, 127, 57);
+}
+
+span.pri > span.外字 {
+ font-size: 0.65em;
+ vertical-align: super;
+}
+
+
+
diff --git a/data/mdict/description/daijirin2.mdx.description.html b/data/mdict/description/daijirin2.mdx.description.html
new file mode 100644
index 0000000..c1eb401
--- /dev/null
+++ b/data/mdict/description/daijirin2.mdx.description.html
@@ -0,0 +1,7 @@
+大辞林 第四版
+
+https://www.monokakido.jp/ja/dictionaries/daijirin2/index.html
+
+{{revision}}
+
+{{attribution}}
diff --git a/data/mdict/description/jitenon-kokugo.mdx.description.html b/data/mdict/description/jitenon-kokugo.mdx.description.html
new file mode 100644
index 0000000..a1c7489
--- /dev/null
+++ b/data/mdict/description/jitenon-kokugo.mdx.description.html
@@ -0,0 +1,7 @@
+国語辞典オンライン
+
+https://kokugo.jitenon.jp/
+
+{{revision}}
+
+{{attribution}}
diff --git a/data/mdict/description/jitenon-kotowaza.mdx.description.html b/data/mdict/description/jitenon-kotowaza.mdx.description.html
new file mode 100644
index 0000000..b6d3c99
--- /dev/null
+++ b/data/mdict/description/jitenon-kotowaza.mdx.description.html
@@ -0,0 +1,7 @@
+故事・ことわざ・慣用句オンライン
+
+https://kotowaza.jitenon.jp/
+
+{{revision}}
+
+{{attribution}}
diff --git a/data/mdict/description/jitenon-yoji.mdx.description.html b/data/mdict/description/jitenon-yoji.mdx.description.html
new file mode 100644
index 0000000..d7e3729
--- /dev/null
+++ b/data/mdict/description/jitenon-yoji.mdx.description.html
@@ -0,0 +1,7 @@
+四字熟語辞典オンライン
+
+https://yoji.jitenon.jp/
+
+{{revision}}
+
+{{attribution}}
diff --git a/data/mdict/description/smk8.mdx.description.html b/data/mdict/description/smk8.mdx.description.html
new file mode 100644
index 0000000..7486250
--- /dev/null
+++ b/data/mdict/description/smk8.mdx.description.html
@@ -0,0 +1,7 @@
+新明解国語辞典 第八版
+
+https://www.monokakido.jp/ja/dictionaries/smk8/index.html
+
+{{revision}}
+
+{{attribution}}
diff --git a/data/mdict/icon/jitenon-kokugo.png b/data/mdict/icon/jitenon-kokugo.png
new file mode 100644
index 0000000..1ef1eb3
Binary files /dev/null and b/data/mdict/icon/jitenon-kokugo.png differ
diff --git a/data/mdict/icon/jitenon-kotowaza.png b/data/mdict/icon/jitenon-kotowaza.png
new file mode 100644
index 0000000..15ccb92
Binary files /dev/null and b/data/mdict/icon/jitenon-kotowaza.png differ
diff --git a/data/mdict/icon/jitenon-yoji.png b/data/mdict/icon/jitenon-yoji.png
new file mode 100644
index 0000000..0603db0
Binary files /dev/null and b/data/mdict/icon/jitenon-yoji.png differ
diff --git a/data/mdict/title/daijirin2.mdx.title.html b/data/mdict/title/daijirin2.mdx.title.html
new file mode 100644
index 0000000..43fdfd7
--- /dev/null
+++ b/data/mdict/title/daijirin2.mdx.title.html
@@ -0,0 +1 @@
+大辞林 第四版
diff --git a/data/mdict/title/jitenon-kokugo.mdx.title.html b/data/mdict/title/jitenon-kokugo.mdx.title.html
new file mode 100644
index 0000000..3fee892
--- /dev/null
+++ b/data/mdict/title/jitenon-kokugo.mdx.title.html
@@ -0,0 +1 @@
+国語辞典オンライン
diff --git a/data/mdict/title/jitenon-kotowaza.mdx.title.html b/data/mdict/title/jitenon-kotowaza.mdx.title.html
new file mode 100644
index 0000000..438ffaf
--- /dev/null
+++ b/data/mdict/title/jitenon-kotowaza.mdx.title.html
@@ -0,0 +1 @@
+故事・ことわざ・慣用句オンライン
diff --git a/data/mdict/title/jitenon-yoji.mdx.title.html b/data/mdict/title/jitenon-yoji.mdx.title.html
new file mode 100644
index 0000000..456dc99
--- /dev/null
+++ b/data/mdict/title/jitenon-yoji.mdx.title.html
@@ -0,0 +1 @@
+四字熟語辞典オンライン
diff --git a/data/mdict/title/smk8.mdx.title.html b/data/mdict/title/smk8.mdx.title.html
new file mode 100644
index 0000000..9f41cd1
--- /dev/null
+++ b/data/mdict/title/smk8.mdx.title.html
@@ -0,0 +1 @@
+新明解国語辞典 第八版
diff --git a/data/smk8/mdict_name_conversion.json b/data/smk8/mdict_name_conversion.json
new file mode 100644
index 0000000..b9a4387
--- /dev/null
+++ b/data/smk8/mdict_name_conversion.json
@@ -0,0 +1,25 @@
+{
+ "a": {},
+ "br": {},
+ "img": {},
+ "div": {},
+ "span": {},
+ "表外字": {
+ "name": "ruby"
+ },
+ "表外字マーク": {
+ "name": "rt"
+ },
+ "表外音訓": {
+ "name": "ruby"
+ },
+ "表外音訓マーク": {
+ "name": "rt"
+ },
+ "表音式": {
+ "name": "ruby"
+ },
+ "表音表記": {
+ "name": "rt"
+ }
+}
diff --git a/data/smk8/yomichan_name_conversion.json b/data/smk8/yomichan_name_conversion.json
index 82c491f..91a6593 100644
--- a/data/smk8/yomichan_name_conversion.json
+++ b/data/smk8/yomichan_name_conversion.json
@@ -121,25 +121,31 @@
"style": "font-weight: bold;"
},
"表外字": {
- "name": "ruby",
+ "name": "ruby"
+ },
+ "表外字マーク": {
+ "name": "rt",
"procedures": [
{
- "procedure_name": "add_ruby_text",
+ "procedure_name": "insert_span",
"parameters": {
- "mark": "︿",
- "style": "font-size: 2em;"
+ "attr_name": "style",
+ "attr_val": "font-size: 2em;"
}
}
]
},
"表外音訓": {
- "name": "ruby",
+ "name": "ruby"
+ },
+ "表外音訓マーク": {
+ "name": "rt",
"procedures": [
{
- "procedure_name": "add_ruby_text",
+ "procedure_name": "insert_span",
"parameters": {
- "mark": "︽",
- "style": "font-size: 2em;"
+ "attr_name": "style",
+ "attr_val": "font-size: 2em;"
}
}
]
@@ -148,23 +154,7 @@
"name": "ruby"
},
"表音表記": {
- "name": "rt",
- "procedures": [
- {
- "procedure_name": "replace",
- "parameters": {
- "old": "(",
- "new": ""
- }
- },
- {
- "procedure_name": "replace",
- "parameters": {
- "old": ")",
- "new": ""
- }
- }
- ]
+ "name": "rt"
},
"派生見出": {
"name": "span",
diff --git a/jitenbot.py b/jitenbot.py
index 0a25b96..e988df9 100644
--- a/jitenbot.py
+++ b/jitenbot.py
@@ -17,11 +17,22 @@ along with this program. If not, see .
"""
import os
+import sys
import argparse
+import subprocess
from bot.targets import Targets
from bot.crawlers.factory import new_crawler
+def filename(f):
+ if not os.path.isfile(f):
+ raise argparse.ArgumentTypeError(f"`{f}` is not a valid filename")
+ elif not os.access(f, os.R_OK):
+ raise argparse.ArgumentTypeError(f"Cannot access file `{f}`")
+ else:
+ return f
+
+
def directory(d):
if not os.path.isdir(d):
raise argparse.ArgumentTypeError(f"`{d}` is not a valid directory")
@@ -35,34 +46,71 @@ def parse_args(target_names):
parser = argparse.ArgumentParser(
prog="jitenbot",
description="Convert Japanese dictionary files to new formats.",
+ epilog="See README.md for details regarding media directory structures",
)
parser.add_argument(
"target",
choices=target_names,
- help="name of dictionary to convert"
+ help="name of dictionary to convert",
)
parser.add_argument(
"-p", "--page-dir",
help="path to directory containing XML page files",
- type=directory
+ type=directory,
)
parser.add_argument(
- "-i", "--image-dir",
- help="path to directory containing image folders (gaiji, graphics, etc.)",
- type=directory
+ "-m", "--media-dir",
+ help="path to directory containing media folders (gaiji, graphics, audio, etc.)",
+ type=directory,
+ )
+ parser.add_argument(
+ "-i", "--mdict-icon",
+ help="path to icon file to be used with MDict",
+ type=filename,
+ )
+ parser.add_argument(
+ "--no-yomichan-export",
+ help="skip export of dictionary data to Yomichan format",
+ action='store_true',
+ )
+ parser.add_argument(
+ "--no-mdict-export",
+ help="skip export of dictionary data to MDict format",
+ action='store_true',
)
args = parser.parse_args()
return args
+def test_mdict():
+ try:
+ subprocess.run(
+ ["mdict", "--version"],
+ check=True,
+ stdout=subprocess.DEVNULL,
+ )
+ except FileNotFoundError:
+ print("Could not find `mdict` pack tool.")
+ print("Ensure that mdict-utils is installed and")
+ print("included in the environment PATH.\n")
+ print("Mdict export functionality may also be")
+ print("disabled with the --no-mdict-export flag.")
+ sys.exit()
+
+
def main():
target_names = [x.value for x in Targets]
args = parse_args(target_names)
+ if not args.no_mdict_export:
+ test_mdict()
selected_target = Targets(args.target)
crawler = new_crawler(selected_target)
crawler.collect_pages(args.page_dir)
crawler.read_pages()
- crawler.make_yomichan_dictionary(args.image_dir)
+ if not args.no_yomichan_export:
+ crawler.make_yomichan_dictionary(args.media_dir)
+ if not args.no_mdict_export:
+ crawler.make_mdict_dictionary(args.media_dir, args.mdict_icon)
if __name__ == "__main__":
diff --git a/requirements.txt b/requirements.txt
index 1c111af..8802356 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,6 +6,7 @@ css-parser==1.0.8
html5lib==1.1
idna==3.4
lxml==4.9.2
+mdict-utils==1.3.12
Pillow==9.5.0
platformdirs==3.5.0
requests==2.29.0
@@ -13,5 +14,7 @@ six==1.16.0
soupsieve==2.4.1
SudachiDict-full==20230110
SudachiPy==0.6.7
+tqdm==4.65.0
urllib3==1.26.15
webencodings==0.5.1
+xxhash==3.2.0
diff --git a/run_all.sh b/run_all.sh
new file mode 100644
index 0000000..2bdd31e
--- /dev/null
+++ b/run_all.sh
@@ -0,0 +1,13 @@
+python jitenbot.py jitenon-kokugo
+python jitenbot.py jitenon-yoji
+python jitenbot.py jitenon-kotowaza
+
+python jitenbot.py smk8 \
+ --media-dir monokakido/SMK8/media \
+ --page-dir monokakido/SMK8/pages \
+ --mdict-icon monokakido/SMK8/SMK8-76@3x.png
+
+python jitenbot.py daijirin2 \
+ --media-dir monokakido/DAIJIRIN2/media \
+ --page-dir monokakido/DAIJIRIN2/pages \
+ --mdict-icon monokakido/DAIJIRIN2/DAIJIRIN2-76@3x.png