From 4c837cd72d6fcca6140431fbfb058b3431438af2 Mon Sep 17 00:00:00 2001 From: stephenmk Date: Sat, 8 Jul 2023 16:49:03 -0500 Subject: [PATCH] Add export support for the MDict dictionary format --- .gitignore | 1 + README.md | 64 ++- TODO.md | 4 +- bot/crawlers/crawlers.py | 19 +- bot/data.py | 8 +- bot/entries/daijirin2.py | 81 ++-- bot/entries/entry.py | 36 +- bot/entries/factory.py | 2 +- bot/entries/jitenon.py | 188 ++++---- bot/entries/smk8.py | 84 ++-- bot/entries/smk8_preprocess.py | 18 + bot/mdict/exporters/export.py | 204 ++++++++ bot/mdict/exporters/factory.py | 18 + bot/mdict/glossary/daijirin2.py | 77 +++ bot/mdict/glossary/jitenon.py | 141 ++++++ bot/mdict/glossary/smk8.py | 67 +++ bot/mdict/terms/daijirin2.py | 23 + bot/mdict/terms/factory.py | 18 + bot/mdict/terms/jitenon.py | 42 ++ bot/mdict/terms/smk8.py | 24 + bot/mdict/terms/terminator.py | 73 +++ .../glossary => }/name_conversion.py | 15 +- bot/yomichan/exporters/export.py | 43 +- bot/yomichan/exporters/factory.py | 2 +- bot/yomichan/glossary/daijirin2.py | 6 +- bot/yomichan/glossary/jitenon.py | 6 +- bot/yomichan/glossary/smk8.py | 6 +- bot/yomichan/terms/jitenon.py | 3 +- bot/yomichan/terms/terminator.py | 31 +- data/daijirin2/mdict_name_conversion.json | 12 + data/mdict/css/daijirin2.css | 414 ++++++++++++++++ data/mdict/css/jitenon-kokugo.css | 56 +++ data/mdict/css/jitenon-kotowaza.css | 40 ++ data/mdict/css/jitenon-yoji.css | 40 ++ data/mdict/css/smk8.css | 449 ++++++++++++++++++ .../daijirin2.mdx.description.html | 7 + .../jitenon-kokugo.mdx.description.html | 7 + .../jitenon-kotowaza.mdx.description.html | 7 + .../jitenon-yoji.mdx.description.html | 7 + .../description/smk8.mdx.description.html | 7 + data/mdict/icon/jitenon-kokugo.png | Bin 0 -> 2374 bytes data/mdict/icon/jitenon-kotowaza.png | Bin 0 -> 5473 bytes data/mdict/icon/jitenon-yoji.png | Bin 0 -> 2628 bytes data/mdict/title/daijirin2.mdx.title.html | 1 + .../mdict/title/jitenon-kokugo.mdx.title.html | 1 + .../title/jitenon-kotowaza.mdx.title.html | 1 + data/mdict/title/jitenon-yoji.mdx.title.html | 1 + data/mdict/title/smk8.mdx.title.html | 1 + data/smk8/mdict_name_conversion.json | 25 + data/smk8/yomichan_name_conversion.json | 40 +- jitenbot.py | 60 ++- requirements.txt | 3 + run_all.sh | 13 + 53 files changed, 2227 insertions(+), 269 deletions(-) create mode 100644 bot/mdict/exporters/export.py create mode 100644 bot/mdict/exporters/factory.py create mode 100644 bot/mdict/glossary/daijirin2.py create mode 100644 bot/mdict/glossary/jitenon.py create mode 100644 bot/mdict/glossary/smk8.py create mode 100644 bot/mdict/terms/daijirin2.py create mode 100644 bot/mdict/terms/factory.py create mode 100644 bot/mdict/terms/jitenon.py create mode 100644 bot/mdict/terms/smk8.py create mode 100644 bot/mdict/terms/terminator.py rename bot/{yomichan/glossary => }/name_conversion.py (88%) create mode 100644 data/daijirin2/mdict_name_conversion.json create mode 100644 data/mdict/css/daijirin2.css create mode 100644 data/mdict/css/jitenon-kokugo.css create mode 100644 data/mdict/css/jitenon-kotowaza.css create mode 100644 data/mdict/css/jitenon-yoji.css create mode 100644 data/mdict/css/smk8.css create mode 100644 data/mdict/description/daijirin2.mdx.description.html create mode 100644 data/mdict/description/jitenon-kokugo.mdx.description.html create mode 100644 data/mdict/description/jitenon-kotowaza.mdx.description.html create mode 100644 data/mdict/description/jitenon-yoji.mdx.description.html create mode 100644 data/mdict/description/smk8.mdx.description.html create mode 100644 data/mdict/icon/jitenon-kokugo.png create mode 100644 data/mdict/icon/jitenon-kotowaza.png create mode 100644 data/mdict/icon/jitenon-yoji.png create mode 100644 data/mdict/title/daijirin2.mdx.title.html create mode 100644 data/mdict/title/jitenon-kokugo.mdx.title.html create mode 100644 data/mdict/title/jitenon-kotowaza.mdx.title.html create mode 100644 data/mdict/title/jitenon-yoji.mdx.title.html create mode 100644 data/mdict/title/smk8.mdx.title.html create mode 100644 data/smk8/mdict_name_conversion.json create mode 100644 run_all.sh diff --git a/.gitignore b/.gitignore index b009cb5..4c7985d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ webcache/ output/ notes/ +monokakido/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/README.md b/README.md index 88d0f2b..5a872ea 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,8 @@ compiling the scraped data into compact dictionary file formats. # Usage ``` -usage: jitenbot [-h] [-p PAGE_DIR] [-i IMAGE_DIR] +usage: jitenbot [-h] [-p PAGE_DIR] [-m MEDIA_DIR] [-i MDICT_ICON] + [--no-yomichan-export] [--no-mdict-export] {jitenon-kokugo,jitenon-yoji,jitenon-kotowaza,smk8,daijirin2} Convert Japanese dictionary files to new formats. @@ -62,9 +63,15 @@ options: -h, --help show this help message and exit -p PAGE_DIR, --page-dir PAGE_DIR path to directory containing XML page files - -i IMAGE_DIR, --image-dir IMAGE_DIR - path to directory containing image folders (gaiji, - graphics, etc.) + -m MEDIA_DIR, --media-dir MEDIA_DIR + path to directory containing media folders (gaiji, + graphics, audio, etc.) + -i MDICT_ICON, --mdict-icon MDICT_ICON + path to icon file to be used with MDict + --no-yomichan-export skip export of dictionary data to Yomichan format + --no-mdict-export skip export of dictionary data to MDict format + +See README.md for details regarding media directory structures ``` ### Online Targets Jitenbot will scrape the target website and save the pages to the [user cache directory](https://pypi.org/project/platformdirs/). @@ -75,8 +82,55 @@ HTTP request headers (user agent string, etc.) may be customized by editing the [user config directory](https://pypi.org/project/platformdirs/). ### Offline Targets -Page data and image data must be procured by the user +Page data and media data must be [procured by the user](https://github.com/golddranks/monokakido/) and passed to jitenbot via the appropriate command line flags. +
+ smk8 media directory + +Since Yomichan does not support audio files from imported +dictionaries, the `audio/` directory may be omitted to save filesize +space in the output ZIP file if desired. + +``` +media +├── Audio.png +├── audio +│   ├── 00001.aac +│   ├── 00002.aac +│   ├── 00003.aac +│   │  ... +│   └── 82682.aac +└── gaiji + ├── 1d110.svg + ├── 1d15d.svg + ├── 1d15e.svg +    │  ... + └── xbunnoa.svg +``` +
+ +
+ daijirin2 media directory + +The `graphics/` directory may be omitted to save space if desired. + +``` +media +├── gaiji +│   ├── 1D10B.svg +│   ├── 1D110.svg +│   ├── 1D12A.svg +│   │  ... +│   └── vectorOB.svg +└── graphics + ├── 3djr_0002.png + ├── 3djr_0004.png + ├── 3djr_0005.png +    │  ... + └── 4djr_yahazu.png +``` +
+ # Attribution `Adobe-Japan1_sequences.txt` is provided by [The Adobe-Japan1-7 Character Collection](https://github.com/adobe-type-tools/Adobe-Japan1). diff --git a/TODO.md b/TODO.md index 30c860d..2f2a5d5 100644 --- a/TODO.md +++ b/TODO.md @@ -1,11 +1,11 @@ ### Todo +- [x] Add factory classes to reduce the amount of class import statements +- [x] Support exporting to MDict (.MDX) dictionary format - [ ] Add test suite - [ ] Add documentation (docstrings, etc.) - [ ] Validate JSON schema of Yomichan terms during export -- [ ] Add factory classes to reduce the amount of class import statements - [ ] Add build scripts for producing program binaries -- [ ] Support exporting to MDict (.MDX) dictionary format - [ ] Validate scraped webpages after downloading - [ ] Log non-fatal failures to a log file instead of raising exceptions - [ ] Support more dictionary websites diff --git a/bot/crawlers/crawlers.py b/bot/crawlers/crawlers.py index c7bf8ea..97b3794 100644 --- a/bot/crawlers/crawlers.py +++ b/bot/crawlers/crawlers.py @@ -5,7 +5,8 @@ from bs4 import BeautifulSoup import bot.scraper as Scraper from bot.entries.factory import new_entry -from bot.yomichan.exporters.factory import new_exporter +from bot.yomichan.exporters.factory import new_yomi_exporter +from bot.mdict.exporters.factory import new_mdict_exporter class Crawler(ABC): @@ -38,9 +39,13 @@ class Crawler(ABC): self._entries.append(entry) print() - def make_yomichan_dictionary(self, image_dir): - exporter = new_exporter(self._target) - exporter.export(self._entries, image_dir) + def make_yomichan_dictionary(self, media_dir): + exporter = new_yomi_exporter(self._target) + exporter.export(self._entries, media_dir) + + def make_mdict_dictionary(self, media_dir, icon_file): + exporter = new_mdict_exporter(self._target) + exporter.export(self._entries, media_dir, icon_file) def _parse_page_id(self, page_link): m = re.search(self._page_id_pattern, page_link) @@ -142,10 +147,8 @@ class _MonokakidoCrawler(Crawler): class Smk8Crawler(_MonokakidoCrawler): - def __init__(self, target): - super().__init__(target) + pass class Daijirin2Crawler(_MonokakidoCrawler): - def __init__(self, target): - super().__init__(target) + pass diff --git a/bot/data.py b/bot/data.py index 5d68769..3b1effd 100644 --- a/bot/data.py +++ b/bot/data.py @@ -99,15 +99,15 @@ def load_daijirin2_kana_abbreviations(): @cache -def load_smk8_yomichan_name_conversion(): - file_name = os.path.join("smk8", "yomichan_name_conversion.json") +def load_yomichan_name_conversion(target): + file_name = os.path.join(target.value, "yomichan_name_conversion.json") data = __load_json(file_name) return data @cache -def load_daijirin2_yomichan_name_conversion(): - file_name = os.path.join("daijirin2", "yomichan_name_conversion.json") +def load_mdict_name_conversion(target): + file_name = os.path.join(target.value, "mdict_name_conversion.json") data = __load_json(file_name) return data diff --git a/bot/entries/daijirin2.py b/bot/entries/daijirin2.py index 1463442..196bd0c 100644 --- a/bot/entries/daijirin2.py +++ b/bot/entries/daijirin2.py @@ -1,4 +1,3 @@ -import re from bs4 import BeautifulSoup import bot.entries.expressions as Expressions @@ -10,19 +9,17 @@ from bot.entries.daijirin2_preprocess import preprocess_page class _BaseDaijirin2Entry(Entry): - ID_TO_ENTRY = {} - SUBENTRY_ID_TO_ENTRY_ID = {} - - def __init__(self, entry_id): - super().__init__(entry_id) - if entry_id not in self.ID_TO_ENTRY: - self.ID_TO_ENTRY[entry_id] = self - else: - raise Exception(f"Duplicate entry ID: {entry_id}") + def __init__(self, target, entry_id): + super().__init__(target, entry_id) self.children = [] self.phrases = [] self._kana_abbreviations = load_daijirin2_kana_abbreviations() + def get_global_identifier(self): + parent_part = format(self.entry_id[0], '06') + child_part = hex(self.entry_id[1]).lstrip('0x').zfill(4).upper() + return f"@{self.target.value}-{parent_part}-{child_part}" + def set_page(self, page): page = self.__decompose_subentries(page) self._page = page @@ -57,14 +54,7 @@ class _BaseDaijirin2Entry(Entry): else: self._part_of_speech_tags.append(pos) - def get_headwords(self): - if self._headwords is not None: - return self._headwords - self._set_headwords() - self._set_variant_headwords() - return self._headwords - - def _set_regular_headwords(self, soup): + def _get_regular_headwords(self, soup): self._fill_alts(soup) reading = soup.find("見出仮名").text expressions = [] @@ -78,10 +68,11 @@ class _BaseDaijirin2Entry(Entry): expressions = Expressions.expand_abbreviation_list(expressions) if len(expressions) == 0: expressions.append(reading) - self._headwords = {reading: expressions} + headwords = {reading: expressions} + return headwords - def _set_variant_headwords(self): - for expressions in self._headwords.values(): + def _add_variant_expressions(self, headwords): + for expressions in headwords.values(): Expressions.add_variant_kanji(expressions) Expressions.add_fullwidth(expressions) Expressions.remove_iteration_mark(expressions) @@ -101,7 +92,7 @@ class _BaseDaijirin2Entry(Entry): tag_soup.name = "項目" subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"]) self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id - subentry = subentry_class(subentry_id) + subentry = subentry_class(self.target, subentry_id) page = tag_soup.decode() subentry.set_page(page) subentry_list.append(subentry) @@ -122,6 +113,8 @@ class _BaseDaijirin2Entry(Entry): @staticmethod def _delete_unused_nodes(soup): + """Remove extra markup elements that appear in the entry + headword line which are not part of the entry headword""" unused_nodes = [ "漢字音logo", "活用分節", "連語句活用分節", "語構成", "表外字マーク", "表外字マーク", "ルビG" @@ -144,25 +137,26 @@ class _BaseDaijirin2Entry(Entry): class Daijirin2Entry(_BaseDaijirin2Entry): - def __init__(self, page_id): + def __init__(self, target, page_id): entry_id = (page_id, 0) - super().__init__(entry_id) + super().__init__(target, entry_id) def set_page(self, page): page = preprocess_page(page) super().set_page(page) - def _set_headwords(self): + def _get_headwords(self): soup = self.get_page_soup() self._delete_unused_nodes(soup) if soup.find("漢字見出") is not None: - self._set_kanji_headwords(soup) + headwords = self._get_kanji_headwords(soup) elif soup.find("略語G") is not None: - self._set_acronym_headwords(soup) + headwords = self._get_acronym_headwords(soup) else: - self._set_regular_headwords(soup) + headwords = self._get_regular_headwords(soup) + return headwords - def _set_kanji_headwords(self, soup): + def _get_kanji_headwords(self, soup): readings = [] for el in soup.find_all("漢字音"): hira = Expressions.kata_to_hira(el.text) @@ -172,11 +166,12 @@ class Daijirin2Entry(_BaseDaijirin2Entry): expressions = [] for el in soup.find_all("漢字見出"): expressions.append(el.text) - self._headwords = {} + headwords = {} for reading in readings: - self._headwords[reading] = expressions + headwords[reading] = expressions + return headwords - def _set_acronym_headwords(self, soup): + def _get_acronym_headwords(self, soup): expressions = [] for el in soup.find_all("略語"): expression_parts = [] @@ -184,29 +179,24 @@ class Daijirin2Entry(_BaseDaijirin2Entry): expression_parts.append(part.text) expression = "".join(expression_parts) expressions.append(expression) - self._headwords = {"": expressions} + headwords = {"": expressions} + return headwords class Daijirin2ChildEntry(_BaseDaijirin2Entry): - def __init__(self, entry_id): - super().__init__(entry_id) - - def _set_headwords(self): + def _get_headwords(self): soup = self.get_page_soup() self._delete_unused_nodes(soup) - self._set_regular_headwords(soup) + headwords = self._get_regular_headwords(soup) + return headwords class Daijirin2PhraseEntry(_BaseDaijirin2Entry): - def __init__(self, entry_id): - super().__init__(entry_id) - self.__phrase_readings = load_daijirin2_phrase_readings() - def get_part_of_speech_tags(self): # phrases do not contain these tags return [] - def _set_headwords(self): + def _get_headwords(self): soup = self.get_page_soup() headwords = {} expressions = self._find_expressions(soup) @@ -217,7 +207,7 @@ class Daijirin2PhraseEntry(_BaseDaijirin2Entry): headwords[reading].append(expression) else: headwords[reading] = [expression] - self._headwords = headwords + return headwords def _find_expressions(self, soup): self._delete_unused_nodes(soup) @@ -231,7 +221,8 @@ class Daijirin2PhraseEntry(_BaseDaijirin2Entry): return expressions def _find_readings(self): - text = self.__phrase_readings[self.entry_id] + phrase_readings = load_daijirin2_phrase_readings() + text = phrase_readings[self.entry_id] alternatives = Expressions.expand_daijirin_alternatives(text) readings = [] for alt in alternatives: diff --git a/bot/entries/entry.py b/bot/entries/entry.py index 57316f6..3811a77 100644 --- a/bot/entries/entry.py +++ b/bot/entries/entry.py @@ -2,12 +2,24 @@ from abc import ABC, abstractmethod class Entry(ABC): - def __init__(self, entry_id): + ID_TO_ENTRY = {} + SUBENTRY_ID_TO_ENTRY_ID = {} + + def __init__(self, target, entry_id): + if entry_id not in self.ID_TO_ENTRY: + self.ID_TO_ENTRY[entry_id] = self + else: + raise Exception(f"Duplicate entry ID: {entry_id}") + self.target = target self.entry_id = entry_id self._page = None self._headwords = None self._part_of_speech_tags = None + @abstractmethod + def get_global_identifier(self): + pass + @abstractmethod def set_page(self, page): pass @@ -16,14 +28,34 @@ class Entry(ABC): def get_page_soup(self): pass - @abstractmethod def get_headwords(self): + if self._headwords is not None: + return self._headwords + headwords = self._get_headwords() + self._add_variant_expressions(headwords) + self._headwords = headwords + return headwords + + @abstractmethod + def _get_headwords(self): + pass + + @abstractmethod + def _add_variant_expressions(self, headwords): pass @abstractmethod def get_part_of_speech_tags(self): pass + def get_parent(self): + if self.entry_id in self.SUBENTRY_ID_TO_ENTRY_ID: + parent_id = self.SUBENTRY_ID_TO_ENTRY_ID[self.entry_id] + parent = self.ID_TO_ENTRY[parent_id] + else: + parent = None + return parent + def get_first_expression(self): headwords = self.get_headwords() expressions = next(iter(headwords.values())) diff --git a/bot/entries/factory.py b/bot/entries/factory.py index 23ca066..a3dec69 100644 --- a/bot/entries/factory.py +++ b/bot/entries/factory.py @@ -15,4 +15,4 @@ def new_entry(target, page_id): Targets.SMK8: Smk8Entry, Targets.DAIJIRIN2: Daijirin2Entry, } - return entry_map[target](page_id) + return entry_map[target](target, page_id) diff --git a/bot/entries/jitenon.py b/bot/entries/jitenon.py index fd9fcd2..65c4d2e 100644 --- a/bot/entries/jitenon.py +++ b/bot/entries/jitenon.py @@ -1,4 +1,5 @@ import re +from abc import abstractmethod from datetime import datetime, date from bs4 import BeautifulSoup @@ -7,18 +8,17 @@ import bot.entries.expressions as Expressions class _JitenonEntry(Entry): - ID_TO_ENTRY = {} - - def __init__(self, entry_id): - super().__init__(entry_id) - if entry_id not in self.ID_TO_ENTRY: - self.ID_TO_ENTRY[entry_id] = self - else: - raise Exception(f"Duplicate entry ID: {entry_id}") + def __init__(self, target, entry_id): + super().__init__(target, entry_id) + self.expression = "" + self.yomikata = "" + self.definition = "" + self.other_forms = [] self.modified_date = date(1970, 1, 1) self.attribution = "" - for column in self._COLUMNS.values(): - setattr(self, column[0], column[1]) + + def get_global_identifier(self): + return f"@{self.target.value}-{format(self.entry_id, '06')}" def set_page(self, page): soup = BeautifulSoup(page, features="html5lib") @@ -39,36 +39,33 @@ class _JitenonEntry(Entry): soup = BeautifulSoup(self._page, "html5lib") return soup - def get_headwords(self): - if self._headwords is not None: - return self._headwords - self._set_headwords() - self._set_variant_headwords() - return self._headwords - def get_part_of_speech_tags(self): # Jitenon doesn't have any return [] - def _set_headwords(self): + def _get_headwords(self): headwords = {} - for yomikata in self._yomikatas(): - headwords[yomikata] = [self.expression] - ikei_headwords = self._ikei_headwords() - for reading, expressions in ikei_headwords.items(): + for reading in self._get_readings(): + headwords[reading] = [self.expression] + other_form_headwords = self._other_form_headwords() + for reading, expressions in other_form_headwords.items(): if reading not in headwords: headwords[reading] = [] for expression in expressions: if expression not in headwords[reading]: headwords[reading].append(expression) - self._headwords = headwords + return headwords + + @abstractmethod + def _get_column_map(self): + pass def __set_modified_date(self, page): m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", page) if m is None: return - date = datetime.strptime(m.group(1), '%Y-%m-%d').date() - self.modified_date = date + modified_date = datetime.strptime(m.group(1), '%Y-%m-%d').date() + self.modified_date = modified_date def __set_attribution(self, soup): attribution = soup.find(class_="copyright") @@ -78,7 +75,8 @@ class _JitenonEntry(Entry): self.attribution = "" def __set_column(self, colname, colval): - attr_name = self._COLUMNS[colname][0] + column_map = self._get_column_map() + attr_name = column_map[colname] attr_value = getattr(self, attr_name) if isinstance(attr_value, str): setattr(self, attr_name, colval) @@ -88,7 +86,7 @@ class _JitenonEntry(Entry): else: attr_value.append(colval) - def _yomikatas(self): + def _get_readings(self): yomikata = self.yomikata m = re.search(r"^[ぁ-ヿ、]+$", yomikata) if m: @@ -109,20 +107,20 @@ class _JitenonEntry(Entry): print(f"Invalid 読み方 format: {self.yomikata}\n{self}\n") return [""] - def _ikei_headwords(self): - ikei_headwords = {} - for val in self.ikei: + def _other_form_headwords(self): + other_form_headwords = {} + for val in self.other_forms: m = re.search(r"^([^(]+)(([ぁ-ヿ、]+))$", val) if not m: print(f"Invalid 異形 format: {val}\n{self}\n") continue expression = m.group(1) reading = m.group(2) - if reading not in ikei_headwords: - ikei_headwords[reading] = [] - if expression not in ikei_headwords[reading]: - ikei_headwords[reading].append(expression) - return ikei_headwords + if reading not in other_form_headwords: + other_form_headwords[reading] = [] + if expression not in other_form_headwords[reading]: + other_form_headwords[reading].append(expression) + return other_form_headwords @staticmethod def __clean_text(text): @@ -133,9 +131,10 @@ class _JitenonEntry(Entry): return text def __str__(self): + column_map = self._get_column_map() colvals = [str(self.entry_id)] - for attr in self._COLUMNS.values(): - attr_val = getattr(self, attr[0]) + for attr_name in column_map.values(): + attr_val = getattr(self, attr_name) if isinstance(attr_val, str): colvals.append(attr_val) elif isinstance(attr_val, list): @@ -144,83 +143,100 @@ class _JitenonEntry(Entry): class JitenonYojiEntry(_JitenonEntry): - _COLUMNS = { - "四字熟語": ["expression", ""], - "読み方": ["yomikata", ""], - "意味": ["imi", ""], - "出典": ["shutten", ""], - "漢検級": ["kankenkyuu", ""], - "場面用途": ["bamenyouto", ""], - "異形": ["ikei", []], - "類義語": ["ruigigo", []], - } + def __init__(self, target, entry_id): + super().__init__(target, entry_id) + self.origin = "" + self.kanken_level = "" + self.category = "" + self.related_expressions = [] - def __init__(self, entry_id): - super().__init__(entry_id) + def _get_column_map(self): + return { + "四字熟語": "expression", + "読み方": "yomikata", + "意味": "definition", + "異形": "other_forms", + "出典": "origin", + "漢検級": "kanken_level", + "場面用途": "category", + "類義語": "related_expressions", + } - def _set_variant_headwords(self): - for expressions in self._headwords.values(): + def _add_variant_expressions(self, headwords): + for expressions in headwords.values(): Expressions.add_variant_kanji(expressions) class JitenonKotowazaEntry(_JitenonEntry): - _COLUMNS = { - "言葉": ["expression", ""], - "読み方": ["yomikata", ""], - "意味": ["imi", ""], - "出典": ["shutten", ""], - "例文": ["reibun", ""], - "異形": ["ikei", []], - "類句": ["ruiku", []], - } + def __init__(self, target, entry_id): + super().__init__(target, entry_id) + self.origin = "" + self.example = "" + self.related_expressions = [] - def __init__(self, entry_id): - super().__init__(entry_id) + def _get_column_map(self): + return { + "言葉": "expression", + "読み方": "yomikata", + "意味": "definition", + "異形": "other_forms", + "出典": "origin", + "例文": "example", + "類句": "related_expressions", + } - def _set_headwords(self): + def _get_headwords(self): if self.expression == "金棒引き・鉄棒引き": - self._headwords = { + headwords = { "かなぼうひき": ["金棒引き", "鉄棒引き"] } else: - super()._set_headwords() + headwords = super()._get_headwords() + return headwords - def _set_variant_headwords(self): - for expressions in self._headwords.values(): + def _add_variant_expressions(self, headwords): + for expressions in headwords.values(): Expressions.add_variant_kanji(expressions) Expressions.add_fullwidth(expressions) class JitenonKokugoEntry(_JitenonEntry): - _COLUMNS = { - "言葉": ["expression", ""], - "読み方": ["yomikata", ""], - "意味": ["imi", ""], - "例文": ["reibun", ""], - "別表記": ["betsuhyouki", ""], - "対義語": ["taigigo", ""], - "活用": ["katsuyou", ""], - "用例": ["yourei", ""], - "類語": ["ruigo", ""], - } + def __init__(self, target, entry_id): + super().__init__(target, entry_id) + self.example = "" + self.alt_expression = "" + self.antonym = "" + self.attachments = "" + self.compounds = "" + self.related_words = "" - def __init__(self, entry_id): - super().__init__(entry_id) + def _get_column_map(self): + return { + "言葉": "expression", + "読み方": "yomikata", + "意味": "definition", + "例文": "example", + "別表記": "alt_expression", + "対義語": "antonym", + "活用": "attachments", + "用例": "compounds", + "類語": "related_words", + } - def _set_headwords(self): + def _get_headwords(self): headwords = {} for reading in self.yomikata.split("・"): if reading not in headwords: headwords[reading] = [] for expression in self.expression.split("・"): headwords[reading].append(expression) - if self.betsuhyouki.strip() != "": - for expression in self.betsuhyouki.split("・"): + if self.alt_expression.strip() != "": + for expression in self.alt_expression.split("・"): headwords[reading].append(expression) - self._headwords = headwords + return headwords - def _set_variant_headwords(self): - for expressions in self._headwords.values(): + def _add_variant_expressions(self, headwords): + for expressions in headwords.values(): Expressions.add_variant_kanji(expressions) Expressions.add_fullwidth(expressions) Expressions.remove_iteration_mark(expressions) diff --git a/bot/entries/smk8.py b/bot/entries/smk8.py index 11ef7e6..2308893 100644 --- a/bot/entries/smk8.py +++ b/bot/entries/smk8.py @@ -1,4 +1,3 @@ -import re from bs4 import BeautifulSoup import bot.entries.expressions as Expressions @@ -9,19 +8,17 @@ from bot.entries.smk8_preprocess import preprocess_page class _BaseSmk8Entry(Entry): - ID_TO_ENTRY = {} - SUBENTRY_ID_TO_ENTRY_ID = {} - - def __init__(self, entry_id): - super().__init__(entry_id) - if entry_id not in self.ID_TO_ENTRY: - self.ID_TO_ENTRY[entry_id] = self - else: - raise Exception(f"Duplicate entry ID: {entry_id}") + def __init__(self, target, entry_id): + super().__init__(target, entry_id) self.children = [] self.phrases = [] self.kanjis = [] + def get_global_identifier(self): + parent_part = format(self.entry_id[0], '06') + child_part = hex(self.entry_id[1]).lstrip('0x').zfill(4).upper() + return f"@{self.target.value}-{parent_part}-{child_part}" + def set_page(self, page): page = self.__decompose_subentries(page) self._page = page @@ -30,13 +27,6 @@ class _BaseSmk8Entry(Entry): soup = BeautifulSoup(self._page, "xml") return soup - def get_headwords(self): - if self._headwords is not None: - return self._headwords - self._set_headwords() - self._set_variant_headwords() - return self._headwords - def get_part_of_speech_tags(self): if self._part_of_speech_tags is not None: return self._part_of_speech_tags @@ -50,8 +40,8 @@ class _BaseSmk8Entry(Entry): self._part_of_speech_tags.append(tag.text) return self._part_of_speech_tags - def _set_variant_headwords(self): - for expressions in self._headwords.values(): + def _add_variant_expressions(self, headwords): + for expressions in headwords.values(): Expressions.add_variant_kanji(expressions) Expressions.add_fullwidth(expressions) Expressions.remove_iteration_mark(expressions) @@ -87,7 +77,7 @@ class _BaseSmk8Entry(Entry): tag_soup.name = "項目" subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"]) self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id - subentry = subentry_class(subentry_id) + subentry = subentry_class(self.target, subentry_id) page = tag_soup.decode() subentry.set_page(page) subentry_list.append(subentry) @@ -106,6 +96,16 @@ class _BaseSmk8Entry(Entry): else: raise Exception(f"Invalid entry ID: {id_string}") + @staticmethod + def _delete_unused_nodes(soup): + """Remove extra markup elements that appear in the entry + headword line which are not part of the entry headword""" + unused_nodes = [ + "表音表記", "表外音訓マーク", "表外字マーク", "ルビG" + ] + for name in unused_nodes: + Soup.delete_soup_nodes(soup, name) + @staticmethod def _clean_expression(expression): for x in ["〈", "〉", "{", "}", "…", " "]: @@ -114,24 +114,24 @@ class _BaseSmk8Entry(Entry): @staticmethod def _fill_alts(soup): - for e in soup.find_all(["親見出仮名", "親見出表記"]): - e.string = e.attrs["alt"] + for el in soup.find_all(["親見出仮名", "親見出表記"]): + el.string = el.attrs["alt"] for gaiji in soup.find_all("外字"): gaiji.string = gaiji.img.attrs["alt"] class Smk8Entry(_BaseSmk8Entry): - def __init__(self, page_id): + def __init__(self, target, page_id): entry_id = (page_id, 0) - super().__init__(entry_id) + super().__init__(target, entry_id) def set_page(self, page): page = preprocess_page(page) super().set_page(page) - def _set_headwords(self): + def _get_headwords(self): soup = self.get_page_soup() - Soup.delete_soup_nodes(soup, "表音表記") + self._delete_unused_nodes(soup) self._fill_alts(soup) reading = self._find_reading(soup) expressions = [] @@ -140,16 +140,14 @@ class Smk8Entry(_BaseSmk8Entry): for expression in self._find_expressions(soup): if expression not in expressions: expressions.append(expression) - self._headwords = {reading: expressions} + headwords = {reading: expressions} + return headwords class Smk8ChildEntry(_BaseSmk8Entry): - def __init__(self, entry_id): - super().__init__(entry_id) - - def _set_headwords(self): + def _get_headwords(self): soup = self.get_page_soup() - Soup.delete_soup_nodes(soup, "表音表記") + self._delete_unused_nodes(soup) self._fill_alts(soup) reading = self._find_reading(soup) expressions = [] @@ -158,19 +156,20 @@ class Smk8ChildEntry(_BaseSmk8Entry): for expression in self._find_expressions(soup): if expression not in expressions: expressions.append(expression) - self._headwords = {reading: expressions} + headwords = {reading: expressions} + return headwords class Smk8PhraseEntry(_BaseSmk8Entry): - def __init__(self, entry_id): - super().__init__(entry_id) + def __init__(self, target, entry_id): + super().__init__(target, entry_id) self.__phrase_readings = load_smk8_phrase_readings() def get_part_of_speech_tags(self): # phrases do not contain these tags return [] - def _set_headwords(self): + def _get_headwords(self): soup = self.get_page_soup() headwords = {} expressions = self._find_expressions(soup) @@ -181,10 +180,10 @@ class Smk8PhraseEntry(_BaseSmk8Entry): headwords[reading].append(expression) else: headwords[reading] = [expression] - self._headwords = headwords + return headwords def _find_expressions(self, soup): - Soup.delete_soup_nodes(soup, "ルビG") + self._delete_unused_nodes(soup) self._fill_alts(soup) text = soup.find("標準表記").text text = self._clean_expression(text) @@ -206,15 +205,14 @@ class Smk8PhraseEntry(_BaseSmk8Entry): class Smk8KanjiEntry(_BaseSmk8Entry): - def __init__(self, entry_id): - super().__init__(entry_id) - - def _set_headwords(self): + def _get_headwords(self): soup = self.get_page_soup() + self._delete_unused_nodes(soup) self._fill_alts(soup) reading = self.__get_parent_reading() expressions = self._find_expressions(soup) - self._headwords = {reading: expressions} + headwords = {reading: expressions} + return headwords def __get_parent_reading(self): parent_id = self.SUBENTRY_ID_TO_ENTRY_ID[self.entry_id] diff --git a/bot/entries/smk8_preprocess.py b/bot/entries/smk8_preprocess.py index 2e480a8..5c9b924 100644 --- a/bot/entries/smk8_preprocess.py +++ b/bot/entries/smk8_preprocess.py @@ -15,6 +15,7 @@ def preprocess_page(page): page = __strip_page(page) page = __replace_glyph_codes(page) page = __format_hyougai_marks(page) + page = __remove_pronunciation_parentheses(page) return page @@ -64,6 +65,7 @@ def __format_hyougai_marks(page): for x in ["\n", "\t", " "]: text = text.replace(x, "") text = re.sub(r"〈([^〈]+)〉", r"\1", text) + page = re.sub(r"〈([^〈]+)〉", r"␂\1␃", page) for mark in re.findall(r"《.", text): if mark[1] == "〓": @@ -79,13 +81,29 @@ def __format_hyougai_marks(page): page = re.sub(f"〈([^{mark[1]}]*)({mark[1]})", r"\1<表外字>\2", page, count=1) + page = page.replace("␂", "〈") page = page.replace("␃", "〉") soup = BeautifulSoup(page, features="xml") + for el in soup.find_all("表外音訓"): if el.text == "": el.append(el.next_sibling) + mark_xml = "<表外音訓マーク>︽" + mark_soup = BeautifulSoup(mark_xml, "xml") + el.append(mark_soup.表外音訓マーク) + for el in soup.find_all("表外字"): if el.text == "": el.append(el.next_sibling) + mark_xml = "<表外字マーク>︿" + mark_soup = BeautifulSoup(mark_xml, "xml") + el.append(mark_soup.表外字マーク) + return soup.decode() + + +def __remove_pronunciation_parentheses(page): + page = page.replace("<表音表記>(", "<表音表記>") + page = page.replace(")", "") + return page diff --git a/bot/mdict/exporters/export.py b/bot/mdict/exporters/export.py new file mode 100644 index 0000000..2d76f1d --- /dev/null +++ b/bot/mdict/exporters/export.py @@ -0,0 +1,204 @@ +# pylint: disable=too-few-public-methods + +import subprocess +import os +import shutil +from abc import ABC, abstractmethod +from pathlib import Path +from datetime import datetime +from platformdirs import user_documents_dir, user_cache_dir + +from bot.targets import Targets +from bot.mdict.terms.factory import new_terminator + + +class Exporter(ABC): + def __init__(self, target): + self._target = target + self._terminator = new_terminator(target) + self._build_dir = None + self._build_media_dir = None + self._description_file = None + self._out_dir = None + + def export(self, entries, media_dir, icon_file): + self._init_build_media_dir(media_dir) + self._init_description_file(entries) + terms = self._get_terms(entries) + print(f"Exporting {len(terms)} Mdict keys...") + self._write_mdx_file(terms) + self._write_mdd_file() + self._write_icon_file(icon_file) + self._rm_build_dir() + + def _get_build_dir(self): + if self._build_dir is not None: + return self._build_dir + cache_dir = user_cache_dir("jitenbot") + build_directory = os.path.join(cache_dir, "mdict_build") + if Path(build_directory).is_dir(): + shutil.rmtree(build_directory) + os.makedirs(build_directory) + self._build_dir = build_directory + return self._build_dir + + def _init_build_media_dir(self, media_dir): + build_dir = self._get_build_dir() + build_media_dir = os.path.join(build_dir, self._target.value) + if media_dir is not None: + print("Copying media files to build directory...") + shutil.copytree(media_dir, build_media_dir) + else: + os.makedirs(build_media_dir) + css_file = self._get_css_file() + shutil.copy(css_file, build_media_dir) + self._terminator.set_media_dir(build_media_dir) + self._build_media_dir = build_media_dir + + def _init_description_file(self, entries): + filename = f"{self._target.value}.mdx.description.html" + original_file = os.path.join( + "data", "mdict", "description", filename) + with open(original_file, "r", encoding="utf8") as f: + description = f.read() + description = description.replace( + "{{revision}}", self._get_revision(entries)) + description = description.replace( + "{{attribution}}", self._get_attribution(entries)) + build_dir = self._get_build_dir() + description_file = os.path.join(build_dir, filename) + with open(description_file, "w", encoding="utf8") as f: + f.write(description) + self._description_file = description_file + + def _get_terms(self, entries): + terms = [] + entries_len = len(entries) + for idx, entry in enumerate(entries): + update = f"Creating Mdict terms for entry {idx+1}/{entries_len}" + print(update, end='\r', flush=True) + new_terms = self._terminator.make_terms(entry) + for term in new_terms: + terms.append(term) + print() + return terms + + def _write_mdx_file(self, terms): + out_dir = self._get_out_dir() + out_file = os.path.join(out_dir, f"{self._target.value}.mdx") + params = [ + "mdict", + "-a", self._get_term_file(terms), + "--title", self._get_title_file(), + "--description", self._description_file, + out_file + ] + subprocess.run(params, check=True) + + def _write_mdd_file(self): + out_dir = self._get_out_dir() + out_file = os.path.join(out_dir, f"{self._target.value}.mdd") + params = [ + "mdict", + "-a", self._build_media_dir, + "--title", self._get_title_file(), + "--description", self._description_file, + out_file + ] + subprocess.run(params, check=True) + + def _write_icon_file(self, icon_file): + premade_icon_file = f"data/mdict/icon/{self._target.value}.png" + out_dir = self._get_out_dir() + out_file = os.path.join(out_dir, f"{self._target.value}.png") + if icon_file is not None and Path(icon_file).is_file(): + shutil.copy(icon_file, out_file) + elif Path(premade_icon_file).is_file(): + shutil.copy(premade_icon_file, out_file) + + def _get_out_dir(self): + if self._out_dir is not None: + return self._out_dir + out_dir = os.path.join( + user_documents_dir(), "jitenbot", "mdict", self._target.value) + if Path(out_dir).is_dir(): + shutil.rmtree(out_dir) + os.makedirs(out_dir) + self._out_dir = out_dir + return out_dir + + def _get_term_file(self, terms): + build_dir = self._get_build_dir() + term_file = os.path.join(build_dir, f"{self._target.value}.mdx.txt") + with open(term_file, "w", encoding="utf8") as f: + for term in terms: + f.write("\n".join(term)) + f.write("\n\n") + return term_file + + def _get_title_file(self): + return os.path.join( + "data", "mdict", "title", + f"{self._target.value}.mdx.title.html") + + def _get_css_file(self): + return os.path.join( + "data", "mdict", "css", + f"{self._target.value}.css") + + def _rm_build_dir(self): + build_dir = self._get_build_dir() + shutil.rmtree(build_dir) + + @abstractmethod + def _get_revision(self, entries): + pass + + @abstractmethod + def _get_attribution(self, entries): + pass + + +class _JitenonExporter(Exporter): + def _get_revision(self, entries): + modified_date = None + for entry in entries: + if modified_date is None or entry.modified_date > modified_date: + modified_date = entry.modified_date + revision = modified_date.strftime("%Y年%m月%d日閲覧") + return revision + + def _get_attribution(self, entries): + modified_date = None + for entry in entries: + if modified_date is None or entry.modified_date > modified_date: + attribution = entry.attribution + return attribution + + +class JitenonKokugoExporter(_JitenonExporter): + pass + + +class JitenonYojiExporter(_JitenonExporter): + pass + + +class JitenonKotowazaExporter(_JitenonExporter): + pass + + +class _MonokakidoExporter(Exporter): + def _get_revision(self, entries): + timestamp = datetime.now().strftime("%Y年%m月%d日作成") + return timestamp + + +class Smk8Exporter(_MonokakidoExporter): + def _get_attribution(self, entries): + return "© Sanseido Co., LTD. 2020" + + +class Daijirin2Exporter(_MonokakidoExporter): + def _get_attribution(self, entries): + return "© Sanseido Co., LTD. 2019" diff --git a/bot/mdict/exporters/factory.py b/bot/mdict/exporters/factory.py new file mode 100644 index 0000000..2c2015c --- /dev/null +++ b/bot/mdict/exporters/factory.py @@ -0,0 +1,18 @@ +from bot.targets import Targets + +from bot.mdict.exporters.export import JitenonKokugoExporter +from bot.mdict.exporters.export import JitenonYojiExporter +from bot.mdict.exporters.export import JitenonKotowazaExporter +from bot.mdict.exporters.export import Smk8Exporter +from bot.mdict.exporters.export import Daijirin2Exporter + + +def new_mdict_exporter(target): + exporter_map = { + Targets.JITENON_KOKUGO: JitenonKokugoExporter, + Targets.JITENON_YOJI: JitenonYojiExporter, + Targets.JITENON_KOTOWAZA: JitenonKotowazaExporter, + Targets.SMK8: Smk8Exporter, + Targets.DAIJIRIN2: Daijirin2Exporter, + } + return exporter_map[target](target) diff --git a/bot/mdict/glossary/daijirin2.py b/bot/mdict/glossary/daijirin2.py new file mode 100644 index 0000000..1a8b0d5 --- /dev/null +++ b/bot/mdict/glossary/daijirin2.py @@ -0,0 +1,77 @@ +import re +import os +from functools import cache +from pathlib import Path + +from bot.soup import delete_soup_nodes +from bot.data import load_mdict_name_conversion +from bot.name_conversion import convert_names + + +def make_glossary(entry, media_dir): + soup = entry.get_page_soup() + __add_rubies(soup) + __hyperlink_parent_expression(soup, entry) + __delete_unused_nodes(soup, media_dir) + __convert_links(soup, entry) + + name_conversion = load_mdict_name_conversion(entry.target) + convert_names(soup, name_conversion) + + glossary = soup.span.decode() + return glossary + + +def __add_rubies(soup): + for name in ["表外音訓", "表外字"]: + for ruby in soup.find_all(name): + ruby.name = "ruby" + rt = ruby.find("表外字マーク") + rt.name = "rt" + ruby.append(rt) # needs to positioned after the text + + +def __hyperlink_parent_expression(soup, entry): + if soup.find("親表記") is None: + return + parent_entry = entry.get_parent() + gid = parent_entry.get_global_identifier() + for el in soup.find_all("親表記"): + el.name = "a" + el.attrs["href"] = f"entry://{gid}" + + +def __delete_unused_nodes(soup, media_dir): + if not __graphics_directory_exists(media_dir): + delete_soup_nodes(soup, "カットG") + for el in soup.find_all("logo"): + next_sibling = el.next_sibling + if next_sibling is None: + continue + elif next_sibling.name in ["漢字見出G", "漢字音G"]: + el.decompose() + for el in soup.find_all("漢字音G"): + for child in el.find_all(string="・"): + child.replace_with("") + + +@cache +def __graphics_directory_exists(media_dir): + path = os.path.join(media_dir, "graphics") + return Path(path).is_dir() + + +def __convert_links(soup, entry): + for el in soup.find_all("a"): + href = el.attrs["href"] + if re.match(r"^[0-9]+(?:-[0-9A-F]{4})?$", href): + ref_entry_id = entry.id_string_to_entry_id(href) + ref_entry = entry.ID_TO_ENTRY[ref_entry_id] + gid = ref_entry.get_global_identifier() + el.attrs["href"] = f"entry://{gid}" + elif re.match(r"^entry:", href): + pass + elif re.match(r"^https?:[\w\W]*", href): + pass + else: + raise Exception(f"Invalid href format: {href}") diff --git a/bot/mdict/glossary/jitenon.py b/bot/mdict/glossary/jitenon.py new file mode 100644 index 0000000..737ea59 --- /dev/null +++ b/bot/mdict/glossary/jitenon.py @@ -0,0 +1,141 @@ +# pylint: disable=too-few-public-methods + +import re + + +class JitenonGlossary(): + def __init__(self): + self._id_pattern = None + self._expression_header = None + + def _replace_punctuation(self, soup): + punctuation = { + "/": "/", + ",": "、", + } + for el in soup.find_all(string=True): + text = el.text + for old, new in punctuation.items(): + text = text.replace(old, new) + el.replace_with(text) + + def _add_internal_links(self, soup, entry): + for el in soup.find_all("a"): + href = el.attrs["href"] + m = re.search(self._id_pattern, href) + if m is not None: + ref_entry_id = int(m.group(1)) + ref_entry = entry.ID_TO_ENTRY[ref_entry_id] + gid = ref_entry.get_global_identifier() + el.attrs["href"] = f"entry://{gid}" + elif re.match(r"^(?:https?:|\?)[\w\W]*", href): + pass + else: + raise Exception(f"Invalid href format: {href}") + + def _decompose_table_rows(self, soup, entry): + for tr in soup.find_all("tr"): + if tr.find("th") is None: + continue + elif tr.th.text == self._expression_header: + tr.decompose() + elif tr.th.text == "読み方": + if self._do_display_yomikata_in_headword(entry): + tr.decompose() + elif tr.th.text == "意味": + definition = tr.td + definition.name = "div" + definition.attrs["class"] = "意味" + soup.body.insert(0, definition) + tr.decompose() + if soup.find("tr") is None: + soup.table.decompose() + + def _insert_headword_line(self, soup, entry): + headword_line = soup.new_tag("div") + headword_line.attrs["class"] = "見出し" + if self._do_display_yomikata_in_headword(entry): + reading = soup.new_tag("span") + reading.attrs["class"] = "読み方" + reading.string = entry.yomikata + headword_line.append(reading) + expression = soup.new_tag("span") + expression.attrs["class"] = self._expression_header + expression.string = f"【{entry.expression}】" + headword_line.append(expression) + soup.body.insert(0, headword_line) + + def _do_display_yomikata_in_headword(self, entry): + if not re.match(r"^[ぁ-ヿ、]+$", entry.yomikata): + return False + elif len(entry.yomikata) > 10: + return False + else: + return True + + +class JitenonKokugoGlossary(JitenonGlossary): + def __init__(self): + super().__init__() + self._expression_header = "言葉" + self._id_pattern = r"kokugo.jitenon.jp/word/p([0-9]+)$" + + def make_glossary(self, entry, media_dir): + soup = entry.get_page_soup() + self._remove_antonym_list_item(soup) + self._replace_number_icons(soup, media_dir) + self._replace_punctuation(soup) + self._add_internal_links(soup, entry) + self._decompose_table_rows(soup, entry) + self._insert_headword_line(soup, entry) + glossary = soup.body.prettify() + return glossary + + def _remove_antonym_list_item(self, soup): + for el in soup.find_all("li"): + if el.text == "対義語辞典": + el.decompose() + + def _replace_number_icons(self, soup, media_dir): + for el in soup.find_all("img"): + alt = el.attrs["alt"] + text = re.search(r"[0-9]+", alt).group(0) + el.name = "span" + el.string = text + del el.attrs["src"] + del el.attrs["alt"] + + def _do_display_yomikata_in_headword(self, entry): + return len(entry.yomikata) <= 10 + + +class JitenonYojiGlossary(JitenonGlossary): + def __init__(self): + super().__init__() + self._expression_header = "四字熟語" + self._id_pattern = r"yoji.jitenon.jp/yoji.?/([0-9]+)\.html$" + + def make_glossary(self, entry, media_dir): + soup = entry.get_page_soup() + self._replace_punctuation(soup) + self._add_internal_links(soup, entry) + self._decompose_table_rows(soup, entry) + self._insert_headword_line(soup, entry) + glossary = soup.body.prettify() + return glossary + + +class JitenonKotowazaGlossary(JitenonGlossary): + def __init__(self): + super().__init__() + self._expression_header = "言葉" + self._id_pattern = r"kotowaza.jitenon.jp/kotowaza/([0-9]+)\.php$" + + def make_glossary(self, entry, media_dir): + soup = entry.get_page_soup() + self._replace_punctuation(soup) + self._add_internal_links(soup, entry) + self._decompose_table_rows(soup, entry) + self._insert_headword_line(soup, entry) + glossary = soup.body.prettify() + return glossary diff --git a/bot/mdict/glossary/smk8.py b/bot/mdict/glossary/smk8.py new file mode 100644 index 0000000..613fc1b --- /dev/null +++ b/bot/mdict/glossary/smk8.py @@ -0,0 +1,67 @@ +import re + +from bot.soup import delete_soup_nodes +from bot.data import load_mdict_name_conversion +from bot.name_conversion import convert_names + + +def make_glossary(entry, media_dir): + soup = entry.get_page_soup() + __fill_alts(soup, entry) + __delete_unused_nodes(soup) + __convert_links(soup, entry) + __convert_priority_markers(soup) + + name_conversion = load_mdict_name_conversion(entry.target) + convert_names(soup, name_conversion) + + glossary = soup.span.decode() + return glossary + + +def __fill_alts(soup, entry): + names = ["親見出仮名", "親見出表記"] + if soup.find(names) is None: + return + parent_entry = entry.get_parent() + gid = parent_entry.get_global_identifier() + for el in soup.find_all(names): + el.name = "a" + alt = el.attrs["alt"] + el.string = alt + el.attrs["href"] = f"entry://{gid}" + del el.attrs["alt"] + + +def __delete_unused_nodes(soup): + for name in ["連濁"]: + delete_soup_nodes(soup, name) + + +def __convert_links(soup, entry): + for el in soup.find_all("a"): + href = el.attrs["href"] + if href.startswith("$"): + el.unwrap() + elif re.match(r"^[0-9]+(?:-[0-9A-F]{4})?$", href): + ref_entry_id = entry.id_string_to_entry_id(href) + ref_entry = entry.ID_TO_ENTRY[ref_entry_id] + gid = ref_entry.get_global_identifier() + el.attrs["href"] = f"entry://{gid}" + elif re.match(r"^[0-9]+[ab]?\.aac$", href): + el.attrs["href"] = f"sound://audio/{href}" + elif re.match(r"^entry:", href): + pass + elif re.match(r"^https?:[\w\W]*", href): + pass + else: + raise Exception(f"Invalid href format: {href}") + + +def __convert_priority_markers(soup): + for el in soup.find_all("img", attrs={"alt": "*"}): + el.name = "span" + el.string = "*" + for el in soup.find_all("img", attrs={"alt": "⁑"}): + el.name = "span" + el.string = "**" diff --git a/bot/mdict/terms/daijirin2.py b/bot/mdict/terms/daijirin2.py new file mode 100644 index 0000000..3b5ce68 --- /dev/null +++ b/bot/mdict/terms/daijirin2.py @@ -0,0 +1,23 @@ +from bot.mdict.terms.terminator import Terminator +from bot.mdict.glossary.daijirin2 import make_glossary + + +class Daijirin2Terminator(Terminator): + def _glossary(self, entry): + if entry.entry_id in self._glossary_cache: + return self._glossary_cache[entry.entry_id] + glossary = make_glossary(entry, self._media_dir) + self._glossary_cache[entry.entry_id] = glossary + return glossary + + def _link_glossary_parameters(self, entry): + return [ + [entry.children, "子項目"], + [entry.phrases, "句項目"], + ] + + def _subentry_lists(self, entry): + return [ + entry.children, + entry.phrases, + ] diff --git a/bot/mdict/terms/factory.py b/bot/mdict/terms/factory.py new file mode 100644 index 0000000..78a05cd --- /dev/null +++ b/bot/mdict/terms/factory.py @@ -0,0 +1,18 @@ +from bot.targets import Targets + +from bot.mdict.terms.jitenon import JitenonKokugoTerminator +from bot.mdict.terms.jitenon import JitenonYojiTerminator +from bot.mdict.terms.jitenon import JitenonKotowazaTerminator +from bot.mdict.terms.smk8 import Smk8Terminator +from bot.mdict.terms.daijirin2 import Daijirin2Terminator + + +def new_terminator(target): + terminator_map = { + Targets.JITENON_KOKUGO: JitenonKokugoTerminator, + Targets.JITENON_YOJI: JitenonYojiTerminator, + Targets.JITENON_KOTOWAZA: JitenonKotowazaTerminator, + Targets.SMK8: Smk8Terminator, + Targets.DAIJIRIN2: Daijirin2Terminator, + } + return terminator_map[target](target) diff --git a/bot/mdict/terms/jitenon.py b/bot/mdict/terms/jitenon.py new file mode 100644 index 0000000..3f9cfc1 --- /dev/null +++ b/bot/mdict/terms/jitenon.py @@ -0,0 +1,42 @@ +from bot.mdict.terms.terminator import Terminator + +from bot.mdict.glossary.jitenon import JitenonKokugoGlossary +from bot.mdict.glossary.jitenon import JitenonYojiGlossary +from bot.mdict.glossary.jitenon import JitenonKotowazaGlossary + + +class JitenonTerminator(Terminator): + def __init__(self, target): + super().__init__(target) + self._glossary_maker = None + + def _glossary(self, entry): + if entry.entry_id in self._glossary_cache: + return self._glossary_cache[entry.entry_id] + glossary = self._glossary_maker.make_glossary(entry, self._media_dir) + self._glossary_cache[entry.entry_id] = glossary + return glossary + + def _link_glossary_parameters(self, entry): + return [] + + def _subentry_lists(self, entry): + return [] + + +class JitenonKokugoTerminator(JitenonTerminator): + def __init__(self, target): + super().__init__(target) + self._glossary_maker = JitenonKokugoGlossary() + + +class JitenonYojiTerminator(JitenonTerminator): + def __init__(self, target): + super().__init__(target) + self._glossary_maker = JitenonYojiGlossary() + + +class JitenonKotowazaTerminator(JitenonTerminator): + def __init__(self, target): + super().__init__(target) + self._glossary_maker = JitenonKotowazaGlossary() diff --git a/bot/mdict/terms/smk8.py b/bot/mdict/terms/smk8.py new file mode 100644 index 0000000..22275d5 --- /dev/null +++ b/bot/mdict/terms/smk8.py @@ -0,0 +1,24 @@ +from bot.mdict.terms.terminator import Terminator +from bot.mdict.glossary.smk8 import make_glossary + + +class Smk8Terminator(Terminator): + def _glossary(self, entry): + if entry.entry_id in self._glossary_cache: + return self._glossary_cache[entry.entry_id] + glossary = make_glossary(entry, self._media_dir) + self._glossary_cache[entry.entry_id] = glossary + return glossary + + def _link_glossary_parameters(self, entry): + return [ + [entry.children, "子項目"], + [entry.phrases, "句項目"], + ] + + def _subentry_lists(self, entry): + return [ + entry.children, + entry.phrases, + entry.kanjis, + ] diff --git a/bot/mdict/terms/terminator.py b/bot/mdict/terms/terminator.py new file mode 100644 index 0000000..e69d9fb --- /dev/null +++ b/bot/mdict/terms/terminator.py @@ -0,0 +1,73 @@ +from abc import abstractmethod, ABC + + +class Terminator(ABC): + def __init__(self, target): + self._target = target + self._glossary_cache = {} + self._media_dir = None + + def set_media_dir(self, media_dir): + self._media_dir = media_dir + + def make_terms(self, entry): + gid = entry.get_global_identifier() + glossary = self.__full_glossary(entry) + terms = [[gid, glossary]] + keys = set() + headwords = entry.get_headwords() + for reading, expressions in headwords.items(): + if len(expressions) == 0: + keys.add(reading) + for expression in expressions: + if expression.strip() == "": + keys.add(reading) + continue + keys.add(expression) + if reading.strip() == "": + continue + if reading != expression: + keys.add(f"{reading}【{expression}】") + else: + keys.add(reading) + link = f"@@@LINK={gid}" + for key in keys: + if key.strip() != "": + terms.append([key, link]) + for subentries in self._subentry_lists(entry): + for subentry in subentries: + for term in self.make_terms(subentry): + terms.append(term) + return terms + + def __full_glossary(self, entry): + glossary = [] + style_link = f"" + glossary.append(style_link) + glossary.append(self._glossary(entry)) + + for x in self._link_glossary_parameters(entry): + (subentries, list_title) = x + if len(subentries) == 0: + continue + items = [] + for subentry in subentries: + exp = subentry.get_first_expression() + gid = subentry.get_global_identifier() + item = f"
  • {exp}
  • " + items.append(item) + link_glossary = f"
    {list_title}
    " + glossary.append(link_glossary) + return "\n".join(glossary) + + @abstractmethod + def _glossary(self, entry): + pass + + @abstractmethod + def _link_glossary_parameters(self, entry): + pass + + @abstractmethod + def _subentry_lists(self, entry): + pass diff --git a/bot/yomichan/glossary/name_conversion.py b/bot/name_conversion.py similarity index 88% rename from bot/yomichan/glossary/name_conversion.py rename to bot/name_conversion.py index 776d65e..2c9b808 100644 --- a/bot/yomichan/glossary/name_conversion.py +++ b/bot/name_conversion.py @@ -30,7 +30,7 @@ def __apply_name_conversion_procedures(soup, procedures): "has_previous_sibling": __has_previous_sibling, "replace": __replace, "wrap": __wrap, - "add_ruby_text": __add_ruby_text, + "insert_span": __insert_span, } for procedure in procedures: function = functions[procedure["procedure_name"]] @@ -92,10 +92,9 @@ def __wrap(soup, l_wrap, r_wrap): soup.string = f"{l_wrap}{soup.text}{r_wrap}" -def __add_ruby_text(soup, mark, style): - if style.strip() != "": - markup = f"{mark}" - else: - markup = f"{mark}" - rt_soup = BeautifulSoup(markup, "xml") - soup.append(rt_soup.rt) +def __insert_span(soup, attr_name, attr_val): + span_markup = f"" + span_soup = BeautifulSoup(span_markup, "xml") + for content in reversed(soup.contents): + span_soup.span.insert(0, content.extract()) + soup.append(span_soup.span) diff --git a/bot/yomichan/exporters/export.py b/bot/yomichan/exporters/export.py index 4658030..03e1b95 100644 --- a/bot/yomichan/exporters/export.py +++ b/bot/yomichan/exporters/export.py @@ -1,15 +1,18 @@ +# pylint: disable=too-few-public-methods + import json import os import shutil from pathlib import Path from datetime import datetime +from abc import ABC, abstractmethod from platformdirs import user_documents_dir, user_cache_dir from bot.data import load_yomichan_metadata from bot.yomichan.terms.factory import new_terminator -class Exporter: +class Exporter(ABC): def __init__(self, target): self._target = target self._terminator = new_terminator(target) @@ -26,6 +29,14 @@ class Exporter: terms = self.__get_terms(entries) self.__make_dictionary(terms, index, tags) + @abstractmethod + def _get_revision(self, entries): + pass + + @abstractmethod + def _get_attribution(self, entries): + pass + def _get_build_dir(self): if self._build_dir is not None: return self._build_dir @@ -41,7 +52,7 @@ class Exporter: build_dir = self._get_build_dir() build_img_dir = os.path.join(build_dir, self._target.value) if image_dir is not None: - print("Copying image files to build directory...") + print("Copying media files to build directory...") shutil.copytree(image_dir, build_img_dir) else: os.makedirs(build_img_dir) @@ -93,7 +104,7 @@ class Exporter: def __write_archive(self, filename): archive_format = "zip" - out_dir = os.path.join(user_documents_dir(), "jitenbot") + out_dir = os.path.join(user_documents_dir(), "jitenbot", "yomichan") if not Path(out_dir).is_dir(): os.makedirs(out_dir) out_file = f"{filename}.{archive_format}" @@ -110,10 +121,7 @@ class Exporter: shutil.rmtree(build_dir) -class JitenonExporter(Exporter): - def __init__(self, target): - super().__init__(target) - +class _JitenonExporter(Exporter): def _get_revision(self, entries): modified_date = None for entry in entries: @@ -130,25 +138,19 @@ class JitenonExporter(Exporter): return attribution -class JitenonKokugoExporter(JitenonExporter): - def __init__(self, target): - super().__init__(target) +class JitenonKokugoExporter(_JitenonExporter): + pass -class JitenonYojiExporter(JitenonExporter): - def __init__(self, target): - super().__init__(target) +class JitenonYojiExporter(_JitenonExporter): + pass -class JitenonKotowazaExporter(JitenonExporter): - def __init__(self, target): - super().__init__(target) +class JitenonKotowazaExporter(_JitenonExporter): + pass class Smk8Exporter(Exporter): - def __init__(self, target): - super().__init__(target) - def _get_revision(self, entries): timestamp = datetime.now().strftime("%Y-%m-%d") return f"{self._target.value};{timestamp}" @@ -158,9 +160,6 @@ class Smk8Exporter(Exporter): class Daijirin2Exporter(Exporter): - def __init__(self, target): - super().__init__(target) - def _get_revision(self, entries): timestamp = datetime.now().strftime("%Y-%m-%d") return f"{self._target.value};{timestamp}" diff --git a/bot/yomichan/exporters/factory.py b/bot/yomichan/exporters/factory.py index 5ab9a6a..06568e3 100644 --- a/bot/yomichan/exporters/factory.py +++ b/bot/yomichan/exporters/factory.py @@ -7,7 +7,7 @@ from bot.yomichan.exporters.export import Smk8Exporter from bot.yomichan.exporters.export import Daijirin2Exporter -def new_exporter(target): +def new_yomi_exporter(target): exporter_map = { Targets.JITENON_KOKUGO: JitenonKokugoExporter, Targets.JITENON_YOJI: JitenonYojiExporter, diff --git a/bot/yomichan/glossary/daijirin2.py b/bot/yomichan/glossary/daijirin2.py index f2b6f2c..c42841c 100644 --- a/bot/yomichan/glossary/daijirin2.py +++ b/bot/yomichan/glossary/daijirin2.py @@ -6,9 +6,9 @@ from pathlib import Path import bot.icons as Icons from bot.soup import delete_soup_nodes -from bot.data import load_daijirin2_yomichan_name_conversion +from bot.data import load_yomichan_name_conversion from bot.yomichan.glossary.gloss import make_gloss -from bot.yomichan.glossary.name_conversion import convert_names +from bot.name_conversion import convert_names def make_glossary(entry, image_dir): @@ -26,7 +26,7 @@ def make_glossary(entry, image_dir): __convert_daigoginum(soup, image_dir) __convert_jundaigoginum(soup, image_dir) - name_conversion = load_daijirin2_yomichan_name_conversion() + name_conversion = load_yomichan_name_conversion(entry.target) convert_names(soup, name_conversion) gloss = make_gloss(soup.span) diff --git a/bot/yomichan/glossary/jitenon.py b/bot/yomichan/glossary/jitenon.py index 6e3a192..ca76f19 100644 --- a/bot/yomichan/glossary/jitenon.py +++ b/bot/yomichan/glossary/jitenon.py @@ -58,9 +58,9 @@ class JitenonGlossary(): if self._do_display_yomikata_in_headword(entry): tr.decompose() elif tr.th.text == "意味": - imi = tr.td - imi.name = "div" - soup.body.insert(0, imi) + definition = tr.td + definition.name = "div" + soup.body.insert(0, definition) tr.decompose() if soup.find("tr") is None: soup.table.decompose() diff --git a/bot/yomichan/glossary/smk8.py b/bot/yomichan/glossary/smk8.py index 870c3fc..8754a02 100644 --- a/bot/yomichan/glossary/smk8.py +++ b/bot/yomichan/glossary/smk8.py @@ -4,9 +4,9 @@ from bs4 import BeautifulSoup import bot.icons as Icons from bot.soup import delete_soup_nodes -from bot.data import load_smk8_yomichan_name_conversion +from bot.data import load_yomichan_name_conversion from bot.yomichan.glossary.gloss import make_gloss -from bot.yomichan.glossary.name_conversion import convert_names +from bot.name_conversion import convert_names def make_glossary(entry, image_dir): @@ -20,7 +20,7 @@ def make_glossary(entry, image_dir): __convert_gaiji(soup, image_dir) __convert_rectangles(soup, image_dir) - name_conversion = load_smk8_yomichan_name_conversion() + name_conversion = load_yomichan_name_conversion(entry.target) convert_names(soup, name_conversion) gloss = make_gloss(soup.span) diff --git a/bot/yomichan/terms/jitenon.py b/bot/yomichan/terms/jitenon.py index f74abaa..66bbed7 100644 --- a/bot/yomichan/terms/jitenon.py +++ b/bot/yomichan/terms/jitenon.py @@ -9,6 +9,7 @@ from bot.yomichan.glossary.jitenon import JitenonKotowazaGlossary class JitenonTerminator(Terminator): def __init__(self, target): super().__init__(target) + self._glossary_maker = None def _definition_tags(self, entry): return None @@ -51,7 +52,7 @@ class JitenonYojiTerminator(JitenonTerminator): return "" def _term_tags(self, entry): - tags = entry.kankenkyuu.split("/") + tags = entry.kanken_level.split("/") return " ".join(tags) diff --git a/bot/yomichan/terms/terminator.py b/bot/yomichan/terms/terminator.py index d41a50a..dd0c02d 100644 --- a/bot/yomichan/terms/terminator.py +++ b/bot/yomichan/terms/terminator.py @@ -1,7 +1,8 @@ +from abc import abstractmethod, ABC from bot.data import load_yomichan_inflection_categories -class Terminator: +class Terminator(ABC): def __init__(self, target): self._target = target self._glossary_cache = {} @@ -62,3 +63,31 @@ class Terminator: } glossary.append(gloss) return glossary + + @abstractmethod + def _definition_tags(self, entry): + pass + + @abstractmethod + def _inflection_rules(self, entry, expression): + pass + + @abstractmethod + def _glossary(self, entry): + pass + + @abstractmethod + def _sequence(self, entry): + pass + + @abstractmethod + def _term_tags(self, entry): + pass + + @abstractmethod + def _link_glossary_parameters(self, entry): + pass + + @abstractmethod + def _subentry_lists(self, entry): + pass diff --git a/data/daijirin2/mdict_name_conversion.json b/data/daijirin2/mdict_name_conversion.json new file mode 100644 index 0000000..d783d28 --- /dev/null +++ b/data/daijirin2/mdict_name_conversion.json @@ -0,0 +1,12 @@ +{ + "a": {}, + "br": {}, + "img": {}, + "div": {}, + "span": {}, + "ruby": {}, + "rt": {}, + "p": {}, + "漢字音G": {"name": "ul"}, + "漢字音": {"name": "li"} +} diff --git a/data/mdict/css/daijirin2.css b/data/mdict/css/daijirin2.css new file mode 100644 index 0000000..703cb35 --- /dev/null +++ b/data/mdict/css/daijirin2.css @@ -0,0 +1,414 @@ + +body { + margin: 1em 44px 1em 1em; + line-height: 1.5em; + font-family: serif; + font-size: 1.2em; + color: black; +} + +body.ABC { + margin: 0.5em 0.5em 2em 0.5em; +} + +a { + text-decoration: none; +} + +img.gaiji { + height: 1em; +} + +img.cut { + max-height: 100px; + max-width: 600px; +} + +p { + margin: 0.5em 0 +} + +span[data-name="i"] { + font-style: italic; +} + +span[data-name="h1"] { + font-family: sans-serif; + font-size: 1em; + font-weight: bold; +} + +span[data-name="image"] { + display: block; +} + +span[data-name="ref"] a { + text-decoration: none; +} + +span[data-name="sl"] { + text-decoration: accent; +} + +span[data-name="sm"] { + font-size: 0.7em; +} + +span[data-name="small"] { + font-size: 0.7em; + vertical-align: 0.35em; +} + +span[data-name="sub"] { + font-size: 0.7em; + vertical-align: -0.35em; +} + +span[data-name="ty2"] span[data-name="sub"] { + vertical-align: 0em; +} + +span[data-name="ty2"] span[data-name="sup"] { + vertical-align: 0.5em; +} + +span[data-name="文語形"] { + display: block; +} + +span[data-name="用例"] { + display: block; +} + +span[data-name="補説G"] { + display: block; +} + +span[data-name="語義Gnum"] + span[data-name="補説G"] { + display: inline; +} + +span[data-name="アクセントG"] + span[data-name="補説G"] { + display: inline; +} + +span[data-name="補説G"] + span[data-name="語釈"] { + display: block; +} + +span[data-name="アクセントG"] { + font-size: 0.7em; + vertical-align: super; + margin-left: 0.25em; + margin-right: 0.25em; +} + +span[data-name="カット"] { + display: block; +} + +span[data-name="カットG"] { + display: block; + margin-top: 0.5em; + margin-bottom: 0.5em; + margin-left: 1em; +} + +span[data-name="キャプション"] { + display: block; +} + +span[data-name="ルビG"] { + font-family: sans-serif; + font-size: 0.7em; + font-weight: normal; + vertical-align: 0.35em; +} + +.warichu span[data-name="ルビG"] { + font-family: serif; + font-size: 0.5em; + font-weight: normal; + vertical-align: 0em; +} + +span[data-name="中語義"] { + display: block; +} + +span[data-name="付記"] { + font-size: 0.7em; + vertical-align: 0.35em; +} + +span[data-name="副義"] { + display: block; + margin-left: 1em; +} + +span[data-name="単位名"] { + font-size: 0.5em; +} + +span[data-name="原籍"] { + font-size: 0.7em; + vertical-align: 0.35em; +} + +span[data-name="句仮名"] { + font-size: 0.7em; + vertical-align: 0.35em; +} + +span[data-name="句項目"] { + margin-top: 0.5em; + margin-left: 1em; + display: block; +} + +span[data-name="和字"] { + font-family: sans-serif; +} + +span[data-name="品詞行"] { + font-size: 0.7em; + vertical-align: 0.35em; +} + +span[data-name="品詞用法"] { + font-size: 0.7em; + vertical-align: 0.35em; +} + +span[data-name="大語義"] { + display: block; +} + +span[data-name="大語義num"] { + margin: 0.025em; + padding: 0.1em; + font-family: sans-serif; + font-size: 0.8em; + color: white; + background-color: black; +} + +span[data-name="子項目"] { + display: block; + margin-top: 0.5em; + margin-left: 1em; +} + +span[data-name="慣用G"] { + display: block; + margin-top: 0.5em; +} + +span[data-name="欧字"] { + font-family: sans-serif; +} + +span[data-name="歴史仮名"] { + font-size: 0.7em; + vertical-align: 0.35em; +} + +span[data-name="派生G"] { + display: block; + margin-top: 0.5em; +} + +span[data-name="準大語義"] { + display: block; +} + +span[data-name="準大語義num"] { + margin: 0.025em; + padding: 0.1em; + font-family: sans-serif; + font-size: 0.8em; + border: solid 1px black; +} + +span[data-name="漢字音logo"] { + margin: 0.025em; + padding: 0.1em; + font-family: sans-serif; + font-size: 0.8em; + border: solid 0.5px black; + border-radius: 1em; +} + +span[data-name="漢字音G"] { + font-size: 0.7em; + font-weight: normal; + vertical-align: 0.35em; +} + +span[data-name="生没年"] { + margin-left: 0.25em; + margin-right: 0.25em; + font-size: 0.7em; + vertical-align: 0.35em; +} + +span[data-name="生没年"]:first-child { + margin-left: 0; +} + +span[data-name="用法"] { + font-size: 0.7em; + vertical-align: 0.35em; +} + +span[data-name="異字同訓"] { + display: block; + margin-top: 0.5em; +} + +span[data-name="異字同訓仮名"] { + font-family: sans-serif; + font-weight: bold; +} + +span[data-name="異字同訓漢字"] { + font-family: serif; + font-weight: normal; +} + +span[data-name="異字同訓表記"] { + font-family: sans-serif; + font-weight: bold; +} + +span[data-name="異字同訓解説"] { + display: block; +} + +span[data-name="異字同訓語義G"] { + display: block; +} + +span[data-name="細義"] { + display: block; +} + +span[data-name="表外字マーク"] { + font-size: 0.5em; + vertical-align: 0.5em; +} + +span[data-name="見出仮名"] { + font-family: sans-serif; + font-weight: bold; +} + +span[data-name="見出相当部"] { + font-family: sans-serif; + font-weight: bold; +} + +span[data-name="見出部"] { + display: block; +} + +span[data-name="解説部"] { + display: block; + margin-left: 1em; +} + +span[data-name="語義G"] { + display: block; +} + +span[data-name="語義区切"] { + font-size: 0.7em; + vertical-align: 0.35em; +} + +span[data-name="返り点"] { + font-size: 0.5em; + font-weight: normal; + vertical-align: 1em; +} + +span[data-name="返り点"].熟語記号 { + vertical-align: 0em; +} + +span[data-name="項目"] { + display: block; +} + +span[data-name="logo"] { + margin: 0.025em 0.25em; + padding: 0.1em; + font-size: 0.8em; + border: solid 1px black; + border-radius: 0.2em; +} + +.gothic { + font-family: sans-serif; + font-weight: bold; +} + +.warichu { + font-size: 1em; +} + +.refnum { + font-size: 0.7em; + vertical-align: 0.35em; +} + +#index { + display: none; +} + +span[data-name="歴史仮名"]:before, +span[data-name="ルビG"]:before, +span[data-name="品詞行"]:before, +span[data-name="原籍"]:before, +span[data-name="品詞用法"]:before, +span[data-name="付記"]:before { + content: "("; +} + +span[data-name="歴史仮名"]:after, +span[data-name="ルビG"]:after, +span[data-name="品詞行"]:after, +span[data-name="原籍"]:after, +span[data-name="品詞用法"]:after, +span[data-name="付記"]:after { + content: ")"; +} + +div[data-child-links] { + padding-top: 1em; +} + +div[data-child-links] ul { + margin: 0; + padding-left: 2em; +} + +div[data-child-links] span { + padding: 0.1em; + font-family: sans-serif; + font-size: 0.8em; + color: white; + border-width: 0.05em; + border-style: none; + border-color: black; + word-break: keep-all; + -webkit-border-radius: 0.2em; +} + +div[data-child-links="子項目"] span { + background-color: rgb(153, 42, 103); +} + +div[data-child-links="句項目"] span { + background-color: rgb(176, 127, 57); +} diff --git a/data/mdict/css/jitenon-kokugo.css b/data/mdict/css/jitenon-kokugo.css new file mode 100644 index 0000000..687ae14 --- /dev/null +++ b/data/mdict/css/jitenon-kokugo.css @@ -0,0 +1,56 @@ + +body { + font-family: serif; + margin: 1em 44px 1em 1.5em; + line-height: 1.5em; + font-size: 1.2em; + color: black; +} + +table, th, td { + border: 1px solid; + border-collapse: collapse; + padding: 0.5em; +} + +th { + font-family: sans-serif; + color: black; + background-color: lightgray; + font-weight: normal; + white-space: nowrap; +} + +a { + text-decoration: none; +} + +td ul { + margin: -0.1em 0em -0.1em -1em; +} + +.見出し { +} + +.読み方 { + font-family: sans-serif; + font-weight: bold; +} + +.意味 { + margin-left: 1.0em; + margin-bottom: 0.5em; +} + +.num_icon { + font-family: sans-serif; + padding-left: 0.25em; + margin-right: 0.5em; + font-size: 0.8em; + word-break: keep-all; + color: white; + background-color: gray; + border-style: none; + -webkit-border-radius: 0.1em; +} + diff --git a/data/mdict/css/jitenon-kotowaza.css b/data/mdict/css/jitenon-kotowaza.css new file mode 100644 index 0000000..2dfb1be --- /dev/null +++ b/data/mdict/css/jitenon-kotowaza.css @@ -0,0 +1,40 @@ + +body { + font-family: serif; + margin: 1em 44px 1em 1.5em; + line-height: 1.5em; + font-size: 1.2em; + color: black; +} + +table, th, td { + border: 1px solid; + border-collapse: collapse; + padding: 0.5em; +} + +th { + font-family: sans-serif; + color: black; + background-color: lightgray; + font-weight: normal; + white-space: nowrap; +} + +a { + text-decoration: none; +} + +.見出し { +} + +.読み方 { + font-family: sans-serif; + font-weight: bold; +} + +.意味 { + margin-left: 1.0em; + margin-bottom: 0.5em; +} + diff --git a/data/mdict/css/jitenon-yoji.css b/data/mdict/css/jitenon-yoji.css new file mode 100644 index 0000000..2dfb1be --- /dev/null +++ b/data/mdict/css/jitenon-yoji.css @@ -0,0 +1,40 @@ + +body { + font-family: serif; + margin: 1em 44px 1em 1.5em; + line-height: 1.5em; + font-size: 1.2em; + color: black; +} + +table, th, td { + border: 1px solid; + border-collapse: collapse; + padding: 0.5em; +} + +th { + font-family: sans-serif; + color: black; + background-color: lightgray; + font-weight: normal; + white-space: nowrap; +} + +a { + text-decoration: none; +} + +.見出し { +} + +.読み方 { + font-family: sans-serif; + font-weight: bold; +} + +.意味 { + margin-left: 1.0em; + margin-bottom: 0.5em; +} + diff --git a/data/mdict/css/smk8.css b/data/mdict/css/smk8.css new file mode 100644 index 0000000..e88da1c --- /dev/null +++ b/data/mdict/css/smk8.css @@ -0,0 +1,449 @@ + +body { + margin: 1em 44px 1em 1.5em; + line-height: 1.5em; + font-family: serif; + font-size: 1.2em; + color: black; +} + +span[data-name="項目"] { + display: block; +} + +span[data-name="見出部"] { + display: block; +} + +span[data-name="見出部"].pri { + margin-left: -0.4em; +} + +span[data-name="見出仮名"] { + font-family: sans-serif; + font-weight: bold; +} + +rt[data-name="表音表記"] { + font-size: 0.65em; +} + +rt[data-name="表外音訓マーク"] { + font-size: 0.65em; +} + +rt[data-name="表外字マーク"] { + font-size: 0.65em; +} + +span[data-name="解説部"] { + display: block; + margin-left: 1em; +} + +span[data-name="大語義"] { + display: block; +} + +span[data-name="語義"] { + display: block; +} + +span[data-name="副義"] { + display: block; +} + +span[data-name="用例G"] { + display: block; +} + +span[data-name="注記"] span[data-name="用例G"] { + display: inline; +} + +span[data-name="用例"] { + display: block; +} + +span[data-name="注記"] span[data-name="用例"] { + display: inline; +} + +span[data-name="見出語省略"] { + margin-left: 0.125em; + margin-right: 0.125em; +} + +span[data-name="教育漢字"] { + color: green; +} + +span[data-name="ルビ"] { + font-size: 0.7em; + vertical-align: 0.5em; +} + +span[data-name="ルビ区切"] { + font-size: 0.7em; + vertical-align: 0.65em; +} + +span[data-name="名詞形G"] { + display: block; +} + +span[data-name="可能形G"] { + display: block; +} + +span[data-name="参照G"] { + display: block; +} + +span[data-name="参照"] { + color: blue; +} + +span[data-name="子項目"], +span[data-name="句項目"] { + display: block; + margin-bottom: 0.5em; +} + +span[data-name="子項目F"], +span[data-name="句項目F"] { + display: block; + margin-bottom: 0.5em; + margin-top: 0.5em; +} + +span[data-name="子見出部"] { + display: block; +} + +span[data-name="子解説部"] { + display: block; + margin-left: 1em; +} + +span[data-name="句見出部"] { + display: block; +} + +span[data-name="句解説部"] { + display: block; + margin-left: 1em; +} + +span[data-name="運用解説"] { + display: block; +} + +span[data-name="表記解説"] { + display: block; +} + +span[data-name="文法解説"] { + display: block; +} + +span[data-name="かぞえ方解説"] { + display: block; +} + +span[data-name="派生"] { + display: block; + margin-left: 1.25em; +} + +span[data-name="派生SubGF"] { + display: block; + text-indent: -1.25em; +} + +span[data-name="派生SubG"] { + display: block; +} + +span[data-name="派生SubGF"] span[data-name="用例G"] { + text-indent: 0; +} + +span[data-name="派生見出"] { + font-weight: bold; +} + +span[data-name="派生見出"].normal { + font-weight: normal +} + +span[data-name="造語成分項目"] { + display: block; + margin-top: 1em; +} + +span[data-name="造語成分見出"] { + font-size:1.4em; +} + +span[data-name="EM"] { + font-weight: bold; +} + +span[data-name="アクセント"] { + font-size: 0.7em; + vertical-align: super; +} + +span[data-name="アクセント組M"] { + vertical-align: 0.1em; +} + + +span[data-name="反意語M"], +span[data-name="同意語M"] { + vertical-align: 0.15em; +} + +span[data-name="B"] { + font-weight: bold; +} + +span[data-name="IT"] { + font-family: "Times New Roman"; + font-style: italic; +} + +span[data-name="EXCLAMATION"] { + font-family: "Times New Roman"; + font-style: italic; + font-size: 1.2em; +} + +span[data-name="歴史仮名"] { + font-family: serif; + font-size: 0.7em; + font-weight: normal; + vertical-align: 0.35em; + -webkit-user-select: nocopy; +} + +span[data-name="出現形"] { + font-weight: bold; +} + +span[data-name="品詞用法"] { + font-size: 0.7em; +} + +span[data-name="品詞用法"] span[data-name="品詞G"] { + font-size: 1.2em; +} + +span[data-name="基本構文型"] { + font-size: 0.8em; +} + +span[data-name="基本構文em"] { + font-weight: bold; +} + +span[data-name="ウ濁音参照"] { + font-family: sans-serif; + font-weight: bold; +} + +span[data-name="rect"] { + padding: 0.1em; + font-family: sans-serif; + font-size: 0.8em; + border-width: 0.05em; + border-style: solid; + border-color: black; + word-break: keep-all; + -webkit-border-radius: 0.1em; +} + +span[data-name="rect"].fill { + color: white; + border-style: none; + background-color: gray; +} + +span[data-name="rect"].red { + color: red; + border-color: red; +} + +span[data-name="rect"].redfill { + color: white; + border-style: none; + background-color: red; +} + +span[data-name="red"] { + color: red; +} + +span[data-name="大語義番号"], +span[data-name="語義番号"], +span[data-name="副義番号"] { + margin-right: 0.25em; + font-family: sans-serif; +} + +span[data-name="ref"] span[data-name="大語義番号"], +span[data-name="ref"] span[data-name="語義番号"], +span[data-name="ref"] span[data-name="副義番号"] { + font-size: 0.8em; + margin-right: 0; +} + +span[data-name="表外字マーク"] { + vertical-align: 0.5em; +} + +span[data-name="表外音訓マーク"] { + font-size: 0.5em; + vertical-align: 0.5em; +} + +span[data-name="言換M"] { + font-size: 0.5em; +} + +span[data-name="字音語参照項目"] { + display: block; +} + +span[data-name="本文項目M"] { + font-size: 0.7em; +} + +span[data-name="運用解説M"], +span[data-name="表記解説M"], +span[data-name="文法解説M"], +span[data-name="かぞえ方解説M"], +span[data-name="派生M"] { + margin-right: 0.25em; + font-family: sans-serif; +} + +span[data-name="派生ロゴ"] { + margin-left: 0.1em; + margin-right: 0.1em; +} + +span[data-name="文字"] { + margin: 0 0.2em; +} + +span[data-name="二分"] { + font-size: 0.5em; +} + +span[data-name="四分"] { + font-size: 0.25em; +} + +span[data-name="ref"] { + margin-left: 0.1em; + margin-right: 0.1em; +} + +span[data-name="ref-small"] { + font-size: 0.7em; +} + +span[data-name="sup"] { + font-size: 0.6em; +} + +span[data-name="外字"] img { + height: 1em; +} + +img.audio { + height: 1em; + margin: 0 0.25em; +} + +img.外字 { + height: 1em; +} + +img.外字欧 { + height: 1em; +} + +span[data-name="レ点M"] { + font-size: 0.6em; + vertical-align: -0.7em; +} + +a { + text-decoration: none; +} + +span[data-name="audio"] a { + padding-bottom: 0; + border-bottom: none; +} + +span[data-name="アクセント"] a, +span[data-name="古語M"] a, +span[data-name="雅語M"] a, +span[data-name="派生M"] a, +span[data-name="原籍M"] a, +span[data-name="品詞M"] a { + color: black; + border-bottom-style: none; +} + + +span[data-name="歴史仮名"]:before, +span[data-name="ルビ"]:before { + content: "("; +} + +span[data-name="歴史仮名"]:after, +span[data-name="ルビ"]:after { + content: ")"; +} + +div[data-child-links] { + padding-top: 1em; +} + +div[data-child-links] ul { + margin: 0; + padding-left: 2em; +} + +div[data-child-links] span { + padding: 0.1em; + font-family: sans-serif; + font-size: 0.8em; + color: white; + border-width: 0.05em; + border-style: none; + border-color: black; + word-break: keep-all; + -webkit-border-radius: 0.2em; +} + +div[data-child-links="子項目"] span { + background-color: rgb(153, 42, 103); +} + +div[data-child-links="句項目"] span { + background-color: rgb(176, 127, 57); +} + +span.pri > span.外字 { + font-size: 0.65em; + vertical-align: super; +} + + + diff --git a/data/mdict/description/daijirin2.mdx.description.html b/data/mdict/description/daijirin2.mdx.description.html new file mode 100644 index 0000000..c1eb401 --- /dev/null +++ b/data/mdict/description/daijirin2.mdx.description.html @@ -0,0 +1,7 @@ +大辞林 第四版 +

    +https://www.monokakido.jp/ja/dictionaries/daijirin2/index.html +

    +{{revision}} +

    +{{attribution}} diff --git a/data/mdict/description/jitenon-kokugo.mdx.description.html b/data/mdict/description/jitenon-kokugo.mdx.description.html new file mode 100644 index 0000000..a1c7489 --- /dev/null +++ b/data/mdict/description/jitenon-kokugo.mdx.description.html @@ -0,0 +1,7 @@ +国語辞典オンライン +

    +https://kokugo.jitenon.jp/ +

    +{{revision}} +

    +{{attribution}} diff --git a/data/mdict/description/jitenon-kotowaza.mdx.description.html b/data/mdict/description/jitenon-kotowaza.mdx.description.html new file mode 100644 index 0000000..b6d3c99 --- /dev/null +++ b/data/mdict/description/jitenon-kotowaza.mdx.description.html @@ -0,0 +1,7 @@ +故事・ことわざ・慣用句オンライン +

    +https://kotowaza.jitenon.jp/ +

    +{{revision}} +

    +{{attribution}} diff --git a/data/mdict/description/jitenon-yoji.mdx.description.html b/data/mdict/description/jitenon-yoji.mdx.description.html new file mode 100644 index 0000000..d7e3729 --- /dev/null +++ b/data/mdict/description/jitenon-yoji.mdx.description.html @@ -0,0 +1,7 @@ +四字熟語辞典オンライン +

    +https://yoji.jitenon.jp/ +

    +{{revision}} +

    +{{attribution}} diff --git a/data/mdict/description/smk8.mdx.description.html b/data/mdict/description/smk8.mdx.description.html new file mode 100644 index 0000000..7486250 --- /dev/null +++ b/data/mdict/description/smk8.mdx.description.html @@ -0,0 +1,7 @@ +新明解国語辞典 第八版 +

    +https://www.monokakido.jp/ja/dictionaries/smk8/index.html +

    +{{revision}} +

    +{{attribution}} diff --git a/data/mdict/icon/jitenon-kokugo.png b/data/mdict/icon/jitenon-kokugo.png new file mode 100644 index 0000000000000000000000000000000000000000..1ef1eb397db315fbb6ccab87da122fc515a2c2dd GIT binary patch literal 2374 zcmeHIc{CIX7aztFW64${CdOc_<$1zbo@EBZ6C#Riy=-GCp&4VTtQizTb`_c7X)I-D zjA`^Wro|hn$i9scBB379csjj*zCXVIzdyR?-0hru&OP^cH-d|kow(=`Q2+oSZjVN} z?rY9>J^z2v8W`A5Oq%jH{CyfR7(2ASfh!KtvQI zCN3c>k*IVCkMJ%f;$ zm3=EGH}7^qVNr2OY5Bd1%KKF$a&^t04=D8wjSn9+HMg|3wRd!OQM+k9bVl!!r_4Uq z^A|7s|6&ik8hric?a=VMkpk87XSH#R?i`Fm^o>(1^s zp2*+^4ge6$v`1N;jvZYY-|@?LR}@}r4lV;Rm8GM>Gs&J|qGYsmx=oOgMwSLuL&e&t zJ5a^*^k9Jw%t5=UtTcp#RntA`>L{);oc{0*y&8HLhmb5XpDTZYn_}n0_m~EM9B$h% zVw(3DekJ5f1DSFYY121xakY*5>+NGj@8^xU9kmu8OJIq6{Vg=tFH_WS zinQVZJs}! zsF1hE1fNWoWiC@SP5sou?Dk`wL2@%pd3dcCFVdE8WXcI|TRITHvwS<(WvYc9X?7y; zsZJ1Of#}ho96r?*uYGH&K7&t{>>P?5{g0nmvF(XA)nvOkqo-2S=t~#cxUbx!0>+}{ zPj!HEkKhx$UxV{y@wv*<#0-?Gik$;iO!-Wl_LTu5=&MGe1)t$yTs6JsUPDs0!Ky;} zf-WF?Tt5N%NLr`KxS}05>#qW7c)Z%vYK>BdhV9i4gF+ikDC;x9kFA_W8i&H{4uLr@mYO3MKuO9<;L9dgIuCG!CYKf1%UiJM&c@kp z^EA`JfqtKDzP&vTc9j*%W8H{VD3jTO)qAJWc8`i+<_Bq)JW0oA4JJ1lKJ%oTrG zDme#*{_dXn#aBXjF_Q9m>sx_KF4ypLq63{Rd&{6xJ;7+G*0Y!XEC+s-JyNoag1s>Q zP-nV)FuS|3kH=83ku+NwZ`%+>UA(hK=a{5Kl4v2f!#yyRx)2>tgTCt<4c~iHbx+$iUr@E&{Fn!i@G0^&YZ~W|nbwpj&8%x>DEi<8AZtBA zX&Eh8L8T~nq&G-}s%>5%DTxuKp5KVjOQz0hidanMiHT$U9V_JM z9$dcA$43Qq-gMs(YB{GnbjH32;Fk*NC_K???ZpdpHx~;JnwIl;d(oe@U8m)zr9&EI zXiUM=9YjJrbeyPo6;wez;LSFvl9%aQ-%;5()yF3dx4BD0WKTa0}N@ zsfSwOjB{(-2ojSDn0_$i!(bm?*5>bwF^a}bSGWPLBu32?A=yGJ{pv2nRPW1EOr_%< zF_ZjvAHY^1FbKRHX<98gAl%scSM|-^)a)_+V zXF(A5rS)Rv`7)tftRev@251*hslf@og>JUXel+qjI%Y!6@?HDhW#F(Iy6I-Tt8587 z|CGf4oI%z*D{i%jGdC4R{t@KlrRpp8@AgU6X2v{ykv%RC34jp6c;gdVESfr=*i%2Z zTTrX4AVX+j0i*lIBxDwS$*?AszsF5@*t0BVj5P;!_nJ9858InN$=psw6E+Qs&Z>mL z1@QNowi&6cyF*=L{Zj2R*LNEO6Ag)4>2cS5>yBxcM!b3R*9Yfa%!9^SucZMQF`<+n ecQJ+F#S@nP_$2}I1kL;Y3GHp1Pz~1p$^QVgrz&^= literal 0 HcmV?d00001 diff --git a/data/mdict/icon/jitenon-kotowaza.png b/data/mdict/icon/jitenon-kotowaza.png new file mode 100644 index 0000000000000000000000000000000000000000..15ccb9264b6dd65af3dcb51e847fcb7d8a1a7cd7 GIT binary patch literal 5473 zcmeI0*Ebvhx5Y;tz0EMlU<47p_ufbEy)$}ZLWqdogTcs1bfPC(q7y`Cln_J@61_(6 zB{$!Fxqrm{&RYANb@toY>pcD94fHg~Nf=20006lrT-E5`T>FoR@c#9V_?I030PeMc zj)~g8^}qGs1pXf;;2*4k`FHl6pOKz100$QjpMVfZL`*_TMovLVMNI<&)6zlc85kcx znV4B%tPk1PIUaF7=Hlkz<>MC+6cQE@6%&_`l#-T_m6KOcR8m$^Ra4i1Yieoh=<4Yk z8X23InweWzT3OrJ+SxleIyoa;T;1F~JiVSg^+rDP@qO<1BH(4!38%)~Ucj+0KS=l+cdHL@P3X6(MO3OZcEU&1ns{Zu3rnauWp|PpC zdm0oHiXc@bV@C1eDqR^zo$ zR_*QPq(d=!i8j8c?~8nRZh`wkt%$g=AD@=CD4tOp<q!GBtcry{z)(t z`YcGMPNvS~&+Ci(z}BFH+dnVdmID_I(hN~f;3E3I$Lc}}kK^fC7Xzl6Bt+o-N}^1T z+*9dtCEKVVw{vxYT=6#Z?CZuygx#};OuCamZf)R4zAKL=y*~+7qJi!bpQLHe2HZp1 z-9?*r2XP0fOoyf}pN5>_v>1v+n(w>B@xHNp#zMr`7!*Q2iG`Pt%&|US zz_?ZVr3ghcy%Sa`Jsa=&_QlqX%55JJ9eg(f$F|Qq7ot$BmUx<;k0EQ)z%4QUZ9i0r z4T{fnuLj|VCiay-rAZ`>i4%|<>~X?xf~OrX6Ik`<6Vz#=CWgk&97In>sP8-EBO z#U!m#Da1jvooD$cT3c5&A*E2M8R4HtIj>bj`SC6h zmm1|mq#L(OC}6dna^zvmK>y%l`lldnK?TvkkDR?4iA)HGdTxpY8=vVww^l7~9`{%F z4>GIxDdMMZqsOdpmf8BW=q91h5pQtO1Dgc?D-ACO~JKtH9YE3Cf zC1tng@2p1frnaD4gRV&Pdc%uS*|O8+>i4f`h)F^#QiSYo<}X(;1T$YLXs(400>T3d zGDE!21pSv#!ca%utfo;024N#lkwD5HsdhYj)ZJ%e0Xi+wrJ7R&w6)o5i4o3+kkY~v z@wd@S6l*UFrhtQ70zOs1a@{t5tF8r`hAR<*)%I3~xuoFneTYJ^G{1d8!nG^qEu0q{e znbS zZ*kPO{a8O*(vV`eC0JJDwcwXF?#at)iKT|2E+|FLPn9CE=ZynG6j~CvD6@11g*#E` zjKi!*bA>lD`~{-}OY^#XG5~=}9!>hdxv0!izYJNUDeePFP;|yu{Lyo50P!|QLlU=KY;rPOu1mi(pN6! zMbRjKrFAnF0~vl3NZ#Pq2z=$xb>QnNp_WTzW>oI@RTtRuh%jlzKTUM1B~`M-wwQJ5 zL?z?$wJysg2;|~Pg!Ua~*n_A{O=x zO5IwV_Eml*{IiXvpT_wKmW(AY08LrjT0Q*YfD=Lg)O+U|pK+Lgp?CC9lNA(s_qH8+ zjv+9@c|v9KIa;yc&fNg*5KV@r>rg>8CDsBR_cD3O$@S4nLs(>JgLhPN>^ zij6uOwY^QmRf!GCTtN{KNZ&l1SV}hAZE=f@toAF`f7H@?5cam>(meTjbYK

    *6*; zm51;kc7Y2I2rIgxx7zZP7Tz`nRr=>l5eL24r_BErcY&0fDBoX zahNPOh=oPM-ahv=;3gqt(E1%s1yA9*jh?e!Fd}5TL=Vl~hOOof5}GE(l5Y>*NVp<} z>gl2~(57Zbv_(-gURNBl%-IFcj8;< zf7jrU^VAKFe%k5mN^6y=|LgNgUe%Kd(E5=%U!y1#&T{zKzmOq8NzoNc8S#M6L(NlXJ#ceKmB_8F$JoYc)tcj1fd0fYeBNra1 zi>+!uM0Srt!?pzo-;^pmS^0)$Dzx29?bLFOIlbgc4khP%-o%u%K)Vh((*wS$%k5Px zXK8R4O>Sz>a2W^$ekITcHKUR@sX&V9R;f|NlO2!m z%jY_aS_K*8FsZc5&)I;QkP!i4J31W7>K^C4zs$A=X2y|d(GZJV-BY{(1f@r+2vBM6 z=6e_EMNaba#8mV{l8-w&U0RxD&Vp74ir*V%VrZU{fd^BAx?L<1b1OngBlK3UK%*t5NM(&WG1hRX>!HW3eP*FW^-$pp# zCv;{pp_B!PG6|yY|DOJ-39*6eb29@6TGC!+XE^-@E~kA_ej=(Wjs4PzTv_ax`b(RV z8!^sr9g3y#4c#23f!Vk9OE`efknBBs@8;=QNAZXMM7S&shT=3*onskltbN z71@%kJhP7Tf_fdQ*yEtG&D4@-14Q`FNCIj6ptrHle_KdA8Ut2CywdR1-(2s2Xx8e3 z0irAK6pf#poS);jit^UCSr5pa5_%A3sf*WjlpfnyT-@J+oE@KzfKmPyp=r5!Fs~?z zx}A0I5uG^M*JaAkag8h2@)kQ-ga`Gv40}g0g4($KWsD@7Z+H@zCTJVam+0|>@NEW4 zJ%Ss$(&-J$62Oc^TsAuHmg1RT`}BH#bOfCV*Y5SPtM!fI#>Y0(DY!~C9`vr9_qHQ; zUvyMEBSKg|lrR8P8kdn8BTIJqQM+(MzSUCV=VHU%xKpjJ6c4$#KL-gh{L$nd!K6NC zBqmI%mc@5?SUuP-YL;*fRso>!6!-oy+C3x%T4LfX|I@!LBe*%hT?|u_a?7eV?2}m< zZ!s?%Xy`vBp9#@o!zRLvh0&Pz0X%BC=C&xtRgTVD@0>{b#FpuJA_Uoug$HKM6IZU( zj($p4h#fYkT_#&2-*`@MZPM|U!8PlP6u zdkFCRLrl49(XN%SBXw%j644{Yb5#^V(*tHrOd*KJPUKiORADjb?|?lEZ=^;_!rZn( znFVfrP)cy9RczB#GZqXXXZI55o>peqWd;A0-+vt6L;*6&0 zb>xB3qtYzuBn;U1=C_HfHWhm zXKg?IKGo78b>Tdx&thIvYn#h1JSCCJVMy#+wCeWZJNSB-5*xqk&yTYv^hc2{g%j_7Oxq{+d+x8`{s0!zKg$fKnCOH+x$_uN7v5LuYm;!Ti_co z-)Ntr{8rihZET~&1GgKq3k&()UPTY-W74=bNHKzsgR_9gyC&(=*-Q)0BXEg6c4S{p zjBU2?Y}*^rV7nSfWSVVl^d_DPd#c;6{S5)EN?)^8#i(wFB>{yvOeXB;GtimwVIw=m z`%iMJ)}#HDCMdcpV+it)}9XyuK5J(z7yuBhLzL>BA}}SZ!07$LJ^rZun+wy7D_e zYQePRl+zP?i;8obiM?Ej>3N(edHsTIR>}XCPl78naahPVVN)KpVLEjefz3=|MP oCOPyMFZn+_@xP=Z_~&sL(Vh_h_6@A`KafmQO;5F2*&*_O0A>u{Q2+n{ literal 0 HcmV?d00001 diff --git a/data/mdict/icon/jitenon-yoji.png b/data/mdict/icon/jitenon-yoji.png new file mode 100644 index 0000000000000000000000000000000000000000..0603db0b0e1751713d7d997e8405fb56a4b1fd0c GIT binary patch literal 2628 zcmai0c~sKd7Dj(YW*}ubfJ54|yecZ0IHVbcx4@yn3bULiMA0ZSHOwHnaT_#pAm{1@ znuVICq~*Y+LPFEDG!e%f(i}5q%O2hP-db%q1CDcqAearSobQ+wOH4UQXL^?}V}Y$?|;b zHSZ7KQ~AZlfA0zQ#C0O7VN%-tI(|0!-~ z>>h8lZFa|fAM@&i-qr1ZxtfI=h=2hy?DK&@+U*ono>wQ$o~96Nx_?dR8V!&~dMW6$ z2NBeKUL~+Tcq6KsOdmf%GZRDzLS@%1tvAG0Gvy&Z$aj@xt!^8WK{r9E5jGNW4no-B z!#@+uy%JCjieIbcukG$!+sj0;!@N94v3pe&qv$T=Ltz3)pOY@#p$eHC=jB=WA+>CwwzWlVUUNBfFRKr6(WK>6tblalZ0v7{CTRfJE|bu9en_TW7SgEsJ&DzQmY^ z3wx!i-@qMYZL@TeV9xCub!|%B0%tepx{D**Rg?}~j7tC9)P!`;TN|#=6?v^EC}(VolRxtp)=M=Z(^y*1&ei#td<{@ zCd3J{_4iBj(dy8g!BQsAw>@=3^2v|;q_B9+MI_}xPdbF3JXmqQ9(l%~dS9j2^iu|Y zqAK#(JPT}TU9Y?)${`&&mV#~DSKT1PUJxU);{=U7!lYKKp1WOuzAjdR>6E)o08gG? zS(}f#R0}Z=9FG5mhhJYAi<0Q>0RC(n*bO|Km9&N zuk0n*eQDjGqYGI-SZ+`bJ&oQNqrY(KFADKKM3~&&(1O(y*@LxkL)%$Yvl9BT1jVIi zLui(z+`z3E2?l|Kiyz#ulEbAP+h0W2g~UNQVK&NI!E(J!*nLu8X&ZX+&Nge-!|o2q zhi=93@}Ig|dm<~U|I~M3e~`2MrWW`dnZVO{z5-e7bxsfI-AP+}5uVtEfX_DUP(?`-qI9WBR@7myvr-+k6kxGV`dSWM1?J*vsp}@q?#Fz zuni+UU$pWjI!Uo6Uoqz8yyI>~Gg3FG}4p zry{Fioy~nd^Vm_quD6UZR~6}F!;&Cb2k2aO#Y1PC5~|n)B)YY#)4oSxEiyuL)9C78 z=5CIqz1R1|;G)l8cgHtlRBWR!Mshu)-l);`^o_xwVo(qfr=j}VC zOk~x%b2T6DZZuuD@Y~f_cQ=2SD?%iQFzhZghYD-=ywyewavrl=U-?>oDLq}RkI&ek zW+%cO7-xe-L69hzMkJgkj}PDgX$@Y$H7**nrBE4YglZ$;=-gYvuwzoxJ~hvNlLj8p z1WwUPM4cnm7U2sa`IW^?+M8SLB$Kuh??#CoV`tmD)QjcKdX@=^N8>demAZoQJK?=0 zZmNOMpuv0)v7o%z7xWxS+HStpOKANTBbvxhYkvZ*G+HZrr}3rsS<0-@X)Uo%746(` zG2bXKhpj_CI&RaFMbaOb16r5Hzc>ovFCA<(mK+~}{s9S!-_B}!o%2Qjiz@dnU0@~B zW3Llb%_1| z#kj_|uqX9=U$`J&=XBuu>3uI(D0nqz08W=$%~uWNZXQ_DG2Pn3j0%*p%GS>8x=60SqXYxtWXP$H!(W>R+frwfxb4F2TOL?)t;kQ z-?F(_Il}OC#eNR8s86*iwHCHnm@%jj zU8NqS9{t9>-!1wlRV31w9Z**CuV$X7f{WP@x zP#LpPWYu>I@pQ^x2#tZ>A2--ZQJ=0y@FR@;+a;T<0Vy_QV1QATUvsf~$DQf8Y&l`0 zcu9~dc&wKYV&iW<2obg^rZ9Q%st3#|-8GYI5?nL3ZH0>y%4pNEw1~+_xOnx+#!B@Q zpCZ|GaxI!sFqU^PwpbL%uovRNWn0D#9F7LU`W*{X=;K6LH`l~+s=)+~!-L%|St$@s zkp4C}?ls0)4(Mre02u!rfZgRY5RM^W{GX_0H`f^RE#T^U?~`Hj_n+YU$APBTSi*l( zikHolz?Eqisy_UU`cJk$bqc-c4*iqIHkE6MJ9+$~Ujnmfp5U^WF&Z<%LV`1;z)a(?|jtUz^L*Jpt)bB}YS~&sS)sv>LuHcSLzwAGI!{hlkRrVt!joF`d?w zCA-gh^zHCVxK#66(~R{)F(GerQTP1)%@>3P^mKuHU;64Q^V$Y6!OxDe=ds9|v#Y|8 zPSsm9>c26%T%AkRddsOhs=%J>|FmE^bMA*c. """ import os +import sys import argparse +import subprocess from bot.targets import Targets from bot.crawlers.factory import new_crawler +def filename(f): + if not os.path.isfile(f): + raise argparse.ArgumentTypeError(f"`{f}` is not a valid filename") + elif not os.access(f, os.R_OK): + raise argparse.ArgumentTypeError(f"Cannot access file `{f}`") + else: + return f + + def directory(d): if not os.path.isdir(d): raise argparse.ArgumentTypeError(f"`{d}` is not a valid directory") @@ -35,34 +46,71 @@ def parse_args(target_names): parser = argparse.ArgumentParser( prog="jitenbot", description="Convert Japanese dictionary files to new formats.", + epilog="See README.md for details regarding media directory structures", ) parser.add_argument( "target", choices=target_names, - help="name of dictionary to convert" + help="name of dictionary to convert", ) parser.add_argument( "-p", "--page-dir", help="path to directory containing XML page files", - type=directory + type=directory, ) parser.add_argument( - "-i", "--image-dir", - help="path to directory containing image folders (gaiji, graphics, etc.)", - type=directory + "-m", "--media-dir", + help="path to directory containing media folders (gaiji, graphics, audio, etc.)", + type=directory, + ) + parser.add_argument( + "-i", "--mdict-icon", + help="path to icon file to be used with MDict", + type=filename, + ) + parser.add_argument( + "--no-yomichan-export", + help="skip export of dictionary data to Yomichan format", + action='store_true', + ) + parser.add_argument( + "--no-mdict-export", + help="skip export of dictionary data to MDict format", + action='store_true', ) args = parser.parse_args() return args +def test_mdict(): + try: + subprocess.run( + ["mdict", "--version"], + check=True, + stdout=subprocess.DEVNULL, + ) + except FileNotFoundError: + print("Could not find `mdict` pack tool.") + print("Ensure that mdict-utils is installed and") + print("included in the environment PATH.\n") + print("Mdict export functionality may also be") + print("disabled with the --no-mdict-export flag.") + sys.exit() + + def main(): target_names = [x.value for x in Targets] args = parse_args(target_names) + if not args.no_mdict_export: + test_mdict() selected_target = Targets(args.target) crawler = new_crawler(selected_target) crawler.collect_pages(args.page_dir) crawler.read_pages() - crawler.make_yomichan_dictionary(args.image_dir) + if not args.no_yomichan_export: + crawler.make_yomichan_dictionary(args.media_dir) + if not args.no_mdict_export: + crawler.make_mdict_dictionary(args.media_dir, args.mdict_icon) if __name__ == "__main__": diff --git a/requirements.txt b/requirements.txt index 1c111af..8802356 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,6 +6,7 @@ css-parser==1.0.8 html5lib==1.1 idna==3.4 lxml==4.9.2 +mdict-utils==1.3.12 Pillow==9.5.0 platformdirs==3.5.0 requests==2.29.0 @@ -13,5 +14,7 @@ six==1.16.0 soupsieve==2.4.1 SudachiDict-full==20230110 SudachiPy==0.6.7 +tqdm==4.65.0 urllib3==1.26.15 webencodings==0.5.1 +xxhash==3.2.0 diff --git a/run_all.sh b/run_all.sh new file mode 100644 index 0000000..2bdd31e --- /dev/null +++ b/run_all.sh @@ -0,0 +1,13 @@ +python jitenbot.py jitenon-kokugo +python jitenbot.py jitenon-yoji +python jitenbot.py jitenon-kotowaza + +python jitenbot.py smk8 \ + --media-dir monokakido/SMK8/media \ + --page-dir monokakido/SMK8/pages \ + --mdict-icon monokakido/SMK8/SMK8-76@3x.png + +python jitenbot.py daijirin2 \ + --media-dir monokakido/DAIJIRIN2/media \ + --page-dir monokakido/DAIJIRIN2/pages \ + --mdict-icon monokakido/DAIJIRIN2/DAIJIRIN2-76@3x.png