diff --git a/.gitignore b/.gitignore index b009cb5..4c7985d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ webcache/ output/ notes/ +monokakido/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/README.md b/README.md index 88d0f2b..5a872ea 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,8 @@ compiling the scraped data into compact dictionary file formats. # Usage ``` -usage: jitenbot [-h] [-p PAGE_DIR] [-i IMAGE_DIR] +usage: jitenbot [-h] [-p PAGE_DIR] [-m MEDIA_DIR] [-i MDICT_ICON] + [--no-yomichan-export] [--no-mdict-export] {jitenon-kokugo,jitenon-yoji,jitenon-kotowaza,smk8,daijirin2} Convert Japanese dictionary files to new formats. @@ -62,9 +63,15 @@ options: -h, --help show this help message and exit -p PAGE_DIR, --page-dir PAGE_DIR path to directory containing XML page files - -i IMAGE_DIR, --image-dir IMAGE_DIR - path to directory containing image folders (gaiji, - graphics, etc.) + -m MEDIA_DIR, --media-dir MEDIA_DIR + path to directory containing media folders (gaiji, + graphics, audio, etc.) + -i MDICT_ICON, --mdict-icon MDICT_ICON + path to icon file to be used with MDict + --no-yomichan-export skip export of dictionary data to Yomichan format + --no-mdict-export skip export of dictionary data to MDict format + +See README.md for details regarding media directory structures ``` ### Online Targets Jitenbot will scrape the target website and save the pages to the [user cache directory](https://pypi.org/project/platformdirs/). @@ -75,8 +82,55 @@ HTTP request headers (user agent string, etc.) may be customized by editing the [user config directory](https://pypi.org/project/platformdirs/). ### Offline Targets -Page data and image data must be procured by the user +Page data and media data must be [procured by the user](https://github.com/golddranks/monokakido/) and passed to jitenbot via the appropriate command line flags. +
+ smk8 media directory + +Since Yomichan does not support audio files from imported +dictionaries, the `audio/` directory may be omitted to save filesize +space in the output ZIP file if desired. + +``` +media +├── Audio.png +├── audio +│   ├── 00001.aac +│   ├── 00002.aac +│   ├── 00003.aac +│   │  ... +│   └── 82682.aac +└── gaiji + ├── 1d110.svg + ├── 1d15d.svg + ├── 1d15e.svg +    │  ... + └── xbunnoa.svg +``` +
+ +
+ daijirin2 media directory + +The `graphics/` directory may be omitted to save space if desired. + +``` +media +├── gaiji +│   ├── 1D10B.svg +│   ├── 1D110.svg +│   ├── 1D12A.svg +│   │  ... +│   └── vectorOB.svg +└── graphics + ├── 3djr_0002.png + ├── 3djr_0004.png + ├── 3djr_0005.png +    │  ... + └── 4djr_yahazu.png +``` +
+ # Attribution `Adobe-Japan1_sequences.txt` is provided by [The Adobe-Japan1-7 Character Collection](https://github.com/adobe-type-tools/Adobe-Japan1). diff --git a/TODO.md b/TODO.md index 30c860d..2f2a5d5 100644 --- a/TODO.md +++ b/TODO.md @@ -1,11 +1,11 @@ ### Todo +- [x] Add factory classes to reduce the amount of class import statements +- [x] Support exporting to MDict (.MDX) dictionary format - [ ] Add test suite - [ ] Add documentation (docstrings, etc.) - [ ] Validate JSON schema of Yomichan terms during export -- [ ] Add factory classes to reduce the amount of class import statements - [ ] Add build scripts for producing program binaries -- [ ] Support exporting to MDict (.MDX) dictionary format - [ ] Validate scraped webpages after downloading - [ ] Log non-fatal failures to a log file instead of raising exceptions - [ ] Support more dictionary websites diff --git a/bot/crawlers/crawlers.py b/bot/crawlers/crawlers.py index c7bf8ea..97b3794 100644 --- a/bot/crawlers/crawlers.py +++ b/bot/crawlers/crawlers.py @@ -5,7 +5,8 @@ from bs4 import BeautifulSoup import bot.scraper as Scraper from bot.entries.factory import new_entry -from bot.yomichan.exporters.factory import new_exporter +from bot.yomichan.exporters.factory import new_yomi_exporter +from bot.mdict.exporters.factory import new_mdict_exporter class Crawler(ABC): @@ -38,9 +39,13 @@ class Crawler(ABC): self._entries.append(entry) print() - def make_yomichan_dictionary(self, image_dir): - exporter = new_exporter(self._target) - exporter.export(self._entries, image_dir) + def make_yomichan_dictionary(self, media_dir): + exporter = new_yomi_exporter(self._target) + exporter.export(self._entries, media_dir) + + def make_mdict_dictionary(self, media_dir, icon_file): + exporter = new_mdict_exporter(self._target) + exporter.export(self._entries, media_dir, icon_file) def _parse_page_id(self, page_link): m = re.search(self._page_id_pattern, page_link) @@ -142,10 +147,8 @@ class _MonokakidoCrawler(Crawler): class Smk8Crawler(_MonokakidoCrawler): - def __init__(self, target): - super().__init__(target) + pass class Daijirin2Crawler(_MonokakidoCrawler): - def __init__(self, target): - super().__init__(target) + pass diff --git a/bot/data.py b/bot/data.py index 5d68769..3b1effd 100644 --- a/bot/data.py +++ b/bot/data.py @@ -99,15 +99,15 @@ def load_daijirin2_kana_abbreviations(): @cache -def load_smk8_yomichan_name_conversion(): - file_name = os.path.join("smk8", "yomichan_name_conversion.json") +def load_yomichan_name_conversion(target): + file_name = os.path.join(target.value, "yomichan_name_conversion.json") data = __load_json(file_name) return data @cache -def load_daijirin2_yomichan_name_conversion(): - file_name = os.path.join("daijirin2", "yomichan_name_conversion.json") +def load_mdict_name_conversion(target): + file_name = os.path.join(target.value, "mdict_name_conversion.json") data = __load_json(file_name) return data diff --git a/bot/entries/daijirin2.py b/bot/entries/daijirin2.py index 1463442..196bd0c 100644 --- a/bot/entries/daijirin2.py +++ b/bot/entries/daijirin2.py @@ -1,4 +1,3 @@ -import re from bs4 import BeautifulSoup import bot.entries.expressions as Expressions @@ -10,19 +9,17 @@ from bot.entries.daijirin2_preprocess import preprocess_page class _BaseDaijirin2Entry(Entry): - ID_TO_ENTRY = {} - SUBENTRY_ID_TO_ENTRY_ID = {} - - def __init__(self, entry_id): - super().__init__(entry_id) - if entry_id not in self.ID_TO_ENTRY: - self.ID_TO_ENTRY[entry_id] = self - else: - raise Exception(f"Duplicate entry ID: {entry_id}") + def __init__(self, target, entry_id): + super().__init__(target, entry_id) self.children = [] self.phrases = [] self._kana_abbreviations = load_daijirin2_kana_abbreviations() + def get_global_identifier(self): + parent_part = format(self.entry_id[0], '06') + child_part = hex(self.entry_id[1]).lstrip('0x').zfill(4).upper() + return f"@{self.target.value}-{parent_part}-{child_part}" + def set_page(self, page): page = self.__decompose_subentries(page) self._page = page @@ -57,14 +54,7 @@ class _BaseDaijirin2Entry(Entry): else: self._part_of_speech_tags.append(pos) - def get_headwords(self): - if self._headwords is not None: - return self._headwords - self._set_headwords() - self._set_variant_headwords() - return self._headwords - - def _set_regular_headwords(self, soup): + def _get_regular_headwords(self, soup): self._fill_alts(soup) reading = soup.find("見出仮名").text expressions = [] @@ -78,10 +68,11 @@ class _BaseDaijirin2Entry(Entry): expressions = Expressions.expand_abbreviation_list(expressions) if len(expressions) == 0: expressions.append(reading) - self._headwords = {reading: expressions} + headwords = {reading: expressions} + return headwords - def _set_variant_headwords(self): - for expressions in self._headwords.values(): + def _add_variant_expressions(self, headwords): + for expressions in headwords.values(): Expressions.add_variant_kanji(expressions) Expressions.add_fullwidth(expressions) Expressions.remove_iteration_mark(expressions) @@ -101,7 +92,7 @@ class _BaseDaijirin2Entry(Entry): tag_soup.name = "項目" subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"]) self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id - subentry = subentry_class(subentry_id) + subentry = subentry_class(self.target, subentry_id) page = tag_soup.decode() subentry.set_page(page) subentry_list.append(subentry) @@ -122,6 +113,8 @@ class _BaseDaijirin2Entry(Entry): @staticmethod def _delete_unused_nodes(soup): + """Remove extra markup elements that appear in the entry + headword line which are not part of the entry headword""" unused_nodes = [ "漢字音logo", "活用分節", "連語句活用分節", "語構成", "表外字マーク", "表外字マーク", "ルビG" @@ -144,25 +137,26 @@ class _BaseDaijirin2Entry(Entry): class Daijirin2Entry(_BaseDaijirin2Entry): - def __init__(self, page_id): + def __init__(self, target, page_id): entry_id = (page_id, 0) - super().__init__(entry_id) + super().__init__(target, entry_id) def set_page(self, page): page = preprocess_page(page) super().set_page(page) - def _set_headwords(self): + def _get_headwords(self): soup = self.get_page_soup() self._delete_unused_nodes(soup) if soup.find("漢字見出") is not None: - self._set_kanji_headwords(soup) + headwords = self._get_kanji_headwords(soup) elif soup.find("略語G") is not None: - self._set_acronym_headwords(soup) + headwords = self._get_acronym_headwords(soup) else: - self._set_regular_headwords(soup) + headwords = self._get_regular_headwords(soup) + return headwords - def _set_kanji_headwords(self, soup): + def _get_kanji_headwords(self, soup): readings = [] for el in soup.find_all("漢字音"): hira = Expressions.kata_to_hira(el.text) @@ -172,11 +166,12 @@ class Daijirin2Entry(_BaseDaijirin2Entry): expressions = [] for el in soup.find_all("漢字見出"): expressions.append(el.text) - self._headwords = {} + headwords = {} for reading in readings: - self._headwords[reading] = expressions + headwords[reading] = expressions + return headwords - def _set_acronym_headwords(self, soup): + def _get_acronym_headwords(self, soup): expressions = [] for el in soup.find_all("略語"): expression_parts = [] @@ -184,29 +179,24 @@ class Daijirin2Entry(_BaseDaijirin2Entry): expression_parts.append(part.text) expression = "".join(expression_parts) expressions.append(expression) - self._headwords = {"": expressions} + headwords = {"": expressions} + return headwords class Daijirin2ChildEntry(_BaseDaijirin2Entry): - def __init__(self, entry_id): - super().__init__(entry_id) - - def _set_headwords(self): + def _get_headwords(self): soup = self.get_page_soup() self._delete_unused_nodes(soup) - self._set_regular_headwords(soup) + headwords = self._get_regular_headwords(soup) + return headwords class Daijirin2PhraseEntry(_BaseDaijirin2Entry): - def __init__(self, entry_id): - super().__init__(entry_id) - self.__phrase_readings = load_daijirin2_phrase_readings() - def get_part_of_speech_tags(self): # phrases do not contain these tags return [] - def _set_headwords(self): + def _get_headwords(self): soup = self.get_page_soup() headwords = {} expressions = self._find_expressions(soup) @@ -217,7 +207,7 @@ class Daijirin2PhraseEntry(_BaseDaijirin2Entry): headwords[reading].append(expression) else: headwords[reading] = [expression] - self._headwords = headwords + return headwords def _find_expressions(self, soup): self._delete_unused_nodes(soup) @@ -231,7 +221,8 @@ class Daijirin2PhraseEntry(_BaseDaijirin2Entry): return expressions def _find_readings(self): - text = self.__phrase_readings[self.entry_id] + phrase_readings = load_daijirin2_phrase_readings() + text = phrase_readings[self.entry_id] alternatives = Expressions.expand_daijirin_alternatives(text) readings = [] for alt in alternatives: diff --git a/bot/entries/entry.py b/bot/entries/entry.py index 57316f6..3811a77 100644 --- a/bot/entries/entry.py +++ b/bot/entries/entry.py @@ -2,12 +2,24 @@ from abc import ABC, abstractmethod class Entry(ABC): - def __init__(self, entry_id): + ID_TO_ENTRY = {} + SUBENTRY_ID_TO_ENTRY_ID = {} + + def __init__(self, target, entry_id): + if entry_id not in self.ID_TO_ENTRY: + self.ID_TO_ENTRY[entry_id] = self + else: + raise Exception(f"Duplicate entry ID: {entry_id}") + self.target = target self.entry_id = entry_id self._page = None self._headwords = None self._part_of_speech_tags = None + @abstractmethod + def get_global_identifier(self): + pass + @abstractmethod def set_page(self, page): pass @@ -16,14 +28,34 @@ class Entry(ABC): def get_page_soup(self): pass - @abstractmethod def get_headwords(self): + if self._headwords is not None: + return self._headwords + headwords = self._get_headwords() + self._add_variant_expressions(headwords) + self._headwords = headwords + return headwords + + @abstractmethod + def _get_headwords(self): + pass + + @abstractmethod + def _add_variant_expressions(self, headwords): pass @abstractmethod def get_part_of_speech_tags(self): pass + def get_parent(self): + if self.entry_id in self.SUBENTRY_ID_TO_ENTRY_ID: + parent_id = self.SUBENTRY_ID_TO_ENTRY_ID[self.entry_id] + parent = self.ID_TO_ENTRY[parent_id] + else: + parent = None + return parent + def get_first_expression(self): headwords = self.get_headwords() expressions = next(iter(headwords.values())) diff --git a/bot/entries/factory.py b/bot/entries/factory.py index 23ca066..a3dec69 100644 --- a/bot/entries/factory.py +++ b/bot/entries/factory.py @@ -15,4 +15,4 @@ def new_entry(target, page_id): Targets.SMK8: Smk8Entry, Targets.DAIJIRIN2: Daijirin2Entry, } - return entry_map[target](page_id) + return entry_map[target](target, page_id) diff --git a/bot/entries/jitenon.py b/bot/entries/jitenon.py index fd9fcd2..65c4d2e 100644 --- a/bot/entries/jitenon.py +++ b/bot/entries/jitenon.py @@ -1,4 +1,5 @@ import re +from abc import abstractmethod from datetime import datetime, date from bs4 import BeautifulSoup @@ -7,18 +8,17 @@ import bot.entries.expressions as Expressions class _JitenonEntry(Entry): - ID_TO_ENTRY = {} - - def __init__(self, entry_id): - super().__init__(entry_id) - if entry_id not in self.ID_TO_ENTRY: - self.ID_TO_ENTRY[entry_id] = self - else: - raise Exception(f"Duplicate entry ID: {entry_id}") + def __init__(self, target, entry_id): + super().__init__(target, entry_id) + self.expression = "" + self.yomikata = "" + self.definition = "" + self.other_forms = [] self.modified_date = date(1970, 1, 1) self.attribution = "" - for column in self._COLUMNS.values(): - setattr(self, column[0], column[1]) + + def get_global_identifier(self): + return f"@{self.target.value}-{format(self.entry_id, '06')}" def set_page(self, page): soup = BeautifulSoup(page, features="html5lib") @@ -39,36 +39,33 @@ class _JitenonEntry(Entry): soup = BeautifulSoup(self._page, "html5lib") return soup - def get_headwords(self): - if self._headwords is not None: - return self._headwords - self._set_headwords() - self._set_variant_headwords() - return self._headwords - def get_part_of_speech_tags(self): # Jitenon doesn't have any return [] - def _set_headwords(self): + def _get_headwords(self): headwords = {} - for yomikata in self._yomikatas(): - headwords[yomikata] = [self.expression] - ikei_headwords = self._ikei_headwords() - for reading, expressions in ikei_headwords.items(): + for reading in self._get_readings(): + headwords[reading] = [self.expression] + other_form_headwords = self._other_form_headwords() + for reading, expressions in other_form_headwords.items(): if reading not in headwords: headwords[reading] = [] for expression in expressions: if expression not in headwords[reading]: headwords[reading].append(expression) - self._headwords = headwords + return headwords + + @abstractmethod + def _get_column_map(self): + pass def __set_modified_date(self, page): m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", page) if m is None: return - date = datetime.strptime(m.group(1), '%Y-%m-%d').date() - self.modified_date = date + modified_date = datetime.strptime(m.group(1), '%Y-%m-%d').date() + self.modified_date = modified_date def __set_attribution(self, soup): attribution = soup.find(class_="copyright") @@ -78,7 +75,8 @@ class _JitenonEntry(Entry): self.attribution = "" def __set_column(self, colname, colval): - attr_name = self._COLUMNS[colname][0] + column_map = self._get_column_map() + attr_name = column_map[colname] attr_value = getattr(self, attr_name) if isinstance(attr_value, str): setattr(self, attr_name, colval) @@ -88,7 +86,7 @@ class _JitenonEntry(Entry): else: attr_value.append(colval) - def _yomikatas(self): + def _get_readings(self): yomikata = self.yomikata m = re.search(r"^[ぁ-ヿ、]+$", yomikata) if m: @@ -109,20 +107,20 @@ class _JitenonEntry(Entry): print(f"Invalid 読み方 format: {self.yomikata}\n{self}\n") return [""] - def _ikei_headwords(self): - ikei_headwords = {} - for val in self.ikei: + def _other_form_headwords(self): + other_form_headwords = {} + for val in self.other_forms: m = re.search(r"^([^(]+)(([ぁ-ヿ、]+))$", val) if not m: print(f"Invalid 異形 format: {val}\n{self}\n") continue expression = m.group(1) reading = m.group(2) - if reading not in ikei_headwords: - ikei_headwords[reading] = [] - if expression not in ikei_headwords[reading]: - ikei_headwords[reading].append(expression) - return ikei_headwords + if reading not in other_form_headwords: + other_form_headwords[reading] = [] + if expression not in other_form_headwords[reading]: + other_form_headwords[reading].append(expression) + return other_form_headwords @staticmethod def __clean_text(text): @@ -133,9 +131,10 @@ class _JitenonEntry(Entry): return text def __str__(self): + column_map = self._get_column_map() colvals = [str(self.entry_id)] - for attr in self._COLUMNS.values(): - attr_val = getattr(self, attr[0]) + for attr_name in column_map.values(): + attr_val = getattr(self, attr_name) if isinstance(attr_val, str): colvals.append(attr_val) elif isinstance(attr_val, list): @@ -144,83 +143,100 @@ class _JitenonEntry(Entry): class JitenonYojiEntry(_JitenonEntry): - _COLUMNS = { - "四字熟語": ["expression", ""], - "読み方": ["yomikata", ""], - "意味": ["imi", ""], - "出典": ["shutten", ""], - "漢検級": ["kankenkyuu", ""], - "場面用途": ["bamenyouto", ""], - "異形": ["ikei", []], - "類義語": ["ruigigo", []], - } + def __init__(self, target, entry_id): + super().__init__(target, entry_id) + self.origin = "" + self.kanken_level = "" + self.category = "" + self.related_expressions = [] - def __init__(self, entry_id): - super().__init__(entry_id) + def _get_column_map(self): + return { + "四字熟語": "expression", + "読み方": "yomikata", + "意味": "definition", + "異形": "other_forms", + "出典": "origin", + "漢検級": "kanken_level", + "場面用途": "category", + "類義語": "related_expressions", + } - def _set_variant_headwords(self): - for expressions in self._headwords.values(): + def _add_variant_expressions(self, headwords): + for expressions in headwords.values(): Expressions.add_variant_kanji(expressions) class JitenonKotowazaEntry(_JitenonEntry): - _COLUMNS = { - "言葉": ["expression", ""], - "読み方": ["yomikata", ""], - "意味": ["imi", ""], - "出典": ["shutten", ""], - "例文": ["reibun", ""], - "異形": ["ikei", []], - "類句": ["ruiku", []], - } + def __init__(self, target, entry_id): + super().__init__(target, entry_id) + self.origin = "" + self.example = "" + self.related_expressions = [] - def __init__(self, entry_id): - super().__init__(entry_id) + def _get_column_map(self): + return { + "言葉": "expression", + "読み方": "yomikata", + "意味": "definition", + "異形": "other_forms", + "出典": "origin", + "例文": "example", + "類句": "related_expressions", + } - def _set_headwords(self): + def _get_headwords(self): if self.expression == "金棒引き・鉄棒引き": - self._headwords = { + headwords = { "かなぼうひき": ["金棒引き", "鉄棒引き"] } else: - super()._set_headwords() + headwords = super()._get_headwords() + return headwords - def _set_variant_headwords(self): - for expressions in self._headwords.values(): + def _add_variant_expressions(self, headwords): + for expressions in headwords.values(): Expressions.add_variant_kanji(expressions) Expressions.add_fullwidth(expressions) class JitenonKokugoEntry(_JitenonEntry): - _COLUMNS = { - "言葉": ["expression", ""], - "読み方": ["yomikata", ""], - "意味": ["imi", ""], - "例文": ["reibun", ""], - "別表記": ["betsuhyouki", ""], - "対義語": ["taigigo", ""], - "活用": ["katsuyou", ""], - "用例": ["yourei", ""], - "類語": ["ruigo", ""], - } + def __init__(self, target, entry_id): + super().__init__(target, entry_id) + self.example = "" + self.alt_expression = "" + self.antonym = "" + self.attachments = "" + self.compounds = "" + self.related_words = "" - def __init__(self, entry_id): - super().__init__(entry_id) + def _get_column_map(self): + return { + "言葉": "expression", + "読み方": "yomikata", + "意味": "definition", + "例文": "example", + "別表記": "alt_expression", + "対義語": "antonym", + "活用": "attachments", + "用例": "compounds", + "類語": "related_words", + } - def _set_headwords(self): + def _get_headwords(self): headwords = {} for reading in self.yomikata.split("・"): if reading not in headwords: headwords[reading] = [] for expression in self.expression.split("・"): headwords[reading].append(expression) - if self.betsuhyouki.strip() != "": - for expression in self.betsuhyouki.split("・"): + if self.alt_expression.strip() != "": + for expression in self.alt_expression.split("・"): headwords[reading].append(expression) - self._headwords = headwords + return headwords - def _set_variant_headwords(self): - for expressions in self._headwords.values(): + def _add_variant_expressions(self, headwords): + for expressions in headwords.values(): Expressions.add_variant_kanji(expressions) Expressions.add_fullwidth(expressions) Expressions.remove_iteration_mark(expressions) diff --git a/bot/entries/smk8.py b/bot/entries/smk8.py index 11ef7e6..2308893 100644 --- a/bot/entries/smk8.py +++ b/bot/entries/smk8.py @@ -1,4 +1,3 @@ -import re from bs4 import BeautifulSoup import bot.entries.expressions as Expressions @@ -9,19 +8,17 @@ from bot.entries.smk8_preprocess import preprocess_page class _BaseSmk8Entry(Entry): - ID_TO_ENTRY = {} - SUBENTRY_ID_TO_ENTRY_ID = {} - - def __init__(self, entry_id): - super().__init__(entry_id) - if entry_id not in self.ID_TO_ENTRY: - self.ID_TO_ENTRY[entry_id] = self - else: - raise Exception(f"Duplicate entry ID: {entry_id}") + def __init__(self, target, entry_id): + super().__init__(target, entry_id) self.children = [] self.phrases = [] self.kanjis = [] + def get_global_identifier(self): + parent_part = format(self.entry_id[0], '06') + child_part = hex(self.entry_id[1]).lstrip('0x').zfill(4).upper() + return f"@{self.target.value}-{parent_part}-{child_part}" + def set_page(self, page): page = self.__decompose_subentries(page) self._page = page @@ -30,13 +27,6 @@ class _BaseSmk8Entry(Entry): soup = BeautifulSoup(self._page, "xml") return soup - def get_headwords(self): - if self._headwords is not None: - return self._headwords - self._set_headwords() - self._set_variant_headwords() - return self._headwords - def get_part_of_speech_tags(self): if self._part_of_speech_tags is not None: return self._part_of_speech_tags @@ -50,8 +40,8 @@ class _BaseSmk8Entry(Entry): self._part_of_speech_tags.append(tag.text) return self._part_of_speech_tags - def _set_variant_headwords(self): - for expressions in self._headwords.values(): + def _add_variant_expressions(self, headwords): + for expressions in headwords.values(): Expressions.add_variant_kanji(expressions) Expressions.add_fullwidth(expressions) Expressions.remove_iteration_mark(expressions) @@ -87,7 +77,7 @@ class _BaseSmk8Entry(Entry): tag_soup.name = "項目" subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"]) self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id - subentry = subentry_class(subentry_id) + subentry = subentry_class(self.target, subentry_id) page = tag_soup.decode() subentry.set_page(page) subentry_list.append(subentry) @@ -106,6 +96,16 @@ class _BaseSmk8Entry(Entry): else: raise Exception(f"Invalid entry ID: {id_string}") + @staticmethod + def _delete_unused_nodes(soup): + """Remove extra markup elements that appear in the entry + headword line which are not part of the entry headword""" + unused_nodes = [ + "表音表記", "表外音訓マーク", "表外字マーク", "ルビG" + ] + for name in unused_nodes: + Soup.delete_soup_nodes(soup, name) + @staticmethod def _clean_expression(expression): for x in ["〈", "〉", "{", "}", "…", " "]: @@ -114,24 +114,24 @@ class _BaseSmk8Entry(Entry): @staticmethod def _fill_alts(soup): - for e in soup.find_all(["親見出仮名", "親見出表記"]): - e.string = e.attrs["alt"] + for el in soup.find_all(["親見出仮名", "親見出表記"]): + el.string = el.attrs["alt"] for gaiji in soup.find_all("外字"): gaiji.string = gaiji.img.attrs["alt"] class Smk8Entry(_BaseSmk8Entry): - def __init__(self, page_id): + def __init__(self, target, page_id): entry_id = (page_id, 0) - super().__init__(entry_id) + super().__init__(target, entry_id) def set_page(self, page): page = preprocess_page(page) super().set_page(page) - def _set_headwords(self): + def _get_headwords(self): soup = self.get_page_soup() - Soup.delete_soup_nodes(soup, "表音表記") + self._delete_unused_nodes(soup) self._fill_alts(soup) reading = self._find_reading(soup) expressions = [] @@ -140,16 +140,14 @@ class Smk8Entry(_BaseSmk8Entry): for expression in self._find_expressions(soup): if expression not in expressions: expressions.append(expression) - self._headwords = {reading: expressions} + headwords = {reading: expressions} + return headwords class Smk8ChildEntry(_BaseSmk8Entry): - def __init__(self, entry_id): - super().__init__(entry_id) - - def _set_headwords(self): + def _get_headwords(self): soup = self.get_page_soup() - Soup.delete_soup_nodes(soup, "表音表記") + self._delete_unused_nodes(soup) self._fill_alts(soup) reading = self._find_reading(soup) expressions = [] @@ -158,19 +156,20 @@ class Smk8ChildEntry(_BaseSmk8Entry): for expression in self._find_expressions(soup): if expression not in expressions: expressions.append(expression) - self._headwords = {reading: expressions} + headwords = {reading: expressions} + return headwords class Smk8PhraseEntry(_BaseSmk8Entry): - def __init__(self, entry_id): - super().__init__(entry_id) + def __init__(self, target, entry_id): + super().__init__(target, entry_id) self.__phrase_readings = load_smk8_phrase_readings() def get_part_of_speech_tags(self): # phrases do not contain these tags return [] - def _set_headwords(self): + def _get_headwords(self): soup = self.get_page_soup() headwords = {} expressions = self._find_expressions(soup) @@ -181,10 +180,10 @@ class Smk8PhraseEntry(_BaseSmk8Entry): headwords[reading].append(expression) else: headwords[reading] = [expression] - self._headwords = headwords + return headwords def _find_expressions(self, soup): - Soup.delete_soup_nodes(soup, "ルビG") + self._delete_unused_nodes(soup) self._fill_alts(soup) text = soup.find("標準表記").text text = self._clean_expression(text) @@ -206,15 +205,14 @@ class Smk8PhraseEntry(_BaseSmk8Entry): class Smk8KanjiEntry(_BaseSmk8Entry): - def __init__(self, entry_id): - super().__init__(entry_id) - - def _set_headwords(self): + def _get_headwords(self): soup = self.get_page_soup() + self._delete_unused_nodes(soup) self._fill_alts(soup) reading = self.__get_parent_reading() expressions = self._find_expressions(soup) - self._headwords = {reading: expressions} + headwords = {reading: expressions} + return headwords def __get_parent_reading(self): parent_id = self.SUBENTRY_ID_TO_ENTRY_ID[self.entry_id] diff --git a/bot/entries/smk8_preprocess.py b/bot/entries/smk8_preprocess.py index 2e480a8..5c9b924 100644 --- a/bot/entries/smk8_preprocess.py +++ b/bot/entries/smk8_preprocess.py @@ -15,6 +15,7 @@ def preprocess_page(page): page = __strip_page(page) page = __replace_glyph_codes(page) page = __format_hyougai_marks(page) + page = __remove_pronunciation_parentheses(page) return page @@ -64,6 +65,7 @@ def __format_hyougai_marks(page): for x in ["\n", "\t", " "]: text = text.replace(x, "") text = re.sub(r"〈([^〈]+)〉", r"\1", text) + page = re.sub(r"〈([^〈]+)〉", r"␂\1␃", page) for mark in re.findall(r"《.", text): if mark[1] == "〓": @@ -79,13 +81,29 @@ def __format_hyougai_marks(page): page = re.sub(f"〈([^{mark[1]}]*)({mark[1]})", r"\1<表外字>\2", page, count=1) + page = page.replace("␂", "〈") page = page.replace("␃", "〉") soup = BeautifulSoup(page, features="xml") + for el in soup.find_all("表外音訓"): if el.text == "": el.append(el.next_sibling) + mark_xml = "<表外音訓マーク>︽" + mark_soup = BeautifulSoup(mark_xml, "xml") + el.append(mark_soup.表外音訓マーク) + for el in soup.find_all("表外字"): if el.text == "": el.append(el.next_sibling) + mark_xml = "<表外字マーク>︿" + mark_soup = BeautifulSoup(mark_xml, "xml") + el.append(mark_soup.表外字マーク) + return soup.decode() + + +def __remove_pronunciation_parentheses(page): + page = page.replace("<表音表記>(", "<表音表記>") + page = page.replace(")", "") + return page diff --git a/bot/mdict/exporters/export.py b/bot/mdict/exporters/export.py new file mode 100644 index 0000000..2d76f1d --- /dev/null +++ b/bot/mdict/exporters/export.py @@ -0,0 +1,204 @@ +# pylint: disable=too-few-public-methods + +import subprocess +import os +import shutil +from abc import ABC, abstractmethod +from pathlib import Path +from datetime import datetime +from platformdirs import user_documents_dir, user_cache_dir + +from bot.targets import Targets +from bot.mdict.terms.factory import new_terminator + + +class Exporter(ABC): + def __init__(self, target): + self._target = target + self._terminator = new_terminator(target) + self._build_dir = None + self._build_media_dir = None + self._description_file = None + self._out_dir = None + + def export(self, entries, media_dir, icon_file): + self._init_build_media_dir(media_dir) + self._init_description_file(entries) + terms = self._get_terms(entries) + print(f"Exporting {len(terms)} Mdict keys...") + self._write_mdx_file(terms) + self._write_mdd_file() + self._write_icon_file(icon_file) + self._rm_build_dir() + + def _get_build_dir(self): + if self._build_dir is not None: + return self._build_dir + cache_dir = user_cache_dir("jitenbot") + build_directory = os.path.join(cache_dir, "mdict_build") + if Path(build_directory).is_dir(): + shutil.rmtree(build_directory) + os.makedirs(build_directory) + self._build_dir = build_directory + return self._build_dir + + def _init_build_media_dir(self, media_dir): + build_dir = self._get_build_dir() + build_media_dir = os.path.join(build_dir, self._target.value) + if media_dir is not None: + print("Copying media files to build directory...") + shutil.copytree(media_dir, build_media_dir) + else: + os.makedirs(build_media_dir) + css_file = self._get_css_file() + shutil.copy(css_file, build_media_dir) + self._terminator.set_media_dir(build_media_dir) + self._build_media_dir = build_media_dir + + def _init_description_file(self, entries): + filename = f"{self._target.value}.mdx.description.html" + original_file = os.path.join( + "data", "mdict", "description", filename) + with open(original_file, "r", encoding="utf8") as f: + description = f.read() + description = description.replace( + "{{revision}}", self._get_revision(entries)) + description = description.replace( + "{{attribution}}", self._get_attribution(entries)) + build_dir = self._get_build_dir() + description_file = os.path.join(build_dir, filename) + with open(description_file, "w", encoding="utf8") as f: + f.write(description) + self._description_file = description_file + + def _get_terms(self, entries): + terms = [] + entries_len = len(entries) + for idx, entry in enumerate(entries): + update = f"Creating Mdict terms for entry {idx+1}/{entries_len}" + print(update, end='\r', flush=True) + new_terms = self._terminator.make_terms(entry) + for term in new_terms: + terms.append(term) + print() + return terms + + def _write_mdx_file(self, terms): + out_dir = self._get_out_dir() + out_file = os.path.join(out_dir, f"{self._target.value}.mdx") + params = [ + "mdict", + "-a", self._get_term_file(terms), + "--title", self._get_title_file(), + "--description", self._description_file, + out_file + ] + subprocess.run(params, check=True) + + def _write_mdd_file(self): + out_dir = self._get_out_dir() + out_file = os.path.join(out_dir, f"{self._target.value}.mdd") + params = [ + "mdict", + "-a", self._build_media_dir, + "--title", self._get_title_file(), + "--description", self._description_file, + out_file + ] + subprocess.run(params, check=True) + + def _write_icon_file(self, icon_file): + premade_icon_file = f"data/mdict/icon/{self._target.value}.png" + out_dir = self._get_out_dir() + out_file = os.path.join(out_dir, f"{self._target.value}.png") + if icon_file is not None and Path(icon_file).is_file(): + shutil.copy(icon_file, out_file) + elif Path(premade_icon_file).is_file(): + shutil.copy(premade_icon_file, out_file) + + def _get_out_dir(self): + if self._out_dir is not None: + return self._out_dir + out_dir = os.path.join( + user_documents_dir(), "jitenbot", "mdict", self._target.value) + if Path(out_dir).is_dir(): + shutil.rmtree(out_dir) + os.makedirs(out_dir) + self._out_dir = out_dir + return out_dir + + def _get_term_file(self, terms): + build_dir = self._get_build_dir() + term_file = os.path.join(build_dir, f"{self._target.value}.mdx.txt") + with open(term_file, "w", encoding="utf8") as f: + for term in terms: + f.write("\n".join(term)) + f.write("\n\n") + return term_file + + def _get_title_file(self): + return os.path.join( + "data", "mdict", "title", + f"{self._target.value}.mdx.title.html") + + def _get_css_file(self): + return os.path.join( + "data", "mdict", "css", + f"{self._target.value}.css") + + def _rm_build_dir(self): + build_dir = self._get_build_dir() + shutil.rmtree(build_dir) + + @abstractmethod + def _get_revision(self, entries): + pass + + @abstractmethod + def _get_attribution(self, entries): + pass + + +class _JitenonExporter(Exporter): + def _get_revision(self, entries): + modified_date = None + for entry in entries: + if modified_date is None or entry.modified_date > modified_date: + modified_date = entry.modified_date + revision = modified_date.strftime("%Y年%m月%d日閲覧") + return revision + + def _get_attribution(self, entries): + modified_date = None + for entry in entries: + if modified_date is None or entry.modified_date > modified_date: + attribution = entry.attribution + return attribution + + +class JitenonKokugoExporter(_JitenonExporter): + pass + + +class JitenonYojiExporter(_JitenonExporter): + pass + + +class JitenonKotowazaExporter(_JitenonExporter): + pass + + +class _MonokakidoExporter(Exporter): + def _get_revision(self, entries): + timestamp = datetime.now().strftime("%Y年%m月%d日作成") + return timestamp + + +class Smk8Exporter(_MonokakidoExporter): + def _get_attribution(self, entries): + return "© Sanseido Co., LTD. 2020" + + +class Daijirin2Exporter(_MonokakidoExporter): + def _get_attribution(self, entries): + return "© Sanseido Co., LTD. 2019" diff --git a/bot/mdict/exporters/factory.py b/bot/mdict/exporters/factory.py new file mode 100644 index 0000000..2c2015c --- /dev/null +++ b/bot/mdict/exporters/factory.py @@ -0,0 +1,18 @@ +from bot.targets import Targets + +from bot.mdict.exporters.export import JitenonKokugoExporter +from bot.mdict.exporters.export import JitenonYojiExporter +from bot.mdict.exporters.export import JitenonKotowazaExporter +from bot.mdict.exporters.export import Smk8Exporter +from bot.mdict.exporters.export import Daijirin2Exporter + + +def new_mdict_exporter(target): + exporter_map = { + Targets.JITENON_KOKUGO: JitenonKokugoExporter, + Targets.JITENON_YOJI: JitenonYojiExporter, + Targets.JITENON_KOTOWAZA: JitenonKotowazaExporter, + Targets.SMK8: Smk8Exporter, + Targets.DAIJIRIN2: Daijirin2Exporter, + } + return exporter_map[target](target) diff --git a/bot/mdict/glossary/daijirin2.py b/bot/mdict/glossary/daijirin2.py new file mode 100644 index 0000000..1a8b0d5 --- /dev/null +++ b/bot/mdict/glossary/daijirin2.py @@ -0,0 +1,77 @@ +import re +import os +from functools import cache +from pathlib import Path + +from bot.soup import delete_soup_nodes +from bot.data import load_mdict_name_conversion +from bot.name_conversion import convert_names + + +def make_glossary(entry, media_dir): + soup = entry.get_page_soup() + __add_rubies(soup) + __hyperlink_parent_expression(soup, entry) + __delete_unused_nodes(soup, media_dir) + __convert_links(soup, entry) + + name_conversion = load_mdict_name_conversion(entry.target) + convert_names(soup, name_conversion) + + glossary = soup.span.decode() + return glossary + + +def __add_rubies(soup): + for name in ["表外音訓", "表外字"]: + for ruby in soup.find_all(name): + ruby.name = "ruby" + rt = ruby.find("表外字マーク") + rt.name = "rt" + ruby.append(rt) # needs to positioned after the text + + +def __hyperlink_parent_expression(soup, entry): + if soup.find("親表記") is None: + return + parent_entry = entry.get_parent() + gid = parent_entry.get_global_identifier() + for el in soup.find_all("親表記"): + el.name = "a" + el.attrs["href"] = f"entry://{gid}" + + +def __delete_unused_nodes(soup, media_dir): + if not __graphics_directory_exists(media_dir): + delete_soup_nodes(soup, "カットG") + for el in soup.find_all("logo"): + next_sibling = el.next_sibling + if next_sibling is None: + continue + elif next_sibling.name in ["漢字見出G", "漢字音G"]: + el.decompose() + for el in soup.find_all("漢字音G"): + for child in el.find_all(string="・"): + child.replace_with("") + + +@cache +def __graphics_directory_exists(media_dir): + path = os.path.join(media_dir, "graphics") + return Path(path).is_dir() + + +def __convert_links(soup, entry): + for el in soup.find_all("a"): + href = el.attrs["href"] + if re.match(r"^[0-9]+(?:-[0-9A-F]{4})?$", href): + ref_entry_id = entry.id_string_to_entry_id(href) + ref_entry = entry.ID_TO_ENTRY[ref_entry_id] + gid = ref_entry.get_global_identifier() + el.attrs["href"] = f"entry://{gid}" + elif re.match(r"^entry:", href): + pass + elif re.match(r"^https?:[\w\W]*", href): + pass + else: + raise Exception(f"Invalid href format: {href}") diff --git a/bot/mdict/glossary/jitenon.py b/bot/mdict/glossary/jitenon.py new file mode 100644 index 0000000..737ea59 --- /dev/null +++ b/bot/mdict/glossary/jitenon.py @@ -0,0 +1,141 @@ +# pylint: disable=too-few-public-methods + +import re + + +class JitenonGlossary(): + def __init__(self): + self._id_pattern = None + self._expression_header = None + + def _replace_punctuation(self, soup): + punctuation = { + "/": "/", + ",": "、", + } + for el in soup.find_all(string=True): + text = el.text + for old, new in punctuation.items(): + text = text.replace(old, new) + el.replace_with(text) + + def _add_internal_links(self, soup, entry): + for el in soup.find_all("a"): + href = el.attrs["href"] + m = re.search(self._id_pattern, href) + if m is not None: + ref_entry_id = int(m.group(1)) + ref_entry = entry.ID_TO_ENTRY[ref_entry_id] + gid = ref_entry.get_global_identifier() + el.attrs["href"] = f"entry://{gid}" + elif re.match(r"^(?:https?:|\?)[\w\W]*", href): + pass + else: + raise Exception(f"Invalid href format: {href}") + + def _decompose_table_rows(self, soup, entry): + for tr in soup.find_all("tr"): + if tr.find("th") is None: + continue + elif tr.th.text == self._expression_header: + tr.decompose() + elif tr.th.text == "読み方": + if self._do_display_yomikata_in_headword(entry): + tr.decompose() + elif tr.th.text == "意味": + definition = tr.td + definition.name = "div" + definition.attrs["class"] = "意味" + soup.body.insert(0, definition) + tr.decompose() + if soup.find("tr") is None: + soup.table.decompose() + + def _insert_headword_line(self, soup, entry): + headword_line = soup.new_tag("div") + headword_line.attrs["class"] = "見出し" + if self._do_display_yomikata_in_headword(entry): + reading = soup.new_tag("span") + reading.attrs["class"] = "読み方" + reading.string = entry.yomikata + headword_line.append(reading) + expression = soup.new_tag("span") + expression.attrs["class"] = self._expression_header + expression.string = f"【{entry.expression}】" + headword_line.append(expression) + soup.body.insert(0, headword_line) + + def _do_display_yomikata_in_headword(self, entry): + if not re.match(r"^[ぁ-ヿ、]+$", entry.yomikata): + return False + elif len(entry.yomikata) > 10: + return False + else: + return True + + +class JitenonKokugoGlossary(JitenonGlossary): + def __init__(self): + super().__init__() + self._expression_header = "言葉" + self._id_pattern = r"kokugo.jitenon.jp/word/p([0-9]+)$" + + def make_glossary(self, entry, media_dir): + soup = entry.get_page_soup() + self._remove_antonym_list_item(soup) + self._replace_number_icons(soup, media_dir) + self._replace_punctuation(soup) + self._add_internal_links(soup, entry) + self._decompose_table_rows(soup, entry) + self._insert_headword_line(soup, entry) + glossary = soup.body.prettify() + return glossary + + def _remove_antonym_list_item(self, soup): + for el in soup.find_all("li"): + if el.text == "対義語辞典": + el.decompose() + + def _replace_number_icons(self, soup, media_dir): + for el in soup.find_all("img"): + alt = el.attrs["alt"] + text = re.search(r"[0-9]+", alt).group(0) + el.name = "span" + el.string = text + del el.attrs["src"] + del el.attrs["alt"] + + def _do_display_yomikata_in_headword(self, entry): + return len(entry.yomikata) <= 10 + + +class JitenonYojiGlossary(JitenonGlossary): + def __init__(self): + super().__init__() + self._expression_header = "四字熟語" + self._id_pattern = r"yoji.jitenon.jp/yoji.?/([0-9]+)\.html$" + + def make_glossary(self, entry, media_dir): + soup = entry.get_page_soup() + self._replace_punctuation(soup) + self._add_internal_links(soup, entry) + self._decompose_table_rows(soup, entry) + self._insert_headword_line(soup, entry) + glossary = soup.body.prettify() + return glossary + + +class JitenonKotowazaGlossary(JitenonGlossary): + def __init__(self): + super().__init__() + self._expression_header = "言葉" + self._id_pattern = r"kotowaza.jitenon.jp/kotowaza/([0-9]+)\.php$" + + def make_glossary(self, entry, media_dir): + soup = entry.get_page_soup() + self._replace_punctuation(soup) + self._add_internal_links(soup, entry) + self._decompose_table_rows(soup, entry) + self._insert_headword_line(soup, entry) + glossary = soup.body.prettify() + return glossary diff --git a/bot/mdict/glossary/smk8.py b/bot/mdict/glossary/smk8.py new file mode 100644 index 0000000..613fc1b --- /dev/null +++ b/bot/mdict/glossary/smk8.py @@ -0,0 +1,67 @@ +import re + +from bot.soup import delete_soup_nodes +from bot.data import load_mdict_name_conversion +from bot.name_conversion import convert_names + + +def make_glossary(entry, media_dir): + soup = entry.get_page_soup() + __fill_alts(soup, entry) + __delete_unused_nodes(soup) + __convert_links(soup, entry) + __convert_priority_markers(soup) + + name_conversion = load_mdict_name_conversion(entry.target) + convert_names(soup, name_conversion) + + glossary = soup.span.decode() + return glossary + + +def __fill_alts(soup, entry): + names = ["親見出仮名", "親見出表記"] + if soup.find(names) is None: + return + parent_entry = entry.get_parent() + gid = parent_entry.get_global_identifier() + for el in soup.find_all(names): + el.name = "a" + alt = el.attrs["alt"] + el.string = alt + el.attrs["href"] = f"entry://{gid}" + del el.attrs["alt"] + + +def __delete_unused_nodes(soup): + for name in ["連濁"]: + delete_soup_nodes(soup, name) + + +def __convert_links(soup, entry): + for el in soup.find_all("a"): + href = el.attrs["href"] + if href.startswith("$"): + el.unwrap() + elif re.match(r"^[0-9]+(?:-[0-9A-F]{4})?$", href): + ref_entry_id = entry.id_string_to_entry_id(href) + ref_entry = entry.ID_TO_ENTRY[ref_entry_id] + gid = ref_entry.get_global_identifier() + el.attrs["href"] = f"entry://{gid}" + elif re.match(r"^[0-9]+[ab]?\.aac$", href): + el.attrs["href"] = f"sound://audio/{href}" + elif re.match(r"^entry:", href): + pass + elif re.match(r"^https?:[\w\W]*", href): + pass + else: + raise Exception(f"Invalid href format: {href}") + + +def __convert_priority_markers(soup): + for el in soup.find_all("img", attrs={"alt": "*"}): + el.name = "span" + el.string = "*" + for el in soup.find_all("img", attrs={"alt": "⁑"}): + el.name = "span" + el.string = "**" diff --git a/bot/mdict/terms/daijirin2.py b/bot/mdict/terms/daijirin2.py new file mode 100644 index 0000000..3b5ce68 --- /dev/null +++ b/bot/mdict/terms/daijirin2.py @@ -0,0 +1,23 @@ +from bot.mdict.terms.terminator import Terminator +from bot.mdict.glossary.daijirin2 import make_glossary + + +class Daijirin2Terminator(Terminator): + def _glossary(self, entry): + if entry.entry_id in self._glossary_cache: + return self._glossary_cache[entry.entry_id] + glossary = make_glossary(entry, self._media_dir) + self._glossary_cache[entry.entry_id] = glossary + return glossary + + def _link_glossary_parameters(self, entry): + return [ + [entry.children, "子項目"], + [entry.phrases, "句項目"], + ] + + def _subentry_lists(self, entry): + return [ + entry.children, + entry.phrases, + ] diff --git a/bot/mdict/terms/factory.py b/bot/mdict/terms/factory.py new file mode 100644 index 0000000..78a05cd --- /dev/null +++ b/bot/mdict/terms/factory.py @@ -0,0 +1,18 @@ +from bot.targets import Targets + +from bot.mdict.terms.jitenon import JitenonKokugoTerminator +from bot.mdict.terms.jitenon import JitenonYojiTerminator +from bot.mdict.terms.jitenon import JitenonKotowazaTerminator +from bot.mdict.terms.smk8 import Smk8Terminator +from bot.mdict.terms.daijirin2 import Daijirin2Terminator + + +def new_terminator(target): + terminator_map = { + Targets.JITENON_KOKUGO: JitenonKokugoTerminator, + Targets.JITENON_YOJI: JitenonYojiTerminator, + Targets.JITENON_KOTOWAZA: JitenonKotowazaTerminator, + Targets.SMK8: Smk8Terminator, + Targets.DAIJIRIN2: Daijirin2Terminator, + } + return terminator_map[target](target) diff --git a/bot/mdict/terms/jitenon.py b/bot/mdict/terms/jitenon.py new file mode 100644 index 0000000..3f9cfc1 --- /dev/null +++ b/bot/mdict/terms/jitenon.py @@ -0,0 +1,42 @@ +from bot.mdict.terms.terminator import Terminator + +from bot.mdict.glossary.jitenon import JitenonKokugoGlossary +from bot.mdict.glossary.jitenon import JitenonYojiGlossary +from bot.mdict.glossary.jitenon import JitenonKotowazaGlossary + + +class JitenonTerminator(Terminator): + def __init__(self, target): + super().__init__(target) + self._glossary_maker = None + + def _glossary(self, entry): + if entry.entry_id in self._glossary_cache: + return self._glossary_cache[entry.entry_id] + glossary = self._glossary_maker.make_glossary(entry, self._media_dir) + self._glossary_cache[entry.entry_id] = glossary + return glossary + + def _link_glossary_parameters(self, entry): + return [] + + def _subentry_lists(self, entry): + return [] + + +class JitenonKokugoTerminator(JitenonTerminator): + def __init__(self, target): + super().__init__(target) + self._glossary_maker = JitenonKokugoGlossary() + + +class JitenonYojiTerminator(JitenonTerminator): + def __init__(self, target): + super().__init__(target) + self._glossary_maker = JitenonYojiGlossary() + + +class JitenonKotowazaTerminator(JitenonTerminator): + def __init__(self, target): + super().__init__(target) + self._glossary_maker = JitenonKotowazaGlossary() diff --git a/bot/mdict/terms/smk8.py b/bot/mdict/terms/smk8.py new file mode 100644 index 0000000..22275d5 --- /dev/null +++ b/bot/mdict/terms/smk8.py @@ -0,0 +1,24 @@ +from bot.mdict.terms.terminator import Terminator +from bot.mdict.glossary.smk8 import make_glossary + + +class Smk8Terminator(Terminator): + def _glossary(self, entry): + if entry.entry_id in self._glossary_cache: + return self._glossary_cache[entry.entry_id] + glossary = make_glossary(entry, self._media_dir) + self._glossary_cache[entry.entry_id] = glossary + return glossary + + def _link_glossary_parameters(self, entry): + return [ + [entry.children, "子項目"], + [entry.phrases, "句項目"], + ] + + def _subentry_lists(self, entry): + return [ + entry.children, + entry.phrases, + entry.kanjis, + ] diff --git a/bot/mdict/terms/terminator.py b/bot/mdict/terms/terminator.py new file mode 100644 index 0000000..e69d9fb --- /dev/null +++ b/bot/mdict/terms/terminator.py @@ -0,0 +1,73 @@ +from abc import abstractmethod, ABC + + +class Terminator(ABC): + def __init__(self, target): + self._target = target + self._glossary_cache = {} + self._media_dir = None + + def set_media_dir(self, media_dir): + self._media_dir = media_dir + + def make_terms(self, entry): + gid = entry.get_global_identifier() + glossary = self.__full_glossary(entry) + terms = [[gid, glossary]] + keys = set() + headwords = entry.get_headwords() + for reading, expressions in headwords.items(): + if len(expressions) == 0: + keys.add(reading) + for expression in expressions: + if expression.strip() == "": + keys.add(reading) + continue + keys.add(expression) + if reading.strip() == "": + continue + if reading != expression: + keys.add(f"{reading}【{expression}】") + else: + keys.add(reading) + link = f"@@@LINK={gid}" + for key in keys: + if key.strip() != "": + terms.append([key, link]) + for subentries in self._subentry_lists(entry): + for subentry in subentries: + for term in self.make_terms(subentry): + terms.append(term) + return terms + + def __full_glossary(self, entry): + glossary = [] + style_link = f"" + glossary.append(style_link) + glossary.append(self._glossary(entry)) + + for x in self._link_glossary_parameters(entry): + (subentries, list_title) = x + if len(subentries) == 0: + continue + items = [] + for subentry in subentries: + exp = subentry.get_first_expression() + gid = subentry.get_global_identifier() + item = f"
  • {exp}
  • " + items.append(item) + link_glossary = f"
    {list_title}
    " + glossary.append(link_glossary) + return "\n".join(glossary) + + @abstractmethod + def _glossary(self, entry): + pass + + @abstractmethod + def _link_glossary_parameters(self, entry): + pass + + @abstractmethod + def _subentry_lists(self, entry): + pass diff --git a/bot/yomichan/glossary/name_conversion.py b/bot/name_conversion.py similarity index 88% rename from bot/yomichan/glossary/name_conversion.py rename to bot/name_conversion.py index 776d65e..2c9b808 100644 --- a/bot/yomichan/glossary/name_conversion.py +++ b/bot/name_conversion.py @@ -30,7 +30,7 @@ def __apply_name_conversion_procedures(soup, procedures): "has_previous_sibling": __has_previous_sibling, "replace": __replace, "wrap": __wrap, - "add_ruby_text": __add_ruby_text, + "insert_span": __insert_span, } for procedure in procedures: function = functions[procedure["procedure_name"]] @@ -92,10 +92,9 @@ def __wrap(soup, l_wrap, r_wrap): soup.string = f"{l_wrap}{soup.text}{r_wrap}" -def __add_ruby_text(soup, mark, style): - if style.strip() != "": - markup = f"{mark}" - else: - markup = f"{mark}" - rt_soup = BeautifulSoup(markup, "xml") - soup.append(rt_soup.rt) +def __insert_span(soup, attr_name, attr_val): + span_markup = f"" + span_soup = BeautifulSoup(span_markup, "xml") + for content in reversed(soup.contents): + span_soup.span.insert(0, content.extract()) + soup.append(span_soup.span) diff --git a/bot/yomichan/exporters/export.py b/bot/yomichan/exporters/export.py index 4658030..03e1b95 100644 --- a/bot/yomichan/exporters/export.py +++ b/bot/yomichan/exporters/export.py @@ -1,15 +1,18 @@ +# pylint: disable=too-few-public-methods + import json import os import shutil from pathlib import Path from datetime import datetime +from abc import ABC, abstractmethod from platformdirs import user_documents_dir, user_cache_dir from bot.data import load_yomichan_metadata from bot.yomichan.terms.factory import new_terminator -class Exporter: +class Exporter(ABC): def __init__(self, target): self._target = target self._terminator = new_terminator(target) @@ -26,6 +29,14 @@ class Exporter: terms = self.__get_terms(entries) self.__make_dictionary(terms, index, tags) + @abstractmethod + def _get_revision(self, entries): + pass + + @abstractmethod + def _get_attribution(self, entries): + pass + def _get_build_dir(self): if self._build_dir is not None: return self._build_dir @@ -41,7 +52,7 @@ class Exporter: build_dir = self._get_build_dir() build_img_dir = os.path.join(build_dir, self._target.value) if image_dir is not None: - print("Copying image files to build directory...") + print("Copying media files to build directory...") shutil.copytree(image_dir, build_img_dir) else: os.makedirs(build_img_dir) @@ -93,7 +104,7 @@ class Exporter: def __write_archive(self, filename): archive_format = "zip" - out_dir = os.path.join(user_documents_dir(), "jitenbot") + out_dir = os.path.join(user_documents_dir(), "jitenbot", "yomichan") if not Path(out_dir).is_dir(): os.makedirs(out_dir) out_file = f"{filename}.{archive_format}" @@ -110,10 +121,7 @@ class Exporter: shutil.rmtree(build_dir) -class JitenonExporter(Exporter): - def __init__(self, target): - super().__init__(target) - +class _JitenonExporter(Exporter): def _get_revision(self, entries): modified_date = None for entry in entries: @@ -130,25 +138,19 @@ class JitenonExporter(Exporter): return attribution -class JitenonKokugoExporter(JitenonExporter): - def __init__(self, target): - super().__init__(target) +class JitenonKokugoExporter(_JitenonExporter): + pass -class JitenonYojiExporter(JitenonExporter): - def __init__(self, target): - super().__init__(target) +class JitenonYojiExporter(_JitenonExporter): + pass -class JitenonKotowazaExporter(JitenonExporter): - def __init__(self, target): - super().__init__(target) +class JitenonKotowazaExporter(_JitenonExporter): + pass class Smk8Exporter(Exporter): - def __init__(self, target): - super().__init__(target) - def _get_revision(self, entries): timestamp = datetime.now().strftime("%Y-%m-%d") return f"{self._target.value};{timestamp}" @@ -158,9 +160,6 @@ class Smk8Exporter(Exporter): class Daijirin2Exporter(Exporter): - def __init__(self, target): - super().__init__(target) - def _get_revision(self, entries): timestamp = datetime.now().strftime("%Y-%m-%d") return f"{self._target.value};{timestamp}" diff --git a/bot/yomichan/exporters/factory.py b/bot/yomichan/exporters/factory.py index 5ab9a6a..06568e3 100644 --- a/bot/yomichan/exporters/factory.py +++ b/bot/yomichan/exporters/factory.py @@ -7,7 +7,7 @@ from bot.yomichan.exporters.export import Smk8Exporter from bot.yomichan.exporters.export import Daijirin2Exporter -def new_exporter(target): +def new_yomi_exporter(target): exporter_map = { Targets.JITENON_KOKUGO: JitenonKokugoExporter, Targets.JITENON_YOJI: JitenonYojiExporter, diff --git a/bot/yomichan/glossary/daijirin2.py b/bot/yomichan/glossary/daijirin2.py index f2b6f2c..c42841c 100644 --- a/bot/yomichan/glossary/daijirin2.py +++ b/bot/yomichan/glossary/daijirin2.py @@ -6,9 +6,9 @@ from pathlib import Path import bot.icons as Icons from bot.soup import delete_soup_nodes -from bot.data import load_daijirin2_yomichan_name_conversion +from bot.data import load_yomichan_name_conversion from bot.yomichan.glossary.gloss import make_gloss -from bot.yomichan.glossary.name_conversion import convert_names +from bot.name_conversion import convert_names def make_glossary(entry, image_dir): @@ -26,7 +26,7 @@ def make_glossary(entry, image_dir): __convert_daigoginum(soup, image_dir) __convert_jundaigoginum(soup, image_dir) - name_conversion = load_daijirin2_yomichan_name_conversion() + name_conversion = load_yomichan_name_conversion(entry.target) convert_names(soup, name_conversion) gloss = make_gloss(soup.span) diff --git a/bot/yomichan/glossary/jitenon.py b/bot/yomichan/glossary/jitenon.py index 6e3a192..ca76f19 100644 --- a/bot/yomichan/glossary/jitenon.py +++ b/bot/yomichan/glossary/jitenon.py @@ -58,9 +58,9 @@ class JitenonGlossary(): if self._do_display_yomikata_in_headword(entry): tr.decompose() elif tr.th.text == "意味": - imi = tr.td - imi.name = "div" - soup.body.insert(0, imi) + definition = tr.td + definition.name = "div" + soup.body.insert(0, definition) tr.decompose() if soup.find("tr") is None: soup.table.decompose() diff --git a/bot/yomichan/glossary/smk8.py b/bot/yomichan/glossary/smk8.py index 870c3fc..8754a02 100644 --- a/bot/yomichan/glossary/smk8.py +++ b/bot/yomichan/glossary/smk8.py @@ -4,9 +4,9 @@ from bs4 import BeautifulSoup import bot.icons as Icons from bot.soup import delete_soup_nodes -from bot.data import load_smk8_yomichan_name_conversion +from bot.data import load_yomichan_name_conversion from bot.yomichan.glossary.gloss import make_gloss -from bot.yomichan.glossary.name_conversion import convert_names +from bot.name_conversion import convert_names def make_glossary(entry, image_dir): @@ -20,7 +20,7 @@ def make_glossary(entry, image_dir): __convert_gaiji(soup, image_dir) __convert_rectangles(soup, image_dir) - name_conversion = load_smk8_yomichan_name_conversion() + name_conversion = load_yomichan_name_conversion(entry.target) convert_names(soup, name_conversion) gloss = make_gloss(soup.span) diff --git a/bot/yomichan/terms/jitenon.py b/bot/yomichan/terms/jitenon.py index f74abaa..66bbed7 100644 --- a/bot/yomichan/terms/jitenon.py +++ b/bot/yomichan/terms/jitenon.py @@ -9,6 +9,7 @@ from bot.yomichan.glossary.jitenon import JitenonKotowazaGlossary class JitenonTerminator(Terminator): def __init__(self, target): super().__init__(target) + self._glossary_maker = None def _definition_tags(self, entry): return None @@ -51,7 +52,7 @@ class JitenonYojiTerminator(JitenonTerminator): return "" def _term_tags(self, entry): - tags = entry.kankenkyuu.split("/") + tags = entry.kanken_level.split("/") return " ".join(tags) diff --git a/bot/yomichan/terms/terminator.py b/bot/yomichan/terms/terminator.py index d41a50a..dd0c02d 100644 --- a/bot/yomichan/terms/terminator.py +++ b/bot/yomichan/terms/terminator.py @@ -1,7 +1,8 @@ +from abc import abstractmethod, ABC from bot.data import load_yomichan_inflection_categories -class Terminator: +class Terminator(ABC): def __init__(self, target): self._target = target self._glossary_cache = {} @@ -62,3 +63,31 @@ class Terminator: } glossary.append(gloss) return glossary + + @abstractmethod + def _definition_tags(self, entry): + pass + + @abstractmethod + def _inflection_rules(self, entry, expression): + pass + + @abstractmethod + def _glossary(self, entry): + pass + + @abstractmethod + def _sequence(self, entry): + pass + + @abstractmethod + def _term_tags(self, entry): + pass + + @abstractmethod + def _link_glossary_parameters(self, entry): + pass + + @abstractmethod + def _subentry_lists(self, entry): + pass diff --git a/data/daijirin2/mdict_name_conversion.json b/data/daijirin2/mdict_name_conversion.json new file mode 100644 index 0000000..d783d28 --- /dev/null +++ b/data/daijirin2/mdict_name_conversion.json @@ -0,0 +1,12 @@ +{ + "a": {}, + "br": {}, + "img": {}, + "div": {}, + "span": {}, + "ruby": {}, + "rt": {}, + "p": {}, + "漢字音G": {"name": "ul"}, + "漢字音": {"name": "li"} +} diff --git a/data/mdict/css/daijirin2.css b/data/mdict/css/daijirin2.css new file mode 100644 index 0000000..703cb35 --- /dev/null +++ b/data/mdict/css/daijirin2.css @@ -0,0 +1,414 @@ + +body { + margin: 1em 44px 1em 1em; + line-height: 1.5em; + font-family: serif; + font-size: 1.2em; + color: black; +} + +body.ABC { + margin: 0.5em 0.5em 2em 0.5em; +} + +a { + text-decoration: none; +} + +img.gaiji { + height: 1em; +} + +img.cut { + max-height: 100px; + max-width: 600px; +} + +p { + margin: 0.5em 0 +} + +span[data-name="i"] { + font-style: italic; +} + +span[data-name="h1"] { + font-family: sans-serif; + font-size: 1em; + font-weight: bold; +} + +span[data-name="image"] { + display: block; +} + +span[data-name="ref"] a { + text-decoration: none; +} + +span[data-name="sl"] { + text-decoration: accent; +} + +span[data-name="sm"] { + font-size: 0.7em; +} + +span[data-name="small"] { + font-size: 0.7em; + vertical-align: 0.35em; +} + +span[data-name="sub"] { + font-size: 0.7em; + vertical-align: -0.35em; +} + +span[data-name="ty2"] span[data-name="sub"] { + vertical-align: 0em; +} + +span[data-name="ty2"] span[data-name="sup"] { + vertical-align: 0.5em; +} + +span[data-name="文語形"] { + display: block; +} + +span[data-name="用例"] { + display: block; +} + +span[data-name="補説G"] { + display: block; +} + +span[data-name="語義Gnum"] + span[data-name="補説G"] { + display: inline; +} + +span[data-name="アクセントG"] + span[data-name="補説G"] { + display: inline; +} + +span[data-name="補説G"] + span[data-name="語釈"] { + display: block; +} + +span[data-name="アクセントG"] { + font-size: 0.7em; + vertical-align: super; + margin-left: 0.25em; + margin-right: 0.25em; +} + +span[data-name="カット"] { + display: block; +} + +span[data-name="カットG"] { + display: block; + margin-top: 0.5em; + margin-bottom: 0.5em; + margin-left: 1em; +} + +span[data-name="キャプション"] { + display: block; +} + +span[data-name="ルビG"] { + font-family: sans-serif; + font-size: 0.7em; + font-weight: normal; + vertical-align: 0.35em; +} + +.warichu span[data-name="ルビG"] { + font-family: serif; + font-size: 0.5em; + font-weight: normal; + vertical-align: 0em; +} + +span[data-name="中語義"] { + display: block; +} + +span[data-name="付記"] { + font-size: 0.7em; + vertical-align: 0.35em; +} + +span[data-name="副義"] { + display: block; + margin-left: 1em; +} + +span[data-name="単位名"] { + font-size: 0.5em; +} + +span[data-name="原籍"] { + font-size: 0.7em; + vertical-align: 0.35em; +} + +span[data-name="句仮名"] { + font-size: 0.7em; + vertical-align: 0.35em; +} + +span[data-name="句項目"] { + margin-top: 0.5em; + margin-left: 1em; + display: block; +} + +span[data-name="和字"] { + font-family: sans-serif; +} + +span[data-name="品詞行"] { + font-size: 0.7em; + vertical-align: 0.35em; +} + +span[data-name="品詞用法"] { + font-size: 0.7em; + vertical-align: 0.35em; +} + +span[data-name="大語義"] { + display: block; +} + +span[data-name="大語義num"] { + margin: 0.025em; + padding: 0.1em; + font-family: sans-serif; + font-size: 0.8em; + color: white; + background-color: black; +} + +span[data-name="子項目"] { + display: block; + margin-top: 0.5em; + margin-left: 1em; +} + +span[data-name="慣用G"] { + display: block; + margin-top: 0.5em; +} + +span[data-name="欧字"] { + font-family: sans-serif; +} + +span[data-name="歴史仮名"] { + font-size: 0.7em; + vertical-align: 0.35em; +} + +span[data-name="派生G"] { + display: block; + margin-top: 0.5em; +} + +span[data-name="準大語義"] { + display: block; +} + +span[data-name="準大語義num"] { + margin: 0.025em; + padding: 0.1em; + font-family: sans-serif; + font-size: 0.8em; + border: solid 1px black; +} + +span[data-name="漢字音logo"] { + margin: 0.025em; + padding: 0.1em; + font-family: sans-serif; + font-size: 0.8em; + border: solid 0.5px black; + border-radius: 1em; +} + +span[data-name="漢字音G"] { + font-size: 0.7em; + font-weight: normal; + vertical-align: 0.35em; +} + +span[data-name="生没年"] { + margin-left: 0.25em; + margin-right: 0.25em; + font-size: 0.7em; + vertical-align: 0.35em; +} + +span[data-name="生没年"]:first-child { + margin-left: 0; +} + +span[data-name="用法"] { + font-size: 0.7em; + vertical-align: 0.35em; +} + +span[data-name="異字同訓"] { + display: block; + margin-top: 0.5em; +} + +span[data-name="異字同訓仮名"] { + font-family: sans-serif; + font-weight: bold; +} + +span[data-name="異字同訓漢字"] { + font-family: serif; + font-weight: normal; +} + +span[data-name="異字同訓表記"] { + font-family: sans-serif; + font-weight: bold; +} + +span[data-name="異字同訓解説"] { + display: block; +} + +span[data-name="異字同訓語義G"] { + display: block; +} + +span[data-name="細義"] { + display: block; +} + +span[data-name="表外字マーク"] { + font-size: 0.5em; + vertical-align: 0.5em; +} + +span[data-name="見出仮名"] { + font-family: sans-serif; + font-weight: bold; +} + +span[data-name="見出相当部"] { + font-family: sans-serif; + font-weight: bold; +} + +span[data-name="見出部"] { + display: block; +} + +span[data-name="解説部"] { + display: block; + margin-left: 1em; +} + +span[data-name="語義G"] { + display: block; +} + +span[data-name="語義区切"] { + font-size: 0.7em; + vertical-align: 0.35em; +} + +span[data-name="返り点"] { + font-size: 0.5em; + font-weight: normal; + vertical-align: 1em; +} + +span[data-name="返り点"].熟語記号 { + vertical-align: 0em; +} + +span[data-name="項目"] { + display: block; +} + +span[data-name="logo"] { + margin: 0.025em 0.25em; + padding: 0.1em; + font-size: 0.8em; + border: solid 1px black; + border-radius: 0.2em; +} + +.gothic { + font-family: sans-serif; + font-weight: bold; +} + +.warichu { + font-size: 1em; +} + +.refnum { + font-size: 0.7em; + vertical-align: 0.35em; +} + +#index { + display: none; +} + +span[data-name="歴史仮名"]:before, +span[data-name="ルビG"]:before, +span[data-name="品詞行"]:before, +span[data-name="原籍"]:before, +span[data-name="品詞用法"]:before, +span[data-name="付記"]:before { + content: "("; +} + +span[data-name="歴史仮名"]:after, +span[data-name="ルビG"]:after, +span[data-name="品詞行"]:after, +span[data-name="原籍"]:after, +span[data-name="品詞用法"]:after, +span[data-name="付記"]:after { + content: ")"; +} + +div[data-child-links] { + padding-top: 1em; +} + +div[data-child-links] ul { + margin: 0; + padding-left: 2em; +} + +div[data-child-links] span { + padding: 0.1em; + font-family: sans-serif; + font-size: 0.8em; + color: white; + border-width: 0.05em; + border-style: none; + border-color: black; + word-break: keep-all; + -webkit-border-radius: 0.2em; +} + +div[data-child-links="子項目"] span { + background-color: rgb(153, 42, 103); +} + +div[data-child-links="句項目"] span { + background-color: rgb(176, 127, 57); +} diff --git a/data/mdict/css/jitenon-kokugo.css b/data/mdict/css/jitenon-kokugo.css new file mode 100644 index 0000000..687ae14 --- /dev/null +++ b/data/mdict/css/jitenon-kokugo.css @@ -0,0 +1,56 @@ + +body { + font-family: serif; + margin: 1em 44px 1em 1.5em; + line-height: 1.5em; + font-size: 1.2em; + color: black; +} + +table, th, td { + border: 1px solid; + border-collapse: collapse; + padding: 0.5em; +} + +th { + font-family: sans-serif; + color: black; + background-color: lightgray; + font-weight: normal; + white-space: nowrap; +} + +a { + text-decoration: none; +} + +td ul { + margin: -0.1em 0em -0.1em -1em; +} + +.見出し { +} + +.読み方 { + font-family: sans-serif; + font-weight: bold; +} + +.意味 { + margin-left: 1.0em; + margin-bottom: 0.5em; +} + +.num_icon { + font-family: sans-serif; + padding-left: 0.25em; + margin-right: 0.5em; + font-size: 0.8em; + word-break: keep-all; + color: white; + background-color: gray; + border-style: none; + -webkit-border-radius: 0.1em; +} + diff --git a/data/mdict/css/jitenon-kotowaza.css b/data/mdict/css/jitenon-kotowaza.css new file mode 100644 index 0000000..2dfb1be --- /dev/null +++ b/data/mdict/css/jitenon-kotowaza.css @@ -0,0 +1,40 @@ + +body { + font-family: serif; + margin: 1em 44px 1em 1.5em; + line-height: 1.5em; + font-size: 1.2em; + color: black; +} + +table, th, td { + border: 1px solid; + border-collapse: collapse; + padding: 0.5em; +} + +th { + font-family: sans-serif; + color: black; + background-color: lightgray; + font-weight: normal; + white-space: nowrap; +} + +a { + text-decoration: none; +} + +.見出し { +} + +.読み方 { + font-family: sans-serif; + font-weight: bold; +} + +.意味 { + margin-left: 1.0em; + margin-bottom: 0.5em; +} + diff --git a/data/mdict/css/jitenon-yoji.css b/data/mdict/css/jitenon-yoji.css new file mode 100644 index 0000000..2dfb1be --- /dev/null +++ b/data/mdict/css/jitenon-yoji.css @@ -0,0 +1,40 @@ + +body { + font-family: serif; + margin: 1em 44px 1em 1.5em; + line-height: 1.5em; + font-size: 1.2em; + color: black; +} + +table, th, td { + border: 1px solid; + border-collapse: collapse; + padding: 0.5em; +} + +th { + font-family: sans-serif; + color: black; + background-color: lightgray; + font-weight: normal; + white-space: nowrap; +} + +a { + text-decoration: none; +} + +.見出し { +} + +.読み方 { + font-family: sans-serif; + font-weight: bold; +} + +.意味 { + margin-left: 1.0em; + margin-bottom: 0.5em; +} + diff --git a/data/mdict/css/smk8.css b/data/mdict/css/smk8.css new file mode 100644 index 0000000..e88da1c --- /dev/null +++ b/data/mdict/css/smk8.css @@ -0,0 +1,449 @@ + +body { + margin: 1em 44px 1em 1.5em; + line-height: 1.5em; + font-family: serif; + font-size: 1.2em; + color: black; +} + +span[data-name="項目"] { + display: block; +} + +span[data-name="見出部"] { + display: block; +} + +span[data-name="見出部"].pri { + margin-left: -0.4em; +} + +span[data-name="見出仮名"] { + font-family: sans-serif; + font-weight: bold; +} + +rt[data-name="表音表記"] { + font-size: 0.65em; +} + +rt[data-name="表外音訓マーク"] { + font-size: 0.65em; +} + +rt[data-name="表外字マーク"] { + font-size: 0.65em; +} + +span[data-name="解説部"] { + display: block; + margin-left: 1em; +} + +span[data-name="大語義"] { + display: block; +} + +span[data-name="語義"] { + display: block; +} + +span[data-name="副義"] { + display: block; +} + +span[data-name="用例G"] { + display: block; +} + +span[data-name="注記"] span[data-name="用例G"] { + display: inline; +} + +span[data-name="用例"] { + display: block; +} + +span[data-name="注記"] span[data-name="用例"] { + display: inline; +} + +span[data-name="見出語省略"] { + margin-left: 0.125em; + margin-right: 0.125em; +} + +span[data-name="教育漢字"] { + color: green; +} + +span[data-name="ルビ"] { + font-size: 0.7em; + vertical-align: 0.5em; +} + +span[data-name="ルビ区切"] { + font-size: 0.7em; + vertical-align: 0.65em; +} + +span[data-name="名詞形G"] { + display: block; +} + +span[data-name="可能形G"] { + display: block; +} + +span[data-name="参照G"] { + display: block; +} + +span[data-name="参照"] { + color: blue; +} + +span[data-name="子項目"], +span[data-name="句項目"] { + display: block; + margin-bottom: 0.5em; +} + +span[data-name="子項目F"], +span[data-name="句項目F"] { + display: block; + margin-bottom: 0.5em; + margin-top: 0.5em; +} + +span[data-name="子見出部"] { + display: block; +} + +span[data-name="子解説部"] { + display: block; + margin-left: 1em; +} + +span[data-name="句見出部"] { + display: block; +} + +span[data-name="句解説部"] { + display: block; + margin-left: 1em; +} + +span[data-name="運用解説"] { + display: block; +} + +span[data-name="表記解説"] { + display: block; +} + +span[data-name="文法解説"] { + display: block; +} + +span[data-name="かぞえ方解説"] { + display: block; +} + +span[data-name="派生"] { + display: block; + margin-left: 1.25em; +} + +span[data-name="派生SubGF"] { + display: block; + text-indent: -1.25em; +} + +span[data-name="派生SubG"] { + display: block; +} + +span[data-name="派生SubGF"] span[data-name="用例G"] { + text-indent: 0; +} + +span[data-name="派生見出"] { + font-weight: bold; +} + +span[data-name="派生見出"].normal { + font-weight: normal +} + +span[data-name="造語成分項目"] { + display: block; + margin-top: 1em; +} + +span[data-name="造語成分見出"] { + font-size:1.4em; +} + +span[data-name="EM"] { + font-weight: bold; +} + +span[data-name="アクセント"] { + font-size: 0.7em; + vertical-align: super; +} + +span[data-name="アクセント組M"] { + vertical-align: 0.1em; +} + + +span[data-name="反意語M"], +span[data-name="同意語M"] { + vertical-align: 0.15em; +} + +span[data-name="B"] { + font-weight: bold; +} + +span[data-name="IT"] { + font-family: "Times New Roman"; + font-style: italic; +} + +span[data-name="EXCLAMATION"] { + font-family: "Times New Roman"; + font-style: italic; + font-size: 1.2em; +} + +span[data-name="歴史仮名"] { + font-family: serif; + font-size: 0.7em; + font-weight: normal; + vertical-align: 0.35em; + -webkit-user-select: nocopy; +} + +span[data-name="出現形"] { + font-weight: bold; +} + +span[data-name="品詞用法"] { + font-size: 0.7em; +} + +span[data-name="品詞用法"] span[data-name="品詞G"] { + font-size: 1.2em; +} + +span[data-name="基本構文型"] { + font-size: 0.8em; +} + +span[data-name="基本構文em"] { + font-weight: bold; +} + +span[data-name="ウ濁音参照"] { + font-family: sans-serif; + font-weight: bold; +} + +span[data-name="rect"] { + padding: 0.1em; + font-family: sans-serif; + font-size: 0.8em; + border-width: 0.05em; + border-style: solid; + border-color: black; + word-break: keep-all; + -webkit-border-radius: 0.1em; +} + +span[data-name="rect"].fill { + color: white; + border-style: none; + background-color: gray; +} + +span[data-name="rect"].red { + color: red; + border-color: red; +} + +span[data-name="rect"].redfill { + color: white; + border-style: none; + background-color: red; +} + +span[data-name="red"] { + color: red; +} + +span[data-name="大語義番号"], +span[data-name="語義番号"], +span[data-name="副義番号"] { + margin-right: 0.25em; + font-family: sans-serif; +} + +span[data-name="ref"] span[data-name="大語義番号"], +span[data-name="ref"] span[data-name="語義番号"], +span[data-name="ref"] span[data-name="副義番号"] { + font-size: 0.8em; + margin-right: 0; +} + +span[data-name="表外字マーク"] { + vertical-align: 0.5em; +} + +span[data-name="表外音訓マーク"] { + font-size: 0.5em; + vertical-align: 0.5em; +} + +span[data-name="言換M"] { + font-size: 0.5em; +} + +span[data-name="字音語参照項目"] { + display: block; +} + +span[data-name="本文項目M"] { + font-size: 0.7em; +} + +span[data-name="運用解説M"], +span[data-name="表記解説M"], +span[data-name="文法解説M"], +span[data-name="かぞえ方解説M"], +span[data-name="派生M"] { + margin-right: 0.25em; + font-family: sans-serif; +} + +span[data-name="派生ロゴ"] { + margin-left: 0.1em; + margin-right: 0.1em; +} + +span[data-name="文字"] { + margin: 0 0.2em; +} + +span[data-name="二分"] { + font-size: 0.5em; +} + +span[data-name="四分"] { + font-size: 0.25em; +} + +span[data-name="ref"] { + margin-left: 0.1em; + margin-right: 0.1em; +} + +span[data-name="ref-small"] { + font-size: 0.7em; +} + +span[data-name="sup"] { + font-size: 0.6em; +} + +span[data-name="外字"] img { + height: 1em; +} + +img.audio { + height: 1em; + margin: 0 0.25em; +} + +img.外字 { + height: 1em; +} + +img.外字欧 { + height: 1em; +} + +span[data-name="レ点M"] { + font-size: 0.6em; + vertical-align: -0.7em; +} + +a { + text-decoration: none; +} + +span[data-name="audio"] a { + padding-bottom: 0; + border-bottom: none; +} + +span[data-name="アクセント"] a, +span[data-name="古語M"] a, +span[data-name="雅語M"] a, +span[data-name="派生M"] a, +span[data-name="原籍M"] a, +span[data-name="品詞M"] a { + color: black; + border-bottom-style: none; +} + + +span[data-name="歴史仮名"]:before, +span[data-name="ルビ"]:before { + content: "("; +} + +span[data-name="歴史仮名"]:after, +span[data-name="ルビ"]:after { + content: ")"; +} + +div[data-child-links] { + padding-top: 1em; +} + +div[data-child-links] ul { + margin: 0; + padding-left: 2em; +} + +div[data-child-links] span { + padding: 0.1em; + font-family: sans-serif; + font-size: 0.8em; + color: white; + border-width: 0.05em; + border-style: none; + border-color: black; + word-break: keep-all; + -webkit-border-radius: 0.2em; +} + +div[data-child-links="子項目"] span { + background-color: rgb(153, 42, 103); +} + +div[data-child-links="句項目"] span { + background-color: rgb(176, 127, 57); +} + +span.pri > span.外字 { + font-size: 0.65em; + vertical-align: super; +} + + + diff --git a/data/mdict/description/daijirin2.mdx.description.html b/data/mdict/description/daijirin2.mdx.description.html new file mode 100644 index 0000000..c1eb401 --- /dev/null +++ b/data/mdict/description/daijirin2.mdx.description.html @@ -0,0 +1,7 @@ +大辞林 第四版 +

    +https://www.monokakido.jp/ja/dictionaries/daijirin2/index.html +

    +{{revision}} +

    +{{attribution}} diff --git a/data/mdict/description/jitenon-kokugo.mdx.description.html b/data/mdict/description/jitenon-kokugo.mdx.description.html new file mode 100644 index 0000000..a1c7489 --- /dev/null +++ b/data/mdict/description/jitenon-kokugo.mdx.description.html @@ -0,0 +1,7 @@ +国語辞典オンライン +

    +https://kokugo.jitenon.jp/ +

    +{{revision}} +

    +{{attribution}} diff --git a/data/mdict/description/jitenon-kotowaza.mdx.description.html b/data/mdict/description/jitenon-kotowaza.mdx.description.html new file mode 100644 index 0000000..b6d3c99 --- /dev/null +++ b/data/mdict/description/jitenon-kotowaza.mdx.description.html @@ -0,0 +1,7 @@ +故事・ことわざ・慣用句オンライン +

    +https://kotowaza.jitenon.jp/ +

    +{{revision}} +

    +{{attribution}} diff --git a/data/mdict/description/jitenon-yoji.mdx.description.html b/data/mdict/description/jitenon-yoji.mdx.description.html new file mode 100644 index 0000000..d7e3729 --- /dev/null +++ b/data/mdict/description/jitenon-yoji.mdx.description.html @@ -0,0 +1,7 @@ +四字熟語辞典オンライン +

    +https://yoji.jitenon.jp/ +

    +{{revision}} +

    +{{attribution}} diff --git a/data/mdict/description/smk8.mdx.description.html b/data/mdict/description/smk8.mdx.description.html new file mode 100644 index 0000000..7486250 --- /dev/null +++ b/data/mdict/description/smk8.mdx.description.html @@ -0,0 +1,7 @@ +新明解国語辞典 第八版 +

    +https://www.monokakido.jp/ja/dictionaries/smk8/index.html +

    +{{revision}} +

    +{{attribution}} diff --git a/data/mdict/icon/jitenon-kokugo.png b/data/mdict/icon/jitenon-kokugo.png new file mode 100644 index 0000000..1ef1eb3 Binary files /dev/null and b/data/mdict/icon/jitenon-kokugo.png differ diff --git a/data/mdict/icon/jitenon-kotowaza.png b/data/mdict/icon/jitenon-kotowaza.png new file mode 100644 index 0000000..15ccb92 Binary files /dev/null and b/data/mdict/icon/jitenon-kotowaza.png differ diff --git a/data/mdict/icon/jitenon-yoji.png b/data/mdict/icon/jitenon-yoji.png new file mode 100644 index 0000000..0603db0 Binary files /dev/null and b/data/mdict/icon/jitenon-yoji.png differ diff --git a/data/mdict/title/daijirin2.mdx.title.html b/data/mdict/title/daijirin2.mdx.title.html new file mode 100644 index 0000000..43fdfd7 --- /dev/null +++ b/data/mdict/title/daijirin2.mdx.title.html @@ -0,0 +1 @@ +大辞林 第四版 diff --git a/data/mdict/title/jitenon-kokugo.mdx.title.html b/data/mdict/title/jitenon-kokugo.mdx.title.html new file mode 100644 index 0000000..3fee892 --- /dev/null +++ b/data/mdict/title/jitenon-kokugo.mdx.title.html @@ -0,0 +1 @@ +国語辞典オンライン diff --git a/data/mdict/title/jitenon-kotowaza.mdx.title.html b/data/mdict/title/jitenon-kotowaza.mdx.title.html new file mode 100644 index 0000000..438ffaf --- /dev/null +++ b/data/mdict/title/jitenon-kotowaza.mdx.title.html @@ -0,0 +1 @@ +故事・ことわざ・慣用句オンライン diff --git a/data/mdict/title/jitenon-yoji.mdx.title.html b/data/mdict/title/jitenon-yoji.mdx.title.html new file mode 100644 index 0000000..456dc99 --- /dev/null +++ b/data/mdict/title/jitenon-yoji.mdx.title.html @@ -0,0 +1 @@ +四字熟語辞典オンライン diff --git a/data/mdict/title/smk8.mdx.title.html b/data/mdict/title/smk8.mdx.title.html new file mode 100644 index 0000000..9f41cd1 --- /dev/null +++ b/data/mdict/title/smk8.mdx.title.html @@ -0,0 +1 @@ +新明解国語辞典 第八版 diff --git a/data/smk8/mdict_name_conversion.json b/data/smk8/mdict_name_conversion.json new file mode 100644 index 0000000..b9a4387 --- /dev/null +++ b/data/smk8/mdict_name_conversion.json @@ -0,0 +1,25 @@ +{ + "a": {}, + "br": {}, + "img": {}, + "div": {}, + "span": {}, + "表外字": { + "name": "ruby" + }, + "表外字マーク": { + "name": "rt" + }, + "表外音訓": { + "name": "ruby" + }, + "表外音訓マーク": { + "name": "rt" + }, + "表音式": { + "name": "ruby" + }, + "表音表記": { + "name": "rt" + } +} diff --git a/data/smk8/yomichan_name_conversion.json b/data/smk8/yomichan_name_conversion.json index 82c491f..91a6593 100644 --- a/data/smk8/yomichan_name_conversion.json +++ b/data/smk8/yomichan_name_conversion.json @@ -121,25 +121,31 @@ "style": "font-weight: bold;" }, "表外字": { - "name": "ruby", + "name": "ruby" + }, + "表外字マーク": { + "name": "rt", "procedures": [ { - "procedure_name": "add_ruby_text", + "procedure_name": "insert_span", "parameters": { - "mark": "︿", - "style": "font-size: 2em;" + "attr_name": "style", + "attr_val": "font-size: 2em;" } } ] }, "表外音訓": { - "name": "ruby", + "name": "ruby" + }, + "表外音訓マーク": { + "name": "rt", "procedures": [ { - "procedure_name": "add_ruby_text", + "procedure_name": "insert_span", "parameters": { - "mark": "︽", - "style": "font-size: 2em;" + "attr_name": "style", + "attr_val": "font-size: 2em;" } } ] @@ -148,23 +154,7 @@ "name": "ruby" }, "表音表記": { - "name": "rt", - "procedures": [ - { - "procedure_name": "replace", - "parameters": { - "old": "(", - "new": "" - } - }, - { - "procedure_name": "replace", - "parameters": { - "old": ")", - "new": "" - } - } - ] + "name": "rt" }, "派生見出": { "name": "span", diff --git a/jitenbot.py b/jitenbot.py index 0a25b96..e988df9 100644 --- a/jitenbot.py +++ b/jitenbot.py @@ -17,11 +17,22 @@ along with this program. If not, see . """ import os +import sys import argparse +import subprocess from bot.targets import Targets from bot.crawlers.factory import new_crawler +def filename(f): + if not os.path.isfile(f): + raise argparse.ArgumentTypeError(f"`{f}` is not a valid filename") + elif not os.access(f, os.R_OK): + raise argparse.ArgumentTypeError(f"Cannot access file `{f}`") + else: + return f + + def directory(d): if not os.path.isdir(d): raise argparse.ArgumentTypeError(f"`{d}` is not a valid directory") @@ -35,34 +46,71 @@ def parse_args(target_names): parser = argparse.ArgumentParser( prog="jitenbot", description="Convert Japanese dictionary files to new formats.", + epilog="See README.md for details regarding media directory structures", ) parser.add_argument( "target", choices=target_names, - help="name of dictionary to convert" + help="name of dictionary to convert", ) parser.add_argument( "-p", "--page-dir", help="path to directory containing XML page files", - type=directory + type=directory, ) parser.add_argument( - "-i", "--image-dir", - help="path to directory containing image folders (gaiji, graphics, etc.)", - type=directory + "-m", "--media-dir", + help="path to directory containing media folders (gaiji, graphics, audio, etc.)", + type=directory, + ) + parser.add_argument( + "-i", "--mdict-icon", + help="path to icon file to be used with MDict", + type=filename, + ) + parser.add_argument( + "--no-yomichan-export", + help="skip export of dictionary data to Yomichan format", + action='store_true', + ) + parser.add_argument( + "--no-mdict-export", + help="skip export of dictionary data to MDict format", + action='store_true', ) args = parser.parse_args() return args +def test_mdict(): + try: + subprocess.run( + ["mdict", "--version"], + check=True, + stdout=subprocess.DEVNULL, + ) + except FileNotFoundError: + print("Could not find `mdict` pack tool.") + print("Ensure that mdict-utils is installed and") + print("included in the environment PATH.\n") + print("Mdict export functionality may also be") + print("disabled with the --no-mdict-export flag.") + sys.exit() + + def main(): target_names = [x.value for x in Targets] args = parse_args(target_names) + if not args.no_mdict_export: + test_mdict() selected_target = Targets(args.target) crawler = new_crawler(selected_target) crawler.collect_pages(args.page_dir) crawler.read_pages() - crawler.make_yomichan_dictionary(args.image_dir) + if not args.no_yomichan_export: + crawler.make_yomichan_dictionary(args.media_dir) + if not args.no_mdict_export: + crawler.make_mdict_dictionary(args.media_dir, args.mdict_icon) if __name__ == "__main__": diff --git a/requirements.txt b/requirements.txt index 1c111af..8802356 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,6 +6,7 @@ css-parser==1.0.8 html5lib==1.1 idna==3.4 lxml==4.9.2 +mdict-utils==1.3.12 Pillow==9.5.0 platformdirs==3.5.0 requests==2.29.0 @@ -13,5 +14,7 @@ six==1.16.0 soupsieve==2.4.1 SudachiDict-full==20230110 SudachiPy==0.6.7 +tqdm==4.65.0 urllib3==1.26.15 webencodings==0.5.1 +xxhash==3.2.0 diff --git a/run_all.sh b/run_all.sh new file mode 100644 index 0000000..2bdd31e --- /dev/null +++ b/run_all.sh @@ -0,0 +1,13 @@ +python jitenbot.py jitenon-kokugo +python jitenbot.py jitenon-yoji +python jitenbot.py jitenon-kotowaza + +python jitenbot.py smk8 \ + --media-dir monokakido/SMK8/media \ + --page-dir monokakido/SMK8/pages \ + --mdict-icon monokakido/SMK8/SMK8-76@3x.png + +python jitenbot.py daijirin2 \ + --media-dir monokakido/DAIJIRIN2/media \ + --page-dir monokakido/DAIJIRIN2/pages \ + --mdict-icon monokakido/DAIJIRIN2/DAIJIRIN2-76@3x.png