Add export support for the MDict dictionary format

This commit is contained in:
stephenmk 2023-07-08 16:49:03 -05:00
parent e4a2e75d82
commit 4c837cd72d
No known key found for this signature in database
GPG key ID: B6DA730DB06235F1
53 changed files with 2227 additions and 269 deletions

1
.gitignore vendored
View file

@ -1,6 +1,7 @@
webcache/
output/
notes/
monokakido/
# Byte-compiled / optimized / DLL files
__pycache__/

View file

@ -49,7 +49,8 @@ compiling the scraped data into compact dictionary file formats.
# Usage
```
usage: jitenbot [-h] [-p PAGE_DIR] [-i IMAGE_DIR]
usage: jitenbot [-h] [-p PAGE_DIR] [-m MEDIA_DIR] [-i MDICT_ICON]
[--no-yomichan-export] [--no-mdict-export]
{jitenon-kokugo,jitenon-yoji,jitenon-kotowaza,smk8,daijirin2}
Convert Japanese dictionary files to new formats.
@ -62,9 +63,15 @@ options:
-h, --help show this help message and exit
-p PAGE_DIR, --page-dir PAGE_DIR
path to directory containing XML page files
-i IMAGE_DIR, --image-dir IMAGE_DIR
path to directory containing image folders (gaiji,
graphics, etc.)
-m MEDIA_DIR, --media-dir MEDIA_DIR
path to directory containing media folders (gaiji,
graphics, audio, etc.)
-i MDICT_ICON, --mdict-icon MDICT_ICON
path to icon file to be used with MDict
--no-yomichan-export skip export of dictionary data to Yomichan format
--no-mdict-export skip export of dictionary data to MDict format
See README.md for details regarding media directory structures
```
### Online Targets
Jitenbot will scrape the target website and save the pages to the [user cache directory](https://pypi.org/project/platformdirs/).
@ -75,8 +82,55 @@ HTTP request headers (user agent string, etc.) may be customized by editing the
[user config directory](https://pypi.org/project/platformdirs/).
### Offline Targets
Page data and image data must be procured by the user
Page data and media data must be [procured by the user](https://github.com/golddranks/monokakido/)
and passed to jitenbot via the appropriate command line flags.
<details>
<summary>smk8 media directory</summary>
Since Yomichan does not support audio files from imported
dictionaries, the `audio/` directory may be omitted to save filesize
space in the output ZIP file if desired.
```
media
├── Audio.png
├── audio
│   ├── 00001.aac
│   ├── 00002.aac
│   ├── 00003.aac
│   │  ...
│   └── 82682.aac
└── gaiji
├── 1d110.svg
├── 1d15d.svg
├── 1d15e.svg
   │  ...
└── xbunnoa.svg
```
</details>
<details>
<summary>daijirin2 media directory</summary>
The `graphics/` directory may be omitted to save space if desired.
```
media
├── gaiji
│   ├── 1D10B.svg
│   ├── 1D110.svg
│   ├── 1D12A.svg
│   │  ...
│   └── vectorOB.svg
└── graphics
├── 3djr_0002.png
├── 3djr_0004.png
├── 3djr_0005.png
   │  ...
└── 4djr_yahazu.png
```
</details>
# Attribution
`Adobe-Japan1_sequences.txt` is provided by [The Adobe-Japan1-7 Character Collection](https://github.com/adobe-type-tools/Adobe-Japan1).

View file

@ -1,11 +1,11 @@
### Todo
- [x] Add factory classes to reduce the amount of class import statements
- [x] Support exporting to MDict (.MDX) dictionary format
- [ ] Add test suite
- [ ] Add documentation (docstrings, etc.)
- [ ] Validate JSON schema of Yomichan terms during export
- [ ] Add factory classes to reduce the amount of class import statements
- [ ] Add build scripts for producing program binaries
- [ ] Support exporting to MDict (.MDX) dictionary format
- [ ] Validate scraped webpages after downloading
- [ ] Log non-fatal failures to a log file instead of raising exceptions
- [ ] Support more dictionary websites

View file

@ -5,7 +5,8 @@ from bs4 import BeautifulSoup
import bot.scraper as Scraper
from bot.entries.factory import new_entry
from bot.yomichan.exporters.factory import new_exporter
from bot.yomichan.exporters.factory import new_yomi_exporter
from bot.mdict.exporters.factory import new_mdict_exporter
class Crawler(ABC):
@ -38,9 +39,13 @@ class Crawler(ABC):
self._entries.append(entry)
print()
def make_yomichan_dictionary(self, image_dir):
exporter = new_exporter(self._target)
exporter.export(self._entries, image_dir)
def make_yomichan_dictionary(self, media_dir):
exporter = new_yomi_exporter(self._target)
exporter.export(self._entries, media_dir)
def make_mdict_dictionary(self, media_dir, icon_file):
exporter = new_mdict_exporter(self._target)
exporter.export(self._entries, media_dir, icon_file)
def _parse_page_id(self, page_link):
m = re.search(self._page_id_pattern, page_link)
@ -142,10 +147,8 @@ class _MonokakidoCrawler(Crawler):
class Smk8Crawler(_MonokakidoCrawler):
def __init__(self, target):
super().__init__(target)
pass
class Daijirin2Crawler(_MonokakidoCrawler):
def __init__(self, target):
super().__init__(target)
pass

View file

@ -99,15 +99,15 @@ def load_daijirin2_kana_abbreviations():
@cache
def load_smk8_yomichan_name_conversion():
file_name = os.path.join("smk8", "yomichan_name_conversion.json")
def load_yomichan_name_conversion(target):
file_name = os.path.join(target.value, "yomichan_name_conversion.json")
data = __load_json(file_name)
return data
@cache
def load_daijirin2_yomichan_name_conversion():
file_name = os.path.join("daijirin2", "yomichan_name_conversion.json")
def load_mdict_name_conversion(target):
file_name = os.path.join(target.value, "mdict_name_conversion.json")
data = __load_json(file_name)
return data

View file

@ -1,4 +1,3 @@
import re
from bs4 import BeautifulSoup
import bot.entries.expressions as Expressions
@ -10,19 +9,17 @@ from bot.entries.daijirin2_preprocess import preprocess_page
class _BaseDaijirin2Entry(Entry):
ID_TO_ENTRY = {}
SUBENTRY_ID_TO_ENTRY_ID = {}
def __init__(self, entry_id):
super().__init__(entry_id)
if entry_id not in self.ID_TO_ENTRY:
self.ID_TO_ENTRY[entry_id] = self
else:
raise Exception(f"Duplicate entry ID: {entry_id}")
def __init__(self, target, entry_id):
super().__init__(target, entry_id)
self.children = []
self.phrases = []
self._kana_abbreviations = load_daijirin2_kana_abbreviations()
def get_global_identifier(self):
parent_part = format(self.entry_id[0], '06')
child_part = hex(self.entry_id[1]).lstrip('0x').zfill(4).upper()
return f"@{self.target.value}-{parent_part}-{child_part}"
def set_page(self, page):
page = self.__decompose_subentries(page)
self._page = page
@ -57,14 +54,7 @@ class _BaseDaijirin2Entry(Entry):
else:
self._part_of_speech_tags.append(pos)
def get_headwords(self):
if self._headwords is not None:
return self._headwords
self._set_headwords()
self._set_variant_headwords()
return self._headwords
def _set_regular_headwords(self, soup):
def _get_regular_headwords(self, soup):
self._fill_alts(soup)
reading = soup.find("見出仮名").text
expressions = []
@ -78,10 +68,11 @@ class _BaseDaijirin2Entry(Entry):
expressions = Expressions.expand_abbreviation_list(expressions)
if len(expressions) == 0:
expressions.append(reading)
self._headwords = {reading: expressions}
headwords = {reading: expressions}
return headwords
def _set_variant_headwords(self):
for expressions in self._headwords.values():
def _add_variant_expressions(self, headwords):
for expressions in headwords.values():
Expressions.add_variant_kanji(expressions)
Expressions.add_fullwidth(expressions)
Expressions.remove_iteration_mark(expressions)
@ -101,7 +92,7 @@ class _BaseDaijirin2Entry(Entry):
tag_soup.name = "項目"
subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"])
self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id
subentry = subentry_class(subentry_id)
subentry = subentry_class(self.target, subentry_id)
page = tag_soup.decode()
subentry.set_page(page)
subentry_list.append(subentry)
@ -122,6 +113,8 @@ class _BaseDaijirin2Entry(Entry):
@staticmethod
def _delete_unused_nodes(soup):
"""Remove extra markup elements that appear in the entry
headword line which are not part of the entry headword"""
unused_nodes = [
"漢字音logo", "活用分節", "連語句活用分節", "語構成",
"表外字マーク", "表外字マーク", "ルビG"
@ -144,25 +137,26 @@ class _BaseDaijirin2Entry(Entry):
class Daijirin2Entry(_BaseDaijirin2Entry):
def __init__(self, page_id):
def __init__(self, target, page_id):
entry_id = (page_id, 0)
super().__init__(entry_id)
super().__init__(target, entry_id)
def set_page(self, page):
page = preprocess_page(page)
super().set_page(page)
def _set_headwords(self):
def _get_headwords(self):
soup = self.get_page_soup()
self._delete_unused_nodes(soup)
if soup.find("漢字見出") is not None:
self._set_kanji_headwords(soup)
headwords = self._get_kanji_headwords(soup)
elif soup.find("略語G") is not None:
self._set_acronym_headwords(soup)
headwords = self._get_acronym_headwords(soup)
else:
self._set_regular_headwords(soup)
headwords = self._get_regular_headwords(soup)
return headwords
def _set_kanji_headwords(self, soup):
def _get_kanji_headwords(self, soup):
readings = []
for el in soup.find_all("漢字音"):
hira = Expressions.kata_to_hira(el.text)
@ -172,11 +166,12 @@ class Daijirin2Entry(_BaseDaijirin2Entry):
expressions = []
for el in soup.find_all("漢字見出"):
expressions.append(el.text)
self._headwords = {}
headwords = {}
for reading in readings:
self._headwords[reading] = expressions
headwords[reading] = expressions
return headwords
def _set_acronym_headwords(self, soup):
def _get_acronym_headwords(self, soup):
expressions = []
for el in soup.find_all("略語"):
expression_parts = []
@ -184,29 +179,24 @@ class Daijirin2Entry(_BaseDaijirin2Entry):
expression_parts.append(part.text)
expression = "".join(expression_parts)
expressions.append(expression)
self._headwords = {"": expressions}
headwords = {"": expressions}
return headwords
class Daijirin2ChildEntry(_BaseDaijirin2Entry):
def __init__(self, entry_id):
super().__init__(entry_id)
def _set_headwords(self):
def _get_headwords(self):
soup = self.get_page_soup()
self._delete_unused_nodes(soup)
self._set_regular_headwords(soup)
headwords = self._get_regular_headwords(soup)
return headwords
class Daijirin2PhraseEntry(_BaseDaijirin2Entry):
def __init__(self, entry_id):
super().__init__(entry_id)
self.__phrase_readings = load_daijirin2_phrase_readings()
def get_part_of_speech_tags(self):
# phrases do not contain these tags
return []
def _set_headwords(self):
def _get_headwords(self):
soup = self.get_page_soup()
headwords = {}
expressions = self._find_expressions(soup)
@ -217,7 +207,7 @@ class Daijirin2PhraseEntry(_BaseDaijirin2Entry):
headwords[reading].append(expression)
else:
headwords[reading] = [expression]
self._headwords = headwords
return headwords
def _find_expressions(self, soup):
self._delete_unused_nodes(soup)
@ -231,7 +221,8 @@ class Daijirin2PhraseEntry(_BaseDaijirin2Entry):
return expressions
def _find_readings(self):
text = self.__phrase_readings[self.entry_id]
phrase_readings = load_daijirin2_phrase_readings()
text = phrase_readings[self.entry_id]
alternatives = Expressions.expand_daijirin_alternatives(text)
readings = []
for alt in alternatives:

View file

@ -2,12 +2,24 @@ from abc import ABC, abstractmethod
class Entry(ABC):
def __init__(self, entry_id):
ID_TO_ENTRY = {}
SUBENTRY_ID_TO_ENTRY_ID = {}
def __init__(self, target, entry_id):
if entry_id not in self.ID_TO_ENTRY:
self.ID_TO_ENTRY[entry_id] = self
else:
raise Exception(f"Duplicate entry ID: {entry_id}")
self.target = target
self.entry_id = entry_id
self._page = None
self._headwords = None
self._part_of_speech_tags = None
@abstractmethod
def get_global_identifier(self):
pass
@abstractmethod
def set_page(self, page):
pass
@ -16,14 +28,34 @@ class Entry(ABC):
def get_page_soup(self):
pass
@abstractmethod
def get_headwords(self):
if self._headwords is not None:
return self._headwords
headwords = self._get_headwords()
self._add_variant_expressions(headwords)
self._headwords = headwords
return headwords
@abstractmethod
def _get_headwords(self):
pass
@abstractmethod
def _add_variant_expressions(self, headwords):
pass
@abstractmethod
def get_part_of_speech_tags(self):
pass
def get_parent(self):
if self.entry_id in self.SUBENTRY_ID_TO_ENTRY_ID:
parent_id = self.SUBENTRY_ID_TO_ENTRY_ID[self.entry_id]
parent = self.ID_TO_ENTRY[parent_id]
else:
parent = None
return parent
def get_first_expression(self):
headwords = self.get_headwords()
expressions = next(iter(headwords.values()))

View file

@ -15,4 +15,4 @@ def new_entry(target, page_id):
Targets.SMK8: Smk8Entry,
Targets.DAIJIRIN2: Daijirin2Entry,
}
return entry_map[target](page_id)
return entry_map[target](target, page_id)

View file

@ -1,4 +1,5 @@
import re
from abc import abstractmethod
from datetime import datetime, date
from bs4 import BeautifulSoup
@ -7,18 +8,17 @@ import bot.entries.expressions as Expressions
class _JitenonEntry(Entry):
ID_TO_ENTRY = {}
def __init__(self, entry_id):
super().__init__(entry_id)
if entry_id not in self.ID_TO_ENTRY:
self.ID_TO_ENTRY[entry_id] = self
else:
raise Exception(f"Duplicate entry ID: {entry_id}")
def __init__(self, target, entry_id):
super().__init__(target, entry_id)
self.expression = ""
self.yomikata = ""
self.definition = ""
self.other_forms = []
self.modified_date = date(1970, 1, 1)
self.attribution = ""
for column in self._COLUMNS.values():
setattr(self, column[0], column[1])
def get_global_identifier(self):
return f"@{self.target.value}-{format(self.entry_id, '06')}"
def set_page(self, page):
soup = BeautifulSoup(page, features="html5lib")
@ -39,36 +39,33 @@ class _JitenonEntry(Entry):
soup = BeautifulSoup(self._page, "html5lib")
return soup
def get_headwords(self):
if self._headwords is not None:
return self._headwords
self._set_headwords()
self._set_variant_headwords()
return self._headwords
def get_part_of_speech_tags(self):
# Jitenon doesn't have any
return []
def _set_headwords(self):
def _get_headwords(self):
headwords = {}
for yomikata in self._yomikatas():
headwords[yomikata] = [self.expression]
ikei_headwords = self._ikei_headwords()
for reading, expressions in ikei_headwords.items():
for reading in self._get_readings():
headwords[reading] = [self.expression]
other_form_headwords = self._other_form_headwords()
for reading, expressions in other_form_headwords.items():
if reading not in headwords:
headwords[reading] = []
for expression in expressions:
if expression not in headwords[reading]:
headwords[reading].append(expression)
self._headwords = headwords
return headwords
@abstractmethod
def _get_column_map(self):
pass
def __set_modified_date(self, page):
m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", page)
if m is None:
return
date = datetime.strptime(m.group(1), '%Y-%m-%d').date()
self.modified_date = date
modified_date = datetime.strptime(m.group(1), '%Y-%m-%d').date()
self.modified_date = modified_date
def __set_attribution(self, soup):
attribution = soup.find(class_="copyright")
@ -78,7 +75,8 @@ class _JitenonEntry(Entry):
self.attribution = ""
def __set_column(self, colname, colval):
attr_name = self._COLUMNS[colname][0]
column_map = self._get_column_map()
attr_name = column_map[colname]
attr_value = getattr(self, attr_name)
if isinstance(attr_value, str):
setattr(self, attr_name, colval)
@ -88,7 +86,7 @@ class _JitenonEntry(Entry):
else:
attr_value.append(colval)
def _yomikatas(self):
def _get_readings(self):
yomikata = self.yomikata
m = re.search(r"^[ぁ-ヿ、]+$", yomikata)
if m:
@ -109,20 +107,20 @@ class _JitenonEntry(Entry):
print(f"Invalid 読み方 format: {self.yomikata}\n{self}\n")
return [""]
def _ikei_headwords(self):
ikei_headwords = {}
for val in self.ikei:
def _other_form_headwords(self):
other_form_headwords = {}
for val in self.other_forms:
m = re.search(r"^([^]+)([ぁ-ヿ、]+)$", val)
if not m:
print(f"Invalid 異形 format: {val}\n{self}\n")
continue
expression = m.group(1)
reading = m.group(2)
if reading not in ikei_headwords:
ikei_headwords[reading] = []
if expression not in ikei_headwords[reading]:
ikei_headwords[reading].append(expression)
return ikei_headwords
if reading not in other_form_headwords:
other_form_headwords[reading] = []
if expression not in other_form_headwords[reading]:
other_form_headwords[reading].append(expression)
return other_form_headwords
@staticmethod
def __clean_text(text):
@ -133,9 +131,10 @@ class _JitenonEntry(Entry):
return text
def __str__(self):
column_map = self._get_column_map()
colvals = [str(self.entry_id)]
for attr in self._COLUMNS.values():
attr_val = getattr(self, attr[0])
for attr_name in column_map.values():
attr_val = getattr(self, attr_name)
if isinstance(attr_val, str):
colvals.append(attr_val)
elif isinstance(attr_val, list):
@ -144,83 +143,100 @@ class _JitenonEntry(Entry):
class JitenonYojiEntry(_JitenonEntry):
_COLUMNS = {
"四字熟語": ["expression", ""],
"読み方": ["yomikata", ""],
"意味": ["imi", ""],
"出典": ["shutten", ""],
"漢検級": ["kankenkyuu", ""],
"場面用途": ["bamenyouto", ""],
"異形": ["ikei", []],
"類義語": ["ruigigo", []],
def __init__(self, target, entry_id):
super().__init__(target, entry_id)
self.origin = ""
self.kanken_level = ""
self.category = ""
self.related_expressions = []
def _get_column_map(self):
return {
"四字熟語": "expression",
"読み方": "yomikata",
"意味": "definition",
"異形": "other_forms",
"出典": "origin",
"漢検級": "kanken_level",
"場面用途": "category",
"類義語": "related_expressions",
}
def __init__(self, entry_id):
super().__init__(entry_id)
def _set_variant_headwords(self):
for expressions in self._headwords.values():
def _add_variant_expressions(self, headwords):
for expressions in headwords.values():
Expressions.add_variant_kanji(expressions)
class JitenonKotowazaEntry(_JitenonEntry):
_COLUMNS = {
"言葉": ["expression", ""],
"読み方": ["yomikata", ""],
"意味": ["imi", ""],
"出典": ["shutten", ""],
"例文": ["reibun", ""],
"異形": ["ikei", []],
"類句": ["ruiku", []],
def __init__(self, target, entry_id):
super().__init__(target, entry_id)
self.origin = ""
self.example = ""
self.related_expressions = []
def _get_column_map(self):
return {
"言葉": "expression",
"読み方": "yomikata",
"意味": "definition",
"異形": "other_forms",
"出典": "origin",
"例文": "example",
"類句": "related_expressions",
}
def __init__(self, entry_id):
super().__init__(entry_id)
def _set_headwords(self):
def _get_headwords(self):
if self.expression == "金棒引き・鉄棒引き":
self._headwords = {
headwords = {
"かなぼうひき": ["金棒引き", "鉄棒引き"]
}
else:
super()._set_headwords()
headwords = super()._get_headwords()
return headwords
def _set_variant_headwords(self):
for expressions in self._headwords.values():
def _add_variant_expressions(self, headwords):
for expressions in headwords.values():
Expressions.add_variant_kanji(expressions)
Expressions.add_fullwidth(expressions)
class JitenonKokugoEntry(_JitenonEntry):
_COLUMNS = {
"言葉": ["expression", ""],
"読み方": ["yomikata", ""],
"意味": ["imi", ""],
"例文": ["reibun", ""],
"別表記": ["betsuhyouki", ""],
"対義語": ["taigigo", ""],
"活用": ["katsuyou", ""],
"用例": ["yourei", ""],
"類語": ["ruigo", ""],
def __init__(self, target, entry_id):
super().__init__(target, entry_id)
self.example = ""
self.alt_expression = ""
self.antonym = ""
self.attachments = ""
self.compounds = ""
self.related_words = ""
def _get_column_map(self):
return {
"言葉": "expression",
"読み方": "yomikata",
"意味": "definition",
"例文": "example",
"別表記": "alt_expression",
"対義語": "antonym",
"活用": "attachments",
"用例": "compounds",
"類語": "related_words",
}
def __init__(self, entry_id):
super().__init__(entry_id)
def _set_headwords(self):
def _get_headwords(self):
headwords = {}
for reading in self.yomikata.split(""):
if reading not in headwords:
headwords[reading] = []
for expression in self.expression.split(""):
headwords[reading].append(expression)
if self.betsuhyouki.strip() != "":
for expression in self.betsuhyouki.split(""):
if self.alt_expression.strip() != "":
for expression in self.alt_expression.split(""):
headwords[reading].append(expression)
self._headwords = headwords
return headwords
def _set_variant_headwords(self):
for expressions in self._headwords.values():
def _add_variant_expressions(self, headwords):
for expressions in headwords.values():
Expressions.add_variant_kanji(expressions)
Expressions.add_fullwidth(expressions)
Expressions.remove_iteration_mark(expressions)

View file

@ -1,4 +1,3 @@
import re
from bs4 import BeautifulSoup
import bot.entries.expressions as Expressions
@ -9,19 +8,17 @@ from bot.entries.smk8_preprocess import preprocess_page
class _BaseSmk8Entry(Entry):
ID_TO_ENTRY = {}
SUBENTRY_ID_TO_ENTRY_ID = {}
def __init__(self, entry_id):
super().__init__(entry_id)
if entry_id not in self.ID_TO_ENTRY:
self.ID_TO_ENTRY[entry_id] = self
else:
raise Exception(f"Duplicate entry ID: {entry_id}")
def __init__(self, target, entry_id):
super().__init__(target, entry_id)
self.children = []
self.phrases = []
self.kanjis = []
def get_global_identifier(self):
parent_part = format(self.entry_id[0], '06')
child_part = hex(self.entry_id[1]).lstrip('0x').zfill(4).upper()
return f"@{self.target.value}-{parent_part}-{child_part}"
def set_page(self, page):
page = self.__decompose_subentries(page)
self._page = page
@ -30,13 +27,6 @@ class _BaseSmk8Entry(Entry):
soup = BeautifulSoup(self._page, "xml")
return soup
def get_headwords(self):
if self._headwords is not None:
return self._headwords
self._set_headwords()
self._set_variant_headwords()
return self._headwords
def get_part_of_speech_tags(self):
if self._part_of_speech_tags is not None:
return self._part_of_speech_tags
@ -50,8 +40,8 @@ class _BaseSmk8Entry(Entry):
self._part_of_speech_tags.append(tag.text)
return self._part_of_speech_tags
def _set_variant_headwords(self):
for expressions in self._headwords.values():
def _add_variant_expressions(self, headwords):
for expressions in headwords.values():
Expressions.add_variant_kanji(expressions)
Expressions.add_fullwidth(expressions)
Expressions.remove_iteration_mark(expressions)
@ -87,7 +77,7 @@ class _BaseSmk8Entry(Entry):
tag_soup.name = "項目"
subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"])
self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id
subentry = subentry_class(subentry_id)
subentry = subentry_class(self.target, subentry_id)
page = tag_soup.decode()
subentry.set_page(page)
subentry_list.append(subentry)
@ -106,6 +96,16 @@ class _BaseSmk8Entry(Entry):
else:
raise Exception(f"Invalid entry ID: {id_string}")
@staticmethod
def _delete_unused_nodes(soup):
"""Remove extra markup elements that appear in the entry
headword line which are not part of the entry headword"""
unused_nodes = [
"表音表記", "表外音訓マーク", "表外字マーク", "ルビG"
]
for name in unused_nodes:
Soup.delete_soup_nodes(soup, name)
@staticmethod
def _clean_expression(expression):
for x in ["", "", "", "", "", " "]:
@ -114,24 +114,24 @@ class _BaseSmk8Entry(Entry):
@staticmethod
def _fill_alts(soup):
for e in soup.find_all(["親見出仮名", "親見出表記"]):
e.string = e.attrs["alt"]
for el in soup.find_all(["親見出仮名", "親見出表記"]):
el.string = el.attrs["alt"]
for gaiji in soup.find_all("外字"):
gaiji.string = gaiji.img.attrs["alt"]
class Smk8Entry(_BaseSmk8Entry):
def __init__(self, page_id):
def __init__(self, target, page_id):
entry_id = (page_id, 0)
super().__init__(entry_id)
super().__init__(target, entry_id)
def set_page(self, page):
page = preprocess_page(page)
super().set_page(page)
def _set_headwords(self):
def _get_headwords(self):
soup = self.get_page_soup()
Soup.delete_soup_nodes(soup, "表音表記")
self._delete_unused_nodes(soup)
self._fill_alts(soup)
reading = self._find_reading(soup)
expressions = []
@ -140,16 +140,14 @@ class Smk8Entry(_BaseSmk8Entry):
for expression in self._find_expressions(soup):
if expression not in expressions:
expressions.append(expression)
self._headwords = {reading: expressions}
headwords = {reading: expressions}
return headwords
class Smk8ChildEntry(_BaseSmk8Entry):
def __init__(self, entry_id):
super().__init__(entry_id)
def _set_headwords(self):
def _get_headwords(self):
soup = self.get_page_soup()
Soup.delete_soup_nodes(soup, "表音表記")
self._delete_unused_nodes(soup)
self._fill_alts(soup)
reading = self._find_reading(soup)
expressions = []
@ -158,19 +156,20 @@ class Smk8ChildEntry(_BaseSmk8Entry):
for expression in self._find_expressions(soup):
if expression not in expressions:
expressions.append(expression)
self._headwords = {reading: expressions}
headwords = {reading: expressions}
return headwords
class Smk8PhraseEntry(_BaseSmk8Entry):
def __init__(self, entry_id):
super().__init__(entry_id)
def __init__(self, target, entry_id):
super().__init__(target, entry_id)
self.__phrase_readings = load_smk8_phrase_readings()
def get_part_of_speech_tags(self):
# phrases do not contain these tags
return []
def _set_headwords(self):
def _get_headwords(self):
soup = self.get_page_soup()
headwords = {}
expressions = self._find_expressions(soup)
@ -181,10 +180,10 @@ class Smk8PhraseEntry(_BaseSmk8Entry):
headwords[reading].append(expression)
else:
headwords[reading] = [expression]
self._headwords = headwords
return headwords
def _find_expressions(self, soup):
Soup.delete_soup_nodes(soup, "ルビG")
self._delete_unused_nodes(soup)
self._fill_alts(soup)
text = soup.find("標準表記").text
text = self._clean_expression(text)
@ -206,15 +205,14 @@ class Smk8PhraseEntry(_BaseSmk8Entry):
class Smk8KanjiEntry(_BaseSmk8Entry):
def __init__(self, entry_id):
super().__init__(entry_id)
def _set_headwords(self):
def _get_headwords(self):
soup = self.get_page_soup()
self._delete_unused_nodes(soup)
self._fill_alts(soup)
reading = self.__get_parent_reading()
expressions = self._find_expressions(soup)
self._headwords = {reading: expressions}
headwords = {reading: expressions}
return headwords
def __get_parent_reading(self):
parent_id = self.SUBENTRY_ID_TO_ENTRY_ID[self.entry_id]

View file

@ -15,6 +15,7 @@ def preprocess_page(page):
page = __strip_page(page)
page = __replace_glyph_codes(page)
page = __format_hyougai_marks(page)
page = __remove_pronunciation_parentheses(page)
return page
@ -64,6 +65,7 @@ def __format_hyougai_marks(page):
for x in ["\n", "\t", " "]:
text = text.replace(x, "")
text = re.sub(r"〈([^〈]+)〉", r"\1", text)
page = re.sub(r"〈([^〈]+)〉", r"\1␃", page)
for mark in re.findall(r"《.", text):
if mark[1] == "":
@ -79,13 +81,29 @@ def __format_hyougai_marks(page):
page = re.sub(f"〈([^{mark[1]}]*)({mark[1]})",
r"\1<表外字>\2</表外字>",
page, count=1)
page = page.replace("", "")
page = page.replace("", "")
soup = BeautifulSoup(page, features="xml")
for el in soup.find_all("表外音訓"):
if el.text == "":
el.append(el.next_sibling)
mark_xml = "<表外音訓マーク>︽</表外音訓マーク>"
mark_soup = BeautifulSoup(mark_xml, "xml")
el.append(mark_soup.表外音訓マーク)
for el in soup.find_all("表外字"):
if el.text == "":
el.append(el.next_sibling)
mark_xml = "<表外字マーク>︿</表外字マーク>"
mark_soup = BeautifulSoup(mark_xml, "xml")
el.append(mark_soup.表外字マーク)
return soup.decode()
def __remove_pronunciation_parentheses(page):
page = page.replace("<表音表記>", "<表音表記>")
page = page.replace("</表音表記>", "</表音表記>")
return page

View file

@ -0,0 +1,204 @@
# pylint: disable=too-few-public-methods
import subprocess
import os
import shutil
from abc import ABC, abstractmethod
from pathlib import Path
from datetime import datetime
from platformdirs import user_documents_dir, user_cache_dir
from bot.targets import Targets
from bot.mdict.terms.factory import new_terminator
class Exporter(ABC):
def __init__(self, target):
self._target = target
self._terminator = new_terminator(target)
self._build_dir = None
self._build_media_dir = None
self._description_file = None
self._out_dir = None
def export(self, entries, media_dir, icon_file):
self._init_build_media_dir(media_dir)
self._init_description_file(entries)
terms = self._get_terms(entries)
print(f"Exporting {len(terms)} Mdict keys...")
self._write_mdx_file(terms)
self._write_mdd_file()
self._write_icon_file(icon_file)
self._rm_build_dir()
def _get_build_dir(self):
if self._build_dir is not None:
return self._build_dir
cache_dir = user_cache_dir("jitenbot")
build_directory = os.path.join(cache_dir, "mdict_build")
if Path(build_directory).is_dir():
shutil.rmtree(build_directory)
os.makedirs(build_directory)
self._build_dir = build_directory
return self._build_dir
def _init_build_media_dir(self, media_dir):
build_dir = self._get_build_dir()
build_media_dir = os.path.join(build_dir, self._target.value)
if media_dir is not None:
print("Copying media files to build directory...")
shutil.copytree(media_dir, build_media_dir)
else:
os.makedirs(build_media_dir)
css_file = self._get_css_file()
shutil.copy(css_file, build_media_dir)
self._terminator.set_media_dir(build_media_dir)
self._build_media_dir = build_media_dir
def _init_description_file(self, entries):
filename = f"{self._target.value}.mdx.description.html"
original_file = os.path.join(
"data", "mdict", "description", filename)
with open(original_file, "r", encoding="utf8") as f:
description = f.read()
description = description.replace(
"{{revision}}", self._get_revision(entries))
description = description.replace(
"{{attribution}}", self._get_attribution(entries))
build_dir = self._get_build_dir()
description_file = os.path.join(build_dir, filename)
with open(description_file, "w", encoding="utf8") as f:
f.write(description)
self._description_file = description_file
def _get_terms(self, entries):
terms = []
entries_len = len(entries)
for idx, entry in enumerate(entries):
update = f"Creating Mdict terms for entry {idx+1}/{entries_len}"
print(update, end='\r', flush=True)
new_terms = self._terminator.make_terms(entry)
for term in new_terms:
terms.append(term)
print()
return terms
def _write_mdx_file(self, terms):
out_dir = self._get_out_dir()
out_file = os.path.join(out_dir, f"{self._target.value}.mdx")
params = [
"mdict",
"-a", self._get_term_file(terms),
"--title", self._get_title_file(),
"--description", self._description_file,
out_file
]
subprocess.run(params, check=True)
def _write_mdd_file(self):
out_dir = self._get_out_dir()
out_file = os.path.join(out_dir, f"{self._target.value}.mdd")
params = [
"mdict",
"-a", self._build_media_dir,
"--title", self._get_title_file(),
"--description", self._description_file,
out_file
]
subprocess.run(params, check=True)
def _write_icon_file(self, icon_file):
premade_icon_file = f"data/mdict/icon/{self._target.value}.png"
out_dir = self._get_out_dir()
out_file = os.path.join(out_dir, f"{self._target.value}.png")
if icon_file is not None and Path(icon_file).is_file():
shutil.copy(icon_file, out_file)
elif Path(premade_icon_file).is_file():
shutil.copy(premade_icon_file, out_file)
def _get_out_dir(self):
if self._out_dir is not None:
return self._out_dir
out_dir = os.path.join(
user_documents_dir(), "jitenbot", "mdict", self._target.value)
if Path(out_dir).is_dir():
shutil.rmtree(out_dir)
os.makedirs(out_dir)
self._out_dir = out_dir
return out_dir
def _get_term_file(self, terms):
build_dir = self._get_build_dir()
term_file = os.path.join(build_dir, f"{self._target.value}.mdx.txt")
with open(term_file, "w", encoding="utf8") as f:
for term in terms:
f.write("\n".join(term))
f.write("\n</>\n")
return term_file
def _get_title_file(self):
return os.path.join(
"data", "mdict", "title",
f"{self._target.value}.mdx.title.html")
def _get_css_file(self):
return os.path.join(
"data", "mdict", "css",
f"{self._target.value}.css")
def _rm_build_dir(self):
build_dir = self._get_build_dir()
shutil.rmtree(build_dir)
@abstractmethod
def _get_revision(self, entries):
pass
@abstractmethod
def _get_attribution(self, entries):
pass
class _JitenonExporter(Exporter):
def _get_revision(self, entries):
modified_date = None
for entry in entries:
if modified_date is None or entry.modified_date > modified_date:
modified_date = entry.modified_date
revision = modified_date.strftime("%Y年%m月%d日閲覧")
return revision
def _get_attribution(self, entries):
modified_date = None
for entry in entries:
if modified_date is None or entry.modified_date > modified_date:
attribution = entry.attribution
return attribution
class JitenonKokugoExporter(_JitenonExporter):
pass
class JitenonYojiExporter(_JitenonExporter):
pass
class JitenonKotowazaExporter(_JitenonExporter):
pass
class _MonokakidoExporter(Exporter):
def _get_revision(self, entries):
timestamp = datetime.now().strftime("%Y年%m月%d日作成")
return timestamp
class Smk8Exporter(_MonokakidoExporter):
def _get_attribution(self, entries):
return "© Sanseido Co., LTD. 2020"
class Daijirin2Exporter(_MonokakidoExporter):
def _get_attribution(self, entries):
return "© Sanseido Co., LTD. 2019"

View file

@ -0,0 +1,18 @@
from bot.targets import Targets
from bot.mdict.exporters.export import JitenonKokugoExporter
from bot.mdict.exporters.export import JitenonYojiExporter
from bot.mdict.exporters.export import JitenonKotowazaExporter
from bot.mdict.exporters.export import Smk8Exporter
from bot.mdict.exporters.export import Daijirin2Exporter
def new_mdict_exporter(target):
exporter_map = {
Targets.JITENON_KOKUGO: JitenonKokugoExporter,
Targets.JITENON_YOJI: JitenonYojiExporter,
Targets.JITENON_KOTOWAZA: JitenonKotowazaExporter,
Targets.SMK8: Smk8Exporter,
Targets.DAIJIRIN2: Daijirin2Exporter,
}
return exporter_map[target](target)

View file

@ -0,0 +1,77 @@
import re
import os
from functools import cache
from pathlib import Path
from bot.soup import delete_soup_nodes
from bot.data import load_mdict_name_conversion
from bot.name_conversion import convert_names
def make_glossary(entry, media_dir):
soup = entry.get_page_soup()
__add_rubies(soup)
__hyperlink_parent_expression(soup, entry)
__delete_unused_nodes(soup, media_dir)
__convert_links(soup, entry)
name_conversion = load_mdict_name_conversion(entry.target)
convert_names(soup, name_conversion)
glossary = soup.span.decode()
return glossary
def __add_rubies(soup):
for name in ["表外音訓", "表外字"]:
for ruby in soup.find_all(name):
ruby.name = "ruby"
rt = ruby.find("表外字マーク")
rt.name = "rt"
ruby.append(rt) # needs to positioned after the text
def __hyperlink_parent_expression(soup, entry):
if soup.find("親表記") is None:
return
parent_entry = entry.get_parent()
gid = parent_entry.get_global_identifier()
for el in soup.find_all("親表記"):
el.name = "a"
el.attrs["href"] = f"entry://{gid}"
def __delete_unused_nodes(soup, media_dir):
if not __graphics_directory_exists(media_dir):
delete_soup_nodes(soup, "カットG")
for el in soup.find_all("logo"):
next_sibling = el.next_sibling
if next_sibling is None:
continue
elif next_sibling.name in ["漢字見出G", "漢字音G"]:
el.decompose()
for el in soup.find_all("漢字音G"):
for child in el.find_all(string=""):
child.replace_with("")
@cache
def __graphics_directory_exists(media_dir):
path = os.path.join(media_dir, "graphics")
return Path(path).is_dir()
def __convert_links(soup, entry):
for el in soup.find_all("a"):
href = el.attrs["href"]
if re.match(r"^[0-9]+(?:-[0-9A-F]{4})?$", href):
ref_entry_id = entry.id_string_to_entry_id(href)
ref_entry = entry.ID_TO_ENTRY[ref_entry_id]
gid = ref_entry.get_global_identifier()
el.attrs["href"] = f"entry://{gid}"
elif re.match(r"^entry:", href):
pass
elif re.match(r"^https?:[\w\W]*", href):
pass
else:
raise Exception(f"Invalid href format: {href}")

View file

@ -0,0 +1,141 @@
# pylint: disable=too-few-public-methods
import re
class JitenonGlossary():
def __init__(self):
self._id_pattern = None
self._expression_header = None
def _replace_punctuation(self, soup):
punctuation = {
"/": "",
",": "",
}
for el in soup.find_all(string=True):
text = el.text
for old, new in punctuation.items():
text = text.replace(old, new)
el.replace_with(text)
def _add_internal_links(self, soup, entry):
for el in soup.find_all("a"):
href = el.attrs["href"]
m = re.search(self._id_pattern, href)
if m is not None:
ref_entry_id = int(m.group(1))
ref_entry = entry.ID_TO_ENTRY[ref_entry_id]
gid = ref_entry.get_global_identifier()
el.attrs["href"] = f"entry://{gid}"
elif re.match(r"^(?:https?:|\?)[\w\W]*", href):
pass
else:
raise Exception(f"Invalid href format: {href}")
def _decompose_table_rows(self, soup, entry):
for tr in soup.find_all("tr"):
if tr.find("th") is None:
continue
elif tr.th.text == self._expression_header:
tr.decompose()
elif tr.th.text == "読み方":
if self._do_display_yomikata_in_headword(entry):
tr.decompose()
elif tr.th.text == "意味":
definition = tr.td
definition.name = "div"
definition.attrs["class"] = "意味"
soup.body.insert(0, definition)
tr.decompose()
if soup.find("tr") is None:
soup.table.decompose()
def _insert_headword_line(self, soup, entry):
headword_line = soup.new_tag("div")
headword_line.attrs["class"] = "見出し"
if self._do_display_yomikata_in_headword(entry):
reading = soup.new_tag("span")
reading.attrs["class"] = "読み方"
reading.string = entry.yomikata
headword_line.append(reading)
expression = soup.new_tag("span")
expression.attrs["class"] = self._expression_header
expression.string = f"{entry.expression}"
headword_line.append(expression)
soup.body.insert(0, headword_line)
def _do_display_yomikata_in_headword(self, entry):
if not re.match(r"^[ぁ-ヿ、]+$", entry.yomikata):
return False
elif len(entry.yomikata) > 10:
return False
else:
return True
class JitenonKokugoGlossary(JitenonGlossary):
def __init__(self):
super().__init__()
self._expression_header = "言葉"
self._id_pattern = r"kokugo.jitenon.jp/word/p([0-9]+)$"
def make_glossary(self, entry, media_dir):
soup = entry.get_page_soup()
self._remove_antonym_list_item(soup)
self._replace_number_icons(soup, media_dir)
self._replace_punctuation(soup)
self._add_internal_links(soup, entry)
self._decompose_table_rows(soup, entry)
self._insert_headword_line(soup, entry)
glossary = soup.body.prettify()
return glossary
def _remove_antonym_list_item(self, soup):
for el in soup.find_all("li"):
if el.text == "対義語辞典":
el.decompose()
def _replace_number_icons(self, soup, media_dir):
for el in soup.find_all("img"):
alt = el.attrs["alt"]
text = re.search(r"[-]+", alt).group(0)
el.name = "span"
el.string = text
del el.attrs["src"]
del el.attrs["alt"]
def _do_display_yomikata_in_headword(self, entry):
return len(entry.yomikata) <= 10
class JitenonYojiGlossary(JitenonGlossary):
def __init__(self):
super().__init__()
self._expression_header = "四字熟語"
self._id_pattern = r"yoji.jitenon.jp/yoji.?/([0-9]+)\.html$"
def make_glossary(self, entry, media_dir):
soup = entry.get_page_soup()
self._replace_punctuation(soup)
self._add_internal_links(soup, entry)
self._decompose_table_rows(soup, entry)
self._insert_headword_line(soup, entry)
glossary = soup.body.prettify()
return glossary
class JitenonKotowazaGlossary(JitenonGlossary):
def __init__(self):
super().__init__()
self._expression_header = "言葉"
self._id_pattern = r"kotowaza.jitenon.jp/kotowaza/([0-9]+)\.php$"
def make_glossary(self, entry, media_dir):
soup = entry.get_page_soup()
self._replace_punctuation(soup)
self._add_internal_links(soup, entry)
self._decompose_table_rows(soup, entry)
self._insert_headword_line(soup, entry)
glossary = soup.body.prettify()
return glossary

View file

@ -0,0 +1,67 @@
import re
from bot.soup import delete_soup_nodes
from bot.data import load_mdict_name_conversion
from bot.name_conversion import convert_names
def make_glossary(entry, media_dir):
soup = entry.get_page_soup()
__fill_alts(soup, entry)
__delete_unused_nodes(soup)
__convert_links(soup, entry)
__convert_priority_markers(soup)
name_conversion = load_mdict_name_conversion(entry.target)
convert_names(soup, name_conversion)
glossary = soup.span.decode()
return glossary
def __fill_alts(soup, entry):
names = ["親見出仮名", "親見出表記"]
if soup.find(names) is None:
return
parent_entry = entry.get_parent()
gid = parent_entry.get_global_identifier()
for el in soup.find_all(names):
el.name = "a"
alt = el.attrs["alt"]
el.string = alt
el.attrs["href"] = f"entry://{gid}"
del el.attrs["alt"]
def __delete_unused_nodes(soup):
for name in ["連濁"]:
delete_soup_nodes(soup, name)
def __convert_links(soup, entry):
for el in soup.find_all("a"):
href = el.attrs["href"]
if href.startswith("$"):
el.unwrap()
elif re.match(r"^[0-9]+(?:-[0-9A-F]{4})?$", href):
ref_entry_id = entry.id_string_to_entry_id(href)
ref_entry = entry.ID_TO_ENTRY[ref_entry_id]
gid = ref_entry.get_global_identifier()
el.attrs["href"] = f"entry://{gid}"
elif re.match(r"^[0-9]+[ab]?\.aac$", href):
el.attrs["href"] = f"sound://audio/{href}"
elif re.match(r"^entry:", href):
pass
elif re.match(r"^https?:[\w\W]*", href):
pass
else:
raise Exception(f"Invalid href format: {href}")
def __convert_priority_markers(soup):
for el in soup.find_all("img", attrs={"alt": "*"}):
el.name = "span"
el.string = ""
for el in soup.find_all("img", attrs={"alt": ""}):
el.name = "span"
el.string = ""

View file

@ -0,0 +1,23 @@
from bot.mdict.terms.terminator import Terminator
from bot.mdict.glossary.daijirin2 import make_glossary
class Daijirin2Terminator(Terminator):
def _glossary(self, entry):
if entry.entry_id in self._glossary_cache:
return self._glossary_cache[entry.entry_id]
glossary = make_glossary(entry, self._media_dir)
self._glossary_cache[entry.entry_id] = glossary
return glossary
def _link_glossary_parameters(self, entry):
return [
[entry.children, "子項目"],
[entry.phrases, "句項目"],
]
def _subentry_lists(self, entry):
return [
entry.children,
entry.phrases,
]

View file

@ -0,0 +1,18 @@
from bot.targets import Targets
from bot.mdict.terms.jitenon import JitenonKokugoTerminator
from bot.mdict.terms.jitenon import JitenonYojiTerminator
from bot.mdict.terms.jitenon import JitenonKotowazaTerminator
from bot.mdict.terms.smk8 import Smk8Terminator
from bot.mdict.terms.daijirin2 import Daijirin2Terminator
def new_terminator(target):
terminator_map = {
Targets.JITENON_KOKUGO: JitenonKokugoTerminator,
Targets.JITENON_YOJI: JitenonYojiTerminator,
Targets.JITENON_KOTOWAZA: JitenonKotowazaTerminator,
Targets.SMK8: Smk8Terminator,
Targets.DAIJIRIN2: Daijirin2Terminator,
}
return terminator_map[target](target)

View file

@ -0,0 +1,42 @@
from bot.mdict.terms.terminator import Terminator
from bot.mdict.glossary.jitenon import JitenonKokugoGlossary
from bot.mdict.glossary.jitenon import JitenonYojiGlossary
from bot.mdict.glossary.jitenon import JitenonKotowazaGlossary
class JitenonTerminator(Terminator):
def __init__(self, target):
super().__init__(target)
self._glossary_maker = None
def _glossary(self, entry):
if entry.entry_id in self._glossary_cache:
return self._glossary_cache[entry.entry_id]
glossary = self._glossary_maker.make_glossary(entry, self._media_dir)
self._glossary_cache[entry.entry_id] = glossary
return glossary
def _link_glossary_parameters(self, entry):
return []
def _subentry_lists(self, entry):
return []
class JitenonKokugoTerminator(JitenonTerminator):
def __init__(self, target):
super().__init__(target)
self._glossary_maker = JitenonKokugoGlossary()
class JitenonYojiTerminator(JitenonTerminator):
def __init__(self, target):
super().__init__(target)
self._glossary_maker = JitenonYojiGlossary()
class JitenonKotowazaTerminator(JitenonTerminator):
def __init__(self, target):
super().__init__(target)
self._glossary_maker = JitenonKotowazaGlossary()

24
bot/mdict/terms/smk8.py Normal file
View file

@ -0,0 +1,24 @@
from bot.mdict.terms.terminator import Terminator
from bot.mdict.glossary.smk8 import make_glossary
class Smk8Terminator(Terminator):
def _glossary(self, entry):
if entry.entry_id in self._glossary_cache:
return self._glossary_cache[entry.entry_id]
glossary = make_glossary(entry, self._media_dir)
self._glossary_cache[entry.entry_id] = glossary
return glossary
def _link_glossary_parameters(self, entry):
return [
[entry.children, "子項目"],
[entry.phrases, "句項目"],
]
def _subentry_lists(self, entry):
return [
entry.children,
entry.phrases,
entry.kanjis,
]

View file

@ -0,0 +1,73 @@
from abc import abstractmethod, ABC
class Terminator(ABC):
def __init__(self, target):
self._target = target
self._glossary_cache = {}
self._media_dir = None
def set_media_dir(self, media_dir):
self._media_dir = media_dir
def make_terms(self, entry):
gid = entry.get_global_identifier()
glossary = self.__full_glossary(entry)
terms = [[gid, glossary]]
keys = set()
headwords = entry.get_headwords()
for reading, expressions in headwords.items():
if len(expressions) == 0:
keys.add(reading)
for expression in expressions:
if expression.strip() == "":
keys.add(reading)
continue
keys.add(expression)
if reading.strip() == "":
continue
if reading != expression:
keys.add(f"{reading}{expression}")
else:
keys.add(reading)
link = f"@@@LINK={gid}"
for key in keys:
if key.strip() != "":
terms.append([key, link])
for subentries in self._subentry_lists(entry):
for subentry in subentries:
for term in self.make_terms(subentry):
terms.append(term)
return terms
def __full_glossary(self, entry):
glossary = []
style_link = f"<link rel='stylesheet' href='{self._target.value}.css' type='text/css'>"
glossary.append(style_link)
glossary.append(self._glossary(entry))
for x in self._link_glossary_parameters(entry):
(subentries, list_title) = x
if len(subentries) == 0:
continue
items = []
for subentry in subentries:
exp = subentry.get_first_expression()
gid = subentry.get_global_identifier()
item = f"<li><a href='entry://{gid}'>{exp}</a></li>"
items.append(item)
link_glossary = f"<div data-child-links='{list_title}'><span>{list_title}</span><ul>{''.join(items)}</ul></div>"
glossary.append(link_glossary)
return "\n".join(glossary)
@abstractmethod
def _glossary(self, entry):
pass
@abstractmethod
def _link_glossary_parameters(self, entry):
pass
@abstractmethod
def _subentry_lists(self, entry):
pass

View file

@ -30,7 +30,7 @@ def __apply_name_conversion_procedures(soup, procedures):
"has_previous_sibling": __has_previous_sibling,
"replace": __replace,
"wrap": __wrap,
"add_ruby_text": __add_ruby_text,
"insert_span": __insert_span,
}
for procedure in procedures:
function = functions[procedure["procedure_name"]]
@ -92,10 +92,9 @@ def __wrap(soup, l_wrap, r_wrap):
soup.string = f"{l_wrap}{soup.text}{r_wrap}"
def __add_ruby_text(soup, mark, style):
if style.strip() != "":
markup = f"<rt><span style='{style}'>{mark}</span></rt>"
else:
markup = f"<rt>{mark}</rt>"
rt_soup = BeautifulSoup(markup, "xml")
soup.append(rt_soup.rt)
def __insert_span(soup, attr_name, attr_val):
span_markup = f"<span {attr_name}='{attr_val}'></span>"
span_soup = BeautifulSoup(span_markup, "xml")
for content in reversed(soup.contents):
span_soup.span.insert(0, content.extract())
soup.append(span_soup.span)

View file

@ -1,15 +1,18 @@
# pylint: disable=too-few-public-methods
import json
import os
import shutil
from pathlib import Path
from datetime import datetime
from abc import ABC, abstractmethod
from platformdirs import user_documents_dir, user_cache_dir
from bot.data import load_yomichan_metadata
from bot.yomichan.terms.factory import new_terminator
class Exporter:
class Exporter(ABC):
def __init__(self, target):
self._target = target
self._terminator = new_terminator(target)
@ -26,6 +29,14 @@ class Exporter:
terms = self.__get_terms(entries)
self.__make_dictionary(terms, index, tags)
@abstractmethod
def _get_revision(self, entries):
pass
@abstractmethod
def _get_attribution(self, entries):
pass
def _get_build_dir(self):
if self._build_dir is not None:
return self._build_dir
@ -41,7 +52,7 @@ class Exporter:
build_dir = self._get_build_dir()
build_img_dir = os.path.join(build_dir, self._target.value)
if image_dir is not None:
print("Copying image files to build directory...")
print("Copying media files to build directory...")
shutil.copytree(image_dir, build_img_dir)
else:
os.makedirs(build_img_dir)
@ -93,7 +104,7 @@ class Exporter:
def __write_archive(self, filename):
archive_format = "zip"
out_dir = os.path.join(user_documents_dir(), "jitenbot")
out_dir = os.path.join(user_documents_dir(), "jitenbot", "yomichan")
if not Path(out_dir).is_dir():
os.makedirs(out_dir)
out_file = f"{filename}.{archive_format}"
@ -110,10 +121,7 @@ class Exporter:
shutil.rmtree(build_dir)
class JitenonExporter(Exporter):
def __init__(self, target):
super().__init__(target)
class _JitenonExporter(Exporter):
def _get_revision(self, entries):
modified_date = None
for entry in entries:
@ -130,25 +138,19 @@ class JitenonExporter(Exporter):
return attribution
class JitenonKokugoExporter(JitenonExporter):
def __init__(self, target):
super().__init__(target)
class JitenonKokugoExporter(_JitenonExporter):
pass
class JitenonYojiExporter(JitenonExporter):
def __init__(self, target):
super().__init__(target)
class JitenonYojiExporter(_JitenonExporter):
pass
class JitenonKotowazaExporter(JitenonExporter):
def __init__(self, target):
super().__init__(target)
class JitenonKotowazaExporter(_JitenonExporter):
pass
class Smk8Exporter(Exporter):
def __init__(self, target):
super().__init__(target)
def _get_revision(self, entries):
timestamp = datetime.now().strftime("%Y-%m-%d")
return f"{self._target.value};{timestamp}"
@ -158,9 +160,6 @@ class Smk8Exporter(Exporter):
class Daijirin2Exporter(Exporter):
def __init__(self, target):
super().__init__(target)
def _get_revision(self, entries):
timestamp = datetime.now().strftime("%Y-%m-%d")
return f"{self._target.value};{timestamp}"

View file

@ -7,7 +7,7 @@ from bot.yomichan.exporters.export import Smk8Exporter
from bot.yomichan.exporters.export import Daijirin2Exporter
def new_exporter(target):
def new_yomi_exporter(target):
exporter_map = {
Targets.JITENON_KOKUGO: JitenonKokugoExporter,
Targets.JITENON_YOJI: JitenonYojiExporter,

View file

@ -6,9 +6,9 @@ from pathlib import Path
import bot.icons as Icons
from bot.soup import delete_soup_nodes
from bot.data import load_daijirin2_yomichan_name_conversion
from bot.data import load_yomichan_name_conversion
from bot.yomichan.glossary.gloss import make_gloss
from bot.yomichan.glossary.name_conversion import convert_names
from bot.name_conversion import convert_names
def make_glossary(entry, image_dir):
@ -26,7 +26,7 @@ def make_glossary(entry, image_dir):
__convert_daigoginum(soup, image_dir)
__convert_jundaigoginum(soup, image_dir)
name_conversion = load_daijirin2_yomichan_name_conversion()
name_conversion = load_yomichan_name_conversion(entry.target)
convert_names(soup, name_conversion)
gloss = make_gloss(soup.span)

View file

@ -58,9 +58,9 @@ class JitenonGlossary():
if self._do_display_yomikata_in_headword(entry):
tr.decompose()
elif tr.th.text == "意味":
imi = tr.td
imi.name = "div"
soup.body.insert(0, imi)
definition = tr.td
definition.name = "div"
soup.body.insert(0, definition)
tr.decompose()
if soup.find("tr") is None:
soup.table.decompose()

View file

@ -4,9 +4,9 @@ from bs4 import BeautifulSoup
import bot.icons as Icons
from bot.soup import delete_soup_nodes
from bot.data import load_smk8_yomichan_name_conversion
from bot.data import load_yomichan_name_conversion
from bot.yomichan.glossary.gloss import make_gloss
from bot.yomichan.glossary.name_conversion import convert_names
from bot.name_conversion import convert_names
def make_glossary(entry, image_dir):
@ -20,7 +20,7 @@ def make_glossary(entry, image_dir):
__convert_gaiji(soup, image_dir)
__convert_rectangles(soup, image_dir)
name_conversion = load_smk8_yomichan_name_conversion()
name_conversion = load_yomichan_name_conversion(entry.target)
convert_names(soup, name_conversion)
gloss = make_gloss(soup.span)

View file

@ -9,6 +9,7 @@ from bot.yomichan.glossary.jitenon import JitenonKotowazaGlossary
class JitenonTerminator(Terminator):
def __init__(self, target):
super().__init__(target)
self._glossary_maker = None
def _definition_tags(self, entry):
return None
@ -51,7 +52,7 @@ class JitenonYojiTerminator(JitenonTerminator):
return ""
def _term_tags(self, entry):
tags = entry.kankenkyuu.split("/")
tags = entry.kanken_level.split("/")
return " ".join(tags)

View file

@ -1,7 +1,8 @@
from abc import abstractmethod, ABC
from bot.data import load_yomichan_inflection_categories
class Terminator:
class Terminator(ABC):
def __init__(self, target):
self._target = target
self._glossary_cache = {}
@ -62,3 +63,31 @@ class Terminator:
}
glossary.append(gloss)
return glossary
@abstractmethod
def _definition_tags(self, entry):
pass
@abstractmethod
def _inflection_rules(self, entry, expression):
pass
@abstractmethod
def _glossary(self, entry):
pass
@abstractmethod
def _sequence(self, entry):
pass
@abstractmethod
def _term_tags(self, entry):
pass
@abstractmethod
def _link_glossary_parameters(self, entry):
pass
@abstractmethod
def _subentry_lists(self, entry):
pass

View file

@ -0,0 +1,12 @@
{
"a": {},
"br": {},
"img": {},
"div": {},
"span": {},
"ruby": {},
"rt": {},
"p": {},
"漢字音G": {"name": "ul"},
"漢字音": {"name": "li"}
}

View file

@ -0,0 +1,414 @@
body {
margin: 1em 44px 1em 1em;
line-height: 1.5em;
font-family: serif;
font-size: 1.2em;
color: black;
}
body.ABC {
margin: 0.5em 0.5em 2em 0.5em;
}
a {
text-decoration: none;
}
img.gaiji {
height: 1em;
}
img.cut {
max-height: 100px;
max-width: 600px;
}
p {
margin: 0.5em 0
}
span[data-name="i"] {
font-style: italic;
}
span[data-name="h1"] {
font-family: sans-serif;
font-size: 1em;
font-weight: bold;
}
span[data-name="image"] {
display: block;
}
span[data-name="ref"] a {
text-decoration: none;
}
span[data-name="sl"] {
text-decoration: accent;
}
span[data-name="sm"] {
font-size: 0.7em;
}
span[data-name="small"] {
font-size: 0.7em;
vertical-align: 0.35em;
}
span[data-name="sub"] {
font-size: 0.7em;
vertical-align: -0.35em;
}
span[data-name="ty2"] span[data-name="sub"] {
vertical-align: 0em;
}
span[data-name="ty2"] span[data-name="sup"] {
vertical-align: 0.5em;
}
span[data-name="文語形"] {
display: block;
}
span[data-name="用例"] {
display: block;
}
span[data-name="補説G"] {
display: block;
}
span[data-name="語義Gnum"] + span[data-name="補説G"] {
display: inline;
}
span[data-name="アクセントG"] + span[data-name="補説G"] {
display: inline;
}
span[data-name="補説G"] + span[data-name="語釈"] {
display: block;
}
span[data-name="アクセントG"] {
font-size: 0.7em;
vertical-align: super;
margin-left: 0.25em;
margin-right: 0.25em;
}
span[data-name="カット"] {
display: block;
}
span[data-name="カットG"] {
display: block;
margin-top: 0.5em;
margin-bottom: 0.5em;
margin-left: 1em;
}
span[data-name="キャプション"] {
display: block;
}
span[data-name="ルビG"] {
font-family: sans-serif;
font-size: 0.7em;
font-weight: normal;
vertical-align: 0.35em;
}
.warichu span[data-name="ルビG"] {
font-family: serif;
font-size: 0.5em;
font-weight: normal;
vertical-align: 0em;
}
span[data-name="中語義"] {
display: block;
}
span[data-name="付記"] {
font-size: 0.7em;
vertical-align: 0.35em;
}
span[data-name="副義"] {
display: block;
margin-left: 1em;
}
span[data-name="単位名"] {
font-size: 0.5em;
}
span[data-name="原籍"] {
font-size: 0.7em;
vertical-align: 0.35em;
}
span[data-name="句仮名"] {
font-size: 0.7em;
vertical-align: 0.35em;
}
span[data-name="句項目"] {
margin-top: 0.5em;
margin-left: 1em;
display: block;
}
span[data-name="和字"] {
font-family: sans-serif;
}
span[data-name="品詞行"] {
font-size: 0.7em;
vertical-align: 0.35em;
}
span[data-name="品詞用法"] {
font-size: 0.7em;
vertical-align: 0.35em;
}
span[data-name="大語義"] {
display: block;
}
span[data-name="大語義num"] {
margin: 0.025em;
padding: 0.1em;
font-family: sans-serif;
font-size: 0.8em;
color: white;
background-color: black;
}
span[data-name="子項目"] {
display: block;
margin-top: 0.5em;
margin-left: 1em;
}
span[data-name="慣用G"] {
display: block;
margin-top: 0.5em;
}
span[data-name="欧字"] {
font-family: sans-serif;
}
span[data-name="歴史仮名"] {
font-size: 0.7em;
vertical-align: 0.35em;
}
span[data-name="派生G"] {
display: block;
margin-top: 0.5em;
}
span[data-name="準大語義"] {
display: block;
}
span[data-name="準大語義num"] {
margin: 0.025em;
padding: 0.1em;
font-family: sans-serif;
font-size: 0.8em;
border: solid 1px black;
}
span[data-name="漢字音logo"] {
margin: 0.025em;
padding: 0.1em;
font-family: sans-serif;
font-size: 0.8em;
border: solid 0.5px black;
border-radius: 1em;
}
span[data-name="漢字音G"] {
font-size: 0.7em;
font-weight: normal;
vertical-align: 0.35em;
}
span[data-name="生没年"] {
margin-left: 0.25em;
margin-right: 0.25em;
font-size: 0.7em;
vertical-align: 0.35em;
}
span[data-name="生没年"]:first-child {
margin-left: 0;
}
span[data-name="用法"] {
font-size: 0.7em;
vertical-align: 0.35em;
}
span[data-name="異字同訓"] {
display: block;
margin-top: 0.5em;
}
span[data-name="異字同訓仮名"] {
font-family: sans-serif;
font-weight: bold;
}
span[data-name="異字同訓漢字"] {
font-family: serif;
font-weight: normal;
}
span[data-name="異字同訓表記"] {
font-family: sans-serif;
font-weight: bold;
}
span[data-name="異字同訓解説"] {
display: block;
}
span[data-name="異字同訓語義G"] {
display: block;
}
span[data-name="細義"] {
display: block;
}
span[data-name="表外字マーク"] {
font-size: 0.5em;
vertical-align: 0.5em;
}
span[data-name="見出仮名"] {
font-family: sans-serif;
font-weight: bold;
}
span[data-name="見出相当部"] {
font-family: sans-serif;
font-weight: bold;
}
span[data-name="見出部"] {
display: block;
}
span[data-name="解説部"] {
display: block;
margin-left: 1em;
}
span[data-name="語義G"] {
display: block;
}
span[data-name="語義区切"] {
font-size: 0.7em;
vertical-align: 0.35em;
}
span[data-name="返り点"] {
font-size: 0.5em;
font-weight: normal;
vertical-align: 1em;
}
span[data-name="返り点"].熟語記号 {
vertical-align: 0em;
}
span[data-name="項目"] {
display: block;
}
span[data-name="logo"] {
margin: 0.025em 0.25em;
padding: 0.1em;
font-size: 0.8em;
border: solid 1px black;
border-radius: 0.2em;
}
.gothic {
font-family: sans-serif;
font-weight: bold;
}
.warichu {
font-size: 1em;
}
.refnum {
font-size: 0.7em;
vertical-align: 0.35em;
}
#index {
display: none;
}
span[data-name="歴史仮名"]:before,
span[data-name="ルビG"]:before,
span[data-name="品詞行"]:before,
span[data-name="原籍"]:before,
span[data-name="品詞用法"]:before,
span[data-name="付記"]:before {
content: "(";
}
span[data-name="歴史仮名"]:after,
span[data-name="ルビG"]:after,
span[data-name="品詞行"]:after,
span[data-name="原籍"]:after,
span[data-name="品詞用法"]:after,
span[data-name="付記"]:after {
content: ")";
}
div[data-child-links] {
padding-top: 1em;
}
div[data-child-links] ul {
margin: 0;
padding-left: 2em;
}
div[data-child-links] span {
padding: 0.1em;
font-family: sans-serif;
font-size: 0.8em;
color: white;
border-width: 0.05em;
border-style: none;
border-color: black;
word-break: keep-all;
-webkit-border-radius: 0.2em;
}
div[data-child-links="子項目"] span {
background-color: rgb(153, 42, 103);
}
div[data-child-links="句項目"] span {
background-color: rgb(176, 127, 57);
}

View file

@ -0,0 +1,56 @@
body {
font-family: serif;
margin: 1em 44px 1em 1.5em;
line-height: 1.5em;
font-size: 1.2em;
color: black;
}
table, th, td {
border: 1px solid;
border-collapse: collapse;
padding: 0.5em;
}
th {
font-family: sans-serif;
color: black;
background-color: lightgray;
font-weight: normal;
white-space: nowrap;
}
a {
text-decoration: none;
}
td ul {
margin: -0.1em 0em -0.1em -1em;
}
.見出し {
}
.読み方 {
font-family: sans-serif;
font-weight: bold;
}
.意味 {
margin-left: 1.0em;
margin-bottom: 0.5em;
}
.num_icon {
font-family: sans-serif;
padding-left: 0.25em;
margin-right: 0.5em;
font-size: 0.8em;
word-break: keep-all;
color: white;
background-color: gray;
border-style: none;
-webkit-border-radius: 0.1em;
}

View file

@ -0,0 +1,40 @@
body {
font-family: serif;
margin: 1em 44px 1em 1.5em;
line-height: 1.5em;
font-size: 1.2em;
color: black;
}
table, th, td {
border: 1px solid;
border-collapse: collapse;
padding: 0.5em;
}
th {
font-family: sans-serif;
color: black;
background-color: lightgray;
font-weight: normal;
white-space: nowrap;
}
a {
text-decoration: none;
}
.見出し {
}
.読み方 {
font-family: sans-serif;
font-weight: bold;
}
.意味 {
margin-left: 1.0em;
margin-bottom: 0.5em;
}

View file

@ -0,0 +1,40 @@
body {
font-family: serif;
margin: 1em 44px 1em 1.5em;
line-height: 1.5em;
font-size: 1.2em;
color: black;
}
table, th, td {
border: 1px solid;
border-collapse: collapse;
padding: 0.5em;
}
th {
font-family: sans-serif;
color: black;
background-color: lightgray;
font-weight: normal;
white-space: nowrap;
}
a {
text-decoration: none;
}
.見出し {
}
.読み方 {
font-family: sans-serif;
font-weight: bold;
}
.意味 {
margin-left: 1.0em;
margin-bottom: 0.5em;
}

449
data/mdict/css/smk8.css Normal file
View file

@ -0,0 +1,449 @@
body {
margin: 1em 44px 1em 1.5em;
line-height: 1.5em;
font-family: serif;
font-size: 1.2em;
color: black;
}
span[data-name="項目"] {
display: block;
}
span[data-name="見出部"] {
display: block;
}
span[data-name="見出部"].pri {
margin-left: -0.4em;
}
span[data-name="見出仮名"] {
font-family: sans-serif;
font-weight: bold;
}
rt[data-name="表音表記"] {
font-size: 0.65em;
}
rt[data-name="表外音訓マーク"] {
font-size: 0.65em;
}
rt[data-name="表外字マーク"] {
font-size: 0.65em;
}
span[data-name="解説部"] {
display: block;
margin-left: 1em;
}
span[data-name="大語義"] {
display: block;
}
span[data-name="語義"] {
display: block;
}
span[data-name="副義"] {
display: block;
}
span[data-name="用例G"] {
display: block;
}
span[data-name="注記"] span[data-name="用例G"] {
display: inline;
}
span[data-name="用例"] {
display: block;
}
span[data-name="注記"] span[data-name="用例"] {
display: inline;
}
span[data-name="見出語省略"] {
margin-left: 0.125em;
margin-right: 0.125em;
}
span[data-name="教育漢字"] {
color: green;
}
span[data-name="ルビ"] {
font-size: 0.7em;
vertical-align: 0.5em;
}
span[data-name="ルビ区切"] {
font-size: 0.7em;
vertical-align: 0.65em;
}
span[data-name="名詞形G"] {
display: block;
}
span[data-name="可能形G"] {
display: block;
}
span[data-name="参照G"] {
display: block;
}
span[data-name="参照"] {
color: blue;
}
span[data-name="子項目"],
span[data-name="句項目"] {
display: block;
margin-bottom: 0.5em;
}
span[data-name="子項目F"],
span[data-name="句項目F"] {
display: block;
margin-bottom: 0.5em;
margin-top: 0.5em;
}
span[data-name="子見出部"] {
display: block;
}
span[data-name="子解説部"] {
display: block;
margin-left: 1em;
}
span[data-name="句見出部"] {
display: block;
}
span[data-name="句解説部"] {
display: block;
margin-left: 1em;
}
span[data-name="運用解説"] {
display: block;
}
span[data-name="表記解説"] {
display: block;
}
span[data-name="文法解説"] {
display: block;
}
span[data-name="かぞえ方解説"] {
display: block;
}
span[data-name="派生"] {
display: block;
margin-left: 1.25em;
}
span[data-name="派生SubGF"] {
display: block;
text-indent: -1.25em;
}
span[data-name="派生SubG"] {
display: block;
}
span[data-name="派生SubGF"] span[data-name="用例G"] {
text-indent: 0;
}
span[data-name="派生見出"] {
font-weight: bold;
}
span[data-name="派生見出"].normal {
font-weight: normal
}
span[data-name="造語成分項目"] {
display: block;
margin-top: 1em;
}
span[data-name="造語成分見出"] {
font-size:1.4em;
}
span[data-name="EM"] {
font-weight: bold;
}
span[data-name="アクセント"] {
font-size: 0.7em;
vertical-align: super;
}
span[data-name="アクセント組M"] {
vertical-align: 0.1em;
}
span[data-name="反意語M"],
span[data-name="同意語M"] {
vertical-align: 0.15em;
}
span[data-name="B"] {
font-weight: bold;
}
span[data-name="IT"] {
font-family: "Times New Roman";
font-style: italic;
}
span[data-name="EXCLAMATION"] {
font-family: "Times New Roman";
font-style: italic;
font-size: 1.2em;
}
span[data-name="歴史仮名"] {
font-family: serif;
font-size: 0.7em;
font-weight: normal;
vertical-align: 0.35em;
-webkit-user-select: nocopy;
}
span[data-name="出現形"] {
font-weight: bold;
}
span[data-name="品詞用法"] {
font-size: 0.7em;
}
span[data-name="品詞用法"] span[data-name="品詞G"] {
font-size: 1.2em;
}
span[data-name="基本構文型"] {
font-size: 0.8em;
}
span[data-name="基本構文em"] {
font-weight: bold;
}
span[data-name="ウ濁音参照"] {
font-family: sans-serif;
font-weight: bold;
}
span[data-name="rect"] {
padding: 0.1em;
font-family: sans-serif;
font-size: 0.8em;
border-width: 0.05em;
border-style: solid;
border-color: black;
word-break: keep-all;
-webkit-border-radius: 0.1em;
}
span[data-name="rect"].fill {
color: white;
border-style: none;
background-color: gray;
}
span[data-name="rect"].red {
color: red;
border-color: red;
}
span[data-name="rect"].redfill {
color: white;
border-style: none;
background-color: red;
}
span[data-name="red"] {
color: red;
}
span[data-name="大語義番号"],
span[data-name="語義番号"],
span[data-name="副義番号"] {
margin-right: 0.25em;
font-family: sans-serif;
}
span[data-name="ref"] span[data-name="大語義番号"],
span[data-name="ref"] span[data-name="語義番号"],
span[data-name="ref"] span[data-name="副義番号"] {
font-size: 0.8em;
margin-right: 0;
}
span[data-name="表外字マーク"] {
vertical-align: 0.5em;
}
span[data-name="表外音訓マーク"] {
font-size: 0.5em;
vertical-align: 0.5em;
}
span[data-name="言換M"] {
font-size: 0.5em;
}
span[data-name="字音語参照項目"] {
display: block;
}
span[data-name="本文項目M"] {
font-size: 0.7em;
}
span[data-name="運用解説M"],
span[data-name="表記解説M"],
span[data-name="文法解説M"],
span[data-name="かぞえ方解説M"],
span[data-name="派生M"] {
margin-right: 0.25em;
font-family: sans-serif;
}
span[data-name="派生ロゴ"] {
margin-left: 0.1em;
margin-right: 0.1em;
}
span[data-name="文字"] {
margin: 0 0.2em;
}
span[data-name="二分"] {
font-size: 0.5em;
}
span[data-name="四分"] {
font-size: 0.25em;
}
span[data-name="ref"] {
margin-left: 0.1em;
margin-right: 0.1em;
}
span[data-name="ref-small"] {
font-size: 0.7em;
}
span[data-name="sup"] {
font-size: 0.6em;
}
span[data-name="外字"] img {
height: 1em;
}
img.audio {
height: 1em;
margin: 0 0.25em;
}
img.外字 {
height: 1em;
}
img.外字欧 {
height: 1em;
}
span[data-name="レ点M"] {
font-size: 0.6em;
vertical-align: -0.7em;
}
a {
text-decoration: none;
}
span[data-name="audio"] a {
padding-bottom: 0;
border-bottom: none;
}
span[data-name="アクセント"] a,
span[data-name="古語M"] a,
span[data-name="雅語M"] a,
span[data-name="派生M"] a,
span[data-name="原籍M"] a,
span[data-name="品詞M"] a {
color: black;
border-bottom-style: none;
}
span[data-name="歴史仮名"]:before,
span[data-name="ルビ"]:before {
content: "(";
}
span[data-name="歴史仮名"]:after,
span[data-name="ルビ"]:after {
content: ")";
}
div[data-child-links] {
padding-top: 1em;
}
div[data-child-links] ul {
margin: 0;
padding-left: 2em;
}
div[data-child-links] span {
padding: 0.1em;
font-family: sans-serif;
font-size: 0.8em;
color: white;
border-width: 0.05em;
border-style: none;
border-color: black;
word-break: keep-all;
-webkit-border-radius: 0.2em;
}
div[data-child-links="子項目"] span {
background-color: rgb(153, 42, 103);
}
div[data-child-links="句項目"] span {
background-color: rgb(176, 127, 57);
}
span.pri > span.外字 {
font-size: 0.65em;
vertical-align: super;
}

View file

@ -0,0 +1,7 @@
大辞林 第四版
<br><br>
https://www.monokakido.jp/ja/dictionaries/daijirin2/index.html
<br><br>
{{revision}}
<br><br>
{{attribution}}

View file

@ -0,0 +1,7 @@
国語辞典オンライン
<br><br>
https://kokugo.jitenon.jp/
<br><br>
{{revision}}
<br><br>
{{attribution}}

View file

@ -0,0 +1,7 @@
故事・ことわざ・慣用句オンライン
<br><br>
https://kotowaza.jitenon.jp/
<br><br>
{{revision}}
<br><br>
{{attribution}}

View file

@ -0,0 +1,7 @@
四字熟語辞典オンライン
<br><br>
https://yoji.jitenon.jp/
<br><br>
{{revision}}
<br><br>
{{attribution}}

View file

@ -0,0 +1,7 @@
新明解国語辞典 第八版
<br><br>
https://www.monokakido.jp/ja/dictionaries/smk8/index.html
<br><br>
{{revision}}
<br><br>
{{attribution}}

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.3 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.3 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.6 KiB

View file

@ -0,0 +1 @@
大辞林 第四版

View file

@ -0,0 +1 @@
国語辞典オンライン

View file

@ -0,0 +1 @@
故事・ことわざ・慣用句オンライン

View file

@ -0,0 +1 @@
四字熟語辞典オンライン

View file

@ -0,0 +1 @@
新明解国語辞典 第八版

View file

@ -0,0 +1,25 @@
{
"a": {},
"br": {},
"img": {},
"div": {},
"span": {},
"表外字": {
"name": "ruby"
},
"表外字マーク": {
"name": "rt"
},
"表外音訓": {
"name": "ruby"
},
"表外音訓マーク": {
"name": "rt"
},
"表音式": {
"name": "ruby"
},
"表音表記": {
"name": "rt"
}
}

View file

@ -121,25 +121,31 @@
"style": "font-weight: bold;"
},
"表外字": {
"name": "ruby",
"name": "ruby"
},
"表外字マーク": {
"name": "rt",
"procedures": [
{
"procedure_name": "add_ruby_text",
"procedure_name": "insert_span",
"parameters": {
"mark": "︿",
"style": "font-size: 2em;"
"attr_name": "style",
"attr_val": "font-size: 2em;"
}
}
]
},
"表外音訓": {
"name": "ruby",
"name": "ruby"
},
"表外音訓マーク": {
"name": "rt",
"procedures": [
{
"procedure_name": "add_ruby_text",
"procedure_name": "insert_span",
"parameters": {
"mark": "︽",
"style": "font-size: 2em;"
"attr_name": "style",
"attr_val": "font-size: 2em;"
}
}
]
@ -148,23 +154,7 @@
"name": "ruby"
},
"表音表記": {
"name": "rt",
"procedures": [
{
"procedure_name": "replace",
"parameters": {
"old": "",
"new": ""
}
},
{
"procedure_name": "replace",
"parameters": {
"old": "",
"new": ""
}
}
]
"name": "rt"
},
"派生見出": {
"name": "span",

View file

@ -17,11 +17,22 @@ along with this program. If not, see <https://www.gnu.org/licenses/>.
"""
import os
import sys
import argparse
import subprocess
from bot.targets import Targets
from bot.crawlers.factory import new_crawler
def filename(f):
if not os.path.isfile(f):
raise argparse.ArgumentTypeError(f"`{f}` is not a valid filename")
elif not os.access(f, os.R_OK):
raise argparse.ArgumentTypeError(f"Cannot access file `{f}`")
else:
return f
def directory(d):
if not os.path.isdir(d):
raise argparse.ArgumentTypeError(f"`{d}` is not a valid directory")
@ -35,34 +46,71 @@ def parse_args(target_names):
parser = argparse.ArgumentParser(
prog="jitenbot",
description="Convert Japanese dictionary files to new formats.",
epilog="See README.md for details regarding media directory structures",
)
parser.add_argument(
"target",
choices=target_names,
help="name of dictionary to convert"
help="name of dictionary to convert",
)
parser.add_argument(
"-p", "--page-dir",
help="path to directory containing XML page files",
type=directory
type=directory,
)
parser.add_argument(
"-i", "--image-dir",
help="path to directory containing image folders (gaiji, graphics, etc.)",
type=directory
"-m", "--media-dir",
help="path to directory containing media folders (gaiji, graphics, audio, etc.)",
type=directory,
)
parser.add_argument(
"-i", "--mdict-icon",
help="path to icon file to be used with MDict",
type=filename,
)
parser.add_argument(
"--no-yomichan-export",
help="skip export of dictionary data to Yomichan format",
action='store_true',
)
parser.add_argument(
"--no-mdict-export",
help="skip export of dictionary data to MDict format",
action='store_true',
)
args = parser.parse_args()
return args
def test_mdict():
try:
subprocess.run(
["mdict", "--version"],
check=True,
stdout=subprocess.DEVNULL,
)
except FileNotFoundError:
print("Could not find `mdict` pack tool.")
print("Ensure that mdict-utils is installed and")
print("included in the environment PATH.\n")
print("Mdict export functionality may also be")
print("disabled with the --no-mdict-export flag.")
sys.exit()
def main():
target_names = [x.value for x in Targets]
args = parse_args(target_names)
if not args.no_mdict_export:
test_mdict()
selected_target = Targets(args.target)
crawler = new_crawler(selected_target)
crawler.collect_pages(args.page_dir)
crawler.read_pages()
crawler.make_yomichan_dictionary(args.image_dir)
if not args.no_yomichan_export:
crawler.make_yomichan_dictionary(args.media_dir)
if not args.no_mdict_export:
crawler.make_mdict_dictionary(args.media_dir, args.mdict_icon)
if __name__ == "__main__":

View file

@ -6,6 +6,7 @@ css-parser==1.0.8
html5lib==1.1
idna==3.4
lxml==4.9.2
mdict-utils==1.3.12
Pillow==9.5.0
platformdirs==3.5.0
requests==2.29.0
@ -13,5 +14,7 @@ six==1.16.0
soupsieve==2.4.1
SudachiDict-full==20230110
SudachiPy==0.6.7
tqdm==4.65.0
urllib3==1.26.15
webencodings==0.5.1
xxhash==3.2.0

13
run_all.sh Normal file
View file

@ -0,0 +1,13 @@
python jitenbot.py jitenon-kokugo
python jitenbot.py jitenon-yoji
python jitenbot.py jitenon-kotowaza
python jitenbot.py smk8 \
--media-dir monokakido/SMK8/media \
--page-dir monokakido/SMK8/pages \
--mdict-icon monokakido/SMK8/SMK8-76@3x.png
python jitenbot.py daijirin2 \
--media-dir monokakido/DAIJIRIN2/media \
--page-dir monokakido/DAIJIRIN2/pages \
--mdict-icon monokakido/DAIJIRIN2/DAIJIRIN2-76@3x.png