Add support for sankoku8

This commit is contained in:
stephenmk 2023-07-18 00:43:38 -05:00
parent b0a9ab5cae
commit e85d0a1625
No known key found for this signature in database
GPG key ID: B6DA730DB06235F1
34 changed files with 6273 additions and 44 deletions

View file

@ -1,10 +1,13 @@
### Todo
- [x] Add factory classes to reduce the amount of class import statements
- [ ] Add dynamic import functionality to factory classes to reduce boilerplate
- [x] Support exporting to MDict (.MDX) dictionary format
- [x] Validate JSON schema of Yomichan terms during export
- [ ] Add support for monokakido search keys from index files
- [ ] Delete unneeded media from temp build directory before final export
- [ ] Add test suite
- [ ] Add documentation (docstrings, etc.)
- [ ] Validate JSON schema of Yomichan terms during export
- [ ] Add build scripts for producing program binaries
- [ ] Validate scraped webpages after downloading
- [ ] Log non-fatal failures to a log file instead of raising exceptions
@ -13,7 +16,7 @@
- [ ] [Yoji-Jukugo.com](https://yoji-jukugo.com/)
- [ ] [実用日本語表現辞典](https://www.weblio.jp/cat/dictionary/jtnhj)
- [ ] Support more Monokakido dictionaries
- [ ] 三省堂国語辞典 第8版 (SANKOKU8)
- [x] 三省堂国語辞典 第8版 (SANKOKU8)
- [ ] 精選版 日本国語大辞典 (NDS)
- [ ] 大辞泉 第2版 (DAIJISEN2)
- [ ] 明鏡国語辞典 第3版 (MK3)

View file

@ -39,9 +39,9 @@ class Crawler(ABC):
self._entries.append(entry)
print()
def make_yomichan_dictionary(self, media_dir):
def make_yomichan_dictionary(self, media_dir, validate):
exporter = new_yomi_exporter(self._target)
exporter.export(self._entries, media_dir)
exporter.export(self._entries, media_dir, validate)
def make_mdict_dictionary(self, media_dir, icon_file):
exporter = new_mdict_exporter(self._target)
@ -152,3 +152,7 @@ class Smk8Crawler(_MonokakidoCrawler):
class Daijirin2Crawler(_MonokakidoCrawler):
pass
class Sankoku8Crawler(_MonokakidoCrawler):
pass

View file

@ -5,6 +5,7 @@ from bot.crawlers.crawlers import JitenonYojiCrawler
from bot.crawlers.crawlers import JitenonKotowazaCrawler
from bot.crawlers.crawlers import Smk8Crawler
from bot.crawlers.crawlers import Daijirin2Crawler
from bot.crawlers.crawlers import Sankoku8Crawler
def new_crawler(target):
@ -14,5 +15,6 @@ def new_crawler(target):
Targets.JITENON_KOTOWAZA: JitenonKotowazaCrawler,
Targets.SMK8: Smk8Crawler,
Targets.DAIJIRIN2: Daijirin2Crawler,
Targets.SANKOKU8: Sankoku8Crawler,
}
return crawler_map[target](target)

View file

@ -37,14 +37,16 @@ def load_config():
@cache
def load_yomichan_inflection_categories():
file_name = os.path.join("yomichan", "inflection_categories.json")
file_name = os.path.join(
"yomichan", "inflection_categories.json")
data = __load_json(file_name)
return data
@cache
def load_yomichan_metadata():
file_name = os.path.join("yomichan", "index.json")
file_name = os.path.join(
"yomichan", "index.json")
data = __load_json(file_name)
return data
@ -53,31 +55,21 @@ def load_yomichan_metadata():
def load_variant_kanji():
def loader(data, row):
data[row[0]] = row[1]
file_name = os.path.join("entries", "variant_kanji.csv")
file_name = os.path.join(
"entries", "variant_kanji.csv")
data = {}
__load_csv(file_name, loader, data)
return data
@cache
def load_smk8_phrase_readings():
def load_phrase_readings(target):
def loader(data, row):
entry_id = (int(row[0]), int(row[1]))
reading = row[2]
data[entry_id] = reading
file_name = os.path.join("entries", "smk8", "phrase_readings.csv")
data = {}
__load_csv(file_name, loader, data)
return data
@cache
def load_daijirin2_phrase_readings():
def loader(data, row):
entry_id = (int(row[0]), int(row[1]))
reading = row[2]
data[entry_id] = reading
file_name = os.path.join("entries", "daijirin2", "phrase_readings.csv")
file_name = os.path.join(
"entries", target.value, "phrase_readings.csv")
data = {}
__load_csv(file_name, loader, data)
return data
@ -92,7 +84,8 @@ def load_daijirin2_kana_abbreviations():
if abbr.strip() != "":
abbreviations.append(abbr)
data[entry_id] = abbreviations
file_name = os.path.join("entries", "daijirin2", "kana_abbreviations.csv")
file_name = os.path.join(
"entries", "daijirin2", "kana_abbreviations.csv")
data = {}
__load_csv(file_name, loader, data)
return data
@ -100,14 +93,24 @@ def load_daijirin2_kana_abbreviations():
@cache
def load_yomichan_name_conversion(target):
file_name = os.path.join("yomichan", "name_conversion", f"{target.value}.json")
file_name = os.path.join(
"yomichan", "name_conversion", f"{target.value}.json")
data = __load_json(file_name)
return data
@cache
def load_yomichan_term_schema():
file_name = os.path.join(
"yomichan", "dictionary-term-bank-v3-schema.json")
schema = __load_json(file_name)
return schema
@cache
def load_mdict_name_conversion(target):
file_name = os.path.join("mdict", "name_conversion", f"{target.value}.json")
file_name = os.path.join(
"mdict", "name_conversion", f"{target.value}.json")
data = __load_json(file_name)
return data
@ -131,7 +134,8 @@ def __load_adobe_glyphs():
data[code].append(character)
else:
data[code] = [character]
file_name = os.path.join("entries", "adobe", "Adobe-Japan1_sequences.txt")
file_name = os.path.join(
"entries", "adobe", "Adobe-Japan1_sequences.txt")
data = {}
__load_csv(file_name, loader, data, delim=';')
return data
@ -139,7 +143,8 @@ def __load_adobe_glyphs():
@cache
def __load_override_adobe_glyphs():
file_name = os.path.join("entries", "adobe", "override_glyphs.json")
file_name = os.path.join(
"entries", "adobe", "override_glyphs.json")
json_data = __load_json(file_name)
data = {}
for key, val in json_data.items():

View file

@ -2,7 +2,7 @@ from bs4 import BeautifulSoup
import bot.entries.expressions as Expressions
import bot.soup as Soup
from bot.data import load_daijirin2_phrase_readings
from bot.data import load_phrase_readings
from bot.data import load_daijirin2_kana_abbreviations
from bot.entries.entry import Entry
from bot.entries.daijirin2_preprocess import preprocess_page
@ -221,7 +221,7 @@ class Daijirin2PhraseEntry(_BaseDaijirin2Entry):
return expressions
def _find_readings(self):
phrase_readings = load_daijirin2_phrase_readings()
phrase_readings = load_phrase_readings(self.target)
text = phrase_readings[self.entry_id]
alternatives = Expressions.expand_daijirin_alternatives(text)
readings = []

View file

@ -5,6 +5,7 @@ from bot.entries.jitenon import JitenonYojiEntry
from bot.entries.jitenon import JitenonKotowazaEntry
from bot.entries.smk8 import Smk8Entry
from bot.entries.daijirin2 import Daijirin2Entry
from bot.entries.sankoku8 import Sankoku8Entry
def new_entry(target, page_id):
@ -14,5 +15,6 @@ def new_entry(target, page_id):
Targets.JITENON_KOTOWAZA: JitenonKotowazaEntry,
Targets.SMK8: Smk8Entry,
Targets.DAIJIRIN2: Daijirin2Entry,
Targets.SANKOKU8: Sankoku8Entry,
}
return entry_map[target](target, page_id)

260
bot/entries/sankoku8.py Normal file
View file

@ -0,0 +1,260 @@
from bs4 import BeautifulSoup
import bot.entries.expressions as Expressions
import bot.soup as Soup
from bot.entries.entry import Entry
from bot.data import load_phrase_readings
from bot.entries.sankoku8_preprocess import preprocess_page
class _BaseSankoku8Entry(Entry):
def __init__(self, target, entry_id):
super().__init__(target, entry_id)
self.children = []
self.phrases = []
self._hyouki_name = "表記"
self._midashi_name = None
self._midashi_kana_name = None
def get_global_identifier(self):
parent_part = format(self.entry_id[0], '06')
child_part = hex(self.entry_id[1]).lstrip('0x').zfill(4).upper()
return f"@{self.target.value}-{parent_part}-{child_part}"
def set_page(self, page):
page = self.__decompose_subentries(page)
self._page = page
def get_page_soup(self):
soup = BeautifulSoup(self._page, "xml")
return soup
def _get_headwords(self):
soup = self.get_page_soup()
self._delete_unused_nodes(soup)
readings = self._find_readings(soup)
expressions = self._find_expressions(soup)
headwords = {}
for reading in readings:
headwords[reading] = []
if len(readings) == 1:
reading = readings[0]
if soup.find(self._midashi_name).find(self._hyouki_name) is None:
headwords[reading].append(reading)
for exp in expressions:
if exp not in headwords[reading]:
headwords[reading].append(exp)
elif len(readings) > 1 and len(expressions) == 0:
for reading in readings:
headwords[reading].append(reading)
elif len(readings) > 1 and len(expressions) == 1:
if soup.find(self._midashi_name).find(self._hyouki_name) is None:
for reading in readings:
headwords[reading].append(reading)
expression = expressions[0]
for reading in readings:
if expression not in headwords[reading]:
headwords[reading].append(expression)
elif len(readings) > 1 and len(expressions) == len(readings):
if soup.find(self._midashi_name).find(self._hyouki_name) is None:
for reading in readings:
headwords[reading].append(reading)
for idx, reading in enumerate(readings):
exp = expressions[idx]
if exp not in headwords[reading]:
headwords[reading].append(exp)
else:
raise Exception() # shouldn't happen
return headwords
def _add_variant_expressions(self, headwords):
for expressions in headwords.values():
Expressions.add_variant_kanji(expressions)
Expressions.add_fullwidth(expressions)
Expressions.remove_iteration_mark(expressions)
Expressions.add_iteration_mark(expressions)
def get_part_of_speech_tags(self):
if self._part_of_speech_tags is not None:
return self._part_of_speech_tags
self._part_of_speech_tags = []
soup = self.get_page_soup()
for midashi in soup.find_all([self._midashi_name, "見出部要素"]):
pos_group = midashi.find("品詞G")
if pos_group is None:
continue
for tag in pos_group.find_all("a"):
if tag.text not in self._part_of_speech_tags:
self._part_of_speech_tags.append(tag.text)
return self._part_of_speech_tags
def _find_expressions(self, soup):
expressions = []
for hyouki in soup.find_all(self._hyouki_name):
for expression in parse_hyouki_soup(hyouki, [""]):
expressions.append(expression)
return expressions
def _find_readings(self, soup):
midasi_kana = soup.find(self._midashi_kana_name)
readings = parse_hyouki_soup(midasi_kana, [""])
return readings
def __decompose_subentries(self, page):
soup = BeautifulSoup(page, features="xml")
subentry_parameters = [
[Sankoku8ChildEntry, ["子項目"], self.children],
[Sankoku8PhraseEntry, ["句項目"], self.phrases],
]
for x in subentry_parameters:
subentry_class, tags, subentry_list = x
for tag in tags:
tag_soup = soup.find(tag)
while tag_soup is not None:
tag_soup.name = "項目"
subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"])
self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id
subentry = subentry_class(self.target, subentry_id)
page = tag_soup.decode()
subentry.set_page(page)
subentry_list.append(subentry)
tag_soup.decompose()
tag_soup = soup.find(tag)
return soup.decode()
@staticmethod
def id_string_to_entry_id(id_string):
parts = id_string.split("-")
if len(parts) == 1:
return (int(parts[0]), 0)
elif len(parts) == 2:
# subentries have a hexadecimal part
return (int(parts[0]), int(parts[1], 16))
else:
raise Exception(f"Invalid entry ID: {id_string}")
@staticmethod
def _delete_unused_nodes(soup):
"""Remove extra markup elements that appear in the entry
headword line which are not part of the entry headword"""
unused_nodes = [
"語構成", "平板", "アクセント", "表外字マーク", "表外音訓マーク",
"アクセント分節", "活用分節", "ルビG", "分書"
]
for name in unused_nodes:
Soup.delete_soup_nodes(soup, name)
class Sankoku8Entry(_BaseSankoku8Entry):
def __init__(self, target, page_id):
entry_id = (page_id, 0)
super().__init__(target, entry_id)
self._midashi_name = "見出部"
self._midashi_kana_name = "見出仮名"
def set_page(self, page):
page = preprocess_page(page)
super().set_page(page)
class Sankoku8ChildEntry(_BaseSankoku8Entry):
def __init__(self, target, page_id):
super().__init__(target, page_id)
self._midashi_name = "子見出部"
self._midashi_kana_name = "子見出仮名"
class Sankoku8PhraseEntry(_BaseSankoku8Entry):
def get_part_of_speech_tags(self):
# phrases do not contain these tags
return []
def _get_headwords(self):
soup = self.get_page_soup()
self._delete_unused_nodes(soup)
expressions = self._find_expressions(soup)
readings = self._find_readings(soup)
headwords = {}
if len(expressions) != len(readings):
raise Exception(f"{self.entry_id[0]}-{self.entry_id[1]}")
for idx, expression in enumerate(expressions):
reading = readings[idx]
if reading in headwords:
headwords[reading].append(expression)
else:
headwords[reading] = [expression]
return headwords
def _find_expressions(self, soup):
phrase_soup = soup.find("句表記")
expressions = parse_hyouki_soup(phrase_soup, [""])
return expressions
def _find_readings(self, soup):
reading_patterns = load_phrase_readings(self.target)
reading_pattern = reading_patterns[self.entry_id]
readings = parse_hyouki_pattern(reading_pattern)
return readings
def parse_hyouki_soup(soup, base_exps):
omitted_characters = [
"", "", "", "", "", "", "", "", ""
]
exps = base_exps.copy()
for child in soup.children:
new_exps = []
if child.name == "言換G":
for alt in child.find_all("言換"):
parts = parse_hyouki_soup(alt, [""])
for exp in exps:
for part in parts:
new_exps.append(exp + part)
elif child.name == "補足表記":
alt1 = child.find("表記対象")
alt2 = child.find("表記内容G")
parts1 = parse_hyouki_soup(alt1, [""])
parts2 = parse_hyouki_soup(alt2, [""])
for exp in exps:
for part in parts1:
new_exps.append(exp + part)
for part in parts2:
new_exps.append(exp + part)
elif child.name == "省略":
parts = parse_hyouki_soup(child, [""])
for exp in exps:
new_exps.append(exp)
for part in parts:
new_exps.append(exp + part)
elif child.name is not None:
new_exps = parse_hyouki_soup(child, exps)
else:
text = child.text
for char in omitted_characters:
text = text.replace(char, "")
for exp in exps:
new_exps.append(exp + text)
exps = new_exps.copy()
return exps
def parse_hyouki_pattern(pattern):
replacements = {
"": "<省略>",
"": "</省略>",
"": "<補足表記><表記対象>",
"": "</表記対象><表記内容G><表記内容>",
"": "</表記内容></表記内容G></補足表記>",
"": "<言換G>〈<言換>",
"": "</言換><言換>",
"": "</言換>〉</言換G>",
"": "<補足表記><表記対象>",
"": "</表記対象><表記内容G>⦅<表記内容>",
"": "</表記内容>⦆</表記内容G></補足表記>",
}
markup = f"<span>{pattern}</span>"
for key, val in replacements.items():
markup = markup.replace(key, val)
soup = BeautifulSoup(markup, "xml")
hyouki_soup = soup.find("span")
exps = parse_hyouki_soup(hyouki_soup, [""])
return exps

View file

@ -0,0 +1,28 @@
import re
from bs4 import BeautifulSoup
from bot.data import get_adobe_glyph
def preprocess_page(page):
soup = BeautifulSoup(page, features="xml")
__replace_glyph_codes(soup)
page = __strip_page(soup)
return page
def __replace_glyph_codes(soup):
for el in soup.find_all("glyph"):
m = re.search(r"^glyph:([0-9]+);?$", el.attrs["style"])
code = int(m.group(1))
for geta in el.find_all(string=""):
glyph = get_adobe_glyph(code)
geta.replace_with(glyph)
def __strip_page(soup):
koumoku = soup.find(["項目"])
if koumoku is not None:
return koumoku.decode()
else:
raise Exception(f"Primary 項目 not found in page:\n{soup.prettify()}")

View file

@ -2,7 +2,7 @@ from bs4 import BeautifulSoup
import bot.entries.expressions as Expressions
import bot.soup as Soup
from bot.data import load_smk8_phrase_readings
from bot.data import load_phrase_readings
from bot.entries.entry import Entry
from bot.entries.smk8_preprocess import preprocess_page
@ -163,7 +163,7 @@ class Smk8ChildEntry(_BaseSmk8Entry):
class Smk8PhraseEntry(_BaseSmk8Entry):
def __init__(self, target, entry_id):
super().__init__(target, entry_id)
self.__phrase_readings = load_smk8_phrase_readings()
self.__phrase_readings = load_phrase_readings(self.target)
def get_part_of_speech_tags(self):
# phrases do not contain these tags

View file

@ -218,3 +218,8 @@ class Smk8Exporter(_MonokakidoExporter):
class Daijirin2Exporter(_MonokakidoExporter):
def _get_attribution(self, entries):
return "© Sanseido Co., LTD. 2019"
class Sankoku8Exporter(_MonokakidoExporter):
def _get_attribution(self, entries):
return "© Sanseido Co., LTD. 2021"

View file

@ -5,6 +5,7 @@ from bot.mdict.exporters.export import JitenonYojiExporter
from bot.mdict.exporters.export import JitenonKotowazaExporter
from bot.mdict.exporters.export import Smk8Exporter
from bot.mdict.exporters.export import Daijirin2Exporter
from bot.mdict.exporters.export import Sankoku8Exporter
def new_mdict_exporter(target):
@ -14,5 +15,6 @@ def new_mdict_exporter(target):
Targets.JITENON_KOTOWAZA: JitenonKotowazaExporter,
Targets.SMK8: Smk8Exporter,
Targets.DAIJIRIN2: Daijirin2Exporter,
Targets.SANKOKU8: Sankoku8Exporter,
}
return exporter_map[target](target)

View file

@ -0,0 +1,137 @@
import re
from bs4 import BeautifulSoup
from bot.data import load_mdict_name_conversion
from bot.name_conversion import convert_names
def make_glossary(entry, media_dir):
soup = entry.get_page_soup()
__reposition_marks(soup)
__remove_appendix_links(soup)
__convert_images(soup)
__remove_links_without_href(soup)
__convert_links(soup, entry)
__add_parent_link(soup, entry)
__add_homophone_links(soup, entry)
name_conversion = load_mdict_name_conversion(entry.target)
convert_names(soup, name_conversion)
glossary = soup.span.decode()
return glossary
def __reposition_marks(soup):
"""These 表外字マーク symbols will be converted to rubies later, so they need to
be positioned after the corresponding text in order to appear correctly"""
for elm in soup.find_all("表外字"):
mark = elm.find("表外字マーク")
elm.append(mark)
for elm in soup.find_all("表外音訓"):
mark = elm.find("表外音訓マーク")
elm.append(mark)
def __remove_appendix_links(soup):
"""This info would be useful and nice to have, but jitenbot currently
isn't designed to fetch and process these appendix files. It probably
wouldn't be possible to include them in Yomichan, but it would definitely
be possible for Mdict."""
for elm in soup.find_all("a"):
if not elm.has_attr("href"):
continue
if elm.attrs["href"].startswith("appendix"):
elm.attrs["data-name"] = "a"
elm.attrs["data-href"] = elm.attrs["href"]
elm.name = "span"
del elm.attrs["href"]
def __convert_images(soup):
conversions = [
["svg-logo/重要語.svg", ""],
["svg-logo/最重要語.svg", ""],
["svg-logo/一般常識語.svg", "☆☆"],
["svg-logo/追い込み.svg", ""],
["svg-special/区切り線.svg", "|"],
["svg-accent/平板.svg", ""],
["svg-accent/アクセント.svg", ""],
["svg-logo/アク.svg", "アク"],
["svg-logo/丁寧.svg", "丁寧"],
["svg-logo/可能.svg", "可能"],
["svg-logo/尊敬.svg", "尊敬"],
["svg-logo/接尾.svg", "接尾"],
["svg-logo/接頭.svg", "接頭"],
["svg-logo/表記.svg", "表記"],
["svg-logo/謙譲.svg", "謙譲"],
["svg-logo/区別.svg", "区別"],
["svg-logo/由来.svg", "由来"],
]
for conversion in conversions:
filename, text = conversion
for elm in soup.find_all("img", attrs={"src": filename}):
elm.attrs["data-name"] = elm.name
elm.attrs["data-src"] = elm.attrs["src"]
elm.name = "span"
elm.string = text
del elm.attrs["src"]
def __remove_links_without_href(soup):
for elm in soup.find_all("a"):
if elm.has_attr("href"):
continue
elm.attrs["data-name"] = elm.name
elm.name = "span"
def __convert_links(soup, entry):
for elm in soup.find_all("a"):
href = elm.attrs["href"].split(" ")[0]
if re.match(r"^#?[0-9]+(?:-[0-9A-F]{4})?$", href):
href = href.removeprefix("#")
ref_entry_id = entry.id_string_to_entry_id(href)
if ref_entry_id in entry.ID_TO_ENTRY:
ref_entry = entry.ID_TO_ENTRY[ref_entry_id]
else:
ref_entry = entry.ID_TO_ENTRY[(ref_entry_id[0], 0)]
gid = ref_entry.get_global_identifier()
elm.attrs["href"] = f"entry://{gid}"
elif re.match(r"^entry:", href):
pass
elif re.match(r"^https?:[\w\W]*", href):
pass
else:
raise Exception(f"Invalid href format: {href}")
def __add_parent_link(soup, entry):
elm = soup.find("親見出相当部")
if elm is not None:
parent_entry = entry.get_parent()
gid = parent_entry.get_global_identifier()
elm.attrs["href"] = f"entry://{gid}"
elm.attrs["data-name"] = elm.name
elm.name = "a"
def __add_homophone_links(soup, entry):
forward_link = ["", entry.entry_id[0] + 1]
backward_link = ["", entry.entry_id[0] - 1]
homophone_info_list = [
["svg-logo/homophone1.svg", [forward_link]],
["svg-logo/homophone2.svg", [forward_link, backward_link]],
["svg-logo/homophone3.svg", [backward_link]],
]
for homophone_info in homophone_info_list:
filename, link_info = homophone_info
for elm in soup.find_all("img", attrs={"src": filename}):
for info in link_info:
text, link_id = info
link_entry = entry.ID_TO_ENTRY[(link_id, 0)]
gid = link_entry.get_global_identifier()
link = BeautifulSoup("<a/>", "xml").a
link.string = text
link.attrs["href"] = f"entry://{gid}"
elm.append(link)
elm.unwrap()

View file

@ -5,6 +5,7 @@ from bot.mdict.terms.jitenon import JitenonYojiTerminator
from bot.mdict.terms.jitenon import JitenonKotowazaTerminator
from bot.mdict.terms.smk8 import Smk8Terminator
from bot.mdict.terms.daijirin2 import Daijirin2Terminator
from bot.mdict.terms.sankoku8 import Sankoku8Terminator
def new_terminator(target):
@ -14,5 +15,6 @@ def new_terminator(target):
Targets.JITENON_KOTOWAZA: JitenonKotowazaTerminator,
Targets.SMK8: Smk8Terminator,
Targets.DAIJIRIN2: Daijirin2Terminator,
Targets.SANKOKU8: Sankoku8Terminator,
}
return terminator_map[target](target)

View file

@ -0,0 +1,23 @@
from bot.mdict.terms.terminator import Terminator
from bot.mdict.glossary.sankoku8 import make_glossary
class Sankoku8Terminator(Terminator):
def _glossary(self, entry):
if entry.entry_id in self._glossary_cache:
return self._glossary_cache[entry.entry_id]
glossary = make_glossary(entry, self._media_dir)
self._glossary_cache[entry.entry_id] = glossary
return glossary
def _link_glossary_parameters(self, entry):
return [
[entry.children, "子項目"],
[entry.phrases, "句項目"],
]
def _subentry_lists(self, entry):
return [
entry.children,
entry.phrases,
]

View file

@ -7,3 +7,4 @@ class Targets(Enum):
JITENON_KOTOWAZA = "jitenon-kotowaza"
SMK8 = "smk8"
DAIJIRIN2 = "daijirin2"
SANKOKU8 = "sankoku8"

View file

@ -3,13 +3,16 @@
import json
import os
import shutil
import copy
from pathlib import Path
from datetime import datetime
from abc import ABC, abstractmethod
from platformdirs import user_documents_dir, user_cache_dir
import fastjsonschema
from bot.data import load_yomichan_metadata
from bot.yomichan.terms.factory import new_terminator
from bot.data import load_yomichan_term_schema
class Exporter(ABC):
@ -19,7 +22,7 @@ class Exporter(ABC):
self._build_dir = None
self._terms_per_file = 2000
def export(self, entries, image_dir):
def export(self, entries, image_dir, validate):
self.__init_build_image_dir(image_dir)
meta = load_yomichan_metadata()
index = meta[self._target.value]["index"]
@ -27,6 +30,8 @@ class Exporter(ABC):
index["attribution"] = self._get_attribution(entries)
tags = meta[self._target.value]["tags"]
terms = self.__get_terms(entries)
if validate:
self.__validate_terms(terms)
self.__make_dictionary(terms, index, tags)
@abstractmethod
@ -49,6 +54,14 @@ class Exporter(ABC):
self._build_dir = build_directory
return self._build_dir
def __get_invalid_term_dir(self):
cache_dir = user_cache_dir("jitenbot")
log_dir = os.path.join(cache_dir, "invalid_yomichan_terms")
if Path(log_dir).is_dir():
shutil.rmtree(log_dir)
os.makedirs(log_dir)
return log_dir
def __init_build_image_dir(self, image_dir):
build_dir = self._get_build_dir()
build_img_dir = os.path.join(build_dir, self._target.value)
@ -71,8 +84,29 @@ class Exporter(ABC):
print()
return terms
def __validate_terms(self, terms):
print("Making a copy of term data for validation...")
terms_copy = copy.deepcopy(terms) # because validator will alter data!
term_count = len(terms_copy)
log_dir = self.__get_invalid_term_dir()
schema = load_yomichan_term_schema()
validator = fastjsonschema.compile(schema)
failure_count = 0
for idx, term in enumerate(terms_copy):
update = f"Validating term {idx+1}/{term_count}"
print(update, end='\r', flush=True)
try:
validator([term])
except fastjsonschema.JsonSchemaException:
failure_count += 1
term_file = os.path.join(log_dir, f"{idx}.json")
with open(term_file, "w", encoding='utf8') as f:
json.dump([term], f, indent=4, ensure_ascii=False)
print(f"\nFinished validating with {failure_count} error{'' if failure_count == 1 else 's'}")
if failure_count > 0:
print(f"Invalid terms saved to `{log_dir}` for debugging")
def __make_dictionary(self, terms, index, tags):
print(f"Exporting {len(terms)} Yomichan terms...")
self.__write_term_banks(terms)
self.__write_index(index)
self.__write_tag_bank(tags)
@ -80,14 +114,18 @@ class Exporter(ABC):
self.__rm_build_dir()
def __write_term_banks(self, terms):
print(f"Exporting {len(terms)} JSON terms")
build_dir = self._get_build_dir()
max_i = int(len(terms) / self._terms_per_file) + 1
for i in range(max_i):
term_file = os.path.join(build_dir, f"term_bank_{i+1}.json")
with open(term_file, "w", encoding='utf8') as f:
start = self._terms_per_file * i
end = self._terms_per_file * (i + 1)
update = f"Writing terms to term banks {start} - {end}"
print(update, end='\r', flush=True)
term_file = os.path.join(build_dir, f"term_bank_{i+1}.json")
with open(term_file, "w", encoding='utf8') as f:
json.dump(terms[start:end], f, indent=4, ensure_ascii=False)
print()
def __write_index(self, index):
build_dir = self._get_build_dir()
@ -104,6 +142,7 @@ class Exporter(ABC):
json.dump(tags, f, indent=4, ensure_ascii=False)
def __write_archive(self, filename):
print("Archiving data to ZIP file...")
archive_format = "zip"
out_dir = os.path.join(user_documents_dir(), "jitenbot", "yomichan")
if not Path(out_dir).is_dir():
@ -151,19 +190,22 @@ class JitenonKotowazaExporter(_JitenonExporter):
pass
class Smk8Exporter(Exporter):
class _MonokakidoExporter(Exporter):
def _get_revision(self, entries):
timestamp = datetime.now().strftime("%Y-%m-%d")
return f"{self._target.value};{timestamp}"
class Smk8Exporter(_MonokakidoExporter):
def _get_attribution(self, entries):
return "© Sanseido Co., LTD. 2020"
class Daijirin2Exporter(Exporter):
def _get_revision(self, entries):
timestamp = datetime.now().strftime("%Y-%m-%d")
return f"{self._target.value};{timestamp}"
class Daijirin2Exporter(_MonokakidoExporter):
def _get_attribution(self, entries):
return "© Sanseido Co., LTD. 2019"
class Sankoku8Exporter(_MonokakidoExporter):
def _get_attribution(self, entries):
return "© Sanseido Co., LTD. 2021"

View file

@ -5,6 +5,7 @@ from bot.yomichan.exporters.export import JitenonYojiExporter
from bot.yomichan.exporters.export import JitenonKotowazaExporter
from bot.yomichan.exporters.export import Smk8Exporter
from bot.yomichan.exporters.export import Daijirin2Exporter
from bot.yomichan.exporters.export import Sankoku8Exporter
def new_yomi_exporter(target):
@ -14,5 +15,6 @@ def new_yomi_exporter(target):
Targets.JITENON_KOTOWAZA: JitenonKotowazaExporter,
Targets.SMK8: Smk8Exporter,
Targets.DAIJIRIN2: Daijirin2Exporter,
Targets.SANKOKU8: Sankoku8Exporter,
}
return exporter_map[target](target)

View file

@ -26,6 +26,27 @@ def make_monochrome_fill_rectangle(path, text):
f.write(svg)
@cache
def make_accent(path):
svg = __svg_accent()
with open(path, "w", encoding="utf-8") as f:
f.write(svg)
@cache
def make_heiban(path):
svg = __svg_heiban()
with open(path, "w", encoding="utf-8") as f:
f.write(svg)
@cache
def make_red_char(path, char):
svg = __svg_red_character(char)
with open(path, "w", encoding="utf-8") as f:
f.write(svg)
def __calculate_svg_ratio(path):
with open(path, "r", encoding="utf-8") as f:
xml = f.read()
@ -82,3 +103,30 @@ def __svg_masked_rectangle(text):
fill='black' mask='url(#a)'/>
</svg>"""
return svg.strip()
def __svg_heiban():
svg = f"""
<svg viewBox='0 0 210 300' xmlns='http://www.w3.org/2000/svg' version='1.1'>
<rect width='210' height='30' fill='red'/>
</svg>"""
return svg.strip()
def __svg_accent():
svg = f"""
<svg viewBox='0 0 150 300' xmlns='http://www.w3.org/2000/svg' version='1.1'>
<rect width='150' height='30' fill='red'/>
<rect width='30' height='150' x='120' fill='red'/>
</svg>"""
return svg.strip()
def __svg_red_character(char):
svg = f"""
<svg viewBox='0 0 300 300' xmlns='http://www.w3.org/2000/svg' version='1.1'>
<text text-anchor='middle' x='50%' y='50%' dy='.37em'
font-family='sans-serif' font-size='300px'
fill='red'>{char}</text>
</svg>"""
return svg.strip()

View file

@ -0,0 +1,344 @@
import re
import os
from bs4 import BeautifulSoup
import bot.yomichan.glossary.icons as Icons
from bot.data import load_yomichan_name_conversion
from bot.yomichan.glossary.gloss import make_gloss
from bot.name_conversion import convert_names
def make_glossary(entry, media_dir):
soup = entry.get_page_soup()
__remove_glyph_styles(soup)
__reposition_marks(soup)
__remove_links_without_href(soup)
__remove_appendix_links(soup)
__convert_links(soup, entry)
__add_parent_link(soup, entry)
__add_homophone_links(soup, entry)
__convert_images_to_text(soup)
__text_parens_to_images(soup, media_dir)
__replace_icons(soup, media_dir)
__replace_accent_symbols(soup, media_dir)
__convert_gaiji(soup, media_dir)
__convert_graphics(soup, media_dir)
__convert_number_icons(soup, media_dir)
name_conversion = load_yomichan_name_conversion(entry.target)
convert_names(soup, name_conversion)
gloss = make_gloss(soup.span)
glossary = [gloss]
return glossary
def __remove_glyph_styles(soup):
"""The css_parser library will emit annoying warning messages
later if it sees these glyph character styles"""
for elm in soup.find_all("glyph"):
if elm.has_attr("style"):
elm["data-style"] = elm.attrs["style"]
del elm.attrs["style"]
def __reposition_marks(soup):
"""These マーク symbols will be converted to rubies later, so they need to
be positioned after the corresponding text in order to appear correctly"""
for elm in soup.find_all("表外字"):
mark = elm.find("表外字マーク")
elm.append(mark)
for elm in soup.find_all("表外音訓"):
mark = elm.find("表外音訓マーク")
elm.append(mark)
def __remove_links_without_href(soup):
for elm in soup.find_all("a"):
if elm.has_attr("href"):
continue
elm.attrs["data-name"] = elm.name
elm.name = "span"
def __remove_appendix_links(soup):
for elm in soup.find_all("a"):
if elm.attrs["href"].startswith("appendix"):
elm.unwrap()
def __convert_links(soup, entry):
for elm in soup.find_all("a"):
href = elm.attrs["href"].split(" ")[0]
href = href.removeprefix("#")
if not re.match(r"^[0-9]+(?:-[0-9A-F]{4})?$", href):
raise Exception(f"Invalid href format: {href}")
ref_entry_id = entry.id_string_to_entry_id(href)
if ref_entry_id in entry.ID_TO_ENTRY:
ref_entry = entry.ID_TO_ENTRY[ref_entry_id]
else:
ref_entry = entry.ID_TO_ENTRY[(ref_entry_id[0], 0)]
expression = ref_entry.get_first_expression()
elm.attrs["href"] = f"?query={expression}&wildcards=off"
def __add_parent_link(soup, entry):
elm = soup.find("親見出相当部")
if elm is not None:
parent_entry = entry.get_parent()
expression = parent_entry.get_first_expression()
elm.attrs["href"] = f"?query={expression}&wildcards=off"
elm.name = "a"
def __add_homophone_links(soup, entry):
forward_link = ["", entry.entry_id[0] + 1]
backward_link = ["", entry.entry_id[0] - 1]
homophone_info_list = [
["svg-logo/homophone1.svg", [forward_link]],
["svg-logo/homophone2.svg", [forward_link, backward_link]],
["svg-logo/homophone3.svg", [backward_link]],
]
for homophone_info in homophone_info_list:
filename, link_info = homophone_info
for elm in soup.find_all("img", attrs={"src": filename}):
for info in link_info:
text, link_id = info
link_entry = entry.ID_TO_ENTRY[(link_id, 0)]
expression = link_entry.get_first_expression()
link = BeautifulSoup("<a/>", "xml").a
link.string = text
link.attrs["href"] = f"?query={expression}&wildcards=off"
elm.append(link)
elm.unwrap()
def __convert_images_to_text(soup):
conversions = [
["svg-logo/重要語.svg", "", "vertical-align: super; font-size: 0.6em"],
["svg-logo/最重要語.svg", "", "vertical-align: super; font-size: 0.6em"],
["svg-logo/一般常識語.svg", "☆☆", "vertical-align: super; font-size: 0.6em"],
["svg-logo/追い込み.svg", "", ""],
["svg-special/区切り線.svg", "|", ""],
]
for conversion in conversions:
filename, text, style = conversion
for elm in soup.find_all("img", attrs={"src": filename}):
if text == "":
elm.unwrap()
continue
if style != "":
elm.attrs["style"] = style
elm.attrs["data-name"] = elm.name
elm.attrs["data-src"] = elm.attrs["src"]
elm.name = "span"
elm.string = text
del elm.attrs["src"]
def __text_parens_to_images(soup, media_dir):
for elm in soup.find_all("red"):
char = elm.text
if char not in ["", ""]:
continue
filename = f"red_{char}.svg"
path = os.path.join(media_dir, filename)
Icons.make_red_char(path, char)
ratio = Icons.calculate_ratio(path)
img = BeautifulSoup("<img/>", "xml").img
img.attrs = {
"height": 1.0,
"width": ratio,
"sizeUnits": "em",
"collapsible": False,
"collapsed": False,
"background": False,
"appearance": "auto",
"path": f"{os.path.basename(media_dir)}/{filename}",
}
elm.attrs["data-name"] = elm.name
elm.name = "span"
elm.string = ""
elm.append(img)
elm.attrs["style"] = "vertical-align: text-bottom;"
def __replace_icons(soup, media_dir):
cls_to_appearance = {
"default": "monochrome",
"fill": "monochrome",
"red": "auto",
"redfill": "auto",
"none": "monochrome",
}
icon_info_list = [
["svg-logo/アク.svg", "アク", "default"],
["svg-logo/丁寧.svg", "丁寧", "default"],
["svg-logo/可能.svg", "可能", "default"],
["svg-logo/尊敬.svg", "尊敬", "default"],
["svg-logo/接尾.svg", "接尾", "default"],
["svg-logo/接頭.svg", "接頭", "default"],
["svg-logo/表記.svg", "表記", "default"],
["svg-logo/謙譲.svg", "謙譲", "default"],
["svg-logo/区別.svg", "区別", "redfill"],
["svg-logo/由来.svg", "由来", "redfill"],
["svg-logo/人.svg", "", "none"],
["svg-logo/他.svg", "", "none"],
["svg-logo/動.svg", "", "none"],
["svg-logo/名.svg", "", "none"],
["svg-logo/句.svg", "", "none"],
["svg-logo/派.svg", "", "none"],
["svg-logo/自.svg", "", "none"],
["svg-logo/連.svg", "", "none"],
["svg-logo/造.svg", "", "none"],
["svg-logo/造2.svg", "", "none"],
["svg-logo/造3.svg", "", "none"],
["svg-logo/百科.svg", "", "none"],
]
for icon_info in icon_info_list:
src, text, cls = icon_info
for elm in soup.find_all("img", attrs={"src": src}):
path = media_dir
for part in src.split("/"):
path = os.path.join(path, part)
__make_rectangle(path, text, cls)
ratio = Icons.calculate_ratio(path)
img = BeautifulSoup("<img/>", "xml").img
img.attrs = {
"height": 1.0,
"width": ratio,
"sizeUnits": "em",
"collapsible": False,
"collapsed": False,
"background": False,
"appearance": cls_to_appearance[cls],
"title": elm.attrs["alt"] if elm.has_attr("alt") else "",
"path": f"{os.path.basename(media_dir)}/{src}",
}
elm.name = "span"
elm.clear()
elm.append(img)
elm.attrs["style"] = "vertical-align: text-bottom; margin-right: 0.25em;"
def __replace_accent_symbols(soup, media_dir):
accent_info_list = [
["svg-accent/平板.svg", Icons.make_heiban],
["svg-accent/アクセント.svg", Icons.make_accent],
]
for info in accent_info_list:
src, write_svg_function = info
for elm in soup.find_all("img", attrs={"src": src}):
path = media_dir
for part in src.split("/"):
path = os.path.join(path, part)
write_svg_function(path)
ratio = Icons.calculate_ratio(path)
img = BeautifulSoup("<img/>", "xml").img
img.attrs = {
"height": 1.0,
"width": ratio,
"sizeUnits": "em",
"collapsible": False,
"collapsed": False,
"background": False,
"appearance": "auto",
"path": f"{os.path.basename(media_dir)}/{src}",
}
elm.name = "span"
elm.clear()
elm.append(img)
elm.attrs["style"] = "vertical-align: text-bottom;"
def __convert_gaiji(soup, media_dir):
for elm in soup.find_all("img"):
if not elm.has_attr("src"):
continue
src = elm.attrs["src"]
if src.startswith("graphics"):
continue
path = media_dir
for part in src.split("/"):
if part.strip() == "":
continue
path = os.path.join(path, part)
ratio = Icons.calculate_ratio(path)
img = BeautifulSoup("<img/>", "xml").img
img.attrs = {
"height": 1.0,
"width": ratio,
"sizeUnits": "em",
"collapsible": False,
"collapsed": False,
"background": False,
"appearance": "monochrome",
"title": elm.attrs["alt"] if elm.has_attr("alt") else "",
"path": f"{os.path.basename(media_dir)}/{src}",
}
elm.name = "span"
elm.clear()
elm.append(img)
elm.attrs["style"] = "vertical-align: text-bottom;"
def __convert_graphics(soup, media_dir):
for elm in soup.find_all("img"):
if not elm.has_attr("src"):
continue
src = elm.attrs["src"]
if not src.startswith("graphics"):
continue
elm.attrs = {
"collapsible": True,
"collapsed": True,
"title": elm.attrs["alt"] if elm.has_attr("alt") else "",
"path": f"{os.path.basename(media_dir)}/{src}",
"src": src,
}
def __convert_number_icons(soup, media_dir):
for elm in soup.find_all("大語義番号"):
if elm.find_parent("a") is None:
filename = f"{elm.text}-fill.svg"
appearance = "monochrome"
path = os.path.join(media_dir, filename)
__make_rectangle(path, elm.text, "fill")
else:
filename = f"{elm.text}-bluefill.svg"
appearance = "auto"
path = os.path.join(media_dir, filename)
__make_rectangle(path, elm.text, "bluefill")
ratio = Icons.calculate_ratio(path)
img = BeautifulSoup("<img/>", "xml").img
img.attrs = {
"height": 1.0,
"width": ratio,
"sizeUnits": "em",
"collapsible": False,
"collapsed": False,
"background": False,
"appearance": appearance,
"title": elm.text,
"path": f"{os.path.basename(media_dir)}/{filename}",
}
elm.name = "span"
elm.clear()
elm.append(img)
elm.attrs["style"] = "vertical-align: text-bottom; margin-right: 0.25em;"
def __make_rectangle(path, text, cls):
if cls == "none":
pass
elif cls == "fill":
Icons.make_monochrome_fill_rectangle(path, text)
elif cls == "red":
Icons.make_rectangle(path, text, "red", "white", "red")
elif cls == "redfill":
Icons.make_rectangle(path, text, "red", "red", "white")
elif cls == "bluefill":
Icons.make_rectangle(path, text, "blue", "blue", "white")
else:
Icons.make_rectangle(path, text, "black", "transparent", "black")

View file

@ -5,6 +5,7 @@ from bot.yomichan.terms.jitenon import JitenonYojiTerminator
from bot.yomichan.terms.jitenon import JitenonKotowazaTerminator
from bot.yomichan.terms.smk8 import Smk8Terminator
from bot.yomichan.terms.daijirin2 import Daijirin2Terminator
from bot.yomichan.terms.sankoku8 import Sankoku8Terminator
def new_terminator(target):
@ -14,5 +15,6 @@ def new_terminator(target):
Targets.JITENON_KOTOWAZA: JitenonKotowazaTerminator,
Targets.SMK8: Smk8Terminator,
Targets.DAIJIRIN2: Daijirin2Terminator,
Targets.SANKOKU8: Sankoku8Terminator,
}
return terminator_map[target](target)

View file

@ -0,0 +1,47 @@
from bot.entries.sankoku8 import Sankoku8PhraseEntry as PhraseEntry
from bot.yomichan.terms.terminator import Terminator
from bot.yomichan.glossary.sankoku8 import make_glossary
from bot.yomichan.grammar import sudachi_rules, tags_to_rules
class Sankoku8Terminator(Terminator):
def __init__(self, target):
super().__init__(target)
def _definition_tags(self, entry):
return ""
def _inflection_rules(self, entry, expression):
if isinstance(entry, PhraseEntry):
return sudachi_rules(expression)
pos_tags = entry.get_part_of_speech_tags()
if len(pos_tags) == 0:
return sudachi_rules(expression)
else:
return tags_to_rules(expression, pos_tags, self._inflection_categories)
def _glossary(self, entry):
if entry.entry_id in self._glossary_cache:
return self._glossary_cache[entry.entry_id]
glossary = make_glossary(entry, self._image_dir)
self._glossary_cache[entry.entry_id] = glossary
return glossary
def _sequence(self, entry):
return entry.entry_id[0] * 100000 + entry.entry_id[1]
def _term_tags(self, entry):
return ""
def _link_glossary_parameters(self, entry):
return [
[entry.children, ""],
[entry.phrases, ""]
]
def _subentry_lists(self, entry):
return [
entry.children,
entry.phrases,
]

File diff suppressed because it is too large Load diff

611
data/mdict/css/sankoku8.css Normal file
View file

@ -0,0 +1,611 @@
@font-face {
font-family: jpgothic;
src: local("Noto Sans CJK JP"), local("IPAexGothic"), local("Source Han Sans JP");
}
@font-face {
font-family: jpmincho;
src: local("Noto Serif CJK JP"), local("IPAexMincho"), local("IPAmjMincho"), local("Source Han Serif JP"), local("HanaMinA"), local("HanaMinB");
}
@font-face {
font-family: jpkyokasho;
src: local("A-OTF Kyoukasho ICA Pro R"), local("DFKyoKaSho-W4");
}
body {
margin: 0em 1em;
line-height: 1.5em;
font-family: jpmincho, serif;
font-size: 1.2em;
}
span[data-name="entry-index"] > a {
display: none;
}
span[data-name="項目"] {
display: block;
/*max-width: 39em;*/
}
span[data-name="見出部"] {
display: block;
}
span[data-name="見出仮名"] {
font-family: jpgothic, sans-serif;
font-weight: bold;
}
span[data-name="見出仮名"].アンチック {
font-family: jpmincho, serif;
}
span[data-name="表記G"] {
margin-left: 0.25em;
}
span[data-name="専門G"] {
margin-right: 0.25em;
}
span[data-name="常用漢字"] {
font-family: jpmincho, serif;
}
span[data-name="教育漢字"] {
font-family: jpkyokasho, jpmincho, serif;
color: green;
}
span[data-name="解説部"],
span[data-name="子解説部"],
span[data-name="句解説部"] {
display: block;
margin-left: 1em;
}
span[data-name="大語義"] {
display: block;
}
span[data-name="大語義"] + span[data-name="大語義"] {
margin-top: 0.5em;
}
span[data-name="大語義番号"] {
margin-right: 0.25em;
padding: 0.1em;
font-family: jpgothic, sans-serif;
font-size: 0.8em;
font-weight: bold;
color: white;
background-color: gray;
border-radius: 0.2em;
}
a span[data-name="大語義番号"] {
background-color: blue;
text-decoration-color: blue;
}
span[data-name="語義番号"] {
margin-right: 0.25em;
}
span[data-name="参照語義番号"] {
margin-left: 0.1em;
}
span[data-name="参照語義番号"]>span[data-name="語義番号"] {
margin-right: 0.1em;
}
span[data-name="参照語義番号"]:first-child {
margin-left: 0em;
}
span[data-name="語義"] {
display: block;
}
span[data-name="副義"] {
display: block;
margin-left: 1.0em;
}
span[data-name="注記語義"] {
margin-left: 0.5em;
}
span[data-name="語釈"] {
}
span[data-name="用例G"] {
display: block;
/*margin-left: 1.25em;*/
}
span[data-name="百科"] span[data-name="用例G"] {
display: inline;
}
span[data-name="注記"] span[data-name="用例G"] {
display: inline;
}
span[data-name="用例"] {
}
span[data-name="見出相当部"] {
margin-left: 0.125em;
margin-right: 0.125em;
}
span[data-name="ルビG"] {
font-size: 0.7em;
font-weight: normal;
vertical-align: 0.5em;
-webkit-user-select: none;
}
span[data-name="名詞形G"],
span[data-name="動詞形G"],
span[data-name="自動詞形G"],
span[data-name="他動詞形G"],
span[data-name="可能形G"],
span[data-name="人G"],
span[data-name="名詞人形G"] {
display: block;
}
span[data-name="語義"] span[data-name="名詞形G"],
span[data-name="語義"] span[data-name="動詞形G"],
span[data-name="語義"] span[data-name="自動詞形G"],
span[data-name="語義"] span[data-name="他動詞形G"],
span[data-name="語義"] span[data-name="可能形G"],
span[data-name="語義"] span[data-name="人G"] {
display: inline;
}
span[data-name="副義"] span[data-name="名詞形G"],
span[data-name="副義"] span[data-name="動詞形G"],
span[data-name="副義"] span[data-name="自動詞形G"],
span[data-name="副義"] span[data-name="他動詞形G"],
span[data-name="副義"] span[data-name="可能形G"],
span[data-name="副義"] span[data-name="人G"] {
display: inline;
}
span[data-name="注記"] span[data-name="名詞形G"],
span[data-name="注記"] span[data-name="動詞形G"],
span[data-name="注記"] span[data-name="自動詞形G"],
span[data-name="注記"] span[data-name="他動詞形G"],
span[data-name="注記"] span[data-name="可能形G"],
span[data-name="注記"] span[data-name="人G"] {
display: inline;
}
span[data-name="共通"] span[data-name="名詞形G"],
span[data-name="共通"] span[data-name="動詞形G"],
span[data-name="共通"] span[data-name="自動詞形G"],
span[data-name="共通"] span[data-name="他動詞形G"],
span[data-name="共通"] span[data-name="可能形G"],
span[data-name="共通"] span[data-name="人G"] {
display: inline;
}
span[data-name="名詞形G"] span[data-name="用例G"],
span[data-name="動詞形G"] span[data-name="用例G"],
span[data-name="自動詞形G"] span[data-name="用例G"],
span[data-name="他動詞形G"] span[data-name="用例G"],
span[data-name="可能形G"] span[data-name="用例G"],
span[data-name="人G"] span[data-name="用例G"] {
display: inline;
}
span[data-name="参照G"] {
display: inline;
}
span[data-name="参照矢印"] {
margin-right: 0.25em;
}
span[data-name="参照"] {
}
span[data-name="子項目"] {
display: block;
margin-top: 0.5em;
}
span[data-name="子見出部"] {
display: block;
/* text-indent: -1em; */
}
span[data-name="子見出仮名"] {
font-family: jpgothic, sans-serif;
font-weight: bold;
}
span[data-name="親見出省略"] {
font-family: jpgothic, sans-serif;
}
span[data-name="句項目"] {
display: block;
margin-top: 0.5em;
}
span[data-name="句見出部"] {
display: block;
/* text-indent: -1em; */
}
span[data-name="句表記"] {
font-family: jpgothic, sans-serif;
font-weight: bold;
}
span[data-name="対義語G"] {
}
span[data-name="派生語G"] {
display: block;
}
span[data-name="謙譲形G"] {
display: block;
}
span[data-name="共通"] {
display: block;
}
span[data-name="共通ロゴ"] {
color: red;
margin-right: 0.25em;
}
span[data-name="rank1"] a,
span[data-name="rank2"] a,
span[data-name="rank3"] a,
span[data-name="表外字マーク"] a,
span[data-name="表外音訓マーク"] a,
span[data-name="省略形"] a,
span[data-name="熟字訓"] a,
span[data-name="原籍"] a,
span[data-name="品詞"] a,
span[data-name="専門"] a,
span[data-name="使用域"] a,
span[data-name="rect"] a {
color: black;
border-top-style: none;
}
span[data-name="共通ロゴ"] a {
color: red;
border-top-style: none;
}
span[data-name="rect"].red a {
color: red;
}
a {
text-decoration: none;
padding-top: 0.04em;
/* border-top: solid 1px blue; */
}
a.appendix {
color: black;
text-decoration: none;
border-top-style: none;
}
a.black {
color: black;
}
span[data-name="カット"] {
display: block;
width: 75%;
margin-top: 1em;
margin-left: auto;
margin-right: auto;
}
span[data-name="カット"] img {
max-height: 200px;
max-width: 600px;
}
span[data-name="イタリック"] {
font-style: italic;
}
span[data-name="ボールド"] {
font-weight: bold;
}
span[data-name="色版"] {
color: red;
}
/* 独自定義 */
span[data-name="rect"] {
margin-left: 0.25em;
margin-right: 0.25em;
padding: 0.1em;
font-size: 0.8em;
border-width: 0.04em;
border-style: solid;
border-color: black;
word-break: keep-all;
border-radius: 0.1em;
}
span[data-name="rect"].fill {
color: white;
border-style: none;
background-color: gray;
}
span[data-name="rect"].red {
color: red;
border-color: red;
}
span[data-name="red"] {
color: red;
}
span[data-name="glyph"] {
font-family: jpmincho, serif;
}
span[data-name="gaiji"] {
width: 1em;
}
span[data-name="frac"] {
width: 2em;
}
img.logo {
display: gaiji;
margin-right: 0.25em;
height: 1em;
text-combine-horizontal: all;
}
.logo-red {
height: 1em;
color: red;
}
span[data-name="平板"] .logo-red {
margin-left: 0.1em;
}
img.区切り線 {
display: gaiji;
height: 1em;
padding: 0 0.3em 0 0.05em;
color: gray;
text-combine-horizontal: all;
}
/* , , and ☆☆ symbols */
span[data-src^="svg-logo"].rank {
font-size: 0.65em;
vertical-align: super;
}
img.gaiji {
display: gaiji;
height: 1em;
text-combine-horizontal: all;
}
img.svg {
zoom: 250%;
}
span[data-name="表外字マーク"] {
font-size: 0.5em;
vertical-align: 1em;
-webkit-user-select: none;
}
span[data-name="表外音訓マーク"] {
font-size: 0.5em;
vertical-align: 1em;
-webkit-user-select: none;
}
span[data-name="表外字ロゴ"],
span[data-name="表外音訓ロゴ"] {
margin: 0em 0.5em;
font-size: 0.5em;
}
span[data-name="アクセント"] {
}
span[data-name="アクセント表記"] {
font-family: jpgothic, sans-serif;
font-weight: bold;
font-size: 0.85em;
}
span[data-name="横"] {
text-combine-horizontal: all 1;
}
span[data-name="縦中横"] {
text-combine-horizontal: all;
}
span[data-name="分子"],
span[data-name="分母"] {
text-combine-horizontal: all;
}
span[data-name="英"],
span[data-name="回転"] {
writing-mode: horizontal-tb;
}
span[data-name="i"] {
font-family: "Times New Roman";
font-style: italic;
}
span[data-name="横"] span[data-name="sub"] {
font-size: 0.7em;
vertical-align: 0.35em;
}
span[data-name="kanbun"] {
font-size: 0.5em;
vertical-align: -1em;
}
span[data-name="歴史仮名"] {
font-size: 0.7em;
vertical-align: 0.5em;
-webkit-user-select: none;
}
span[data-name="品詞G"] {
/* margin-left: 0.25em; */
}
span[data-name="ロゴ"] {
margin-right: 0.25em;
}
span[data-name="割書"] {
/*display: warichu;*/
/*font-size: 0.5em;*/
}
span[data-name="尊敬形G"],
span[data-name="謙譲形G"],
span[data-name="丁寧形G"] {
display: block;
}
span[data-name="百科"],
span[data-name="由来"],
span[data-name="区別"],
span[data-name="アクセント注記"] {
display: block;
}
span[data-name="表記情報"] {
display: block;
}
span[data-name="別見出"] {
font-family: jpmincho, serif;
font-weight: bold;
}
span[data-name="読み"] {
font-size: 0.7em;
}
span[data-name="歴史仮名"]:before,
span[data-name="ルビ"]:before {
content: "(";
}
span[data-name="歴史仮名"]:after,
span[data-name="ルビ"]:after {
content: ")";
}
div[data-child-links] {
padding-left: 1em;
}
div[data-child-links] ul {
margin: 0;
padding-left: 2em;
}
div[data-child-links] span {
padding: 0.1em;
font-family: jpgothic, sans-serif;
font-size: 0.8em;
color: white;
border-width: 0.05em;
border-style: none;
border-color: black;
border-radius: 0.2em;
word-break: keep-all;
}
div[data-child-links="子項目"] span {
background-color: rgb(153, 42, 103);
}
div[data-child-links="句項目"] span {
background-color: rgb(176, 127, 57);
}
/* Replacements for vertical SVG icons */
span[data-src="svg-logo/区別.svg"],
span[data-src="svg-logo/由来.svg"] {
font-family: jpgothic, sans-serif;
margin-left: 0.25em;
margin-right: 0.25em;
padding: 0em 0.1em 0.1em 0.1em;
font-size: 0.8em;
word-break: keep-all;
border-radius: 0.2em;
border-style: none;
color: white;
background-color: red;
}
span[data-src="svg-logo/アク.svg"],
span[data-src="svg-logo/丁寧.svg"],
span[data-src="svg-logo/可能.svg"],
span[data-src="svg-logo/尊敬.svg"],
span[data-src="svg-logo/表記.svg"],
span[data-src="svg-logo/謙譲.svg"],
span[data-src="svg-logo/接尾.svg"],
span[data-src="svg-logo/接頭.svg"] {
margin-left: 0.25em;
margin-right: 0.25em;
padding: 0em 0.1em 0.1em 0.1em;
font-size: 0.8em;
word-break: keep-all;
border-width: 0.1em;
border-style: solid;
border-radius: 0.2em;
}
span[data-src="svg-logo/アク.svg"],
span[data-src="svg-logo/丁寧.svg"],
span[data-src="svg-logo/可能.svg"],
span[data-src="svg-logo/尊敬.svg"],
span[data-src="svg-logo/表記.svg"],
span[data-src="svg-logo/謙譲.svg"] {
font-family: jpgothic, sans-serif;
}
span[data-src="svg-logo/接尾.svg"],
span[data-src="svg-logo/接頭.svg"] {
font-family: jpmincho, serif;
}

View file

@ -0,0 +1,7 @@
三省堂国語辞典 第八版
<br><br>
https://www.monokakido.jp/ja/dictionaries/sankoku8/index.html
<br><br>
{{revision}}
<br><br>
{{attribution}}

View file

@ -0,0 +1,22 @@
{
"a": {},
"br": {},
"img": {},
"div": {},
"span": {},
"small": {},
"sup": {},
"sub": {},
"表外字": {
"name": "ruby"
},
"表外字マーク": {
"name": "rt"
},
"表外音訓": {
"name": "ruby"
},
"表外音訓マーク": {
"name": "rt"
}
}

View file

@ -0,0 +1 @@
三省堂国語辞典 第八版

View file

@ -0,0 +1,474 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"definitions": {
"structuredContent": {
"oneOf": [
{
"type": "string",
"description": "Represents a text node."
},
{
"type": "array",
"items": {
"$ref": "#/definitions/structuredContent",
"description": "An array of child content."
}
},
{
"type": "object",
"oneOf": [
{
"type": "object",
"description": "Empty tags.",
"required": [
"tag"
],
"additionalProperties": false,
"properties": {
"tag": {
"type": "string",
"const": "br"
},
"data": {
"$ref": "#/definitions/structuredContentData"
}
}
},
{
"type": "object",
"description": "Generic container tags.",
"required": [
"tag"
],
"additionalProperties": false,
"properties": {
"tag": {
"type": "string",
"enum": ["ruby", "rt", "rp", "table", "thead", "tbody", "tfoot", "tr"]
},
"content": {
"$ref": "#/definitions/structuredContent"
},
"data": {
"$ref": "#/definitions/structuredContentData"
},
"lang": {
"type": "string",
"description": "Defines the language of an element in the format defined by RFC 5646."
}
}
},
{
"type": "object",
"description": "Table tags.",
"required": [
"tag"
],
"additionalProperties": false,
"properties": {
"tag": {
"type": "string",
"enum": ["td", "th"]
},
"content": {
"$ref": "#/definitions/structuredContent"
},
"data": {
"$ref": "#/definitions/structuredContentData"
},
"colSpan": {
"type": "integer",
"minimum": 1
},
"rowSpan": {
"type": "integer",
"minimum": 1
},
"style": {
"$ref": "#/definitions/structuredContentStyle"
},
"lang": {
"type": "string",
"description": "Defines the language of an element in the format defined by RFC 5646."
}
}
},
{
"type": "object",
"description": "Container tags supporting configurable styles.",
"required": [
"tag"
],
"additionalProperties": false,
"properties": {
"tag": {
"type": "string",
"enum": ["span", "div", "ol", "ul", "li"]
},
"content": {
"$ref": "#/definitions/structuredContent"
},
"data": {
"$ref": "#/definitions/structuredContentData"
},
"style": {
"$ref": "#/definitions/structuredContentStyle"
},
"lang": {
"type": "string",
"description": "Defines the language of an element in the format defined by RFC 5646."
}
}
},
{
"type": "object",
"description": "Image tag.",
"required": [
"tag",
"path"
],
"additionalProperties": false,
"properties": {
"tag": {
"type": "string",
"const": "img"
},
"data": {
"$ref": "#/definitions/structuredContentData"
},
"path": {
"type": "string",
"description": "Path to the image file in the archive."
},
"width": {
"type": "number",
"description": "Preferred width of the image.",
"minimum": 0
},
"height": {
"type": "number",
"description": "Preferred width of the image.",
"minimum": 0
},
"title": {
"type": "string",
"description": "Hover text for the image."
},
"pixelated": {
"type": "boolean",
"description": "Whether or not the image should appear pixelated at sizes larger than the image's native resolution.",
"default": false
},
"imageRendering": {
"type": "string",
"description": "Controls how the image is rendered. The value of this field supersedes the pixelated field.",
"enum": ["auto", "pixelated", "crisp-edges"],
"default": "auto"
},
"appearance": {
"type": "string",
"description": "Controls the appearance of the image. The \"monochrome\" value will mask the opaque parts of the image using the current text color.",
"enum": ["auto", "monochrome"],
"default": "auto"
},
"background": {
"type": "boolean",
"description": "Whether or not a background color is displayed behind the image.",
"default": true
},
"collapsed": {
"type": "boolean",
"description": "Whether or not the image is collapsed by default.",
"default": false
},
"collapsible": {
"type": "boolean",
"description": "Whether or not the image can be collapsed.",
"default": false
},
"verticalAlign": {
"type": "string",
"description": "The vertical alignment of the image.",
"enum": ["baseline", "sub", "super", "text-top", "text-bottom", "middle", "top", "bottom"]
},
"sizeUnits": {
"type": "string",
"description": "The units for the width and height.",
"enum": ["px", "em"]
}
}
},
{
"type": "object",
"description": "Link tag.",
"required": [
"tag",
"href"
],
"additionalProperties": false,
"properties": {
"tag": {
"type": "string",
"const": "a"
},
"content": {
"$ref": "#/definitions/structuredContent"
},
"href": {
"type": "string",
"description": "The URL for the link. URLs starting with a ? are treated as internal links to other dictionary content.",
"pattern": "^(?:https?:|\\?)[\\w\\W]*"
},
"lang": {
"type": "string",
"description": "Defines the language of an element in the format defined by RFC 5646."
}
}
}
]
}
]
},
"structuredContentData": {
"type": "object",
"description": "Generic data attributes that should be added to the element.",
"additionalProperties": {
"type": "string"
}
},
"structuredContentStyle": {
"type": "object",
"additionalProperties": false,
"properties": {
"fontStyle": {
"type": "string",
"enum": ["normal", "italic"],
"default": "normal"
},
"fontWeight": {
"type": "string",
"enum": ["normal", "bold"],
"default": "normal"
},
"fontSize": {
"type": "string",
"default": "medium"
},
"textDecorationLine": {
"oneOf": [
{
"type": "string",
"enum": ["none", "underline", "overline", "line-through"],
"default": "none"
},
{
"type": "array",
"items": {
"type": "string",
"enum": ["underline", "overline", "line-through"],
"default": "none"
}
}
]
},
"verticalAlign": {
"type": "string",
"enum": ["baseline", "sub", "super", "text-top", "text-bottom", "middle", "top", "bottom"],
"default": "baseline"
},
"textAlign": {
"type": "string",
"enum": ["start", "end", "left", "right", "center", "justify", "justify-all", "match-parent"],
"default": "start"
},
"marginTop": {
"type": "number",
"default": 0
},
"marginLeft": {
"type": "number",
"default": 0
},
"marginRight": {
"type": "number",
"default": 0
},
"marginBottom": {
"type": "number",
"default": 0
},
"listStyleType": {
"type": "string",
"default": "disc"
}
}
}
},
"type": "array",
"description": "Data file containing term information.",
"items": {
"type": "array",
"description": "Information about a single term.",
"minItems": 8,
"additionalItems": false,
"items": [
{
"type": "string",
"description": "The text for the term."
},
{
"type": "string",
"description": "Reading of the term, or an empty string if the reading is the same as the term."
},
{
"type": ["string", "null"],
"description": "String of space-separated tags for the definition. An empty string is treated as no tags."
},
{
"type": "string",
"description": "String of space-separated rule identifiers for the definition which is used to validate delinflection. Valid rule identifiers are: v1: ichidan verb; v5: godan verb; vs: suru verb; vk: kuru verb; adj-i: i-adjective. An empty string corresponds to words which aren't inflected, such as nouns."
},
{
"type": "number",
"description": "Score used to determine popularity. Negative values are more rare and positive values are more frequent. This score is also used to sort search results."
},
{
"type": "array",
"description": "Array of definitions for the term.",
"items": {
"oneOf": [
{
"type": "string",
"description": "Single definition for the term."
},
{
"type": "object",
"description": "Single detailed definition for the term.",
"required": [
"type"
],
"properties": {
"type": {
"type": "string",
"description": "The type of the data for this definition.",
"enum": ["text", "image", "structured-content"]
}
},
"oneOf": [
{
"required": [
"type",
"text"
],
"additionalProperties": false,
"properties": {
"type": {
"type": "string",
"const": "text"
},
"text": {
"type": "string",
"description": "Single definition for the term."
}
}
},
{
"required": [
"type",
"content"
],
"additionalProperties": false,
"properties": {
"type": {
"type": "string",
"const": "structured-content"
},
"content": {
"$ref": "#/definitions/structuredContent",
"description": "Single definition for the term using a structured content object."
}
}
},
{
"required": [
"type",
"path"
],
"additionalProperties": false,
"properties": {
"type": {
"type": "string",
"const": "image"
},
"path": {
"type": "string",
"description": "Path to the image file in the archive."
},
"width": {
"type": "integer",
"description": "Preferred width of the image.",
"minimum": 1
},
"height": {
"type": "integer",
"description": "Preferred width of the image.",
"minimum": 1
},
"title": {
"type": "string",
"description": "Hover text for the image."
},
"description": {
"type": "string",
"description": "Description of the image."
},
"pixelated": {
"type": "boolean",
"description": "Whether or not the image should appear pixelated at sizes larger than the image's native resolution.",
"default": false
},
"imageRendering": {
"type": "string",
"description": "Controls how the image is rendered. The value of this field supersedes the pixelated field.",
"enum": ["auto", "pixelated", "crisp-edges"],
"default": "auto"
},
"appearance": {
"type": "string",
"description": "Controls the appearance of the image. The \"monochrome\" value will mask the opaque parts of the image using the current text color.",
"enum": ["auto", "monochrome"],
"default": "auto"
},
"background": {
"type": "boolean",
"description": "Whether or not a background color is displayed behind the image.",
"default": true
},
"collapsed": {
"type": "boolean",
"description": "Whether or not the image is collapsed by default.",
"default": false
},
"collapsible": {
"type": "boolean",
"description": "Whether or not the image can be collapsed.",
"default": true
}
}
}
]
}
]
}
},
{
"type": "integer",
"description": "Sequence number for the term. Terms with the same sequence number can be shown together when the \"resultOutputMode\" option is set to \"merge\"."
},
{
"type": "string",
"description": "String of space-separated tags for the term. An empty string is treated as no tags."
}
]
}
}

View file

@ -56,5 +56,16 @@
["子", "name", 0, "子項目", 0],
["句", "expression", 0, "句項目", 0]
]
},
"sankoku8": {
"index": {
"title": "三省堂国語辞典 第八版",
"sequenced": true,
"format": 3
},
"tags": [
["子", "name", 0, "子項目", 0],
["句", "expression", 0, "句項目", 0]
]
}
}

View file

@ -25,5 +25,13 @@
"keiyoushi": ["形"],
"kahen": ["カ変"],
"sudachi": ["助動", "接尾", "枕詞", "連体", "連語"]
},
"sankoku8": {
"sahen": ["サ", "サ型"],
"godan": ["上二", "下二", "下二型", "四","四型", "五", "五型", "特殊型", "マス","マス型"],
"ichidan": ["上一", "下一", "下一型"],
"keiyoushi": ["形", "形型"],
"kahen": ["カ"],
"sudachi": []
}
}

View file

@ -0,0 +1,495 @@
{
"a": {},
"br": {},
"img": {},
"div": {},
"span": {},
"ruby": {},
"rt": {},
"small": {
"name": "span",
"style": "vertical-align: super; font-size: 0.65em; font-weight: normal; margin-right: 0.25em;",
"procedures": [
{
"procedure_name": "wrap",
"parameters": {
"l_wrap": "(",
"r_wrap": ")"
}
}
]
},
"sup": {
"name": "span",
"style": "vertical-align: super; font-size: 0.65em;"
},
"sub": {
"name": "span",
"style": "vertical-align: sub; font-size: 0.65em;"
},
"表外字": {
"name": "ruby"
},
"表外字マーク": {
"name": "rt"
},
"表外音訓": {
"name": "ruby"
},
"表外音訓マーク": {
"name": "rt"
},
"語構成": {
"name": "span",
"style": "margin-right: 0.5em;"
},
"分書": {
"name": "span",
"style": "margin-right: 0.5em;"
},
"見出仮名": {
"name": "span",
"style": "font-weight: bold;"
},
"解説部": {
"name": "div"
},
"子解説部": {
"name": "div"
},
"句解説部": {
"name": "div"
},
"大語義": {
"name": "div"
},
"語義": {
"name": "div"
},
"副義": {
"name": "div",
"style": "margin-left: 1.0em;"
},
"注記語義": {
"name": "span",
"style": "margin-left: 0.5em;"
},
"用例G": {
"name": "div",
"procedures": [
{
"procedure_name": "has_parent",
"parameters": {
"parent_name": "百科",
"key": "name",
"value": "span"
}
},
{
"procedure_name": "has_parent",
"parameters": {
"parent_name": "注記",
"key": "name",
"value": "span"
}
},
{
"procedure_name": "has_parent",
"parameters": {
"parent_name": "名詞形G",
"key": "name",
"value": "span"
}
},
{
"procedure_name": "has_parent",
"parameters": {
"parent_name": "動詞形G",
"key": "name",
"value": "span"
}
},
{
"procedure_name": "has_parent",
"parameters": {
"parent_name": "自動詞形G",
"key": "name",
"value": "span"
}
},
{
"procedure_name": "has_parent",
"parameters": {
"parent_name": "他動詞形G",
"key": "name",
"value": "span"
}
},
{
"procedure_name": "has_parent",
"parameters": {
"parent_name": "可能形G",
"key": "name",
"value": "span"
}
},
{
"procedure_name": "has_parent",
"parameters": {
"parent_name": "人G",
"key": "name",
"value": "span"
}
}
]
},
"見出相当部": {
"name": "span",
"style": "margin-left: 0.125em; margin-right: 0.125em;"
},
"ルビG": {
"name": "span",
"style": "vertical-align: super; font-size: 0.65em; font-weight: normal;"
},
"名詞人形G": {
"name": "div"
},
"名詞形G": {
"name": "div",
"procedures": [
{
"procedure_name": "has_parent",
"parameters": {
"parent_name": "語義",
"key": "name",
"value": "span"
}
},
{
"procedure_name": "has_parent",
"parameters": {
"parent_name": "副義",
"key": "name",
"value": "span"
}
},
{
"procedure_name": "has_parent",
"parameters": {
"parent_name": "注記",
"key": "name",
"value": "span"
}
},
{
"procedure_name": "has_parent",
"parameters": {
"parent_name": "共通",
"key": "name",
"value": "span"
}
}
]
},
"動詞形G": {
"name": "div",
"procedures": [
{
"procedure_name": "has_parent",
"parameters": {
"parent_name": "語義",
"key": "name",
"value": "span"
}
},
{
"procedure_name": "has_parent",
"parameters": {
"parent_name": "副義",
"key": "name",
"value": "span"
}
},
{
"procedure_name": "has_parent",
"parameters": {
"parent_name": "注記",
"key": "name",
"value": "span"
}
},
{
"procedure_name": "has_parent",
"parameters": {
"parent_name": "共通",
"key": "name",
"value": "span"
}
}
]
},
"自動詞形G": {
"name": "div",
"procedures": [
{
"procedure_name": "has_parent",
"parameters": {
"parent_name": "語義",
"key": "name",
"value": "span"
}
},
{
"procedure_name": "has_parent",
"parameters": {
"parent_name": "副義",
"key": "name",
"value": "span"
}
},
{
"procedure_name": "has_parent",
"parameters": {
"parent_name": "注記",
"key": "name",
"value": "span"
}
},
{
"procedure_name": "has_parent",
"parameters": {
"parent_name": "共通",
"key": "name",
"value": "span"
}
}
]
},
"他動詞形G": {
"name": "div",
"procedures": [
{
"procedure_name": "has_parent",
"parameters": {
"parent_name": "語義",
"key": "name",
"value": "span"
}
},
{
"procedure_name": "has_parent",
"parameters": {
"parent_name": "副義",
"key": "name",
"value": "span"
}
},
{
"procedure_name": "has_parent",
"parameters": {
"parent_name": "注記",
"key": "name",
"value": "span"
}
},
{
"procedure_name": "has_parent",
"parameters": {
"parent_name": "共通",
"key": "name",
"value": "span"
}
}
]
},
"可能形G": {
"name": "div",
"procedures": [
{
"procedure_name": "has_parent",
"parameters": {
"parent_name": "語義",
"key": "name",
"value": "span"
}
},
{
"procedure_name": "has_parent",
"parameters": {
"parent_name": "副義",
"key": "name",
"value": "span"
}
},
{
"procedure_name": "has_parent",
"parameters": {
"parent_name": "注記",
"key": "name",
"value": "span"
}
},
{
"procedure_name": "has_parent",
"parameters": {
"parent_name": "共通",
"key": "name",
"value": "span"
}
}
]
},
"人G": {
"name": "div",
"procedures": [
{
"procedure_name": "has_parent",
"parameters": {
"parent_name": "語義",
"key": "name",
"value": "span"
}
},
{
"procedure_name": "has_parent",
"parameters": {
"parent_name": "副義",
"key": "name",
"value": "span"
}
},
{
"procedure_name": "has_parent",
"parameters": {
"parent_name": "注記",
"key": "name",
"value": "span"
}
},
{
"procedure_name": "has_parent",
"parameters": {
"parent_name": "共通",
"key": "name",
"value": "span"
}
}
]
},
"参照矢印": {
"name": "span",
"style": "margin-right: 0.25em;"
},
"子見出仮名": {
"name": "span",
"style": "font-weight: bold;"
},
"句表記": {
"name": "span",
"style": "font-weight: bold;"
},
"派生語G": {
"name": "div"
},
"謙譲形G": {
"name": "div"
},
"共通": {
"name": "div"
},
"イタリック": {
"name": "span",
"style": "font-style: italic;"
},
"ボールド": {
"name": "span",
"style": "font-weight: bold;"
},
"アクセント表記": {
"name": "span",
"style": "font-weight: bold; font-size: 0.85em;"
},
"i": {
"name": "span",
"style": "font-style: italic;"
},
"sub": {
"name": "span",
"style": "font-size: 0.7em; vertical-align: sub;",
"procedures": [
{
"procedure_name": "has_parent",
"parameters": {
"parent_name": "横",
"key": "style",
"value": "font-size: 0.7em; vertical-align: super;"
}
}
]
},
"kanbun": {
"name": "span",
"style": "font-size: 0.5em; vertical-align: sub;"
},
"ロゴ": {
"name": "span",
"style": "margin-right: 0.25em"
},
"尊敬形G": {
"name": "div"
},
"謙譲形G": {
"name": "div"
},
"丁寧形G": {
"name": "div"
},
"百科": {
"name": "div"
},
"由来": {
"name": "div"
},
"区別": {
"name": "div"
},
"アクセント注記": {
"name": "div"
},
"表記情報": {
"name": "div"
},
"別見出": {
"name": "span",
"style": "font-weight: bold;"
},
"読み": {
"name": "span",
"style": "font-size: 0.7em;"
},
"歴史仮名": {
"name": "span",
"style": "vertical-align: super; font-size: 0.65em; font-weight: normal;",
"procedures": [
{
"procedure_name": "wrap",
"parameters": {
"l_wrap": "(",
"r_wrap": ")"
}
}
]
},
"ルビ": {
"name": "span",
"procedures": [
{
"procedure_name": "wrap",
"parameters": {
"l_wrap": "(",
"r_wrap": ")"
}
}
]
}
}

View file

@ -68,14 +68,19 @@ def parse_args(target_names):
help="path to icon file to be used with MDict",
type=filename,
)
parser.add_argument(
"--no-mdict-export",
help="skip export of dictionary data to MDict format",
action='store_true',
)
parser.add_argument(
"--no-yomichan-export",
help="skip export of dictionary data to Yomichan format",
action='store_true',
)
parser.add_argument(
"--no-mdict-export",
help="skip export of dictionary data to MDict format",
"--validate-yomichan-terms",
help="validate JSON structure of exported Yomichan dictionary terms",
action='store_true',
)
args = parser.parse_args()
@ -108,9 +113,11 @@ def main():
crawler.collect_pages(args.page_dir)
crawler.read_pages()
if not args.no_yomichan_export:
crawler.make_yomichan_dictionary(args.media_dir)
crawler.make_yomichan_dictionary(
args.media_dir, args.validate_yomichan_terms)
if not args.no_mdict_export:
crawler.make_mdict_dictionary(args.media_dir, args.mdict_icon)
crawler.make_mdict_dictionary(
args.media_dir, args.mdict_icon)
if __name__ == "__main__":

View file

@ -1,15 +1,20 @@
attrs==23.1.0
beautifulsoup4==4.12.2
bs4==0.0.1
certifi==2022.12.7
charset-normalizer==3.1.0
css-parser==1.0.8
fastjsonschema==2.17.1
html5lib==1.1
idna==3.4
jsonschema-specifications==2023.6.1
lxml==4.9.2
mdict-utils==1.3.12
Pillow==9.5.0
platformdirs==3.5.0
referencing==0.29.1
requests==2.29.0
rpds-py==0.8.10
six==1.16.0
soupsieve==2.4.1
SudachiDict-full==20230110

9
run_all.sh Normal file → Executable file
View file

@ -1,3 +1,7 @@
#!/bin/sh
python -m unittest discover -s tests
python jitenbot.py jitenon-kokugo
python jitenbot.py jitenon-yoji
python jitenbot.py jitenon-kotowaza
@ -11,3 +15,8 @@ python jitenbot.py daijirin2 \
--media-dir monokakido/DAIJIRIN2/media \
--page-dir monokakido/DAIJIRIN2/pages \
--mdict-icon monokakido/DAIJIRIN2/DAIJIRIN2-76@3x.png
python jitenbot.py sankoku8 \
--media-dir monokakido/SANKOKU8/media \
--page-dir monokakido/SANKOKU8/pages \
--mdict-icon monokakido/SANKOKU8/SANKOKU8-76@3x.png

View file

@ -0,0 +1,47 @@
import unittest
from bot.entries.sankoku8 import parse_hyouki_pattern
from bs4 import BeautifulSoup
class TestSankokuPhrases(unittest.TestCase):
def test_sankoku_phrases1(self):
pattern = '耳にたこ(ができる)'
exps = parse_hyouki_pattern(pattern)
self.assertEqual(len(exps), 2)
self.assertIn("耳にたこ", exps)
self.assertIn("耳にたこができる", exps)
def test_sankoku_phrases2(self):
pattern = '一斑を〈見て/もって〉全豹を〈卜す/推す〉'
exps = parse_hyouki_pattern(pattern)
self.assertEqual(len(exps), 4)
self.assertIn("一斑を見て全豹を卜す", exps)
self.assertIn("一斑を見て全豹を推す", exps)
self.assertIn("一斑をもって全豹を卜す", exps)
self.assertIn("一斑をもって全豹を推す", exps)
def test_sankoku_phrases3(self):
pattern = '{かじ・舵}を切る'
exps = parse_hyouki_pattern(pattern)
self.assertEqual(len(exps), 2)
self.assertIn("かじを切る", exps)
self.assertIn("舵を切る", exps)
def test_sankoku_phrases4(self):
pattern = '重箱の隅を(⦅ようじ\楊枝⦆で)〈つつく/ほじくる〉'
exps = parse_hyouki_pattern(pattern)
self.assertEqual(len(exps), 6)
self.assertIn("重箱の隅をつつく", exps)
self.assertIn("重箱の隅をようじでつつく", exps)
self.assertIn("重箱の隅を楊枝でつつく", exps)
self.assertIn("重箱の隅をほじくる", exps)
self.assertIn("重箱の隅をようじでほじくる", exps)
self.assertIn("重箱の隅を楊枝でほじくる", exps)
def test_sankoku_phrases5(self):
pattern = '群盲象を〈{な・撫}でる/評する〉'
exps = parse_hyouki_pattern(pattern)
self.assertEqual(len(exps), 3)
self.assertIn("群盲象をなでる", exps)
self.assertIn("群盲象を撫でる", exps)
self.assertIn("群盲象を評する", exps)