diff --git a/bot/crawlers.py b/bot/crawlers.py index e520f76..7ba495c 100644 --- a/bot/crawlers.py +++ b/bot/crawlers.py @@ -4,11 +4,13 @@ from bs4 import BeautifulSoup import bot.scraper as Scraper +from bot.entries.jitenon import JitenonKokugoEntry from bot.entries.jitenon import JitenonKotowazaEntry from bot.entries.jitenon import JitenonYojiEntry from bot.entries.smk8 import Smk8Entry from bot.entries.daijirin2 import Daijirin2Entry +from bot.yomichan.export import JitenonKokugoExporter from bot.yomichan.export import JitenonKotowazaExporter from bot.yomichan.export import JitenonYojiExporter from bot.yomichan.export import Smk8Exporter @@ -48,6 +50,41 @@ class _Crawler(): return page_id +class JitenonKokugoCrawler(_Crawler): + def __init__(self, args): + super().__init__(args) + self._entry_class = JitenonKokugoEntry + self._yomi_exporter = JitenonKokugoExporter(args.target) + self._gojuon_url = "https://kokugo.jitenon.jp/cat/gojuonindex.php" + self._page_id_pattern = r"word/p([0-9]+)$" + + def collect_pages(self): + jitenon = Scraper.Jitenon() + gojuon_doc, _ = jitenon.scrape(self._gojuon_url) + gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser") + for gojuon_a in gojuon_soup.select(".kana_area a", href=True): + gojuon_href = gojuon_a['href'] + max_kana_page = 1 + current_kana_page = 1 + while current_kana_page <= max_kana_page: + kana_doc, _ = jitenon.scrape(f"{gojuon_href}&page={current_kana_page}") + current_kana_page += 1 + kana_soup = BeautifulSoup(kana_doc, features="html.parser") + page_total = kana_soup.find(class_="page_total").text + m = re.search(r"全([0-9]+)件", page_total) + if m: + max_kana_page = int(m.group(1)) + for kana_a in kana_soup.select(".word_box a", href=True): + page_link = kana_a['href'] + page_id = self._parse_page_id(page_link) + if page_id is None: + continue + _, page_path = jitenon.scrape(page_link) + self._page_map[page_id] = page_path + pages_len = len(self._page_map) + print(f"Finished scraping {pages_len} pages") + + class _JitenonCrawler(_Crawler): def __init__(self, args): super().__init__(args) diff --git a/bot/entries/jitenon.py b/bot/entries/jitenon.py index afff5b7..e1e17b4 100644 --- a/bot/entries/jitenon.py +++ b/bot/entries/jitenon.py @@ -7,8 +7,14 @@ import bot.expressions as Expressions class _JitenonEntry(Entry): + ID_TO_ENTRY = {} + def __init__(self, entry_id): super().__init__(entry_id) + if entry_id not in self.ID_TO_ENTRY: + self.ID_TO_ENTRY[entry_id] = self + else: + raise Exception(f"Duplicate entry ID: {entry_id}") self.modified_date = date(1970, 1, 1) self.attribution = "" for column in self._COLUMNS.values(): @@ -44,9 +50,9 @@ class _JitenonEntry(Entry): def _set_headwords(self): headwords = {} - for yomikata in self.__yomikatas(): + for yomikata in self._yomikatas(): headwords[yomikata] = [self.expression] - ikei_headwords = self.__ikei_headwords() + ikei_headwords = self._ikei_headwords() for reading, expressions in ikei_headwords.items(): if reading not in headwords: headwords[reading] = [] @@ -73,7 +79,7 @@ class _JitenonEntry(Entry): else: attr_value.append(colval) - def __yomikatas(self): + def _yomikatas(self): yomikata = self.yomikata m = re.search(r"^[ぁ-ヿ、]+$", yomikata) if m: @@ -94,7 +100,7 @@ class _JitenonEntry(Entry): print(f"Invalid 読み方 format: {self.yomikata}\n{self}\n") return [""] - def __ikei_headwords(self): + def _ikei_headwords(self): ikei_headwords = {} for val in self.ikei: m = re.search(r"^([^(]+)(([ぁ-ヿ、]+))$", val) @@ -174,3 +180,39 @@ class JitenonKotowazaEntry(_JitenonEntry): for expressions in self._headwords.values(): Expressions.add_variant_kanji(expressions, self._variant_kanji) Expressions.add_fullwidth(expressions) + + +class JitenonKokugoEntry(_JitenonEntry): + _COLUMNS = { + "言葉": ["expression", ""], + "読み方": ["yomikata", ""], + "意味": ["imi", ""], + "例文": ["reibun", ""], + "別表記": ["betsuhyouki", ""], + "対義語": ["taigigo", ""], + "活用": ["katsuyou", ""], + "用例": ["yourei", ""], + "類語": ["ruigo", ""], + } + + def __init__(self, entry_id): + super().__init__(entry_id) + + def _set_headwords(self): + headwords = {} + for reading in self.yomikata.split("・"): + if reading not in headwords: + headwords[reading] = [] + for expression in self.expression.split("・"): + headwords[reading].append(expression) + if self.betsuhyouki.strip() != "": + for expression in self.betsuhyouki.split("・"): + headwords[reading].append(expression) + self._headwords = headwords + + def _set_variant_headwords(self): + for expressions in self._headwords.values(): + Expressions.add_variant_kanji(expressions, self._variant_kanji) + Expressions.add_fullwidth(expressions) + Expressions.remove_iteration_mark(expressions) + Expressions.add_iteration_mark(expressions) diff --git a/bot/yomichan/export.py b/bot/yomichan/export.py index 15b61c5..a2acf81 100644 --- a/bot/yomichan/export.py +++ b/bot/yomichan/export.py @@ -7,6 +7,7 @@ from platformdirs import user_documents_dir, user_cache_dir from bot.data import load_yomichan_metadata +from bot.yomichan.terms.jitenon import JitenonKokugoTerminator from bot.yomichan.terms.jitenon import JitenonYojiTerminator from bot.yomichan.terms.jitenon import JitenonKotowazaTerminator from bot.yomichan.terms.smk8 import Smk8Terminator @@ -20,8 +21,7 @@ class Exporter: self._terms_per_file = 2000 def export(self, entries, image_dir): - if image_dir is not None: - self.__init_build_image_dir(image_dir) + self.__init_build_image_dir(image_dir) meta = load_yomichan_metadata() index = meta[self._name]["index"] index["revision"] = self._get_revision(entries) @@ -42,10 +42,13 @@ class Exporter: return self._build_dir def __init_build_image_dir(self, image_dir): - print("Copying image files to build directory...") build_dir = self._get_build_dir() build_img_dir = os.path.join(build_dir, self._name) - shutil.copytree(image_dir, build_img_dir) + if image_dir is not None: + print("Copying image files to build directory...") + shutil.copytree(image_dir, build_img_dir) + else: + os.makedirs(build_img_dir) self._terminator.set_image_dir(build_img_dir) def __get_terms(self, entries): @@ -131,6 +134,12 @@ class JitenonExporter(Exporter): return attribution +class JitenonKokugoExporter(JitenonExporter): + def __init__(self, name): + super().__init__(name) + self._terminator = JitenonKokugoTerminator(name) + + class JitenonYojiExporter(JitenonExporter): def __init__(self, name): super().__init__(name) diff --git a/bot/yomichan/glossary/jitenon.py b/bot/yomichan/glossary/jitenon.py index ebee87c..6e3a192 100644 --- a/bot/yomichan/glossary/jitenon.py +++ b/bot/yomichan/glossary/jitenon.py @@ -1,93 +1,176 @@ import re +import os +from bs4 import BeautifulSoup +import bot.icons as Icons from bot.yomichan.glossary.gloss import make_gloss -def make_glossary(entry): - soup = entry.get_page_soup() - __replace_punctuation(soup) - __add_internal_links(soup) - __convert_paragraphs(soup) - __style_table_headers(soup) - __unwrap_table_body(soup) - __decompose_table_rows(soup, entry) - __insert_headword_line(soup, entry) - gloss = make_gloss(soup.body) - glossary = [gloss] - return glossary +class JitenonGlossary(): + def __init__(self): + self._id_pattern = None + self._expression_header = None + def _replace_punctuation(self, soup): + punctuation = { + "/": "/", + ",": "、", + } + for el in soup.find_all(string=True): + text = el.text + for old, new in punctuation.items(): + text = text.replace(old, new) + el.replace_with(text) -def __replace_punctuation(soup): - punctuation = { - "/": "/", - ",": "、", - } - for el in soup.find_all(string=True): - text = el.text - for old, new in punctuation.items(): - text = text.replace(old, new) - el.replace_with(text) + def _add_internal_links(self, soup, entry): + for el in soup.find_all("a"): + href = el.attrs["href"] + m = re.search(self._id_pattern, href) + if m is not None: + ref_entry_id = int(m.group(1)) + ref_entry = entry.ID_TO_ENTRY[ref_entry_id] + expression = ref_entry.get_first_expression() + el.attrs["href"] = f"?query={expression}&wildcards=off" + elif re.match(r"^(?:https?:|\?)[\w\W]*", href): + pass + else: + raise Exception(f"Invalid href format: {href}") + def _convert_paragraphs(self, soup): + for p in soup.find_all("p"): + p.name = "div" -def __add_internal_links(soup): - patterns = [ - r"^(.+)([ぁ-ヿ、\s]+)$", - r"^(.+)([ぁ-ヿ、\s]+([ぁ-ヿ、\s])[ぁ-ヿ、\s]+)$" - ] - for a in soup.find_all("a"): - for pattern in patterns: - m = re.search(pattern, a.text) - if m: - a['href'] = f"?query={m.group(1)}&wildcards=off" - break + def _style_table_headers(self, soup): + for th in soup.find_all("th"): + th['style'] = "vertical-align: middle; text-align: center;" + def _unwrap_table_body(self, soup): + if soup.find("tbody") is not None: + soup.tbody.unwrap() -def __convert_paragraphs(soup): - for p in soup.find_all("p"): - p.name = "span" - - -def __style_table_headers(soup): - for th in soup.find_all("th"): - th['style'] = "vertical-align: middle; text-align: center;" - - -def __unwrap_table_body(soup): - if soup.find("tbody") is not None: - soup.tbody.unwrap() - - -def __decompose_table_rows(soup, entry): - for tr in soup.find_all("tr"): - if tr.find("th") is None: - continue - elif tr.th.text in ["四字熟語", "言葉"]: - tr.decompose() - elif tr.th.text == "読み方": - if __do_display_yomikata_in_headword(entry): + def _decompose_table_rows(self, soup, entry): + for tr in soup.find_all("tr"): + if tr.find("th") is None: + continue + elif tr.th.text == self._expression_header: tr.decompose() - elif tr.th.text == "意味": - imi = tr.td - imi.name = "div" - soup.body.insert(0, imi) - tr.decompose() - if soup.find("tr") is None: - soup.table.decompose() + elif tr.th.text == "読み方": + if self._do_display_yomikata_in_headword(entry): + tr.decompose() + elif tr.th.text == "意味": + imi = tr.td + imi.name = "div" + soup.body.insert(0, imi) + tr.decompose() + if soup.find("tr") is None: + soup.table.decompose() + + def _insert_headword_line(self, soup, entry): + headword_line = soup.new_tag("span") + if self._do_display_yomikata_in_headword(entry): + headword_line.string = f"{entry.yomikata}【{entry.expression}】" + else: + headword_line.string = f"【{entry.expression}】" + soup.body.insert(0, headword_line) + + def _do_display_yomikata_in_headword(self, entry): + if not re.match(r"^[ぁ-ヿ、]+$", entry.yomikata): + return False + elif len(entry.yomikata) > 10: + return False + else: + return True -def __insert_headword_line(soup, entry): - headword_line = soup.new_tag("span") - if __do_display_yomikata_in_headword(entry): - headword_line.string = f"{entry.yomikata}【{entry.expression}】" - else: - headword_line.string = f"【{entry.expression}】" - soup.body.insert(0, headword_line) +class JitenonKokugoGlossary(JitenonGlossary): + def __init__(self): + super().__init__() + self._expression_header = "言葉" + self._id_pattern = r"kokugo.jitenon.jp/word/p([0-9]+)$" + + def make_glossary(self, entry, image_dir): + soup = entry.get_page_soup() + self._remove_antonym_list_item(soup) + self._replace_number_icons(soup, image_dir) + self._replace_punctuation(soup) + self._add_internal_links(soup, entry) + self._convert_paragraphs(soup) + self._style_table_headers(soup) + self._unwrap_table_body(soup) + self._decompose_table_rows(soup, entry) + self._insert_headword_line(soup, entry) + gloss = make_gloss(soup.body) + glossary = [gloss] + return glossary + + def _remove_antonym_list_item(self, soup): + for el in soup.find_all("li"): + if el.text == "対義語辞典": + el.decompose() + + def _replace_number_icons(self, soup, image_dir): + for el in soup.find_all("img"): + alt = el.attrs["alt"] + text = re.search(r"[0-9]+", alt).group(0) + filename = f"{text}-fill.svg" + path = os.path.join(image_dir, filename) + Icons.make_monochrome_fill_rectangle(path, text) + ratio = Icons.calculate_ratio(path) + img = BeautifulSoup("", "xml").img + img.attrs = { + "height": 1.0 if ratio > 1.0 else ratio, + "width": ratio if ratio > 1.0 else 1.0, + "sizeUnits": "em", + "collapsible": False, + "collapsed": False, + "background": False, + "appearance": "monochrome", + "title": alt, + "path": f"{os.path.basename(image_dir)}/{filename}", + } + el.name = "span" + el.append(img) + el.attrs["style"] = "margin-right: 0.25em;" + + def _do_display_yomikata_in_headword(self, entry): + return len(entry.yomikata) <= 10 -def __do_display_yomikata_in_headword(entry): - if not re.match(r"^[ぁ-ヿ、]+$", entry.yomikata): - return False - elif len(entry.yomikata) > 10: - return False - else: - return True +class JitenonYojiGlossary(JitenonGlossary): + def __init__(self): + super().__init__() + self._expression_header = "四字熟語" + self._id_pattern = r"yoji.jitenon.jp/yoji.?/([0-9]+)\.html$" + + def make_glossary(self, entry, image_dir): + soup = entry.get_page_soup() + self._replace_punctuation(soup) + self._add_internal_links(soup, entry) + self._convert_paragraphs(soup) + self._style_table_headers(soup) + self._unwrap_table_body(soup) + self._decompose_table_rows(soup, entry) + self._insert_headword_line(soup, entry) + gloss = make_gloss(soup.body) + glossary = [gloss] + return glossary + + +class JitenonKotowazaGlossary(JitenonGlossary): + def __init__(self): + super().__init__() + self._expression_header = "言葉" + self._id_pattern = r"kotowaza.jitenon.jp/kotowaza/([0-9]+)\.php$" + + def make_glossary(self, entry, image_dir): + soup = entry.get_page_soup() + self._replace_punctuation(soup) + self._add_internal_links(soup, entry) + self._convert_paragraphs(soup) + self._style_table_headers(soup) + self._unwrap_table_body(soup) + self._decompose_table_rows(soup, entry) + self._insert_headword_line(soup, entry) + gloss = make_gloss(soup.body) + glossary = [gloss] + return glossary diff --git a/bot/yomichan/terms/jitenon.py b/bot/yomichan/terms/jitenon.py index 75a3a5f..45f4d5b 100644 --- a/bot/yomichan/terms/jitenon.py +++ b/bot/yomichan/terms/jitenon.py @@ -1,6 +1,9 @@ from bot.yomichan.grammar import sudachi_rules from bot.yomichan.terms.terminator import Terminator -from bot.yomichan.glossary.jitenon import make_glossary + +from bot.yomichan.glossary.jitenon import JitenonKokugoGlossary +from bot.yomichan.glossary.jitenon import JitenonYojiGlossary +from bot.yomichan.glossary.jitenon import JitenonKotowazaGlossary class JitenonTerminator(Terminator): @@ -13,7 +16,7 @@ class JitenonTerminator(Terminator): def _glossary(self, entry): if entry.entry_id in self._glossary_cache: return self._glossary_cache[entry.entry_id] - glossary = make_glossary(entry) + glossary = self._glossary_maker.make_glossary(entry, self._image_dir) self._glossary_cache[entry.entry_id] = glossary return glossary @@ -27,9 +30,22 @@ class JitenonTerminator(Terminator): return [] +class JitenonKokugoTerminator(JitenonTerminator): + def __init__(self, name): + super().__init__(name) + self._glossary_maker = JitenonKokugoGlossary() + + def _inflection_rules(self, entry, expression): + return sudachi_rules(expression) + + def _term_tags(self, entry): + return "" + + class JitenonYojiTerminator(JitenonTerminator): def __init__(self, name): super().__init__(name) + self._glossary_maker = JitenonYojiGlossary() def _inflection_rules(self, entry, expression): return "" @@ -42,6 +58,7 @@ class JitenonYojiTerminator(JitenonTerminator): class JitenonKotowazaTerminator(JitenonTerminator): def __init__(self, name): super().__init__(name) + self._glossary_maker = JitenonKotowazaGlossary() def _inflection_rules(self, entry, expression): return sudachi_rules(expression) diff --git a/data/yomichan_metadata.json b/data/yomichan_metadata.json index c892015..3fa8cb4 100644 --- a/data/yomichan_metadata.json +++ b/data/yomichan_metadata.json @@ -1,4 +1,13 @@ { + "jitenon-kokugo": { + "index": { + "title": "国語辞典オンライン", + "sequenced": true, + "format": 3, + "url": "https://kokugo.jitenon.jp/" + }, + "tags": [] + }, "jitenon-yoji": { "index": { "title": "四字熟語辞典オンライン", diff --git a/jitenbot.py b/jitenbot.py index acd73a4..be42f5b 100644 --- a/jitenbot.py +++ b/jitenbot.py @@ -18,6 +18,7 @@ along with this program. If not, see . import os import argparse +from bot.crawlers import JitenonKokugoCrawler from bot.crawlers import JitenonYojiCrawler from bot.crawlers import JitenonKotowazaCrawler from bot.crawlers import Smk8Crawler @@ -59,6 +60,7 @@ def parse_args(targets): def main(): crawlers = { + "jitenon-kokugo": JitenonKokugoCrawler, "jitenon-yoji": JitenonYojiCrawler, "jitenon-kotowaza": JitenonKotowazaCrawler, "smk8": Smk8Crawler,