diff --git a/bot/crawlers.py b/bot/crawlers.py index 37af503..31d3b76 100644 --- a/bot/crawlers.py +++ b/bot/crawlers.py @@ -3,10 +3,10 @@ from bs4 import BeautifulSoup import bot.scraper as Scraper -from bot.entries.jitenon_kotowaza import JitenonKotowazaEntry +from bot.entries.jitenon import JitenonKotowazaEntry from bot.yomichan.export import JitenonKotowazaExporter -from bot.entries.jitenon_yoji import JitenonYojiEntry +from bot.entries.jitenon import JitenonYojiEntry from bot.yomichan.export import JitenonYojiExporter @@ -15,14 +15,14 @@ class Crawler(): self._crawl_map = {} self.__entries = [] - def make_entries(self): + def read_entries(self): entries_len = len(self._crawl_map) items = self._crawl_map.items() for idx, (entry_id, entry_path) in enumerate(items): update = f"Reading entry {idx+1}/{entries_len}" print(update, end='\r', flush=True) entry = self._entry_class(entry_id) - entry.add_document(entry_path) + entry.set_markup(entry_path) self.__entries.append(entry) print() diff --git a/bot/entries/jitenon.py b/bot/entries/jitenon.py index 65dc647..36f397b 100644 --- a/bot/entries/jitenon.py +++ b/bot/entries/jitenon.py @@ -2,35 +2,52 @@ import re from datetime import datetime, date from bs4 import BeautifulSoup -import bot.yomichan.html_gloss as YomichanGloss import bot.util as Util class JitenonEntry: - def __init__(self, sequence): - self.sequence = sequence - self.yomichan_glossary = [""] + def __init__(self, entry_id): + self.entry_id = entry_id + self.markup = "" self.modified_date = date(1970, 1, 1) self.attribution = "" - for column in self.columns.values(): + for column in self.COLUMNS.values(): setattr(self, column[0], column[1]) + self._headwords = None - def add_document(self, path): + def set_markup(self, path): with open(path, "r") as f: html = f.read() - yoji_soup = BeautifulSoup(html, features="html5lib") + soup = BeautifulSoup(html, features="html5lib") self.__set_modified_date(html) - self.attribution = yoji_soup.find(class_="copyright").text - table = yoji_soup.find(class_="kanjirighttb") + self.attribution = soup.find(class_="copyright").text + table = soup.find(class_="kanjirighttb") rows = table.find("tbody").find_all("tr") colname = "" for row in rows: colname = row.th.text if row.th is not None else colname - colval = self.__clean(row.td.text) + colval = self.__clean_text(row.td.text) self.__set_column(colname, colval) - self.__prepare_yomichan_soup(table) - gloss = YomichanGloss.make_gloss(table) - self.yomichan_glossary = [gloss] + self.markup = table.decode() + + def get_headwords(self): + if self._headwords is not None: + return self._headwords + self._set_headwords() + return self._headwords + + def _set_headwords(self): + headwords = {} + for yomikata in self.__yomikatas(): + headwords[yomikata] = [self.expression] + ikei_headwords = self.__ikei_headwords() + for reading, expressions in ikei_headwords.items(): + if reading not in headwords: + headwords[reading] = [] + for expression in expressions: + if expression not in headwords[reading]: + headwords[reading].append(expression) + self._headwords = headwords def __set_modified_date(self, html): m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", html) @@ -39,15 +56,8 @@ class JitenonEntry: date = datetime.strptime(m.group(1), '%Y-%m-%d').date() self.modified_date = date - def __clean(self, text): - text = text.replace("\n", "") - text = text.replace(",", "、") - text = text.replace(" ", "") - text = text.strip() - return text - def __set_column(self, colname, colval): - attr_name = self.columns[colname][0] + attr_name = self.COLUMNS[colname][0] attr_value = getattr(self, attr_name) if isinstance(attr_value, str): setattr(self, attr_name, colval) @@ -57,35 +67,6 @@ class JitenonEntry: else: attr_value.append(colval) - def __prepare_yomichan_soup(self, soup): - patterns = [ - r"^(.+)([ぁ-ヿ、\s]+)$", - r"^(.+)([ぁ-ヿ、\s]+([ぁ-ヿ、\s])[ぁ-ヿ、\s]+)$" - ] - for a in soup.find_all("a"): - for pattern in patterns: - m = re.search(pattern, a.text) - if m: - a['href'] = f"?query={m.group(1)}&wildcards=off" - break - for p in soup.find_all("p"): - p.name = "span" - for th in soup.find_all("th"): - th['style'] = "vertical-align: middle; text-align: center;" - - def _headwords(self): - words = [] - for yomikata in self.__yomikatas(): - headword = [self.expression, yomikata] - if headword in words: - words.remove(headword) - words.append(headword) - for headword in self.__ikei_headwords(): - if headword in words: - words.remove(headword) - words.append(headword) - return words - def __yomikatas(self): yomikata = self.yomikata m = re.search(r"^[ぁ-ヿ、]+$", yomikata) @@ -108,22 +89,73 @@ class JitenonEntry: return [""] def __ikei_headwords(self): - ikei_headwords = [] + ikei_headwords = {} for val in self.ikei: m = re.search(r"^([^(]+)(([ぁ-ヿ、]+))$", val) - if m: - headword = [m.group(1), m.group(2)] - ikei_headwords.append(headword) - else: + if not m: print(f"Invalid 異形 format: {val}\n{self}\n") + continue + expression = m.group(1) + reading = m.group(2) + if reading not in ikei_headwords: + ikei_headwords[reading] = [] + if expression not in ikei_headwords[reading]: + ikei_headwords[reading].append(expression) return ikei_headwords + @staticmethod + def __clean_text(text): + text = text.replace("\n", "") + text = text.replace(",", "、") + text = text.replace(" ", "") + text = text.strip() + return text + def __str__(self): - colvals = [str(self.sequence)] - for attr in self.columns.values(): + colvals = [str(self.entry_id)] + for attr in self.COLUMNS.values(): attr_val = getattr(self, attr[0]) if isinstance(attr_val, str): colvals.append(attr_val) elif isinstance(attr_val, list): colvals.append(";".join(attr_val)) return ",".join(colvals) + + +class JitenonYojiEntry(JitenonEntry): + COLUMNS = { + "四字熟語": ["expression", ""], + "読み方": ["yomikata", ""], + "意味": ["imi", ""], + "出典": ["shutten", ""], + "漢検級": ["kankenkyuu", ""], + "場面用途": ["bamenyouto", ""], + "異形": ["ikei", []], + "類義語": ["ruigigo", []], + } + + def __init__(self, sequence): + super().__init__(sequence) + + +class JitenonKotowazaEntry(JitenonEntry): + COLUMNS = { + "言葉": ["expression", ""], + "読み方": ["yomikata", ""], + "意味": ["imi", ""], + "出典": ["shutten", ""], + "例文": ["reibun", ""], + "異形": ["ikei", []], + "類句": ["ruiku", []], + } + + def __init__(self, sequence): + super().__init__(sequence) + + def _set_headwords(self): + if self.expression == "金棒引き・鉄棒引き": + self._headwords = { + "かなぼうひき": ["金棒引き", "鉄棒引き"] + } + else: + super()._set_headwords() diff --git a/bot/entries/jitenon_kotowaza.py b/bot/entries/jitenon_kotowaza.py deleted file mode 100644 index 23a4c21..0000000 --- a/bot/entries/jitenon_kotowaza.py +++ /dev/null @@ -1,41 +0,0 @@ -from bot.entries.jitenon import JitenonEntry -import bot.yomichan.grammar as Grammar - - -class JitenonKotowazaEntry(JitenonEntry): - columns = { - "言葉": ["expression", ""], - "読み方": ["yomikata", ""], - "意味": ["imi", ""], - "出典": ["shutten", ""], - "例文": ["reibun", ""], - "異形": ["ikei", []], - "類句": ["ruiku", []], - } - - def __init__(self, sequence): - super().__init__(sequence) - - def yomichan_terms(self): - terms = [] - for idx, headword in enumerate(self._headwords()): - (expression, reading) = headword - definition_tags = None - inflection_rules = Grammar.sudachi_rules(expression) - score = -idx - glossary = self.yomichan_glossary - sequence = self.sequence - term_tags = "" - term = [ - expression, reading, definition_tags, inflection_rules, - score, glossary, sequence, term_tags - ] - terms.append(term) - return terms - - def _headwords(self): - if self.expression == "金棒引き・鉄棒引き": - return [["金棒引き", "かなぼうひき"], - ["鉄棒引き", "かなぼうひき"]] - else: - return super()._headwords() diff --git a/bot/entries/jitenon_yoji.py b/bot/entries/jitenon_yoji.py deleted file mode 100644 index d08d607..0000000 --- a/bot/entries/jitenon_yoji.py +++ /dev/null @@ -1,38 +0,0 @@ -from bot.entries.jitenon import JitenonEntry - - -class JitenonYojiEntry(JitenonEntry): - columns = { - "四字熟語": ["expression", ""], - "読み方": ["yomikata", ""], - "意味": ["imi", ""], - "出典": ["shutten", ""], - "漢検級": ["kankenkyuu", ""], - "場面用途": ["bamenyouto", ""], - "異形": ["ikei", []], - "類義語": ["ruigigo", []], - } - - def __init__(self, sequence): - super().__init__(sequence) - - def yomichan_terms(self): - terms = [] - for idx, headword in enumerate(self._headwords()): - (expression, reading) = headword - definition_tags = None - inflection_rules = "" - score = -idx - glossary = self.yomichan_glossary - sequence = self.sequence - term_tags = self.__term_tags() - term = [ - expression, reading, definition_tags, inflection_rules, - score, glossary, sequence, term_tags - ] - terms.append(term) - return terms - - def __term_tags(self): - tags = self.kankenkyuu.replace(" ", "").split("/") - return " ".join(tags) diff --git a/bot/yomichan/export.py b/bot/yomichan/export.py index ab24eaf..b6ed7b1 100644 --- a/bot/yomichan/export.py +++ b/bot/yomichan/export.py @@ -5,7 +5,10 @@ from pathlib import Path from datetime import datetime from platformdirs import user_documents_dir, user_cache_dir -import bot.data as Data +from bot.data import yomichan_metadata + +from bot.yomichan.terms.jitenon import JitenonYojiTerminator +from bot.yomichan.terms.jitenon import JitenonKotowazaTerminator class Exporter: @@ -14,7 +17,7 @@ class Exporter: self._terms_per_file = 2000 def export(self, entries): - meta = Data.yomichan_metadata() + meta = yomichan_metadata() index = meta[self._name]["index"] index["revision"] = self._get_revision(entries) index["attribution"] = self._get_attribution(entries) @@ -40,7 +43,8 @@ class Exporter: for idx, entry in enumerate(entries): update = f"Creating Yomichan terms for entry {idx+1}/{entries_len}" print(update, end='\r', flush=True) - for term in entry.yomichan_terms(): + new_terms = self._terminator.make_terms(entry) + for term in new_terms: terms.append(term) print() return terms @@ -120,9 +124,11 @@ class JitenonYojiExporter(JitenonExporter): def __init__(self): super().__init__() self._name = "jitenon-yoji" + self._terminator = JitenonYojiTerminator() class JitenonKotowazaExporter(JitenonExporter): def __init__(self): super().__init__() self._name = "jitenon-kotowaza" + self._terminator = JitenonKotowazaTerminator() diff --git a/bot/yomichan/html_gloss.py b/bot/yomichan/glossary/gloss.py similarity index 100% rename from bot/yomichan/html_gloss.py rename to bot/yomichan/glossary/gloss.py diff --git a/bot/yomichan/glossary/jitenon.py b/bot/yomichan/glossary/jitenon.py new file mode 100644 index 0000000..ecaeac3 --- /dev/null +++ b/bot/yomichan/glossary/jitenon.py @@ -0,0 +1,25 @@ +import re +from bs4 import BeautifulSoup + +from bot.yomichan.glossary.gloss import make_gloss + + +def make_glossary(entry): + soup = BeautifulSoup(entry.markup, "html5lib") + patterns = [ + r"^(.+)([ぁ-ヿ、\s]+)$", + r"^(.+)([ぁ-ヿ、\s]+([ぁ-ヿ、\s])[ぁ-ヿ、\s]+)$" + ] + for a in soup.find_all("a"): + for pattern in patterns: + m = re.search(pattern, a.text) + if m: + a['href'] = f"?query={m.group(1)}&wildcards=off" + break + for p in soup.find_all("p"): + p.name = "span" + for th in soup.find_all("th"): + th['style'] = "vertical-align: middle; text-align: center;" + gloss = make_gloss(soup.table) + glossary = [gloss] + return glossary diff --git a/bot/yomichan/terms/jitenon.py b/bot/yomichan/terms/jitenon.py new file mode 100644 index 0000000..ace79c8 --- /dev/null +++ b/bot/yomichan/terms/jitenon.py @@ -0,0 +1,50 @@ +import bot.yomichan.grammar as Grammar +from bot.yomichan.terms.terminator import Terminator +from bot.yomichan.glossary.jitenon import make_glossary + + +class JitenonTerminator(Terminator): + def __init__(self): + super().__init__() + + def _definition_tags(self, entry): + return None + + def _glossary(self, entry): + if entry.entry_id in self.glossary_cache: + return self.glossary_cache[entry.entry_id] + glossary = make_glossary(entry) + self.glossary_cache[entry.entry_id] = glossary + return glossary + + def _sequence(self, entry): + return entry.entry_id + + def _link_glossary_parameters(self, entry): + return [] + + def _subentry_lists(self, entry): + return [] + + +class JitenonYojiTerminator(JitenonTerminator): + def __init__(self): + super().__init__() + + def _inflection_rules(self, entry, expression): + return "" + + def _term_tags(self, entry): + tags = entry.kankenkyuu.replace(" ", "").split("/") + return " ".join(tags) + + +class JitenonKotowazaTerminator(JitenonTerminator): + def __init__(self): + super().__init__() + + def _inflection_rules(self, entry, expression): + return Grammar.sudachi_rules(expression) + + def _term_tags(self, entry): + return "" diff --git a/bot/yomichan/terms/terminator.py b/bot/yomichan/terms/terminator.py new file mode 100644 index 0000000..b3b2fc0 --- /dev/null +++ b/bot/yomichan/terms/terminator.py @@ -0,0 +1,54 @@ +class Terminator: + def __init__(self): + self.glossary_cache = {} + + def make_terms(self, entry): + terms = [] + headwords = entry.get_headwords() + for reading, expressions in headwords.items(): + for expression in expressions: + definition_tags = self._definition_tags(entry) + inflection_rules = self._inflection_rules(entry, expression) + score = -len(terms) + glossary = self._glossary(entry) + sequence = self._sequence(entry) + term_tags = "" + term = [ + expression, reading, definition_tags, inflection_rules, + score, glossary, sequence, term_tags + ] + terms.append(term) + + for x in self._link_glossary_parameters(entry): + (subentries, definition_tags) = x + if len(subentries) == 0: + continue + score = -len(terms) + glossary = self.__links_glossary(subentries) + term = [ + expression, reading, definition_tags, inflection_rules, + score, glossary, sequence, term_tags + ] + terms.append(term) + + for subentries in self._subentry_lists(entry): + for subentry in subentries: + for term in self.make_terms(subentry): + terms.append(term) + return terms + + @staticmethod + def __links_glossary(subentries): + glossary = [] + for subentry in subentries: + exp = subentry.get_first_expression() + gloss = { + "type": "structured-content", + "content": { + "tag": "a", + "href": f"?query={exp}&wildcards=off", + "content": exp, + } + } + glossary.append(gloss) + return glossary diff --git a/jitenbot.py b/jitenbot.py index 304fb64..8496467 100644 --- a/jitenbot.py +++ b/jitenbot.py @@ -44,7 +44,7 @@ def main(): crawler_class = crawlers[args.target] crawler = crawler_class() crawler.crawl() - crawler.make_entries() + crawler.read_entries() crawler.make_yomichan_dictionary()