Organize yomichan term creation logic into separate classes

2023-04-22 20:26:54 -05:00 · 2023-04-22 20:26:54 -05:00 · 13f07c9000
parent 7d7e32ba45
commit 13f07c9000
10 changed files with 232 additions and 144 deletions
--- a/bot/crawlers.py
+++ b/bot/crawlers.py
@ -3,10 +3,10 @@ from bs4 import BeautifulSoup

 import bot.scraper as Scraper

-from bot.entries.jitenon_kotowaza import JitenonKotowazaEntry
+from bot.entries.jitenon import JitenonKotowazaEntry
 from bot.yomichan.export import JitenonKotowazaExporter

-from bot.entries.jitenon_yoji import JitenonYojiEntry
+from bot.entries.jitenon import JitenonYojiEntry
 from bot.yomichan.export import JitenonYojiExporter


@ -15,14 +15,14 @@ class Crawler():
        self._crawl_map = {}
        self.__entries = []

-    def make_entries(self):
+    def read_entries(self):
        entries_len = len(self._crawl_map)
        items = self._crawl_map.items()
        for idx, (entry_id, entry_path) in enumerate(items):
            update = f"Reading entry {idx+1}/{entries_len}"
            print(update, end='\r', flush=True)
            entry = self._entry_class(entry_id)
-            entry.add_document(entry_path)
+            entry.set_markup(entry_path)
            self.__entries.append(entry)
        print()

--- a/bot/entries/jitenon.py
+++ b/bot/entries/jitenon.py
@ -2,35 +2,52 @@ import re
 from datetime import datetime, date
 from bs4 import BeautifulSoup

-import bot.yomichan.html_gloss as YomichanGloss
 import bot.util as Util


 class JitenonEntry:
-    def __init__(self, sequence):
-        self.sequence = sequence
-        self.yomichan_glossary = [""]
+    def __init__(self, entry_id):
+        self.entry_id = entry_id
+        self.markup = ""
        self.modified_date = date(1970, 1, 1)
        self.attribution = ""
-        for column in self.columns.values():
+        for column in self.COLUMNS.values():
            setattr(self, column[0], column[1])
+        self._headwords = None

-    def add_document(self, path):
+    def set_markup(self, path):
        with open(path, "r") as f:
            html = f.read()
-        yoji_soup = BeautifulSoup(html, features="html5lib")
+        soup = BeautifulSoup(html, features="html5lib")
        self.__set_modified_date(html)
-        self.attribution = yoji_soup.find(class_="copyright").text
-        table = yoji_soup.find(class_="kanjirighttb")
+        self.attribution = soup.find(class_="copyright").text
+        table = soup.find(class_="kanjirighttb")
        rows = table.find("tbody").find_all("tr")
        colname = ""
        for row in rows:
            colname = row.th.text if row.th is not None else colname
-            colval = self.__clean(row.td.text)
+            colval = self.__clean_text(row.td.text)
            self.__set_column(colname, colval)
-        self.__prepare_yomichan_soup(table)
-        gloss = YomichanGloss.make_gloss(table)
-        self.yomichan_glossary = [gloss]
+        self.markup = table.decode()
+
+    def get_headwords(self):
+        if self._headwords is not None:
+            return self._headwords
+        self._set_headwords()
+        return self._headwords
+
+    def _set_headwords(self):
+        headwords = {}
+        for yomikata in self.__yomikatas():
+            headwords[yomikata] = [self.expression]
+        ikei_headwords = self.__ikei_headwords()
+        for reading, expressions in ikei_headwords.items():
+            if reading not in headwords:
+                headwords[reading] = []
+            for expression in expressions:
+                if expression not in headwords[reading]:
+                    headwords[reading].append(expression)
+        self._headwords = headwords

    def __set_modified_date(self, html):
        m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", html)
@ -39,15 +56,8 @@ class JitenonEntry:
        date = datetime.strptime(m.group(1), '%Y-%m-%d').date()
        self.modified_date = date

-    def __clean(self, text):
-        text = text.replace("\n", "")
-        text = text.replace(",", "、")
-        text = text.replace(" ", "")
-        text = text.strip()
-        return text
-
    def __set_column(self, colname, colval):
-        attr_name = self.columns[colname][0]
+        attr_name = self.COLUMNS[colname][0]
        attr_value = getattr(self, attr_name)
        if isinstance(attr_value, str):
            setattr(self, attr_name, colval)
@ -57,35 +67,6 @@ class JitenonEntry:
            else:
                attr_value.append(colval)

-    def __prepare_yomichan_soup(self, soup):
-        patterns = [
-            r"^(.+)（[ぁ-ヿ、\s]+）$",
-            r"^(.+)（[ぁ-ヿ、\s]+（[ぁ-ヿ、\s]）[ぁ-ヿ、\s]+）$"
-        ]
-        for a in soup.find_all("a"):
-            for pattern in patterns:
-                m = re.search(pattern, a.text)
-                if m:
-                    a['href'] = f"?query={m.group(1)}&wildcards=off"
-                    break
-        for p in soup.find_all("p"):
-            p.name = "span"
-        for th in soup.find_all("th"):
-            th['style'] = "vertical-align: middle; text-align: center;"
-
-    def _headwords(self):
-        words = []
-        for yomikata in self.__yomikatas():
-            headword = [self.expression, yomikata]
-            if headword in words:
-                words.remove(headword)
-            words.append(headword)
-        for headword in self.__ikei_headwords():
-            if headword in words:
-                words.remove(headword)
-            words.append(headword)
-        return words
-
    def __yomikatas(self):
        yomikata = self.yomikata
        m = re.search(r"^[ぁ-ヿ、]+$", yomikata)
@ -108,22 +89,73 @@ class JitenonEntry:
        return [""]

    def __ikei_headwords(self):
-        ikei_headwords = []
+        ikei_headwords = {}
        for val in self.ikei:
            m = re.search(r"^([^（]+)（([ぁ-ヿ、]+)）$", val)
-            if m:
-                headword = [m.group(1), m.group(2)]
-                ikei_headwords.append(headword)
-            else:
+            if not m:
                print(f"Invalid 異形 format: {val}\n{self}\n")
+                continue
+            expression = m.group(1)
+            reading = m.group(2)
+            if reading not in ikei_headwords:
+                ikei_headwords[reading] = []
+            if expression not in ikei_headwords[reading]:
+                ikei_headwords[reading].append(expression)
        return ikei_headwords

+    @staticmethod
+    def __clean_text(text):
+        text = text.replace("\n", "")
+        text = text.replace(",", "、")
+        text = text.replace(" ", "")
+        text = text.strip()
+        return text
+
    def __str__(self):
-        colvals = [str(self.sequence)]
-        for attr in self.columns.values():
+        colvals = [str(self.entry_id)]
+        for attr in self.COLUMNS.values():
            attr_val = getattr(self, attr[0])
            if isinstance(attr_val, str):
                colvals.append(attr_val)
            elif isinstance(attr_val, list):
                colvals.append("；".join(attr_val))
        return ",".join(colvals)
+
+
+class JitenonYojiEntry(JitenonEntry):
+    COLUMNS = {
+        "四字熟語": ["expression", ""],
+        "読み方":   ["yomikata", ""],
+        "意味":     ["imi", ""],
+        "出典":     ["shutten", ""],
+        "漢検級":   ["kankenkyuu", ""],
+        "場面用途": ["bamenyouto", ""],
+        "異形":     ["ikei", []],
+        "類義語":   ["ruigigo", []],
+    }
+
+    def __init__(self, sequence):
+        super().__init__(sequence)
+
+
+class JitenonKotowazaEntry(JitenonEntry):
+    COLUMNS = {
+        "言葉":   ["expression", ""],
+        "読み方": ["yomikata", ""],
+        "意味":   ["imi", ""],
+        "出典":   ["shutten", ""],
+        "例文":   ["reibun", ""],
+        "異形":   ["ikei", []],
+        "類句":   ["ruiku", []],
+    }
+
+    def __init__(self, sequence):
+        super().__init__(sequence)
+
+    def _set_headwords(self):
+        if self.expression == "金棒引き・鉄棒引き":
+            self._headwords = {
+                "かなぼうひき": ["金棒引き", "鉄棒引き"]
+            }
+        else:
+            super()._set_headwords()
--- a/bot/entries/jitenon_kotowaza.py
+++ b/bot/entries/jitenon_kotowaza.py
@ -1,41 +0,0 @@
-from bot.entries.jitenon import JitenonEntry
-import bot.yomichan.grammar as Grammar
-
-
-class JitenonKotowazaEntry(JitenonEntry):
-    columns = {
-        "言葉":   ["expression", ""],
-        "読み方": ["yomikata", ""],
-        "意味":   ["imi", ""],
-        "出典":   ["shutten", ""],
-        "例文":   ["reibun", ""],
-        "異形":   ["ikei", []],
-        "類句":   ["ruiku", []],
-    }
-
-    def __init__(self, sequence):
-        super().__init__(sequence)
-
-    def yomichan_terms(self):
-        terms = []
-        for idx, headword in enumerate(self._headwords()):
-            (expression, reading) = headword
-            definition_tags = None
-            inflection_rules = Grammar.sudachi_rules(expression)
-            score = -idx
-            glossary = self.yomichan_glossary
-            sequence = self.sequence
-            term_tags = ""
-            term = [
-                expression, reading, definition_tags, inflection_rules,
-                score, glossary, sequence, term_tags
-            ]
-            terms.append(term)
-        return terms
-
-    def _headwords(self):
-        if self.expression == "金棒引き・鉄棒引き":
-            return [["金棒引き", "かなぼうひき"],
-                    ["鉄棒引き", "かなぼうひき"]]
-        else:
-            return super()._headwords()
--- a/bot/entries/jitenon_yoji.py
+++ b/bot/entries/jitenon_yoji.py
@ -1,38 +0,0 @@
-from bot.entries.jitenon import JitenonEntry
-
-
-class JitenonYojiEntry(JitenonEntry):
-    columns = {
-        "四字熟語": ["expression", ""],
-        "読み方":   ["yomikata", ""],
-        "意味":     ["imi", ""],
-        "出典":     ["shutten", ""],
-        "漢検級":   ["kankenkyuu", ""],
-        "場面用途": ["bamenyouto", ""],
-        "異形":     ["ikei", []],
-        "類義語":   ["ruigigo", []],
-    }
-
-    def __init__(self, sequence):
-        super().__init__(sequence)
-
-    def yomichan_terms(self):
-        terms = []
-        for idx, headword in enumerate(self._headwords()):
-            (expression, reading) = headword
-            definition_tags = None
-            inflection_rules = ""
-            score = -idx
-            glossary = self.yomichan_glossary
-            sequence = self.sequence
-            term_tags = self.__term_tags()
-            term = [
-                expression, reading, definition_tags, inflection_rules,
-                score, glossary, sequence, term_tags
-            ]
-            terms.append(term)
-        return terms
-
-    def __term_tags(self):
-        tags = self.kankenkyuu.replace(" ", "").split("/")
-        return " ".join(tags)
--- a/bot/yomichan/export.py
+++ b/bot/yomichan/export.py
@ -5,7 +5,10 @@ from pathlib import Path
 from datetime import datetime
 from platformdirs import user_documents_dir, user_cache_dir

-import bot.data as Data
+from bot.data import yomichan_metadata
+
+from bot.yomichan.terms.jitenon import JitenonYojiTerminator
+from bot.yomichan.terms.jitenon import JitenonKotowazaTerminator


 class Exporter:
@ -14,7 +17,7 @@ class Exporter:
        self._terms_per_file = 2000

    def export(self, entries):
-        meta = Data.yomichan_metadata()
+        meta = yomichan_metadata()
        index = meta[self._name]["index"]
        index["revision"] = self._get_revision(entries)
        index["attribution"] = self._get_attribution(entries)
@ -40,7 +43,8 @@ class Exporter:
        for idx, entry in enumerate(entries):
            update = f"Creating Yomichan terms for entry {idx+1}/{entries_len}"
            print(update, end='\r', flush=True)
-            for term in entry.yomichan_terms():
+            new_terms = self._terminator.make_terms(entry)
+            for term in new_terms:
                terms.append(term)
        print()
        return terms
@ -120,9 +124,11 @@ class JitenonYojiExporter(JitenonExporter):
    def __init__(self):
        super().__init__()
        self._name = "jitenon-yoji"
+        self._terminator = JitenonYojiTerminator()


 class JitenonKotowazaExporter(JitenonExporter):
    def __init__(self):
        super().__init__()
        self._name = "jitenon-kotowaza"
+        self._terminator = JitenonKotowazaTerminator()
--- a/bot/yomichan/glossary/gloss.py
+++ b/bot/yomichan/glossary/gloss.py
--- a/bot/yomichan/glossary/jitenon.py
+++ b/bot/yomichan/glossary/jitenon.py
@ -0,0 +1,25 @@
+import re
+from bs4 import BeautifulSoup
+
+from bot.yomichan.glossary.gloss import make_gloss
+
+
+def make_glossary(entry):
+    soup = BeautifulSoup(entry.markup, "html5lib")
+    patterns = [
+        r"^(.+)（[ぁ-ヿ、\s]+）$",
+        r"^(.+)（[ぁ-ヿ、\s]+（[ぁ-ヿ、\s]）[ぁ-ヿ、\s]+）$"
+    ]
+    for a in soup.find_all("a"):
+        for pattern in patterns:
+            m = re.search(pattern, a.text)
+            if m:
+                a['href'] = f"?query={m.group(1)}&wildcards=off"
+                break
+    for p in soup.find_all("p"):
+        p.name = "span"
+    for th in soup.find_all("th"):
+        th['style'] = "vertical-align: middle; text-align: center;"
+    gloss = make_gloss(soup.table)
+    glossary = [gloss]
+    return glossary
--- a/bot/yomichan/terms/jitenon.py
+++ b/bot/yomichan/terms/jitenon.py
@ -0,0 +1,50 @@
+import bot.yomichan.grammar as Grammar
+from bot.yomichan.terms.terminator import Terminator
+from bot.yomichan.glossary.jitenon import make_glossary
+
+
+class JitenonTerminator(Terminator):
+    def __init__(self):
+        super().__init__()
+
+    def _definition_tags(self, entry):
+        return None
+
+    def _glossary(self, entry):
+        if entry.entry_id in self.glossary_cache:
+            return self.glossary_cache[entry.entry_id]
+        glossary = make_glossary(entry)
+        self.glossary_cache[entry.entry_id] = glossary
+        return glossary
+
+    def _sequence(self, entry):
+        return entry.entry_id
+
+    def _link_glossary_parameters(self, entry):
+        return []
+
+    def _subentry_lists(self, entry):
+        return []
+
+
+class JitenonYojiTerminator(JitenonTerminator):
+    def __init__(self):
+        super().__init__()
+
+    def _inflection_rules(self, entry, expression):
+        return ""
+
+    def _term_tags(self, entry):
+        tags = entry.kankenkyuu.replace(" ", "").split("/")
+        return " ".join(tags)
+
+
+class JitenonKotowazaTerminator(JitenonTerminator):
+    def __init__(self):
+        super().__init__()
+
+    def _inflection_rules(self, entry, expression):
+        return Grammar.sudachi_rules(expression)
+
+    def _term_tags(self, entry):
+        return ""
--- a/bot/yomichan/terms/terminator.py
+++ b/bot/yomichan/terms/terminator.py
@ -0,0 +1,54 @@
+class Terminator:
+    def __init__(self):
+        self.glossary_cache = {}
+
+    def make_terms(self, entry):
+        terms = []
+        headwords = entry.get_headwords()
+        for reading, expressions in headwords.items():
+            for expression in expressions:
+                definition_tags = self._definition_tags(entry)
+                inflection_rules = self._inflection_rules(entry, expression)
+                score = -len(terms)
+                glossary = self._glossary(entry)
+                sequence = self._sequence(entry)
+                term_tags = ""
+                term = [
+                    expression, reading, definition_tags, inflection_rules,
+                    score, glossary, sequence, term_tags
+                ]
+                terms.append(term)
+
+                for x in self._link_glossary_parameters(entry):
+                    (subentries, definition_tags) = x
+                    if len(subentries) == 0:
+                        continue
+                    score = -len(terms)
+                    glossary = self.__links_glossary(subentries)
+                    term = [
+                        expression, reading, definition_tags, inflection_rules,
+                        score, glossary, sequence, term_tags
+                    ]
+                    terms.append(term)
+
+        for subentries in self._subentry_lists(entry):
+            for subentry in subentries:
+                for term in self.make_terms(subentry):
+                    terms.append(term)
+        return terms
+
+    @staticmethod
+    def __links_glossary(subentries):
+        glossary = []
+        for subentry in subentries:
+            exp = subentry.get_first_expression()
+            gloss = {
+                "type": "structured-content",
+                "content": {
+                    "tag": "a",
+                    "href": f"?query={exp}&wildcards=off",
+                    "content": exp,
+                }
+            }
+            glossary.append(gloss)
+        return glossary
--- a/jitenbot.py
+++ b/jitenbot.py
@ -44,7 +44,7 @@ def main():
    crawler_class = crawlers[args.target]
    crawler = crawler_class()
    crawler.crawl()
-    crawler.make_entries()
+    crawler.read_entries()
    crawler.make_yomichan_dictionary()