Redesign Jitenon glossaries for yomichan

2023-04-23 00:17:42 -05:00 · 2023-04-23 00:17:42 -05:00 · 08b180f442
parent 934f6534f1
commit 08b180f442
7 changed files with 95 additions and 56 deletions
--- a/README.md
+++ b/README.md
@ -1,35 +0,0 @@
-# jitenbot
-Jitenbot is a program for scraping Japanese dictionary websites and converting the scraped data into structured dictionary files.
-
-### Target Websites
-
-* [四字熟語辞典オンライン](https://yoji.jitenon.jp/)
-* [故事・ことわざ・慣用句オンライン](https://kotowaza.jitenon.jp/)
-
-### Export Formats
-
-* [Yomichan](https://github.com/foosoft/yomichan)
-
-# Usage
-Add your desired HTTP request headers to [config.json](https://github.com/stephenmk/jitenbot/blob/main/config.json)
-and ensure that all [requirements](https://github.com/stephenmk/jitenbot/blob/main/requirements.txt)
-are installed.
-
-```
-jitenbot [-h] {all,jitenon-yoji,jitenon-kotowaza}
-
-positional arguments:
-  {all,jitenon-yoji,jitenon-kotowaza}
-                        website to crawl
-
-options:
-  -h, --help            show this help message and exit
-```
-
-Scraped webpages are written to a `webcache` directory. Each page may be as large as 100 KiB,
-and a single dictionary may include thousands of pages. Ensure that adequate disk space is available.
-
-Jitenbot will pause for at least 10 seconds between each web request. Depending upon the size of
-the target dictionary, it make take hours or days to finish scraping.
-
-Exported dictionary files will be saved in an `output` directory.
--- a/bot/entries/jitenon.py
+++ b/bot/entries/jitenon.py
@ -36,6 +36,17 @@ class JitenonEntry:
        self._set_headwords()
        return self._headwords

+    def get_first_expression(self):
+        headwords = self.get_headwords()
+        expressions = next(iter(headwords.values()))
+        expression = expressions[0]
+        return expression
+
+    def get_first_reading(self):
+        headwords = self.get_headwords()
+        reading = next(iter(headwords.keys()))
+        return reading
+
    def _set_headwords(self):
        headwords = {}
        for yomikata in self.__yomikatas():
--- a/bot/yomichan/glossary/gloss.py
+++ b/bot/yomichan/glossary/gloss.py
@ -15,7 +15,7 @@ def __get_markup_structure(soup):
    content = []
    for child in soup.children:
        if child.name is None:
-            text = __clean(child.text)
+            text = child.text.strip()
            if text != "":
                content.append(text)
        else:
@ -35,12 +35,6 @@ def __get_markup_structure(soup):
    return node


-def __clean(text):
-    text = text.replace("/", "／")
-    text = text.strip()
-    return text
-
-
 def __get_attributes(attrs):
    attributes = {}
    if "href" in attrs:
--- a/bot/yomichan/glossary/jitenon.py
+++ b/bot/yomichan/glossary/jitenon.py
@ -6,6 +6,31 @@ from bot.yomichan.glossary.gloss import make_gloss

 def make_glossary(entry):
    soup = BeautifulSoup(entry.markup, "html5lib")
+    __replace_punctuation(soup)
+    __add_internal_links(soup)
+    __convert_paragraphs(soup)
+    __style_table_headers(soup)
+    __unwrap_table_body(soup)
+    __decompose_table_rows(soup, entry)
+    __insert_headword_line(soup, entry)
+    gloss = make_gloss(soup.body)
+    glossary = [gloss]
+    return glossary
+
+
+def __replace_punctuation(soup):
+    punctuation = {
+        "/": "／",
+        ",": "、",
+    }
+    for el in soup.find_all(string=True):
+        text = el.text
+        for old, new in punctuation.items():
+            text = text.replace(old, new)
+        el.replace_with(text)
+
+
+def __add_internal_links(soup):
    patterns = [
        r"^(.+)（[ぁ-ヿ、\s]+）$",
        r"^(.+)（[ぁ-ヿ、\s]+（[ぁ-ヿ、\s]）[ぁ-ヿ、\s]+）$"
@ -16,10 +41,42 @@ def make_glossary(entry):
            if m:
                a['href'] = f"?query={m.group(1)}&wildcards=off"
                break
+
+
+def __convert_paragraphs(soup):
    for p in soup.find_all("p"):
        p.name = "span"
+
+
+def __style_table_headers(soup):
    for th in soup.find_all("th"):
        th['style'] = "vertical-align: middle; text-align: center;"
-    gloss = make_gloss(soup.body)
-    glossary = [gloss]
-    return glossary
+
+
+def __unwrap_table_body(soup):
+    if soup.find("tbody") is not None:
+        soup.tbody.unwrap()
+
+
+def __decompose_table_rows(soup, entry):
+    for tr in soup.find_all("tr"):
+        if tr.find("th") is None:
+            continue
+        elif tr.th.text in ["四字熟語", "言葉"]:
+            tr.decompose()
+        elif tr.th.text == "読み方":
+            if re.match(r"^[ぁ-ヿ、]+$", entry.yomikata):
+                tr.decompose()
+        elif tr.th.text == "意味":
+            imi = tr.td
+            imi.name = "div"
+            soup.body.insert(0, imi)
+            tr.decompose()
+    if soup.find("tr") is None:
+        soup.table.decompose()
+
+
+def __insert_headword_line(soup, entry):
+    headword_line = soup.new_tag("span")
+    headword_line.string = f"{entry.get_first_reading()}【{entry.expression}】"
+    soup.body.insert(0, headword_line)
--- a/bot/yomichan/terms/jitenon.py
+++ b/bot/yomichan/terms/jitenon.py
@ -35,7 +35,7 @@ class JitenonYojiTerminator(JitenonTerminator):
        return ""

    def _term_tags(self, entry):
-        tags = entry.kankenkyuu.replace(" ", "").split("/")
+        tags = entry.kankenkyuu.split("/")
        return " ".join(tags)


--- a/bot/yomichan/terms/terminator.py
+++ b/bot/yomichan/terms/terminator.py
@ -12,7 +12,7 @@ class Terminator:
                score = -len(terms)
                glossary = self._glossary(entry)
                sequence = self._sequence(entry)
-                term_tags = ""
+                term_tags = self._term_tags(entry)
                term = [
                    expression, reading, definition_tags, inflection_rules,
                    score, glossary, sequence, term_tags
--- a/jitenbot.py
+++ b/jitenbot.py
@ -22,19 +22,31 @@ from bot.crawlers import JitenonKotowazaCrawler


 crawlers = {
-    'jitenon-yoji': JitenonYojiCrawler,
-    'jitenon-kotowaza': JitenonKotowazaCrawler,
+    "jitenon-yoji": JitenonYojiCrawler,
+    "jitenon-kotowaza": JitenonKotowazaCrawler,
 }


+def add_target_argument(parser):
+    target_argument_params = {
+        "choices": crawlers.keys(),
+        "help": "Dictionary to convert."
+    }
+    parser.add_argument("target", **target_argument_params)
+
+
+def make_parser():
+    argument_parser_params = {
+        "prog": "jitenbot",
+        "description": "Convert Japanese dictionary files to new formats.",
+    }
+    parser = argparse.ArgumentParser(**argument_parser_params)
+    return parser
+
+
 def parse_args():
-    parser = argparse.ArgumentParser(
-        prog='jitenbot',
-        description='Convert Japanese dictionary files to new formats.')
-    parser.add_argument(
-        'target',
-        choices=crawlers.keys(),
-        help='Dictionary to convert.')
+    parser = make_parser()
+    add_target_argument(parser)
    args = parser.parse_args()
    return args