Reorganize file structure

2023-04-10 15:20:33 -05:00 · 2023-04-10 15:20:33 -05:00 · 83a182e682
parent 16d694d2d2
commit 83a182e682
7 changed files with 231 additions and 131 deletions
--- a/crawlers.py
+++ b/crawlers.py
@ -2,9 +2,9 @@ import re
 from bs4 import BeautifulSoup

 import scraper as Scraper
-import yomichan as Yomichan
-from jitenon_yoji import JitenonYoji
-from jitenon_kotowaza import JitenonKotowaza
+import yomichan.export as YomichanExport
+from entries.jitenon_kotowaza import JitenonKotowaza
+from entries.jitenon_yoji import JitenonYoji


 def run_all():
@ -13,7 +13,7 @@ def run_all():


 def jitenon_yoji():
-    entries = {}
+    seq_to_entries = {}
    jitenon = Scraper.Jitenon()
    gojuon_doc = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html")
    gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
@ -24,34 +24,18 @@ def jitenon_yoji():
        for kana_a in kana_soup.select(".word_box a", href=True):
            kana_href = kana_a['href']
            sequence = int(re.search(r"([0-9]+).html", kana_href).group(1))
-            if sequence in entries:
+            if sequence in seq_to_entries:
                continue
            yoji_doc = jitenon.scrape(kana_href)
            entry = JitenonYoji(sequence)
            entry.add_document(yoji_doc)
-            entries[sequence] = entry
-    terms = []
-    attribution = ""
-    modified_date = None
-    for entry in entries.values():
-        if modified_date is None or entry.modified_date > modified_date:
-            modified_date = entry.modified_date
-            attribution = entry.attribution
-        for term in entry.yomichan_terms():
-            terms.append(term)
-    index = {
-        "title": "四字熟語辞典オンライン",
-        "revision": f"jitenon-yoji.{modified_date}",
-        "sequenced": True,
-        "format": 3,
-        "url": "https://yoji.jitenon.jp/",
-        "attribution": attribution,
-    }
-    Yomichan.create_zip(terms, index)
+            seq_to_entries[sequence] = entry
+    entries = seq_to_entries.values()
+    YomichanExport.jitenon_yoji(entries)


 def jitenon_kotowaza():
-    entries = {}
+    seq_to_entries = {}
    jitenon = Scraper.Jitenon()
    gojuon_doc = jitenon.scrape("https://kotowaza.jitenon.jp/cat/gojuon.php")
    gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
@ -65,29 +49,12 @@ def jitenon_kotowaza():
            if m:
                sequence = int(m.group(1))
            else:
-                # print(f"Skipping {kana_href}")
                continue
-            if sequence in entries:
+            if sequence in seq_to_entries:
                continue
            kotowaza_doc = jitenon.scrape(kana_href)
            entry = JitenonKotowaza(sequence)
            entry.add_document(kotowaza_doc)
-            entries[sequence] = entry
-    terms = []
-    attribution = ""
-    modified_date = None
-    for entry in entries.values():
-        if modified_date is None or entry.modified_date > modified_date:
-            modified_date = entry.modified_date
-            attribution = entry.attribution
-        for term in entry.yomichan_terms():
-            terms.append(term)
-    index = {
-        "title": "故事・ことわざ・慣用句オンライン",
-        "revision": f"jitenon-kotowaza.{modified_date}",
-        "sequenced": True,
-        "format": 3,
-        "url": "https://kotowaza.jitenon.jp/",
-        "attribution": attribution,
-    }
-    Yomichan.create_zip(terms, index)
+            seq_to_entries[sequence] = entry
+    entries = seq_to_entries.values()
+    YomichanExport.jitenon_kotowaza(entries)
--- a/entries/jitenon.py
+++ b/entries/jitenon.py
@ -2,22 +2,11 @@ import re
 from datetime import datetime, date
 from bs4 import BeautifulSoup

-import yomichan as Yomichan
+import yomichan.soup as YomichanSoup
 import util as Util


-class JitenonYoji:
-    columns = {
-        "四字熟語": ["expression", ""],
-        "読み方":   ["yomikata", ""],
-        "意味":     ["imi", ""],
-        "出典":     ["shutten", ""],
-        "漢検級":   ["kankenkyuu", ""],
-        "場面用途": ["bamenyouto", ""],
-        "異形":     ["ikei", []],
-        "類義語":   ["ruigigo", []],
-    }
-
+class Jitenon:
    def __init__(self, sequence):
        self.sequence = sequence
        self.yomichan_glossary = [""]
@ -35,26 +24,10 @@ class JitenonYoji:
        colname = ""
        for row in rows:
            colname = row.th.text if row.th is not None else colname
-            colval = row.td.decode_contents()
+            colval = row.td.text
            self.__set_column(colname, colval)
-        self.yomichan_glossary = [Yomichan.soup_to_gloss(table)]
-
-    def yomichan_terms(self):
-        terms = []
-        for idx, headword in enumerate(self.__headwords()):
-            (yoji, reading) = headword
-            definition_tags = None
-            inflection_rules = ""
-            score = -idx
-            glossary = self.yomichan_glossary
-            sequence = self.sequence
-            term_tags = ""
-            term = [
-                yoji, reading, definition_tags, inflection_rules,
-                score, glossary, sequence, term_tags
-            ]
-            terms.append(term)
-        return terms
+        gloss = YomichanSoup.make_gloss(table)  # note: modifies table
+        self.yomichan_glossary = [gloss]

    def __set_modified_date(self, html):
        m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", html)
@ -76,7 +49,7 @@ class JitenonYoji:
                attr_value.append(colval)
                setattr(self, attr_name, attr_value)

-    def __headwords(self):
+    def _headwords(self):
        words = []
        for yomikata in self.__yomikatas():
            headword = [self.expression, yomikata]
@ -91,33 +64,35 @@ class JitenonYoji:

    def __yomikatas(self):
        yomikata = self.yomikata.replace(" ", "")
-        m = re.search(r"^[ぁ-ヿ]+$", yomikata)
+        m = re.search(r"^[ぁ-ヿ、]+$", yomikata)
        if m:
            return [yomikata]
-        m = re.search(r"^([ぁ-ヿ]+)<br/>", yomikata)
+        m = re.search(r"^([ぁ-ヿ、]+)※", yomikata)
        if m:
            return [m.group(1)]
-        m = re.search(r"^[ぁ-ヿ]+（[ぁ-ヿ]）[ぁ-ヿ]+$", yomikata)
+        m = re.search(r"^[ぁ-ヿ、]+（[ぁ-ヿ、]）[ぁ-ヿ、]+$", yomikata)
        if m:
            return Util.expand_shouryaku(yomikata)
-        m = re.search(r"^([ぁ-ヿ]+)（([ぁ-ヿ/\s]+)）$", yomikata)
+        m = re.search(r"^([ぁ-ヿ、]+)（([ぁ-ヿ/\s、]+)）$", yomikata)
        if m:
            yomikatas = [m.group(1)]
            alts = m.group(2).split("/")
            for alt in alts:
                yomikatas.append(alt.strip())
            return yomikatas
-        raise Exception(f"Invalid 読み方 format: {self.yomikata}\n{self}")
+        print(f"Invalid 読み方 format: {self.yomikata}\n{self}\n")
+        return ""

    def __ikei_headwords(self):
        ikei_headwords = []
        for val in self.ikei:
-            m = re.search(r"^([^（]+)（([ぁ-ヿ]+)）$", val)
+            val = val.replace(" ", "")
+            m = re.search(r"^([^（]+)（([ぁ-ヿ、]+)）$", val)
            if m:
                headword = [m.group(1), m.group(2)]
                ikei_headwords.append(headword)
            else:
-                raise Exception(f"Invalid 異形 format: {val}\n{self}")
+                print(f"Invalid 異形 format: {val}\n{self}\n")
        return ikei_headwords

    def __str__(self):
--- a/entries/jitenon_kotowaza.py
+++ b/entries/jitenon_kotowaza.py
@ -0,0 +1,34 @@
+from entries.jitenon import Jitenon
+import yomichan.grammar as Grammar
+
+
+class JitenonKotowaza(Jitenon):
+    columns = {
+        "言葉":   ["expression", ""],
+        "読み方": ["yomikata", ""],
+        "意味":   ["imi", ""],
+        "出典":   ["shutten", ""],
+        "例文":   ["reibun", ""],
+        "異形":   ["ikei", []],
+        "類句":   ["ruiku", []],
+    }
+
+    def __init__(self, sequence):
+        Jitenon.__init__(self, sequence)
+
+    def yomichan_terms(self):
+        terms = []
+        for idx, headword in enumerate(self._headwords()):
+            (expression, reading) = headword
+            definition_tags = None
+            inflection_rules = Grammar.sudachi_rules(expression, reading)
+            score = -idx
+            glossary = self.yomichan_glossary
+            sequence = self.sequence
+            term_tags = ""
+            term = [
+                expression, reading, definition_tags, inflection_rules,
+                score, glossary, sequence, term_tags
+            ]
+            terms.append(term)
+        return terms
--- a/entries/jitenon_yoji.py
+++ b/entries/jitenon_yoji.py
@ -0,0 +1,38 @@
+from entries.jitenon import Jitenon
+
+
+class JitenonYoji(Jitenon):
+    columns = {
+        "四字熟語": ["expression", ""],
+        "読み方":   ["yomikata", ""],
+        "意味":     ["imi", ""],
+        "出典":     ["shutten", ""],
+        "漢検級":   ["kankenkyuu", ""],
+        "場面用途": ["bamenyouto", ""],
+        "異形":     ["ikei", []],
+        "類義語":   ["ruigigo", []],
+    }
+
+    def __init__(self, sequence):
+        Jitenon.__init__(self, sequence)
+
+    def yomichan_terms(self):
+        terms = []
+        for idx, headword in enumerate(self._headwords()):
+            (expression, reading) = headword
+            definition_tags = None
+            inflection_rules = ""
+            score = -idx
+            glossary = self.yomichan_glossary
+            sequence = self.sequence
+            term_tags = self.__term_tags()
+            term = [
+                expression, reading, definition_tags, inflection_rules,
+                score, glossary, sequence, term_tags
+            ]
+            terms.append(term)
+        return terms
+
+    def __term_tags(self):
+        tags = self.kankenkyuu.replace(" ", "").split("/")
+        return " ".join(tags)
--- a/yomichan/export.py
+++ b/yomichan/export.py
@ -0,0 +1,88 @@
+import json
+import os
+import shutil
+import uuid
+from pathlib import Path
+
+
+def jitenon_yoji(entries):
+    terms, modified_date, attribution = __terms(entries)
+    index = {
+        "title": "四字熟語辞典オンライン",
+        "revision": f"jitenon-yoji.{modified_date}",
+        "sequenced": True,
+        "format": 3,
+        "url": "https://yoji.jitenon.jp/",
+        "attribution": attribution,
+    }
+    tags = [
+        ["１級", "frequent", 0, "漢字検定（漢検）１級の四字熟語", 0],
+        ["準１級", "frequent", 0, "漢字検定（漢検）準１級の四字熟語", 0],
+        ["２級", "frequent", 0, "漢字検定（漢検）２級の四字熟語", 0],
+        ["準２級", "frequent", 0, "漢字検定（漢検）準２級の四字熟語", 0],
+        ["３級", "frequent", 0, "漢字検定（漢検）３級の四字熟語", 0],
+        ["４級", "frequent", 0, "漢字検定（漢検）４級の四字熟語", 0],
+        ["５級", "frequent", 0, "漢字検定（漢検）５級の四字熟語", 0],
+    ]
+    __create_zip(terms, index, tags)
+
+
+def jitenon_kotowaza(entries):
+    terms, modified_date, attribution = __terms(entries)
+    index = {
+        "title": "故事・ことわざ・慣用句オンライン",
+        "revision": f"jitenon-kotowaza.{modified_date}",
+        "sequenced": True,
+        "format": 3,
+        "url": "https://kotowaza.jitenon.jp/",
+        "attribution": attribution,
+    }
+    __create_zip(terms, index)
+
+
+def __terms(entries):
+    terms = []
+    modified_date = None
+    attribution = ""
+    for entry in entries:
+        if modified_date is None or entry.modified_date > modified_date:
+            modified_date = entry.modified_date
+            attribution = entry.attribution
+        for term in entry.yomichan_terms():
+            terms.append(term)
+    return terms, modified_date, attribution
+
+
+def __create_zip(terms, index, tags=[]):
+    build_directory = str(uuid.uuid4())
+    os.mkdir(build_directory)
+
+    terms_per_file = 1000
+    max_i = int(len(terms) / terms_per_file) + 1
+    for i in range(max_i):
+        term_file = os.path.join(build_directory, f"term_bank_{i+1}.json")
+        with open(term_file, "w", encoding='utf8') as f:
+            start = terms_per_file * i
+            end = terms_per_file * (i + 1)
+            json.dump(terms[start:end], f, indent=4, ensure_ascii=False)
+
+    index_file = os.path.join(build_directory, "index.json")
+    with open(index_file, 'w', encoding='utf8') as f:
+        json.dump(index, f, indent=4, ensure_ascii=False)
+
+    if len(tags) > 0:
+        tag_file = os.path.join(build_directory, "tag_bank_1.json")
+        with open(tag_file, 'w', encoding='utf8') as f:
+            json.dump(tags, f, indent=4, ensure_ascii=False)
+
+    zip_filename = index["title"]
+    zip_file = f"{zip_filename}.zip"
+    shutil.make_archive(zip_filename, "zip", build_directory)
+    out_dir = "output"
+    out_file = os.path.join(out_dir, zip_file)
+    if not Path(out_dir).is_dir():
+        os.mkdir(out_dir)
+    elif Path(out_file).is_file():
+        os.remove(out_file)
+    shutil.move(zip_file, out_dir)
+    shutil.rmtree(build_directory)
--- a/yomichan/grammar.py
+++ b/yomichan/grammar.py
@ -0,0 +1,38 @@
+from sudachipy import tokenizer
+from sudachipy import dictionary
+
+
+def sudachi_rules(expression, reading):
+    tokenizer_obj = dictionary.Dictionary().create()
+    splitmode = tokenizer.Tokenizer.SplitMode.A
+    tokens = tokenizer_obj.tokenize(expression, splitmode)
+    pos = tokens[len(tokens)-1].part_of_speech()[4]
+    tags = pos.split("-")
+    rules = __sudachi_tags_to_rules(tags, expression, reading)
+    return rules
+
+
+def __sudachi_tags_to_rules(tags, expression, reading):
+    u_endings = ["う", "く", "す", "つ", "ぬ", "ふ", "む",
+                 "ゆ", "る", "ぐ", "ず", "づ", "ぶ", "ぷ"]
+    rules = set()
+    for tag in tags:
+        if expression.endswith("い"):
+            if tag == "形容詞" or "ナイ" in tag or "タイ" in tag:
+                rules.add("adj-i")
+        if expression.endswith("る"):
+            if "一" in tag or tag == "レル":
+                rules.add("v1")
+        if "二" in tag or "四" in tag or "五" in tag:
+            for u_ending in u_endings:
+                if expression.endswith(u_ending):
+                    rules.add("v5")
+                    break
+        if "サ" in tag and (expression.endswith("する") or expression == "為る"):
+            rules.add("vs")
+        if "サ" in tag and expression.endswith("ずる"):
+            rules.add("vz")
+    if expression.endswith("来る") and reading.endswith("くる"):
+        rules = set()
+        rules.add("vk")
+    return " ".join(list(rules))
--- a/yomichan/soup.py
+++ b/yomichan/soup.py
@ -1,49 +1,9 @@
-import json
-import os
-import shutil
-import uuid
 import re
-from pathlib import Path
 from css_parser import parseStyle


-def create_zip(terms, index, tags=[]):
-    build_directory = str(uuid.uuid4())
-    os.mkdir(build_directory)
-
-    terms_per_file = 1000
-    max_i = int(len(terms) / terms_per_file) + 1
-    for i in range(max_i):
-        term_file = os.path.join(build_directory, f"term_bank_{i+1}.json")
-        with open(term_file, "w", encoding='utf8') as f:
-            start = terms_per_file * i
-            end = terms_per_file * (i + 1)
-            json.dump(terms[start:end], f, indent=4, ensure_ascii=False)
-
-    index_file = os.path.join(build_directory, "index.json")
-    with open(index_file, 'w', encoding='utf8') as f:
-        json.dump(index, f, indent=4, ensure_ascii=False)
-
-    if len(tags) > 0:
-        tag_file = os.path.join(build_directory, "tag_bank_1.json")
-        with open(tag_file, 'w', encoding='utf8') as f:
-            json.dump(tags, f, indent=4, ensure_ascii=False)
-
-    zip_filename = index["title"]
-    zip_file = f"{zip_filename}.zip"
-    shutil.make_archive(zip_filename, "zip", build_directory)
-    out_dir = "output"
-    out_file = os.path.join(out_dir, zip_file)
-    if not Path(out_dir).is_dir():
-        os.mkdir(out_dir)
-    elif Path(out_file).is_file():
-        os.remove(out_file)
-    shutil.move(zip_file, out_dir)
-    shutil.rmtree(build_directory)
-
-
-def soup_to_gloss(soup):
-    __sanitize_soup(soup)
+def make_gloss(soup):
+    __preprocess_soup(soup)
    structured_content = __get_markup_structure(soup)
    return {
        "type": "structured-content",
@ -51,10 +11,10 @@ def soup_to_gloss(soup):
    }


-def __sanitize_soup(soup):
+def __preprocess_soup(soup):
    patterns = [
-        r"^(.+)（[ぁ-ヿ]+）$",
-        r"^(.+)（[ぁ-ヿ]+（[ぁ-ヿ]）[ぁ-ヿ]+）$"
+        r"^(.+)（[ぁ-ヿ、\s]+）$",
+        r"^(.+)（[ぁ-ヿ、\s]+（[ぁ-ヿ、\s]）[ぁ-ヿ、\s]+）$"
    ]
    for a in soup.find_all("a"):
        for pattern in patterns: