Reorganize file structure

2023-04-10 15:20:33 -05:00 · 2023-04-10 15:20:33 -05:00 · 83a182e682
parent 16d694d2d2
commit 83a182e682
7 changed files with 231 additions and 131 deletions
--- a/crawlers.py
+++ b/crawlers.py
@ -2,9 +2,9 @@ import re
 from bs4 import BeautifulSoup
 import scraper as Scraper
-import yomichan as Yomichan
+import yomichan.export as YomichanExport
-from jitenon_yoji import JitenonYoji
+from entries.jitenon_kotowaza import JitenonKotowaza
-from jitenon_kotowaza import JitenonKotowaza
+from entries.jitenon_yoji import JitenonYoji
 def run_all():
@ -13,7 +13,7 @@ def run_all():
 def jitenon_yoji():
-    entries = {}
+    seq_to_entries = {}
    jitenon = Scraper.Jitenon()
    gojuon_doc = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html")
    gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
@ -24,34 +24,18 @@ def jitenon_yoji():
        for kana_a in kana_soup.select(".word_box a", href=True):
            kana_href = kana_a['href']
            sequence = int(re.search(r"([0-9]+).html", kana_href).group(1))
-            if sequence in entries:
+            if sequence in seq_to_entries:
                continue
            yoji_doc = jitenon.scrape(kana_href)
            entry = JitenonYoji(sequence)
            entry.add_document(yoji_doc)
-            entries[sequence] = entry
+            seq_to_entries[sequence] = entry
-    terms = []
+    entries = seq_to_entries.values()
-    attribution = ""
+    YomichanExport.jitenon_yoji(entries)
    modified_date = None
    for entry in entries.values():
        if modified_date is None or entry.modified_date > modified_date:
            modified_date = entry.modified_date
            attribution = entry.attribution
        for term in entry.yomichan_terms():
            terms.append(term)
    index = {
        "title": "四字熟語辞典オンライン",
        "revision": f"jitenon-yoji.{modified_date}",
        "sequenced": True,
        "format": 3,
        "url": "https://yoji.jitenon.jp/",
        "attribution": attribution,
    }
    Yomichan.create_zip(terms, index)
 def jitenon_kotowaza():
-    entries = {}
+    seq_to_entries = {}
    jitenon = Scraper.Jitenon()
    gojuon_doc = jitenon.scrape("https://kotowaza.jitenon.jp/cat/gojuon.php")
    gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
@ -65,29 +49,12 @@ def jitenon_kotowaza():
            if m:
                sequence = int(m.group(1))
            else:
                # print(f"Skipping {kana_href}")
                continue
-            if sequence in entries:
+            if sequence in seq_to_entries:
                continue
            kotowaza_doc = jitenon.scrape(kana_href)
            entry = JitenonKotowaza(sequence)
            entry.add_document(kotowaza_doc)
-            entries[sequence] = entry
+            seq_to_entries[sequence] = entry
-    terms = []
+    entries = seq_to_entries.values()
-    attribution = ""
+    YomichanExport.jitenon_kotowaza(entries)
    modified_date = None
    for entry in entries.values():
        if modified_date is None or entry.modified_date > modified_date:
            modified_date = entry.modified_date
            attribution = entry.attribution
        for term in entry.yomichan_terms():
            terms.append(term)
    index = {
        "title": "故事・ことわざ・慣用句オンライン",
        "revision": f"jitenon-kotowaza.{modified_date}",
        "sequenced": True,
        "format": 3,
        "url": "https://kotowaza.jitenon.jp/",
        "attribution": attribution,
    }
    Yomichan.create_zip(terms, index)
--- a/entries/jitenon.py
+++ b/entries/jitenon.py
@ -2,22 +2,11 @@ import re
 from datetime import datetime, date
 from bs4 import BeautifulSoup
-import yomichan as Yomichan
+import yomichan.soup as YomichanSoup
 import util as Util
-class JitenonYoji:
+class Jitenon:
    columns = {
        "四字熟語": ["expression", ""],
        "読み方":   ["yomikata", ""],
        "意味":     ["imi", ""],
        "出典":     ["shutten", ""],
        "漢検級":   ["kankenkyuu", ""],
        "場面用途": ["bamenyouto", ""],
        "異形":     ["ikei", []],
        "類義語":   ["ruigigo", []],
    }
    def __init__(self, sequence):
        self.sequence = sequence
        self.yomichan_glossary = [""]
@ -35,26 +24,10 @@ class JitenonYoji:
        colname = ""
        for row in rows:
            colname = row.th.text if row.th is not None else colname
-            colval = row.td.decode_contents()
+            colval = row.td.text
            self.__set_column(colname, colval)
-        self.yomichan_glossary = [Yomichan.soup_to_gloss(table)]
+        gloss = YomichanSoup.make_gloss(table)  # note: modifies table
-
+        self.yomichan_glossary = [gloss]
    def yomichan_terms(self):
        terms = []
        for idx, headword in enumerate(self.__headwords()):
            (yoji, reading) = headword
            definition_tags = None
            inflection_rules = ""
            score = -idx
            glossary = self.yomichan_glossary
            sequence = self.sequence
            term_tags = ""
            term = [
                yoji, reading, definition_tags, inflection_rules,
                score, glossary, sequence, term_tags
            ]
            terms.append(term)
        return terms
    def __set_modified_date(self, html):
        m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", html)
@ -76,7 +49,7 @@ class JitenonYoji:
                attr_value.append(colval)
                setattr(self, attr_name, attr_value)
-    def __headwords(self):
+    def _headwords(self):
        words = []
        for yomikata in self.__yomikatas():
            headword = [self.expression, yomikata]
@ -91,33 +64,35 @@ class JitenonYoji:
    def __yomikatas(self):
        yomikata = self.yomikata.replace(" ", "")
-        m = re.search(r"^[ぁ-ヿ]+$", yomikata)
+        m = re.search(r"^[ぁ-ヿ、]+$", yomikata)
        if m:
            return [yomikata]
-        m = re.search(r"^([ぁ-ヿ]+)<br/>", yomikata)
+        m = re.search(r"^([ぁ-ヿ、]+)※", yomikata)
        if m:
            return [m.group(1)]
-        m = re.search(r"^[ぁ-ヿ]+（[ぁ-ヿ]）[ぁ-ヿ]+$", yomikata)
+        m = re.search(r"^[ぁ-ヿ、]+（[ぁ-ヿ、]）[ぁ-ヿ、]+$", yomikata)
        if m:
            return Util.expand_shouryaku(yomikata)
-        m = re.search(r"^([ぁ-ヿ]+)（([ぁ-ヿ/\s]+)）$", yomikata)
+        m = re.search(r"^([ぁ-ヿ、]+)（([ぁ-ヿ/\s、]+)）$", yomikata)
        if m:
            yomikatas = [m.group(1)]
            alts = m.group(2).split("/")
            for alt in alts:
                yomikatas.append(alt.strip())
            return yomikatas
-        raise Exception(f"Invalid 読み方 format: {self.yomikata}\n{self}")
+        print(f"Invalid 読み方 format: {self.yomikata}\n{self}\n")
        return ""
    def __ikei_headwords(self):
        ikei_headwords = []
        for val in self.ikei:
-            m = re.search(r"^([^（]+)（([ぁ-ヿ]+)）$", val)
+            val = val.replace(" ", "")
            m = re.search(r"^([^（]+)（([ぁ-ヿ、]+)）$", val)
            if m:
                headword = [m.group(1), m.group(2)]
                ikei_headwords.append(headword)
            else:
-                raise Exception(f"Invalid 異形 format: {val}\n{self}")
+                print(f"Invalid 異形 format: {val}\n{self}\n")
        return ikei_headwords
    def __str__(self):
--- a/entries/jitenon_kotowaza.py
+++ b/entries/jitenon_kotowaza.py
@ -0,0 +1,34 @@
 from entries.jitenon import Jitenon
 import yomichan.grammar as Grammar
 class JitenonKotowaza(Jitenon):
    columns = {
        "言葉":   ["expression", ""],
        "読み方": ["yomikata", ""],
        "意味":   ["imi", ""],
        "出典":   ["shutten", ""],
        "例文":   ["reibun", ""],
        "異形":   ["ikei", []],
        "類句":   ["ruiku", []],
    }
    def __init__(self, sequence):
        Jitenon.__init__(self, sequence)
    def yomichan_terms(self):
        terms = []
        for idx, headword in enumerate(self._headwords()):
            (expression, reading) = headword
            definition_tags = None
            inflection_rules = Grammar.sudachi_rules(expression, reading)
            score = -idx
            glossary = self.yomichan_glossary
            sequence = self.sequence
            term_tags = ""
            term = [
                expression, reading, definition_tags, inflection_rules,
                score, glossary, sequence, term_tags
            ]
            terms.append(term)
        return terms
--- a/entries/jitenon_yoji.py
+++ b/entries/jitenon_yoji.py
@ -0,0 +1,38 @@
 from entries.jitenon import Jitenon
 class JitenonYoji(Jitenon):
    columns = {
        "四字熟語": ["expression", ""],
        "読み方":   ["yomikata", ""],
        "意味":     ["imi", ""],
        "出典":     ["shutten", ""],
        "漢検級":   ["kankenkyuu", ""],
        "場面用途": ["bamenyouto", ""],
        "異形":     ["ikei", []],
        "類義語":   ["ruigigo", []],
    }
    def __init__(self, sequence):
        Jitenon.__init__(self, sequence)
    def yomichan_terms(self):
        terms = []
        for idx, headword in enumerate(self._headwords()):
            (expression, reading) = headword
            definition_tags = None
            inflection_rules = ""
            score = -idx
            glossary = self.yomichan_glossary
            sequence = self.sequence
            term_tags = self.__term_tags()
            term = [
                expression, reading, definition_tags, inflection_rules,
                score, glossary, sequence, term_tags
            ]
            terms.append(term)
        return terms
    def __term_tags(self):
        tags = self.kankenkyuu.replace(" ", "").split("/")
        return " ".join(tags)
--- a/yomichan/export.py
+++ b/yomichan/export.py
@ -0,0 +1,88 @@
 import json
 import os
 import shutil
 import uuid
 from pathlib import Path
 def jitenon_yoji(entries):
    terms, modified_date, attribution = __terms(entries)
    index = {
        "title": "四字熟語辞典オンライン",
        "revision": f"jitenon-yoji.{modified_date}",
        "sequenced": True,
        "format": 3,
        "url": "https://yoji.jitenon.jp/",
        "attribution": attribution,
    }
    tags = [
        ["１級", "frequent", 0, "漢字検定（漢検）１級の四字熟語", 0],
        ["準１級", "frequent", 0, "漢字検定（漢検）準１級の四字熟語", 0],
        ["２級", "frequent", 0, "漢字検定（漢検）２級の四字熟語", 0],
        ["準２級", "frequent", 0, "漢字検定（漢検）準２級の四字熟語", 0],
        ["３級", "frequent", 0, "漢字検定（漢検）３級の四字熟語", 0],
        ["４級", "frequent", 0, "漢字検定（漢検）４級の四字熟語", 0],
        ["５級", "frequent", 0, "漢字検定（漢検）５級の四字熟語", 0],
    ]
    __create_zip(terms, index, tags)
 def jitenon_kotowaza(entries):
    terms, modified_date, attribution = __terms(entries)
    index = {
        "title": "故事・ことわざ・慣用句オンライン",
        "revision": f"jitenon-kotowaza.{modified_date}",
        "sequenced": True,
        "format": 3,
        "url": "https://kotowaza.jitenon.jp/",
        "attribution": attribution,
    }
    __create_zip(terms, index)
 def __terms(entries):
    terms = []
    modified_date = None
    attribution = ""
    for entry in entries:
        if modified_date is None or entry.modified_date > modified_date:
            modified_date = entry.modified_date
            attribution = entry.attribution
        for term in entry.yomichan_terms():
            terms.append(term)
    return terms, modified_date, attribution
 def __create_zip(terms, index, tags=[]):
    build_directory = str(uuid.uuid4())
    os.mkdir(build_directory)
    terms_per_file = 1000
    max_i = int(len(terms) / terms_per_file) + 1
    for i in range(max_i):
        term_file = os.path.join(build_directory, f"term_bank_{i+1}.json")
        with open(term_file, "w", encoding='utf8') as f:
            start = terms_per_file * i
            end = terms_per_file * (i + 1)
            json.dump(terms[start:end], f, indent=4, ensure_ascii=False)
    index_file = os.path.join(build_directory, "index.json")
    with open(index_file, 'w', encoding='utf8') as f:
        json.dump(index, f, indent=4, ensure_ascii=False)
    if len(tags) > 0:
        tag_file = os.path.join(build_directory, "tag_bank_1.json")
        with open(tag_file, 'w', encoding='utf8') as f:
            json.dump(tags, f, indent=4, ensure_ascii=False)
    zip_filename = index["title"]
    zip_file = f"{zip_filename}.zip"
    shutil.make_archive(zip_filename, "zip", build_directory)
    out_dir = "output"
    out_file = os.path.join(out_dir, zip_file)
    if not Path(out_dir).is_dir():
        os.mkdir(out_dir)
    elif Path(out_file).is_file():
        os.remove(out_file)
    shutil.move(zip_file, out_dir)
    shutil.rmtree(build_directory)
--- a/yomichan/grammar.py
+++ b/yomichan/grammar.py
@ -0,0 +1,38 @@
 from sudachipy import tokenizer
 from sudachipy import dictionary
 def sudachi_rules(expression, reading):
    tokenizer_obj = dictionary.Dictionary().create()
    splitmode = tokenizer.Tokenizer.SplitMode.A
    tokens = tokenizer_obj.tokenize(expression, splitmode)
    pos = tokens[len(tokens)-1].part_of_speech()[4]
    tags = pos.split("-")
    rules = __sudachi_tags_to_rules(tags, expression, reading)
    return rules
 def __sudachi_tags_to_rules(tags, expression, reading):
    u_endings = ["う", "く", "す", "つ", "ぬ", "ふ", "む",
                 "ゆ", "る", "ぐ", "ず", "づ", "ぶ", "ぷ"]
    rules = set()
    for tag in tags:
        if expression.endswith("い"):
            if tag == "形容詞" or "ナイ" in tag or "タイ" in tag:
                rules.add("adj-i")
        if expression.endswith("る"):
            if "一" in tag or tag == "レル":
                rules.add("v1")
        if "二" in tag or "四" in tag or "五" in tag:
            for u_ending in u_endings:
                if expression.endswith(u_ending):
                    rules.add("v5")
                    break
        if "サ" in tag and (expression.endswith("する") or expression == "為る"):
            rules.add("vs")
        if "サ" in tag and expression.endswith("ずる"):
            rules.add("vz")
    if expression.endswith("来る") and reading.endswith("くる"):
        rules = set()
        rules.add("vk")
    return " ".join(list(rules))
--- a/yomichan/soup.py
+++ b/yomichan/soup.py
@ -1,49 +1,9 @@
 import json
 import os
 import shutil
 import uuid
 import re
 from pathlib import Path
 from css_parser import parseStyle
-def create_zip(terms, index, tags=[]):
+def make_gloss(soup):
-    build_directory = str(uuid.uuid4())
+    __preprocess_soup(soup)
    os.mkdir(build_directory)
    terms_per_file = 1000
    max_i = int(len(terms) / terms_per_file) + 1
    for i in range(max_i):
        term_file = os.path.join(build_directory, f"term_bank_{i+1}.json")
        with open(term_file, "w", encoding='utf8') as f:
            start = terms_per_file * i
            end = terms_per_file * (i + 1)
            json.dump(terms[start:end], f, indent=4, ensure_ascii=False)
    index_file = os.path.join(build_directory, "index.json")
    with open(index_file, 'w', encoding='utf8') as f:
        json.dump(index, f, indent=4, ensure_ascii=False)
    if len(tags) > 0:
        tag_file = os.path.join(build_directory, "tag_bank_1.json")
        with open(tag_file, 'w', encoding='utf8') as f:
            json.dump(tags, f, indent=4, ensure_ascii=False)
    zip_filename = index["title"]
    zip_file = f"{zip_filename}.zip"
    shutil.make_archive(zip_filename, "zip", build_directory)
    out_dir = "output"
    out_file = os.path.join(out_dir, zip_file)
    if not Path(out_dir).is_dir():
        os.mkdir(out_dir)
    elif Path(out_file).is_file():
        os.remove(out_file)
    shutil.move(zip_file, out_dir)
    shutil.rmtree(build_directory)
 def soup_to_gloss(soup):
    __sanitize_soup(soup)
    structured_content = __get_markup_structure(soup)
    return {
        "type": "structured-content",
@ -51,10 +11,10 @@ def soup_to_gloss(soup):
    }
-def __sanitize_soup(soup):
+def __preprocess_soup(soup):
    patterns = [
-        r"^(.+)（[ぁ-ヿ]+）$",
+        r"^(.+)（[ぁ-ヿ、\s]+）$",
-        r"^(.+)（[ぁ-ヿ]+（[ぁ-ヿ]）[ぁ-ヿ]+）$"
+        r"^(.+)（[ぁ-ヿ、\s]+（[ぁ-ヿ、\s]）[ぁ-ヿ、\s]+）$"
    ]
    for a in soup.find_all("a"):
        for pattern in patterns: