First version

Support for Jitenon's yoji dictionary
2023-04-07 22:05:36 -05:00 · 2023-04-07 22:05:36 -05:00 · f9ad9e6d21
parent 0a155809fe
commit f9ad9e6d21
8 changed files with 427 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,7 @@
+webcache/
+output/
+notes/
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
--- a/config.json
+++ b/config.json
@ -0,0 +1,6 @@
+{
+    "http-request-headers": {
+        "User-Agent": "",
+        "Accept-Language": ""
+    }
+}
--- a/crawlers.py
+++ b/crawlers.py
@ -0,0 +1,41 @@
+import re
+from bs4 import BeautifulSoup
+
+import scraper as Scraper
+import yomichan as Yomichan
+from jitenon_yoji import JitenonYoji
+
+
+def jitenon_yoji_crawler():
+    entries = {}
+    jitenon = Scraper.Jitenon()
+    gojuon = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html")
+    gojuon_soup = BeautifulSoup(gojuon, features="html.parser")
+    for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
+        gojuon_href = gojuon_a['href']
+        kana = jitenon.scrape(gojuon_href)
+        kana_soup = BeautifulSoup(kana, features="html.parser")
+        for kana_a in kana_soup.select(".word_box a", href=True):
+            kana_href = kana_a['href']
+            sequence = int(re.search(r"([0-9]+).html", kana_href).group(1))
+            if sequence in entries:
+                continue
+            yoji = jitenon.scrape(kana_href)
+            yoji_soup = BeautifulSoup(yoji, features="html5lib")
+            entry = JitenonYoji(sequence)
+            entry.add_soup(yoji_soup)
+            entries[sequence] = entry
+    terms = []
+    for entry in entries.values():
+        for term in entry.yomichan_terms():
+            terms.append(term)
+    index = {
+        "title": "四字熟語辞典オンライン",
+        "revision": "test",
+        "sequenced": True,
+        "format": 3,
+        "url": "https://yoji.jitenon.jp/",
+        "attribution": "© 2012-2023 四字熟語辞典オンライン",
+        "description": "",
+    }
+    Yomichan.create_zip(terms, index)
--- a/jitenbot.py
+++ b/jitenbot.py
@ -0,0 +1,22 @@
+""" jitenbot
+Copyright (C) 2023 Stephen Kraus
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+"""
+
+from crawlers import jitenon_yoji_crawler
+
+if __name__ == "__main__":
+    jitenon_yoji_crawler()
--- a/jitenon_yoji.py
+++ b/jitenon_yoji.py
@ -0,0 +1,116 @@
+import re
+
+import yomichan as Yomichan
+import util as Util
+
+
+class JitenonYoji:
+    columns = {
+        "四字熟語": ["yojijukugo", ""],
+        "読み方":   ["yomikata", ""],
+        "意味":     ["imi", ""],
+        "出典":     ["shutten", ""],
+        "漢検級":   ["kankenkyuu", ""],
+        "場面用途": ["bamenyouto", ""],
+        "異形":     ["ikei", []],
+        "類義語":   ["ruigigo", []],
+    }
+
+    def __init__(self, sequence):
+        self.sequence = sequence
+        self.yomichan_glossary = [""]
+        for column in self.columns.values():
+            setattr(self, column[0], column[1])
+
+    def add_soup(self, yoji_soup):
+        table = yoji_soup.find(class_="kanjirighttb")
+        rows = table.find("tbody").find_all("tr")
+        colname = ""
+        for row in rows:
+            colname = row.th.text if row.th is not None else colname
+            colval = row.td.decode_contents()
+            self.__set_column(colname, colval)
+        self.yomichan_glossary = [Yomichan.soup_to_gloss(table)]
+
+    def yomichan_terms(self):
+        terms = []
+        for idx, headword in enumerate(self.__headwords()):
+            (yoji, reading) = headword
+            definition_tags = None
+            inflection_rules = ""
+            score = -idx
+            glossary = self.yomichan_glossary
+            sequence = self.sequence
+            term_tags = ""
+            term = [
+                yoji, reading, definition_tags, inflection_rules,
+                score, glossary, sequence, term_tags
+            ]
+            terms.append(term)
+        return terms
+
+    def __set_column(self, colname, colval):
+        attr_name = self.columns[colname][0]
+        attr_value = getattr(self, attr_name)
+        colval = colval.replace("\n", "").replace(",", "、").strip()
+        if isinstance(attr_value, str):
+            setattr(self, attr_name, colval)
+        elif isinstance(attr_value, list):
+            if len(attr_value) == 0:
+                setattr(self, attr_name, [colval])
+            else:
+                attr_value.append(colval)
+                setattr(self, attr_name, attr_value)
+
+    def __headwords(self):
+        words = []
+        for yomikata in self.__yomikatas():
+            headword = [self.yojijukugo, yomikata]
+            if headword in words:
+                words.remove(headword)
+            words.append(headword)
+        for headword in self.__ikei_headwords():
+            if headword in words:
+                words.remove(headword)
+            words.append(headword)
+        return words
+
+    def __yomikatas(self):
+        m = re.search(r"^[ぁ-ヿ]+$", self.yomikata)
+        if m:
+            return [self.yomikata]
+        m = re.search(r"^([ぁ-ヿ]+)<br/>", self.yomikata)
+        if m:
+            return [m.group(1)]
+        m = re.search(r"^[ぁ-ヿ]+（[ぁ-ヿ]）[ぁ-ヿ]+$", self.yomikata)
+        if m:
+            return Util.expand_shouryaku(self.yomikata)
+        m = re.search(r"^([ぁ-ヿ]+)（([ぁ-ヿ/\s]+)）$", self.yomikata)
+        if m:
+            yomikatas = [m.group(1)]
+            alts = m.group(2).split("/")
+            for alt in alts:
+                yomikatas.append(alt.strip())
+            return yomikatas
+        raise Exception(f"Invalid 読み方 format: {self.yomikata}\n{self}")
+
+    def __ikei_headwords(self):
+        ikei_headwords = []
+        for val in self.ikei:
+            m = re.search(r"^([^（]+)（([ぁ-ヿ]+)）$", val)
+            if m:
+                headword = [m.group(1), m.group(2)]
+                ikei_headwords.append(headword)
+            else:
+                raise Exception(f"Invalid 異形 format: {val}\n{self}")
+        return ikei_headwords
+
+    def __str__(self):
+        colvals = [str(self.sequence)]
+        for attr in self.columns.values():
+            attr_val = getattr(self, attr[0])
+            if isinstance(attr_val, str):
+                colvals.append(attr_val)
+            elif isinstance(attr_val, list):
+                colvals.append("；".join(attr_val))
+        return ",".join(colvals)
--- a/scraper.py
+++ b/scraper.py
@ -0,0 +1,103 @@
+import time
+import requests
+import re
+import os
+import json
+import hashlib
+
+from pathlib import Path
+from urllib.parse import urlparse
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry
+from datetime import datetime
+
+
+class Scraper():
+    def __init__(self):
+        self.netloc_re = \
+            re.compile(r"^(?:([A-Za-z0-9.\-]+)\.)?" + self.domain + r"$")
+        self.__set_session()
+
+    def scrape(self, urlstring):
+        url = urlparse(urlstring, scheme='https://', allow_fragments=True)
+        self.__validate_url(url)
+        cache_path = self.__cache_path(url)
+        cache_contents = self.__read_cache(cache_path)
+        if cache_contents is not None:
+            return cache_contents
+        html = self.__get(urlstring)
+        with open(cache_path, "w") as f:
+            f.write(html)
+        return html
+
+    def __set_session(self):
+        retry_strategy = Retry(
+            total=3,
+            backoff_factor=1,
+            status_forcelist=[429, 500, 502, 503, 504],
+            allowed_methods=["HEAD", "GET", "OPTIONS"]
+        )
+        adapter = HTTPAdapter(max_retries=retry_strategy)
+        with open("config.json", "r") as f:
+            config = json.load(f)
+        headers = config["http-request-headers"]
+        self.session = requests.Session()
+        self.session.mount("https://", adapter)
+        self.session.headers.update(headers)
+
+    def __validate_url(self, url):
+        valid = False
+        if self.netloc_re.match(url.netloc):
+            valid = True
+        # may add more validators later
+        if not valid:
+            raise Exception(f"Invalid URL: {url.geturl()}")
+
+    def __cache_path(self, url):
+        cache_dir = os.path.join("webcache", self.__class__.__name__.lower())
+        netloc_match = self.netloc_re.match(url.netloc)
+        if netloc_match.group(1) is not None:
+            subdomain = netloc_match.group(1)
+            cache_dir = os.path.join(cache_dir, subdomain)
+        paths = re.findall(r"/([^/]+)", url.path)
+        if len(paths) < 1:
+            raise Exception(f"Invalid path in URL: {url.geturl()}")
+        for x in paths[:len(paths)-1]:
+            cache_dir = os.path.join(cache_dir, x)
+        if not Path(cache_dir).is_dir():
+            os.makedirs(cache_dir)
+        basename = paths[-1].replace(".", "_")
+        urlstring_hash = hashlib.md5(url.geturl().encode()).hexdigest()
+        filename = f"{basename}-{urlstring_hash}.html"
+        cache_path = os.path.join(cache_dir, filename)
+        return cache_path
+
+    def __read_cache(self, cache_path):
+        if Path(cache_path).is_file():
+            with open(cache_path, "r") as f:
+                file_contents = f.read()
+        else:
+            file_contents = None
+        return file_contents
+
+    def __get(self, url):
+        delay = 10
+        time.sleep(delay)
+        now = datetime.now().strftime("%H:%M:%S")
+        print(f"{now} scraping {url.geturl()} ...", end='')
+        try:
+            response = self.session.get(url, timeout=10)
+            print("OK")
+            return response.text
+        except Exception:
+            print("failed")
+            print("resetting session and trying again")
+            self.__set_session()
+            response = self.session.get(url, timeout=10)
+            return response.text
+
+
+class Jitenon(Scraper):
+    def __init__(self):
+        self.domain = r"jitenon\.jp"
+        Scraper.__init__(self)
--- a/util.py
+++ b/util.py
@ -0,0 +1,26 @@
+import re
+
+
+def expand_shouryaku(shouryaku):
+    """Return a list of words described by a 省略 notation.
+    eg. "有（り）合（わ）せ" -> [
+        "有り合わせ",
+        "有合わせ",
+        "有り合せ",
+        "有合せ"
+    ]
+    """
+    groups = re.findall(r"([^（]*)(（([^（]+)）)?", shouryaku)
+    forms = [""]
+    for group in groups:
+        new_forms = []
+        for form in forms:
+            new_forms.append(form + group[0])
+        forms = new_forms.copy()
+        if group[2] == '':
+            continue
+        new_forms = []
+        for form in forms:
+            new_forms.append(form + group[2])
+        forms = new_forms.copy() + forms.copy()
+    return forms
--- a/yomichan.py
+++ b/yomichan.py
@ -0,0 +1,109 @@
+import json
+import os
+import shutil
+import uuid
+from pathlib import Path
+from css_parser import parseStyle
+
+
+def create_zip(terms, index, tags=[]):
+    build_directory = str(uuid.uuid4())
+    os.mkdir(build_directory)
+
+    terms_per_file = 500
+    max_i = int(len(terms) / terms_per_file) + 1
+    for i in range(max_i):
+        term_file = os.path.join(build_directory, f"term_bank_{i+1}.json")
+        with open(term_file, "w", encoding='utf8') as f:
+            start = terms_per_file * i
+            end = terms_per_file * (i + 1)
+            json.dump(terms[start:end], f, indent=4, ensure_ascii=False)
+
+    index_file = os.path.join(build_directory, "index.json")
+    with open(index_file, 'w', encoding='utf8') as f:
+        json.dump(index, f, indent=4, ensure_ascii=False)
+
+    if len(tags) > 0:
+        tag_file = os.path.join(build_directory, "tag_bank_1.json")
+        with open(tag_file, 'w', encoding='utf8') as f:
+            json.dump(tags, f, indent=4, ensure_ascii=False)
+
+    zip_filename = index["title"]
+    zip_file = f"{zip_filename}.zip"
+    if Path(zip_file).is_file():
+        os.remove(zip_file)
+    shutil.make_archive(zip_filename, "zip", build_directory)
+    if not Path("output").is_dir():
+        os.mkdir("output")
+    shutil.move(zip_file, "output")
+    shutil.rmtree(build_directory)
+
+
+def soup_to_gloss(soup):
+    structured_content = __get_markup_structure(soup)
+    return {
+        "type": "structured-content",
+        "content": structured_content
+    }
+
+
+def __get_markup_structure(soup):
+    node = {}
+    content = []
+    for child in soup.children:
+        if child.name is None:
+            text = child.text.strip()
+            if text != "":
+                content.append(text)
+        else:
+            content.append(__get_markup_structure(child))
+
+    node["tag"] = soup.name
+    attributes = __get_attributes(soup.attrs)
+    for key, val in attributes.items():
+        node[key] = val
+
+    if node["tag"] == "th":
+        node["style"] = {"verticalAlign": "middle", "textAlign": "center"}
+    elif node["tag"] == "p":
+        node["tag"] = "span"
+
+    if len(content) == 0:
+        pass
+    elif len(content) == 1:
+        node["content"] = content[0]
+    else:
+        node["content"] = content
+
+    return node
+
+
+def __get_attributes(attrs):
+    attributes = {}
+    if "href" in attrs:
+        attributes["href"] = attrs["href"]
+    if "rowspan" in attrs:
+        attributes["rowSpan"] = int(attrs["rowspan"])
+    if "colspan" in attrs:
+        attributes["colSpan"] = int(attrs["colspan"])
+    if "style" in attrs:
+        attributes["style"] = __get_style(attrs["style"])
+    return attributes
+
+
+def __get_style(inline_style_string):
+    style = {}
+    parsedStyle = parseStyle(inline_style_string)
+    if parsedStyle.fontSize != "":
+        style["fontSize"] = parsedStyle.fontSize
+    if parsedStyle.verticalAlign != "":
+        style["verticalAlign"] = parsedStyle.verticalAlign
+    if parsedStyle.textDecoration != "":
+        style["textDecorationLine"] = parsedStyle.textDecoration
+    if parsedStyle.listStyleType != "":
+        style["listStyleType"] = parsedStyle.listStyleType
+    if parsedStyle.fontStyle != "":
+        style["fontStyle"] = parsedStyle.fontStyle
+    if parsedStyle.fontWeight != "":
+        style["fontWeight"] = parsedStyle.fontWeight
+    return style