From 79632843cb66eb5656976b79fffd68de7eb61d6d Mon Sep 17 00:00:00 2001
From: stephenmk <stephenmk@users.noreply.github.com>
Date: Sat, 8 Apr 2023 18:17:09 -0500
Subject: [PATCH] Finishing touches on first Jitenon Yoji version

---
 crawlers.py     | 24 ++++++++++++---------
 jitenon_yoji.py | 16 +++++++++++++-
 yomichan.py     | 56 ++++++++++++++++++++++++++++++++-----------------
 3 files changed, 66 insertions(+), 30 deletions(-)

diff --git a/crawlers.py b/crawlers.py
index 51017c5..76c2c84 100644
--- a/crawlers.py
+++ b/crawlers.py
@@ -9,33 +9,37 @@ from jitenon_yoji import JitenonYoji
 def jitenon_yoji_crawler():
     entries = {}
     jitenon = Scraper.Jitenon()
-    gojuon = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html")
-    gojuon_soup = BeautifulSoup(gojuon, features="html.parser")
+    gojuon_doc = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html")
+    gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
     for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
         gojuon_href = gojuon_a['href']
-        kana = jitenon.scrape(gojuon_href)
-        kana_soup = BeautifulSoup(kana, features="html.parser")
+        kana_doc = jitenon.scrape(gojuon_href)
+        kana_soup = BeautifulSoup(kana_doc, features="html.parser")
         for kana_a in kana_soup.select(".word_box a", href=True):
             kana_href = kana_a['href']
             sequence = int(re.search(r"([0-9]+).html", kana_href).group(1))
             if sequence in entries:
                 continue
-            yoji = jitenon.scrape(kana_href)
-            yoji_soup = BeautifulSoup(yoji, features="html5lib")
+            yoji_doc = jitenon.scrape(kana_href)
             entry = JitenonYoji(sequence)
-            entry.add_soup(yoji_soup)
+            entry.add_document(yoji_doc)
             entries[sequence] = entry
+
     terms = []
+    attribution = ""
+    modified_date = None
     for entry in entries.values():
+        if modified_date is None or entry.modified_date > modified_date:
+            modified_date = entry.modified_date
+            attribution = entry.attribution
         for term in entry.yomichan_terms():
             terms.append(term)
     index = {
         "title": "四字熟語辞典オンライン",
-        "revision": "test",
+        "revision": f"jitenon-yoji.{modified_date}",
         "sequenced": True,
         "format": 3,
         "url": "https://yoji.jitenon.jp/",
-        "attribution": "© 2012-2023 四字熟語辞典オンライン",
-        "description": "",
+        "attribution": attribution,
     }
     Yomichan.create_zip(terms, index)
diff --git a/jitenon_yoji.py b/jitenon_yoji.py
index ceabac2..5b48e6d 100644
--- a/jitenon_yoji.py
+++ b/jitenon_yoji.py
@@ -1,4 +1,6 @@
 import re
+from datetime import datetime, date
+from bs4 import BeautifulSoup
 
 import yomichan as Yomichan
 import util as Util
@@ -19,10 +21,15 @@ class JitenonYoji:
     def __init__(self, sequence):
         self.sequence = sequence
         self.yomichan_glossary = [""]
+        self.modified_date = date(1970, 1, 1)
+        self.attribution = ""
         for column in self.columns.values():
             setattr(self, column[0], column[1])
 
-    def add_soup(self, yoji_soup):
+    def add_document(self, html):
+        yoji_soup = BeautifulSoup(html, features="html5lib")
+        self.__set_modified_date(html)
+        self.attribution = yoji_soup.find(class_="copyright").text
         table = yoji_soup.find(class_="kanjirighttb")
         rows = table.find("tbody").find_all("tr")
         colname = ""
@@ -49,6 +56,13 @@ class JitenonYoji:
             terms.append(term)
         return terms
 
+    def __set_modified_date(self, html):
+        m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", html)
+        if not m:
+            return
+        date = datetime.strptime(m.group(1), '%Y-%m-%d').date()
+        self.modified_date = date
+
     def __set_column(self, colname, colval):
         attr_name = self.columns[colname][0]
         attr_value = getattr(self, attr_name)
diff --git a/yomichan.py b/yomichan.py
index 9d970e2..fd928fd 100644
--- a/yomichan.py
+++ b/yomichan.py
@@ -2,6 +2,7 @@ import json
 import os
 import shutil
 import uuid
+import re
 from pathlib import Path
 from css_parser import parseStyle
 
@@ -10,7 +11,7 @@ def create_zip(terms, index, tags=[]):
     build_directory = str(uuid.uuid4())
     os.mkdir(build_directory)
 
-    terms_per_file = 500
+    terms_per_file = 1000
     max_i = int(len(terms) / terms_per_file) + 1
     for i in range(max_i):
         term_file = os.path.join(build_directory, f"term_bank_{i+1}.json")
@@ -30,16 +31,19 @@ def create_zip(terms, index, tags=[]):
 
     zip_filename = index["title"]
     zip_file = f"{zip_filename}.zip"
-    if Path(zip_file).is_file():
-        os.remove(zip_file)
     shutil.make_archive(zip_filename, "zip", build_directory)
-    if not Path("output").is_dir():
-        os.mkdir("output")
-    shutil.move(zip_file, "output")
+    out_dir = "output"
+    out_file = os.path.join(out_dir, zip_file)
+    if not Path(out_dir).is_dir():
+        os.mkdir(out_dir)
+    elif Path(out_file).is_file():
+        os.remove(out_file)
+    shutil.move(zip_file, out_dir)
     shutil.rmtree(build_directory)
 
 
 def soup_to_gloss(soup):
+    __sanitize_soup(soup)
     structured_content = __get_markup_structure(soup)
     return {
         "type": "structured-content",
@@ -47,6 +51,23 @@ def soup_to_gloss(soup):
     }
 
 
+def __sanitize_soup(soup):
+    patterns = [
+        r"^(.+)（[ぁ-ヿ]+）$",
+        r"^(.+)（[ぁ-ヿ]+（[ぁ-ヿ]）[ぁ-ヿ]+）$"
+    ]
+    for a in soup.find_all("a"):
+        for pattern in patterns:
+            m = re.search(pattern, a.text)
+            if m:
+                a['href'] = f"?query={m.group(1)}&wildcards=off"
+                break
+    for p in soup.find_all("p"):
+        p.name = "span"
+    for th in soup.find_all("th"):
+        th['style'] = "vertical-align: middle; text-align: center;"
+
+
 def __get_markup_structure(soup):
     node = {}
     content = []
@@ -63,11 +84,6 @@ def __get_markup_structure(soup):
     for key, val in attributes.items():
         node[key] = val
 
-    if node["tag"] == "th":
-        node["style"] = {"verticalAlign": "middle", "textAlign": "center"}
-    elif node["tag"] == "p":
-        node["tag"] = "span"
-
     if len(content) == 0:
         pass
     elif len(content) == 1:
@@ -94,16 +110,18 @@ def __get_attributes(attrs):
 def __get_style(inline_style_string):
     style = {}
     parsedStyle = parseStyle(inline_style_string)
-    if parsedStyle.fontSize != "":
-        style["fontSize"] = parsedStyle.fontSize
-    if parsedStyle.verticalAlign != "":
-        style["verticalAlign"] = parsedStyle.verticalAlign
-    if parsedStyle.textDecoration != "":
-        style["textDecorationLine"] = parsedStyle.textDecoration
-    if parsedStyle.listStyleType != "":
-        style["listStyleType"] = parsedStyle.listStyleType
     if parsedStyle.fontStyle != "":
         style["fontStyle"] = parsedStyle.fontStyle
     if parsedStyle.fontWeight != "":
         style["fontWeight"] = parsedStyle.fontWeight
+    if parsedStyle.fontSize != "":
+        style["fontSize"] = parsedStyle.fontSize
+    if parsedStyle.textDecoration != "":
+        style["textDecorationLine"] = parsedStyle.textDecoration
+    if parsedStyle.verticalAlign != "":
+        style["verticalAlign"] = parsedStyle.verticalAlign
+    if parsedStyle.textAlign != "":
+        style["textAlign"] = parsedStyle.textAlign
+    if parsedStyle.listStyleType != "":
+        style["listStyleType"] = parsedStyle.listStyleType
     return style