From 79632843cb66eb5656976b79fffd68de7eb61d6d Mon Sep 17 00:00:00 2001 From: stephenmk Date: Sat, 8 Apr 2023 18:17:09 -0500 Subject: [PATCH] Finishing touches on first Jitenon Yoji version --- crawlers.py | 24 ++++++++++++--------- jitenon_yoji.py | 16 +++++++++++++- yomichan.py | 56 ++++++++++++++++++++++++++++++++----------------- 3 files changed, 66 insertions(+), 30 deletions(-) diff --git a/crawlers.py b/crawlers.py index 51017c5..76c2c84 100644 --- a/crawlers.py +++ b/crawlers.py @@ -9,33 +9,37 @@ from jitenon_yoji import JitenonYoji def jitenon_yoji_crawler(): entries = {} jitenon = Scraper.Jitenon() - gojuon = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html") - gojuon_soup = BeautifulSoup(gojuon, features="html.parser") + gojuon_doc = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html") + gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser") for gojuon_a in gojuon_soup.select(".kana_area a", href=True): gojuon_href = gojuon_a['href'] - kana = jitenon.scrape(gojuon_href) - kana_soup = BeautifulSoup(kana, features="html.parser") + kana_doc = jitenon.scrape(gojuon_href) + kana_soup = BeautifulSoup(kana_doc, features="html.parser") for kana_a in kana_soup.select(".word_box a", href=True): kana_href = kana_a['href'] sequence = int(re.search(r"([0-9]+).html", kana_href).group(1)) if sequence in entries: continue - yoji = jitenon.scrape(kana_href) - yoji_soup = BeautifulSoup(yoji, features="html5lib") + yoji_doc = jitenon.scrape(kana_href) entry = JitenonYoji(sequence) - entry.add_soup(yoji_soup) + entry.add_document(yoji_doc) entries[sequence] = entry + terms = [] + attribution = "" + modified_date = None for entry in entries.values(): + if modified_date is None or entry.modified_date > modified_date: + modified_date = entry.modified_date + attribution = entry.attribution for term in entry.yomichan_terms(): terms.append(term) index = { "title": "四字熟語辞典オンライン", - "revision": "test", + "revision": f"jitenon-yoji.{modified_date}", "sequenced": True, "format": 3, "url": "https://yoji.jitenon.jp/", - "attribution": "© 2012-2023 四字熟語辞典オンライン", - "description": "", + "attribution": attribution, } Yomichan.create_zip(terms, index) diff --git a/jitenon_yoji.py b/jitenon_yoji.py index ceabac2..5b48e6d 100644 --- a/jitenon_yoji.py +++ b/jitenon_yoji.py @@ -1,4 +1,6 @@ import re +from datetime import datetime, date +from bs4 import BeautifulSoup import yomichan as Yomichan import util as Util @@ -19,10 +21,15 @@ class JitenonYoji: def __init__(self, sequence): self.sequence = sequence self.yomichan_glossary = [""] + self.modified_date = date(1970, 1, 1) + self.attribution = "" for column in self.columns.values(): setattr(self, column[0], column[1]) - def add_soup(self, yoji_soup): + def add_document(self, html): + yoji_soup = BeautifulSoup(html, features="html5lib") + self.__set_modified_date(html) + self.attribution = yoji_soup.find(class_="copyright").text table = yoji_soup.find(class_="kanjirighttb") rows = table.find("tbody").find_all("tr") colname = "" @@ -49,6 +56,13 @@ class JitenonYoji: terms.append(term) return terms + def __set_modified_date(self, html): + m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", html) + if not m: + return + date = datetime.strptime(m.group(1), '%Y-%m-%d').date() + self.modified_date = date + def __set_column(self, colname, colval): attr_name = self.columns[colname][0] attr_value = getattr(self, attr_name) diff --git a/yomichan.py b/yomichan.py index 9d970e2..fd928fd 100644 --- a/yomichan.py +++ b/yomichan.py @@ -2,6 +2,7 @@ import json import os import shutil import uuid +import re from pathlib import Path from css_parser import parseStyle @@ -10,7 +11,7 @@ def create_zip(terms, index, tags=[]): build_directory = str(uuid.uuid4()) os.mkdir(build_directory) - terms_per_file = 500 + terms_per_file = 1000 max_i = int(len(terms) / terms_per_file) + 1 for i in range(max_i): term_file = os.path.join(build_directory, f"term_bank_{i+1}.json") @@ -30,16 +31,19 @@ def create_zip(terms, index, tags=[]): zip_filename = index["title"] zip_file = f"{zip_filename}.zip" - if Path(zip_file).is_file(): - os.remove(zip_file) shutil.make_archive(zip_filename, "zip", build_directory) - if not Path("output").is_dir(): - os.mkdir("output") - shutil.move(zip_file, "output") + out_dir = "output" + out_file = os.path.join(out_dir, zip_file) + if not Path(out_dir).is_dir(): + os.mkdir(out_dir) + elif Path(out_file).is_file(): + os.remove(out_file) + shutil.move(zip_file, out_dir) shutil.rmtree(build_directory) def soup_to_gloss(soup): + __sanitize_soup(soup) structured_content = __get_markup_structure(soup) return { "type": "structured-content", @@ -47,6 +51,23 @@ def soup_to_gloss(soup): } +def __sanitize_soup(soup): + patterns = [ + r"^(.+)([ぁ-ヿ]+)$", + r"^(.+)([ぁ-ヿ]+([ぁ-ヿ])[ぁ-ヿ]+)$" + ] + for a in soup.find_all("a"): + for pattern in patterns: + m = re.search(pattern, a.text) + if m: + a['href'] = f"?query={m.group(1)}&wildcards=off" + break + for p in soup.find_all("p"): + p.name = "span" + for th in soup.find_all("th"): + th['style'] = "vertical-align: middle; text-align: center;" + + def __get_markup_structure(soup): node = {} content = [] @@ -63,11 +84,6 @@ def __get_markup_structure(soup): for key, val in attributes.items(): node[key] = val - if node["tag"] == "th": - node["style"] = {"verticalAlign": "middle", "textAlign": "center"} - elif node["tag"] == "p": - node["tag"] = "span" - if len(content) == 0: pass elif len(content) == 1: @@ -94,16 +110,18 @@ def __get_attributes(attrs): def __get_style(inline_style_string): style = {} parsedStyle = parseStyle(inline_style_string) - if parsedStyle.fontSize != "": - style["fontSize"] = parsedStyle.fontSize - if parsedStyle.verticalAlign != "": - style["verticalAlign"] = parsedStyle.verticalAlign - if parsedStyle.textDecoration != "": - style["textDecorationLine"] = parsedStyle.textDecoration - if parsedStyle.listStyleType != "": - style["listStyleType"] = parsedStyle.listStyleType if parsedStyle.fontStyle != "": style["fontStyle"] = parsedStyle.fontStyle if parsedStyle.fontWeight != "": style["fontWeight"] = parsedStyle.fontWeight + if parsedStyle.fontSize != "": + style["fontSize"] = parsedStyle.fontSize + if parsedStyle.textDecoration != "": + style["textDecorationLine"] = parsedStyle.textDecoration + if parsedStyle.verticalAlign != "": + style["verticalAlign"] = parsedStyle.verticalAlign + if parsedStyle.textAlign != "": + style["textAlign"] = parsedStyle.textAlign + if parsedStyle.listStyleType != "": + style["listStyleType"] = parsedStyle.listStyleType return style