From 83a182e682063a72f682ff5891a688a3fe821856 Mon Sep 17 00:00:00 2001 From: stephenmk Date: Mon, 10 Apr 2023 15:20:33 -0500 Subject: [PATCH] Reorganize file structure --- crawlers.py | 59 ++++-------------- jitenon_yoji.py => entries/jitenon.py | 55 +++++------------ entries/jitenon_kotowaza.py | 34 +++++++++++ entries/jitenon_yoji.py | 38 ++++++++++++ yomichan/export.py | 88 +++++++++++++++++++++++++++ yomichan/grammar.py | 38 ++++++++++++ yomichan.py => yomichan/soup.py | 50 ++------------- 7 files changed, 231 insertions(+), 131 deletions(-) rename jitenon_yoji.py => entries/jitenon.py (66%) create mode 100644 entries/jitenon_kotowaza.py create mode 100644 entries/jitenon_yoji.py create mode 100644 yomichan/export.py create mode 100644 yomichan/grammar.py rename yomichan.py => yomichan/soup.py (60%) diff --git a/crawlers.py b/crawlers.py index ebaf0ed..f923a46 100644 --- a/crawlers.py +++ b/crawlers.py @@ -2,9 +2,9 @@ import re from bs4 import BeautifulSoup import scraper as Scraper -import yomichan as Yomichan -from jitenon_yoji import JitenonYoji -from jitenon_kotowaza import JitenonKotowaza +import yomichan.export as YomichanExport +from entries.jitenon_kotowaza import JitenonKotowaza +from entries.jitenon_yoji import JitenonYoji def run_all(): @@ -13,7 +13,7 @@ def run_all(): def jitenon_yoji(): - entries = {} + seq_to_entries = {} jitenon = Scraper.Jitenon() gojuon_doc = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html") gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser") @@ -24,34 +24,18 @@ def jitenon_yoji(): for kana_a in kana_soup.select(".word_box a", href=True): kana_href = kana_a['href'] sequence = int(re.search(r"([0-9]+).html", kana_href).group(1)) - if sequence in entries: + if sequence in seq_to_entries: continue yoji_doc = jitenon.scrape(kana_href) entry = JitenonYoji(sequence) entry.add_document(yoji_doc) - entries[sequence] = entry - terms = [] - attribution = "" - modified_date = None - for entry in entries.values(): - if modified_date is None or entry.modified_date > modified_date: - modified_date = entry.modified_date - attribution = entry.attribution - for term in entry.yomichan_terms(): - terms.append(term) - index = { - "title": "四字熟語辞典オンライン", - "revision": f"jitenon-yoji.{modified_date}", - "sequenced": True, - "format": 3, - "url": "https://yoji.jitenon.jp/", - "attribution": attribution, - } - Yomichan.create_zip(terms, index) + seq_to_entries[sequence] = entry + entries = seq_to_entries.values() + YomichanExport.jitenon_yoji(entries) def jitenon_kotowaza(): - entries = {} + seq_to_entries = {} jitenon = Scraper.Jitenon() gojuon_doc = jitenon.scrape("https://kotowaza.jitenon.jp/cat/gojuon.php") gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser") @@ -65,29 +49,12 @@ def jitenon_kotowaza(): if m: sequence = int(m.group(1)) else: - # print(f"Skipping {kana_href}") continue - if sequence in entries: + if sequence in seq_to_entries: continue kotowaza_doc = jitenon.scrape(kana_href) entry = JitenonKotowaza(sequence) entry.add_document(kotowaza_doc) - entries[sequence] = entry - terms = [] - attribution = "" - modified_date = None - for entry in entries.values(): - if modified_date is None or entry.modified_date > modified_date: - modified_date = entry.modified_date - attribution = entry.attribution - for term in entry.yomichan_terms(): - terms.append(term) - index = { - "title": "故事・ことわざ・慣用句オンライン", - "revision": f"jitenon-kotowaza.{modified_date}", - "sequenced": True, - "format": 3, - "url": "https://kotowaza.jitenon.jp/", - "attribution": attribution, - } - Yomichan.create_zip(terms, index) + seq_to_entries[sequence] = entry + entries = seq_to_entries.values() + YomichanExport.jitenon_kotowaza(entries) diff --git a/jitenon_yoji.py b/entries/jitenon.py similarity index 66% rename from jitenon_yoji.py rename to entries/jitenon.py index 4345dca..d081458 100644 --- a/jitenon_yoji.py +++ b/entries/jitenon.py @@ -2,22 +2,11 @@ import re from datetime import datetime, date from bs4 import BeautifulSoup -import yomichan as Yomichan +import yomichan.soup as YomichanSoup import util as Util -class JitenonYoji: - columns = { - "四字熟語": ["expression", ""], - "読み方": ["yomikata", ""], - "意味": ["imi", ""], - "出典": ["shutten", ""], - "漢検級": ["kankenkyuu", ""], - "場面用途": ["bamenyouto", ""], - "異形": ["ikei", []], - "類義語": ["ruigigo", []], - } - +class Jitenon: def __init__(self, sequence): self.sequence = sequence self.yomichan_glossary = [""] @@ -35,26 +24,10 @@ class JitenonYoji: colname = "" for row in rows: colname = row.th.text if row.th is not None else colname - colval = row.td.decode_contents() + colval = row.td.text self.__set_column(colname, colval) - self.yomichan_glossary = [Yomichan.soup_to_gloss(table)] - - def yomichan_terms(self): - terms = [] - for idx, headword in enumerate(self.__headwords()): - (yoji, reading) = headword - definition_tags = None - inflection_rules = "" - score = -idx - glossary = self.yomichan_glossary - sequence = self.sequence - term_tags = "" - term = [ - yoji, reading, definition_tags, inflection_rules, - score, glossary, sequence, term_tags - ] - terms.append(term) - return terms + gloss = YomichanSoup.make_gloss(table) # note: modifies table + self.yomichan_glossary = [gloss] def __set_modified_date(self, html): m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", html) @@ -76,7 +49,7 @@ class JitenonYoji: attr_value.append(colval) setattr(self, attr_name, attr_value) - def __headwords(self): + def _headwords(self): words = [] for yomikata in self.__yomikatas(): headword = [self.expression, yomikata] @@ -91,33 +64,35 @@ class JitenonYoji: def __yomikatas(self): yomikata = self.yomikata.replace(" ", "") - m = re.search(r"^[ぁ-ヿ]+$", yomikata) + m = re.search(r"^[ぁ-ヿ、]+$", yomikata) if m: return [yomikata] - m = re.search(r"^([ぁ-ヿ]+)
", yomikata) + m = re.search(r"^([ぁ-ヿ、]+)※", yomikata) if m: return [m.group(1)] - m = re.search(r"^[ぁ-ヿ]+([ぁ-ヿ])[ぁ-ヿ]+$", yomikata) + m = re.search(r"^[ぁ-ヿ、]+([ぁ-ヿ、])[ぁ-ヿ、]+$", yomikata) if m: return Util.expand_shouryaku(yomikata) - m = re.search(r"^([ぁ-ヿ]+)(([ぁ-ヿ/\s]+))$", yomikata) + m = re.search(r"^([ぁ-ヿ、]+)(([ぁ-ヿ/\s、]+))$", yomikata) if m: yomikatas = [m.group(1)] alts = m.group(2).split("/") for alt in alts: yomikatas.append(alt.strip()) return yomikatas - raise Exception(f"Invalid 読み方 format: {self.yomikata}\n{self}") + print(f"Invalid 読み方 format: {self.yomikata}\n{self}\n") + return "" def __ikei_headwords(self): ikei_headwords = [] for val in self.ikei: - m = re.search(r"^([^(]+)(([ぁ-ヿ]+))$", val) + val = val.replace(" ", "") + m = re.search(r"^([^(]+)(([ぁ-ヿ、]+))$", val) if m: headword = [m.group(1), m.group(2)] ikei_headwords.append(headword) else: - raise Exception(f"Invalid 異形 format: {val}\n{self}") + print(f"Invalid 異形 format: {val}\n{self}\n") return ikei_headwords def __str__(self): diff --git a/entries/jitenon_kotowaza.py b/entries/jitenon_kotowaza.py new file mode 100644 index 0000000..1b7b352 --- /dev/null +++ b/entries/jitenon_kotowaza.py @@ -0,0 +1,34 @@ +from entries.jitenon import Jitenon +import yomichan.grammar as Grammar + + +class JitenonKotowaza(Jitenon): + columns = { + "言葉": ["expression", ""], + "読み方": ["yomikata", ""], + "意味": ["imi", ""], + "出典": ["shutten", ""], + "例文": ["reibun", ""], + "異形": ["ikei", []], + "類句": ["ruiku", []], + } + + def __init__(self, sequence): + Jitenon.__init__(self, sequence) + + def yomichan_terms(self): + terms = [] + for idx, headword in enumerate(self._headwords()): + (expression, reading) = headword + definition_tags = None + inflection_rules = Grammar.sudachi_rules(expression, reading) + score = -idx + glossary = self.yomichan_glossary + sequence = self.sequence + term_tags = "" + term = [ + expression, reading, definition_tags, inflection_rules, + score, glossary, sequence, term_tags + ] + terms.append(term) + return terms diff --git a/entries/jitenon_yoji.py b/entries/jitenon_yoji.py new file mode 100644 index 0000000..286d44e --- /dev/null +++ b/entries/jitenon_yoji.py @@ -0,0 +1,38 @@ +from entries.jitenon import Jitenon + + +class JitenonYoji(Jitenon): + columns = { + "四字熟語": ["expression", ""], + "読み方": ["yomikata", ""], + "意味": ["imi", ""], + "出典": ["shutten", ""], + "漢検級": ["kankenkyuu", ""], + "場面用途": ["bamenyouto", ""], + "異形": ["ikei", []], + "類義語": ["ruigigo", []], + } + + def __init__(self, sequence): + Jitenon.__init__(self, sequence) + + def yomichan_terms(self): + terms = [] + for idx, headword in enumerate(self._headwords()): + (expression, reading) = headword + definition_tags = None + inflection_rules = "" + score = -idx + glossary = self.yomichan_glossary + sequence = self.sequence + term_tags = self.__term_tags() + term = [ + expression, reading, definition_tags, inflection_rules, + score, glossary, sequence, term_tags + ] + terms.append(term) + return terms + + def __term_tags(self): + tags = self.kankenkyuu.replace(" ", "").split("/") + return " ".join(tags) diff --git a/yomichan/export.py b/yomichan/export.py new file mode 100644 index 0000000..ef30db9 --- /dev/null +++ b/yomichan/export.py @@ -0,0 +1,88 @@ +import json +import os +import shutil +import uuid +from pathlib import Path + + +def jitenon_yoji(entries): + terms, modified_date, attribution = __terms(entries) + index = { + "title": "四字熟語辞典オンライン", + "revision": f"jitenon-yoji.{modified_date}", + "sequenced": True, + "format": 3, + "url": "https://yoji.jitenon.jp/", + "attribution": attribution, + } + tags = [ + ["1級", "frequent", 0, "漢字検定(漢検)1級の四字熟語", 0], + ["準1級", "frequent", 0, "漢字検定(漢検)準1級の四字熟語", 0], + ["2級", "frequent", 0, "漢字検定(漢検)2級の四字熟語", 0], + ["準2級", "frequent", 0, "漢字検定(漢検)準2級の四字熟語", 0], + ["3級", "frequent", 0, "漢字検定(漢検)3級の四字熟語", 0], + ["4級", "frequent", 0, "漢字検定(漢検)4級の四字熟語", 0], + ["5級", "frequent", 0, "漢字検定(漢検)5級の四字熟語", 0], + ] + __create_zip(terms, index, tags) + + +def jitenon_kotowaza(entries): + terms, modified_date, attribution = __terms(entries) + index = { + "title": "故事・ことわざ・慣用句オンライン", + "revision": f"jitenon-kotowaza.{modified_date}", + "sequenced": True, + "format": 3, + "url": "https://kotowaza.jitenon.jp/", + "attribution": attribution, + } + __create_zip(terms, index) + + +def __terms(entries): + terms = [] + modified_date = None + attribution = "" + for entry in entries: + if modified_date is None or entry.modified_date > modified_date: + modified_date = entry.modified_date + attribution = entry.attribution + for term in entry.yomichan_terms(): + terms.append(term) + return terms, modified_date, attribution + + +def __create_zip(terms, index, tags=[]): + build_directory = str(uuid.uuid4()) + os.mkdir(build_directory) + + terms_per_file = 1000 + max_i = int(len(terms) / terms_per_file) + 1 + for i in range(max_i): + term_file = os.path.join(build_directory, f"term_bank_{i+1}.json") + with open(term_file, "w", encoding='utf8') as f: + start = terms_per_file * i + end = terms_per_file * (i + 1) + json.dump(terms[start:end], f, indent=4, ensure_ascii=False) + + index_file = os.path.join(build_directory, "index.json") + with open(index_file, 'w', encoding='utf8') as f: + json.dump(index, f, indent=4, ensure_ascii=False) + + if len(tags) > 0: + tag_file = os.path.join(build_directory, "tag_bank_1.json") + with open(tag_file, 'w', encoding='utf8') as f: + json.dump(tags, f, indent=4, ensure_ascii=False) + + zip_filename = index["title"] + zip_file = f"{zip_filename}.zip" + shutil.make_archive(zip_filename, "zip", build_directory) + out_dir = "output" + out_file = os.path.join(out_dir, zip_file) + if not Path(out_dir).is_dir(): + os.mkdir(out_dir) + elif Path(out_file).is_file(): + os.remove(out_file) + shutil.move(zip_file, out_dir) + shutil.rmtree(build_directory) diff --git a/yomichan/grammar.py b/yomichan/grammar.py new file mode 100644 index 0000000..6f30061 --- /dev/null +++ b/yomichan/grammar.py @@ -0,0 +1,38 @@ +from sudachipy import tokenizer +from sudachipy import dictionary + + +def sudachi_rules(expression, reading): + tokenizer_obj = dictionary.Dictionary().create() + splitmode = tokenizer.Tokenizer.SplitMode.A + tokens = tokenizer_obj.tokenize(expression, splitmode) + pos = tokens[len(tokens)-1].part_of_speech()[4] + tags = pos.split("-") + rules = __sudachi_tags_to_rules(tags, expression, reading) + return rules + + +def __sudachi_tags_to_rules(tags, expression, reading): + u_endings = ["う", "く", "す", "つ", "ぬ", "ふ", "む", + "ゆ", "る", "ぐ", "ず", "づ", "ぶ", "ぷ"] + rules = set() + for tag in tags: + if expression.endswith("い"): + if tag == "形容詞" or "ナイ" in tag or "タイ" in tag: + rules.add("adj-i") + if expression.endswith("る"): + if "一" in tag or tag == "レル": + rules.add("v1") + if "二" in tag or "四" in tag or "五" in tag: + for u_ending in u_endings: + if expression.endswith(u_ending): + rules.add("v5") + break + if "サ" in tag and (expression.endswith("する") or expression == "為る"): + rules.add("vs") + if "サ" in tag and expression.endswith("ずる"): + rules.add("vz") + if expression.endswith("来る") and reading.endswith("くる"): + rules = set() + rules.add("vk") + return " ".join(list(rules)) diff --git a/yomichan.py b/yomichan/soup.py similarity index 60% rename from yomichan.py rename to yomichan/soup.py index fd928fd..cf12e8d 100644 --- a/yomichan.py +++ b/yomichan/soup.py @@ -1,49 +1,9 @@ -import json -import os -import shutil -import uuid import re -from pathlib import Path from css_parser import parseStyle -def create_zip(terms, index, tags=[]): - build_directory = str(uuid.uuid4()) - os.mkdir(build_directory) - - terms_per_file = 1000 - max_i = int(len(terms) / terms_per_file) + 1 - for i in range(max_i): - term_file = os.path.join(build_directory, f"term_bank_{i+1}.json") - with open(term_file, "w", encoding='utf8') as f: - start = terms_per_file * i - end = terms_per_file * (i + 1) - json.dump(terms[start:end], f, indent=4, ensure_ascii=False) - - index_file = os.path.join(build_directory, "index.json") - with open(index_file, 'w', encoding='utf8') as f: - json.dump(index, f, indent=4, ensure_ascii=False) - - if len(tags) > 0: - tag_file = os.path.join(build_directory, "tag_bank_1.json") - with open(tag_file, 'w', encoding='utf8') as f: - json.dump(tags, f, indent=4, ensure_ascii=False) - - zip_filename = index["title"] - zip_file = f"{zip_filename}.zip" - shutil.make_archive(zip_filename, "zip", build_directory) - out_dir = "output" - out_file = os.path.join(out_dir, zip_file) - if not Path(out_dir).is_dir(): - os.mkdir(out_dir) - elif Path(out_file).is_file(): - os.remove(out_file) - shutil.move(zip_file, out_dir) - shutil.rmtree(build_directory) - - -def soup_to_gloss(soup): - __sanitize_soup(soup) +def make_gloss(soup): + __preprocess_soup(soup) structured_content = __get_markup_structure(soup) return { "type": "structured-content", @@ -51,10 +11,10 @@ def soup_to_gloss(soup): } -def __sanitize_soup(soup): +def __preprocess_soup(soup): patterns = [ - r"^(.+)([ぁ-ヿ]+)$", - r"^(.+)([ぁ-ヿ]+([ぁ-ヿ])[ぁ-ヿ]+)$" + r"^(.+)([ぁ-ヿ、\s]+)$", + r"^(.+)([ぁ-ヿ、\s]+([ぁ-ヿ、\s])[ぁ-ヿ、\s]+)$" ] for a in soup.find_all("a"): for pattern in patterns: