From 6dbc8b90cec6a9a9347164cb899f30be58623eaf Mon Sep 17 00:00:00 2001 From: stephenmk Date: Sat, 6 May 2023 16:55:00 -0500 Subject: [PATCH] Add entry and term factories --- bot/crawlers/crawlers.py | 93 ++++++++++-------------- bot/crawlers/factory.py | 4 +- bot/entries/factory.py | 18 +++++ bot/yomichan/{ => exporters}/export.py | 53 ++++++-------- bot/yomichan/exporters/factory.py | 18 +++++ bot/yomichan/terms/daijirin2.py | 8 +- bot/yomichan/terms/factory.py | 18 +++++ bot/yomichan/terms/jitenon.py | 16 ++-- bot/yomichan/terms/smk8.py | 8 +- bot/yomichan/terms/terminator.py | 9 ++- data/yomichan_inflection_categories.json | 3 + jitenbot.py | 6 +- 12 files changed, 143 insertions(+), 111 deletions(-) create mode 100644 bot/entries/factory.py rename bot/yomichan/{ => exporters}/export.py (79%) create mode 100644 bot/yomichan/exporters/factory.py create mode 100644 bot/yomichan/terms/factory.py diff --git a/bot/crawlers/crawlers.py b/bot/crawlers/crawlers.py index 7ba495c..4df33a8 100644 --- a/bot/crawlers/crawlers.py +++ b/bot/crawlers/crawlers.py @@ -1,28 +1,23 @@ import os import re +from abc import ABC, abstractmethod from bs4 import BeautifulSoup import bot.scraper as Scraper - -from bot.entries.jitenon import JitenonKokugoEntry -from bot.entries.jitenon import JitenonKotowazaEntry -from bot.entries.jitenon import JitenonYojiEntry -from bot.entries.smk8 import Smk8Entry -from bot.entries.daijirin2 import Daijirin2Entry - -from bot.yomichan.export import JitenonKokugoExporter -from bot.yomichan.export import JitenonKotowazaExporter -from bot.yomichan.export import JitenonYojiExporter -from bot.yomichan.export import Smk8Exporter -from bot.yomichan.export import Daijirin2Exporter +from bot.entries.factory import new_entry +from bot.yomichan.exporters.factory import new_exporter -class _Crawler(): - def __init__(self, args): - self._page_dir = args.page_dir - self._image_dir = args.image_dir +class Crawler(ABC): + def __init__(self, target): + self._target = target self._page_map = {} self._entries = [] + self._page_id_pattern = None + + @abstractmethod + def collect_pages(self, page_dir): + pass def read_pages(self): pages_len = len(self._page_map) @@ -30,19 +25,20 @@ class _Crawler(): for idx, (page_id, page_path) in enumerate(items): update = f"Reading page {idx+1}/{pages_len}" print(update, end='\r', flush=True) - entry = self._entry_class(page_id) + entry = new_entry(self._target, page_id) with open(page_path, "r", encoding="utf-8") as f: page = f.read() entry.set_page(page) self._entries.append(entry) print() - def make_yomichan_dictionary(self): - self._yomi_exporter.export(self._entries, self._image_dir) + def make_yomichan_dictionary(self, image_dir): + exporter = new_exporter(self._target) + exporter.export(self._entries, image_dir) def _parse_page_id(self, page_link): m = re.search(self._page_id_pattern, page_link) - if not m: + if m is None: return None page_id = int(m.group(1)) if page_id in self._page_map: @@ -50,15 +46,13 @@ class _Crawler(): return page_id -class JitenonKokugoCrawler(_Crawler): - def __init__(self, args): - super().__init__(args) - self._entry_class = JitenonKokugoEntry - self._yomi_exporter = JitenonKokugoExporter(args.target) +class JitenonKokugoCrawler(Crawler): + def __init__(self, target): + super().__init__(target) self._gojuon_url = "https://kokugo.jitenon.jp/cat/gojuonindex.php" self._page_id_pattern = r"word/p([0-9]+)$" - def collect_pages(self): + def collect_pages(self, page_dir): jitenon = Scraper.Jitenon() gojuon_doc, _ = jitenon.scrape(self._gojuon_url) gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser") @@ -85,11 +79,12 @@ class JitenonKokugoCrawler(_Crawler): print(f"Finished scraping {pages_len} pages") -class _JitenonCrawler(_Crawler): - def __init__(self, args): - super().__init__(args) +class _JitenonCrawler(Crawler): + def __init__(self, target): + super().__init__(target) + self._gojuon_url = None - def collect_pages(self): + def collect_pages(self, page_dir): print("Scraping jitenon.jp") jitenon = Scraper.Jitenon() gojuon_doc, _ = jitenon.scrape(self._gojuon_url) @@ -110,49 +105,41 @@ class _JitenonCrawler(_Crawler): class JitenonYojiCrawler(_JitenonCrawler): - def __init__(self, args): - super().__init__(args) - self._entry_class = JitenonYojiEntry - self._yomi_exporter = JitenonYojiExporter(args.target) + def __init__(self, target): + super().__init__(target) self._gojuon_url = "https://yoji.jitenon.jp/cat/gojuon.html" self._page_id_pattern = r"([0-9]+)\.html$" class JitenonKotowazaCrawler(_JitenonCrawler): - def __init__(self, args): - super().__init__(args) - self._entry_class = JitenonKotowazaEntry - self._yomi_exporter = JitenonKotowazaExporter(args.target) + def __init__(self, target): + super().__init__(target) self._gojuon_url = "https://kotowaza.jitenon.jp/cat/gojuon.php" self._page_id_pattern = r"([0-9]+)\.php$" -class _MonokakidoCrawler(_Crawler): - def __init__(self, args): - super().__init__(args) +class _MonokakidoCrawler(Crawler): + def __init__(self, target): + super().__init__(target) self._page_id_pattern = r"^([0-9]+)\.xml$" - def collect_pages(self): - print(f"Searching for page files in `{self._page_dir}`") - for pagefile in os.listdir(self._page_dir): + def collect_pages(self, page_dir): + print(f"Searching for page files in `{page_dir}`") + for pagefile in os.listdir(page_dir): page_id = self._parse_page_id(pagefile) if page_id is None or page_id == 0: continue - path = os.path.join(self._page_dir, pagefile) + path = os.path.join(page_dir, pagefile) self._page_map[page_id] = path pages_len = len(self._page_map) print(f"Found {pages_len} page files for processing") class Smk8Crawler(_MonokakidoCrawler): - def __init__(self, args): - super().__init__(args) - self._entry_class = Smk8Entry - self._yomi_exporter = Smk8Exporter(args.target) + def __init__(self, target): + super().__init__(target) class Daijirin2Crawler(_MonokakidoCrawler): - def __init__(self, args): - super().__init__(args) - self._entry_class = Daijirin2Entry - self._yomi_exporter = Daijirin2Exporter(args.target) + def __init__(self, target): + super().__init__(target) diff --git a/bot/crawlers/factory.py b/bot/crawlers/factory.py index f2af6d1..081567b 100644 --- a/bot/crawlers/factory.py +++ b/bot/crawlers/factory.py @@ -7,7 +7,7 @@ from bot.crawlers.crawlers import Smk8Crawler from bot.crawlers.crawlers import Daijirin2Crawler -def new_crawler(target, args): +def new_crawler(target): crawler_map = { Targets.JITENON_KOKUGO: JitenonKokugoCrawler, Targets.JITENON_YOJI: JitenonYojiCrawler, @@ -15,4 +15,4 @@ def new_crawler(target, args): Targets.SMK8: Smk8Crawler, Targets.DAIJIRIN2: Daijirin2Crawler, } - return crawler_map[target](args) + return crawler_map[target](target) diff --git a/bot/entries/factory.py b/bot/entries/factory.py new file mode 100644 index 0000000..23ca066 --- /dev/null +++ b/bot/entries/factory.py @@ -0,0 +1,18 @@ +from bot.targets import Targets + +from bot.entries.jitenon import JitenonKokugoEntry +from bot.entries.jitenon import JitenonYojiEntry +from bot.entries.jitenon import JitenonKotowazaEntry +from bot.entries.smk8 import Smk8Entry +from bot.entries.daijirin2 import Daijirin2Entry + + +def new_entry(target, page_id): + entry_map = { + Targets.JITENON_KOKUGO: JitenonKokugoEntry, + Targets.JITENON_YOJI: JitenonYojiEntry, + Targets.JITENON_KOTOWAZA: JitenonKotowazaEntry, + Targets.SMK8: Smk8Entry, + Targets.DAIJIRIN2: Daijirin2Entry, + } + return entry_map[target](page_id) diff --git a/bot/yomichan/export.py b/bot/yomichan/exporters/export.py similarity index 79% rename from bot/yomichan/export.py rename to bot/yomichan/exporters/export.py index a2acf81..4658030 100644 --- a/bot/yomichan/export.py +++ b/bot/yomichan/exporters/export.py @@ -6,27 +6,23 @@ from datetime import datetime from platformdirs import user_documents_dir, user_cache_dir from bot.data import load_yomichan_metadata - -from bot.yomichan.terms.jitenon import JitenonKokugoTerminator -from bot.yomichan.terms.jitenon import JitenonYojiTerminator -from bot.yomichan.terms.jitenon import JitenonKotowazaTerminator -from bot.yomichan.terms.smk8 import Smk8Terminator -from bot.yomichan.terms.daijirin2 import Daijirin2Terminator +from bot.yomichan.terms.factory import new_terminator class Exporter: - def __init__(self, name): - self._name = name + def __init__(self, target): + self._target = target + self._terminator = new_terminator(target) self._build_dir = None self._terms_per_file = 2000 def export(self, entries, image_dir): self.__init_build_image_dir(image_dir) meta = load_yomichan_metadata() - index = meta[self._name]["index"] + index = meta[self._target.value]["index"] index["revision"] = self._get_revision(entries) index["attribution"] = self._get_attribution(entries) - tags = meta[self._name]["tags"] + tags = meta[self._target.value]["tags"] terms = self.__get_terms(entries) self.__make_dictionary(terms, index, tags) @@ -43,7 +39,7 @@ class Exporter: def __init_build_image_dir(self, image_dir): build_dir = self._get_build_dir() - build_img_dir = os.path.join(build_dir, self._name) + build_img_dir = os.path.join(build_dir, self._target.value) if image_dir is not None: print("Copying image files to build directory...") shutil.copytree(image_dir, build_img_dir) @@ -115,15 +111,15 @@ class Exporter: class JitenonExporter(Exporter): - def __init__(self, name): - super().__init__(name) + def __init__(self, target): + super().__init__(target) def _get_revision(self, entries): modified_date = None for entry in entries: if modified_date is None or entry.modified_date > modified_date: modified_date = entry.modified_date - revision = f"{self._name};{modified_date}" + revision = f"{self._target.value};{modified_date}" return revision def _get_attribution(self, entries): @@ -135,44 +131,39 @@ class JitenonExporter(Exporter): class JitenonKokugoExporter(JitenonExporter): - def __init__(self, name): - super().__init__(name) - self._terminator = JitenonKokugoTerminator(name) + def __init__(self, target): + super().__init__(target) class JitenonYojiExporter(JitenonExporter): - def __init__(self, name): - super().__init__(name) - self._terminator = JitenonYojiTerminator(name) + def __init__(self, target): + super().__init__(target) class JitenonKotowazaExporter(JitenonExporter): - def __init__(self, name): - super().__init__(name) - self._terminator = JitenonKotowazaTerminator(name) + def __init__(self, target): + super().__init__(target) class Smk8Exporter(Exporter): - def __init__(self, name): - super().__init__(name) - self._terminator = Smk8Terminator(name) + def __init__(self, target): + super().__init__(target) def _get_revision(self, entries): timestamp = datetime.now().strftime("%Y-%m-%d") - return f"{self._name};{timestamp}" + return f"{self._target.value};{timestamp}" def _get_attribution(self, entries): return "© Sanseido Co., LTD. 2020" class Daijirin2Exporter(Exporter): - def __init__(self, name): - super().__init__(name) - self._terminator = Daijirin2Terminator(name) + def __init__(self, target): + super().__init__(target) def _get_revision(self, entries): timestamp = datetime.now().strftime("%Y-%m-%d") - return f"{self._name};{timestamp}" + return f"{self._target.value};{timestamp}" def _get_attribution(self, entries): return "© Sanseido Co., LTD. 2019" diff --git a/bot/yomichan/exporters/factory.py b/bot/yomichan/exporters/factory.py new file mode 100644 index 0000000..5ab9a6a --- /dev/null +++ b/bot/yomichan/exporters/factory.py @@ -0,0 +1,18 @@ +from bot.targets import Targets + +from bot.yomichan.exporters.export import JitenonKokugoExporter +from bot.yomichan.exporters.export import JitenonYojiExporter +from bot.yomichan.exporters.export import JitenonKotowazaExporter +from bot.yomichan.exporters.export import Smk8Exporter +from bot.yomichan.exporters.export import Daijirin2Exporter + + +def new_exporter(target): + exporter_map = { + Targets.JITENON_KOKUGO: JitenonKokugoExporter, + Targets.JITENON_YOJI: JitenonYojiExporter, + Targets.JITENON_KOTOWAZA: JitenonKotowazaExporter, + Targets.SMK8: Smk8Exporter, + Targets.DAIJIRIN2: Daijirin2Exporter, + } + return exporter_map[target](target) diff --git a/bot/yomichan/terms/daijirin2.py b/bot/yomichan/terms/daijirin2.py index 0c18426..10aaa76 100644 --- a/bot/yomichan/terms/daijirin2.py +++ b/bot/yomichan/terms/daijirin2.py @@ -1,5 +1,3 @@ -from bot.data import load_yomichan_inflection_categories - from bot.entries.daijirin2 import Daijirin2PhraseEntry as PhraseEntry from bot.yomichan.terms.terminator import Terminator @@ -8,10 +6,8 @@ from bot.yomichan.grammar import sudachi_rules, tags_to_rules class Daijirin2Terminator(Terminator): - def __init__(self, name): - super().__init__(name) - categories = load_yomichan_inflection_categories() - self._inflection_categories = categories[name] + def __init__(self, target): + super().__init__(target) def _definition_tags(self, entry): return "" diff --git a/bot/yomichan/terms/factory.py b/bot/yomichan/terms/factory.py new file mode 100644 index 0000000..d3fc199 --- /dev/null +++ b/bot/yomichan/terms/factory.py @@ -0,0 +1,18 @@ +from bot.targets import Targets + +from bot.yomichan.terms.jitenon import JitenonKokugoTerminator +from bot.yomichan.terms.jitenon import JitenonYojiTerminator +from bot.yomichan.terms.jitenon import JitenonKotowazaTerminator +from bot.yomichan.terms.smk8 import Smk8Terminator +from bot.yomichan.terms.daijirin2 import Daijirin2Terminator + + +def new_terminator(target): + terminator_map = { + Targets.JITENON_KOKUGO: JitenonKokugoTerminator, + Targets.JITENON_YOJI: JitenonYojiTerminator, + Targets.JITENON_KOTOWAZA: JitenonKotowazaTerminator, + Targets.SMK8: Smk8Terminator, + Targets.DAIJIRIN2: Daijirin2Terminator, + } + return terminator_map[target](target) diff --git a/bot/yomichan/terms/jitenon.py b/bot/yomichan/terms/jitenon.py index 45f4d5b..f74abaa 100644 --- a/bot/yomichan/terms/jitenon.py +++ b/bot/yomichan/terms/jitenon.py @@ -7,8 +7,8 @@ from bot.yomichan.glossary.jitenon import JitenonKotowazaGlossary class JitenonTerminator(Terminator): - def __init__(self, name): - super().__init__(name) + def __init__(self, target): + super().__init__(target) def _definition_tags(self, entry): return None @@ -31,8 +31,8 @@ class JitenonTerminator(Terminator): class JitenonKokugoTerminator(JitenonTerminator): - def __init__(self, name): - super().__init__(name) + def __init__(self, target): + super().__init__(target) self._glossary_maker = JitenonKokugoGlossary() def _inflection_rules(self, entry, expression): @@ -43,8 +43,8 @@ class JitenonKokugoTerminator(JitenonTerminator): class JitenonYojiTerminator(JitenonTerminator): - def __init__(self, name): - super().__init__(name) + def __init__(self, target): + super().__init__(target) self._glossary_maker = JitenonYojiGlossary() def _inflection_rules(self, entry, expression): @@ -56,8 +56,8 @@ class JitenonYojiTerminator(JitenonTerminator): class JitenonKotowazaTerminator(JitenonTerminator): - def __init__(self, name): - super().__init__(name) + def __init__(self, target): + super().__init__(target) self._glossary_maker = JitenonKotowazaGlossary() def _inflection_rules(self, entry, expression): diff --git a/bot/yomichan/terms/smk8.py b/bot/yomichan/terms/smk8.py index 68e97e3..d1e3ca7 100644 --- a/bot/yomichan/terms/smk8.py +++ b/bot/yomichan/terms/smk8.py @@ -1,5 +1,3 @@ -from bot.data import load_yomichan_inflection_categories - from bot.entries.smk8 import Smk8KanjiEntry as KanjiEntry from bot.entries.smk8 import Smk8PhraseEntry as PhraseEntry @@ -9,10 +7,8 @@ from bot.yomichan.grammar import sudachi_rules, tags_to_rules class Smk8Terminator(Terminator): - def __init__(self, name): - super().__init__(name) - categories = load_yomichan_inflection_categories() - self._inflection_categories = categories[name] + def __init__(self, target): + super().__init__(target) def _definition_tags(self, entry): if isinstance(entry, KanjiEntry): diff --git a/bot/yomichan/terms/terminator.py b/bot/yomichan/terms/terminator.py index d6c69a5..d41a50a 100644 --- a/bot/yomichan/terms/terminator.py +++ b/bot/yomichan/terms/terminator.py @@ -1,8 +1,13 @@ +from bot.data import load_yomichan_inflection_categories + + class Terminator: - def __init__(self, name): - self._name = name + def __init__(self, target): + self._target = target self._glossary_cache = {} self._image_dir = None + categories = load_yomichan_inflection_categories() + self._inflection_categories = categories[target.value] def set_image_dir(self, image_dir): self._image_dir = image_dir diff --git a/data/yomichan_inflection_categories.json b/data/yomichan_inflection_categories.json index 665b9a3..396ddb6 100644 --- a/data/yomichan_inflection_categories.json +++ b/data/yomichan_inflection_categories.json @@ -7,6 +7,9 @@ "kahen": ["カ行変格"], "sudachi": [] }, + "jitenon-kokugo": {}, + "jitenon-yoji": {}, + "jitenon-kotowaza": {}, "smk8": { "sahen": ["サ", "サ変型"], "godan": ["上二", "下二", "四", "五", "上二型", "下二型", "四段型", "五型", "特殊型"], diff --git a/jitenbot.py b/jitenbot.py index 950ab16..0a25b96 100644 --- a/jitenbot.py +++ b/jitenbot.py @@ -59,10 +59,10 @@ def main(): target_names = [x.value for x in Targets] args = parse_args(target_names) selected_target = Targets(args.target) - crawler = new_crawler(selected_target, args) - crawler.collect_pages() + crawler = new_crawler(selected_target) + crawler.collect_pages(args.page_dir) crawler.read_pages() - crawler.make_yomichan_dictionary() + crawler.make_yomichan_dictionary(args.image_dir) if __name__ == "__main__":