Add entry and term factories

This commit is contained in:
stephenmk 2023-05-06 16:55:00 -05:00
parent 3d795ab49f
commit 6dbc8b90ce
No known key found for this signature in database
GPG key ID: B6DA730DB06235F1
12 changed files with 143 additions and 111 deletions

View file

@ -1,28 +1,23 @@
import os
import re
from abc import ABC, abstractmethod
from bs4 import BeautifulSoup
import bot.scraper as Scraper
from bot.entries.jitenon import JitenonKokugoEntry
from bot.entries.jitenon import JitenonKotowazaEntry
from bot.entries.jitenon import JitenonYojiEntry
from bot.entries.smk8 import Smk8Entry
from bot.entries.daijirin2 import Daijirin2Entry
from bot.yomichan.export import JitenonKokugoExporter
from bot.yomichan.export import JitenonKotowazaExporter
from bot.yomichan.export import JitenonYojiExporter
from bot.yomichan.export import Smk8Exporter
from bot.yomichan.export import Daijirin2Exporter
from bot.entries.factory import new_entry
from bot.yomichan.exporters.factory import new_exporter
class _Crawler():
def __init__(self, args):
self._page_dir = args.page_dir
self._image_dir = args.image_dir
class Crawler(ABC):
def __init__(self, target):
self._target = target
self._page_map = {}
self._entries = []
self._page_id_pattern = None
@abstractmethod
def collect_pages(self, page_dir):
pass
def read_pages(self):
pages_len = len(self._page_map)
@ -30,19 +25,20 @@ class _Crawler():
for idx, (page_id, page_path) in enumerate(items):
update = f"Reading page {idx+1}/{pages_len}"
print(update, end='\r', flush=True)
entry = self._entry_class(page_id)
entry = new_entry(self._target, page_id)
with open(page_path, "r", encoding="utf-8") as f:
page = f.read()
entry.set_page(page)
self._entries.append(entry)
print()
def make_yomichan_dictionary(self):
self._yomi_exporter.export(self._entries, self._image_dir)
def make_yomichan_dictionary(self, image_dir):
exporter = new_exporter(self._target)
exporter.export(self._entries, image_dir)
def _parse_page_id(self, page_link):
m = re.search(self._page_id_pattern, page_link)
if not m:
if m is None:
return None
page_id = int(m.group(1))
if page_id in self._page_map:
@ -50,15 +46,13 @@ class _Crawler():
return page_id
class JitenonKokugoCrawler(_Crawler):
def __init__(self, args):
super().__init__(args)
self._entry_class = JitenonKokugoEntry
self._yomi_exporter = JitenonKokugoExporter(args.target)
class JitenonKokugoCrawler(Crawler):
def __init__(self, target):
super().__init__(target)
self._gojuon_url = "https://kokugo.jitenon.jp/cat/gojuonindex.php"
self._page_id_pattern = r"word/p([0-9]+)$"
def collect_pages(self):
def collect_pages(self, page_dir):
jitenon = Scraper.Jitenon()
gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
@ -85,11 +79,12 @@ class JitenonKokugoCrawler(_Crawler):
print(f"Finished scraping {pages_len} pages")
class _JitenonCrawler(_Crawler):
def __init__(self, args):
super().__init__(args)
class _JitenonCrawler(Crawler):
def __init__(self, target):
super().__init__(target)
self._gojuon_url = None
def collect_pages(self):
def collect_pages(self, page_dir):
print("Scraping jitenon.jp")
jitenon = Scraper.Jitenon()
gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
@ -110,49 +105,41 @@ class _JitenonCrawler(_Crawler):
class JitenonYojiCrawler(_JitenonCrawler):
def __init__(self, args):
super().__init__(args)
self._entry_class = JitenonYojiEntry
self._yomi_exporter = JitenonYojiExporter(args.target)
def __init__(self, target):
super().__init__(target)
self._gojuon_url = "https://yoji.jitenon.jp/cat/gojuon.html"
self._page_id_pattern = r"([0-9]+)\.html$"
class JitenonKotowazaCrawler(_JitenonCrawler):
def __init__(self, args):
super().__init__(args)
self._entry_class = JitenonKotowazaEntry
self._yomi_exporter = JitenonKotowazaExporter(args.target)
def __init__(self, target):
super().__init__(target)
self._gojuon_url = "https://kotowaza.jitenon.jp/cat/gojuon.php"
self._page_id_pattern = r"([0-9]+)\.php$"
class _MonokakidoCrawler(_Crawler):
def __init__(self, args):
super().__init__(args)
class _MonokakidoCrawler(Crawler):
def __init__(self, target):
super().__init__(target)
self._page_id_pattern = r"^([0-9]+)\.xml$"
def collect_pages(self):
print(f"Searching for page files in `{self._page_dir}`")
for pagefile in os.listdir(self._page_dir):
def collect_pages(self, page_dir):
print(f"Searching for page files in `{page_dir}`")
for pagefile in os.listdir(page_dir):
page_id = self._parse_page_id(pagefile)
if page_id is None or page_id == 0:
continue
path = os.path.join(self._page_dir, pagefile)
path = os.path.join(page_dir, pagefile)
self._page_map[page_id] = path
pages_len = len(self._page_map)
print(f"Found {pages_len} page files for processing")
class Smk8Crawler(_MonokakidoCrawler):
def __init__(self, args):
super().__init__(args)
self._entry_class = Smk8Entry
self._yomi_exporter = Smk8Exporter(args.target)
def __init__(self, target):
super().__init__(target)
class Daijirin2Crawler(_MonokakidoCrawler):
def __init__(self, args):
super().__init__(args)
self._entry_class = Daijirin2Entry
self._yomi_exporter = Daijirin2Exporter(args.target)
def __init__(self, target):
super().__init__(target)

View file

@ -7,7 +7,7 @@ from bot.crawlers.crawlers import Smk8Crawler
from bot.crawlers.crawlers import Daijirin2Crawler
def new_crawler(target, args):
def new_crawler(target):
crawler_map = {
Targets.JITENON_KOKUGO: JitenonKokugoCrawler,
Targets.JITENON_YOJI: JitenonYojiCrawler,
@ -15,4 +15,4 @@ def new_crawler(target, args):
Targets.SMK8: Smk8Crawler,
Targets.DAIJIRIN2: Daijirin2Crawler,
}
return crawler_map[target](args)
return crawler_map[target](target)

18
bot/entries/factory.py Normal file
View file

@ -0,0 +1,18 @@
from bot.targets import Targets
from bot.entries.jitenon import JitenonKokugoEntry
from bot.entries.jitenon import JitenonYojiEntry
from bot.entries.jitenon import JitenonKotowazaEntry
from bot.entries.smk8 import Smk8Entry
from bot.entries.daijirin2 import Daijirin2Entry
def new_entry(target, page_id):
entry_map = {
Targets.JITENON_KOKUGO: JitenonKokugoEntry,
Targets.JITENON_YOJI: JitenonYojiEntry,
Targets.JITENON_KOTOWAZA: JitenonKotowazaEntry,
Targets.SMK8: Smk8Entry,
Targets.DAIJIRIN2: Daijirin2Entry,
}
return entry_map[target](page_id)

View file

@ -6,27 +6,23 @@ from datetime import datetime
from platformdirs import user_documents_dir, user_cache_dir
from bot.data import load_yomichan_metadata
from bot.yomichan.terms.jitenon import JitenonKokugoTerminator
from bot.yomichan.terms.jitenon import JitenonYojiTerminator
from bot.yomichan.terms.jitenon import JitenonKotowazaTerminator
from bot.yomichan.terms.smk8 import Smk8Terminator
from bot.yomichan.terms.daijirin2 import Daijirin2Terminator
from bot.yomichan.terms.factory import new_terminator
class Exporter:
def __init__(self, name):
self._name = name
def __init__(self, target):
self._target = target
self._terminator = new_terminator(target)
self._build_dir = None
self._terms_per_file = 2000
def export(self, entries, image_dir):
self.__init_build_image_dir(image_dir)
meta = load_yomichan_metadata()
index = meta[self._name]["index"]
index = meta[self._target.value]["index"]
index["revision"] = self._get_revision(entries)
index["attribution"] = self._get_attribution(entries)
tags = meta[self._name]["tags"]
tags = meta[self._target.value]["tags"]
terms = self.__get_terms(entries)
self.__make_dictionary(terms, index, tags)
@ -43,7 +39,7 @@ class Exporter:
def __init_build_image_dir(self, image_dir):
build_dir = self._get_build_dir()
build_img_dir = os.path.join(build_dir, self._name)
build_img_dir = os.path.join(build_dir, self._target.value)
if image_dir is not None:
print("Copying image files to build directory...")
shutil.copytree(image_dir, build_img_dir)
@ -115,15 +111,15 @@ class Exporter:
class JitenonExporter(Exporter):
def __init__(self, name):
super().__init__(name)
def __init__(self, target):
super().__init__(target)
def _get_revision(self, entries):
modified_date = None
for entry in entries:
if modified_date is None or entry.modified_date > modified_date:
modified_date = entry.modified_date
revision = f"{self._name};{modified_date}"
revision = f"{self._target.value};{modified_date}"
return revision
def _get_attribution(self, entries):
@ -135,44 +131,39 @@ class JitenonExporter(Exporter):
class JitenonKokugoExporter(JitenonExporter):
def __init__(self, name):
super().__init__(name)
self._terminator = JitenonKokugoTerminator(name)
def __init__(self, target):
super().__init__(target)
class JitenonYojiExporter(JitenonExporter):
def __init__(self, name):
super().__init__(name)
self._terminator = JitenonYojiTerminator(name)
def __init__(self, target):
super().__init__(target)
class JitenonKotowazaExporter(JitenonExporter):
def __init__(self, name):
super().__init__(name)
self._terminator = JitenonKotowazaTerminator(name)
def __init__(self, target):
super().__init__(target)
class Smk8Exporter(Exporter):
def __init__(self, name):
super().__init__(name)
self._terminator = Smk8Terminator(name)
def __init__(self, target):
super().__init__(target)
def _get_revision(self, entries):
timestamp = datetime.now().strftime("%Y-%m-%d")
return f"{self._name};{timestamp}"
return f"{self._target.value};{timestamp}"
def _get_attribution(self, entries):
return "© Sanseido Co., LTD. 2020"
class Daijirin2Exporter(Exporter):
def __init__(self, name):
super().__init__(name)
self._terminator = Daijirin2Terminator(name)
def __init__(self, target):
super().__init__(target)
def _get_revision(self, entries):
timestamp = datetime.now().strftime("%Y-%m-%d")
return f"{self._name};{timestamp}"
return f"{self._target.value};{timestamp}"
def _get_attribution(self, entries):
return "© Sanseido Co., LTD. 2019"

View file

@ -0,0 +1,18 @@
from bot.targets import Targets
from bot.yomichan.exporters.export import JitenonKokugoExporter
from bot.yomichan.exporters.export import JitenonYojiExporter
from bot.yomichan.exporters.export import JitenonKotowazaExporter
from bot.yomichan.exporters.export import Smk8Exporter
from bot.yomichan.exporters.export import Daijirin2Exporter
def new_exporter(target):
exporter_map = {
Targets.JITENON_KOKUGO: JitenonKokugoExporter,
Targets.JITENON_YOJI: JitenonYojiExporter,
Targets.JITENON_KOTOWAZA: JitenonKotowazaExporter,
Targets.SMK8: Smk8Exporter,
Targets.DAIJIRIN2: Daijirin2Exporter,
}
return exporter_map[target](target)

View file

@ -1,5 +1,3 @@
from bot.data import load_yomichan_inflection_categories
from bot.entries.daijirin2 import Daijirin2PhraseEntry as PhraseEntry
from bot.yomichan.terms.terminator import Terminator
@ -8,10 +6,8 @@ from bot.yomichan.grammar import sudachi_rules, tags_to_rules
class Daijirin2Terminator(Terminator):
def __init__(self, name):
super().__init__(name)
categories = load_yomichan_inflection_categories()
self._inflection_categories = categories[name]
def __init__(self, target):
super().__init__(target)
def _definition_tags(self, entry):
return ""

View file

@ -0,0 +1,18 @@
from bot.targets import Targets
from bot.yomichan.terms.jitenon import JitenonKokugoTerminator
from bot.yomichan.terms.jitenon import JitenonYojiTerminator
from bot.yomichan.terms.jitenon import JitenonKotowazaTerminator
from bot.yomichan.terms.smk8 import Smk8Terminator
from bot.yomichan.terms.daijirin2 import Daijirin2Terminator
def new_terminator(target):
terminator_map = {
Targets.JITENON_KOKUGO: JitenonKokugoTerminator,
Targets.JITENON_YOJI: JitenonYojiTerminator,
Targets.JITENON_KOTOWAZA: JitenonKotowazaTerminator,
Targets.SMK8: Smk8Terminator,
Targets.DAIJIRIN2: Daijirin2Terminator,
}
return terminator_map[target](target)

View file

@ -7,8 +7,8 @@ from bot.yomichan.glossary.jitenon import JitenonKotowazaGlossary
class JitenonTerminator(Terminator):
def __init__(self, name):
super().__init__(name)
def __init__(self, target):
super().__init__(target)
def _definition_tags(self, entry):
return None
@ -31,8 +31,8 @@ class JitenonTerminator(Terminator):
class JitenonKokugoTerminator(JitenonTerminator):
def __init__(self, name):
super().__init__(name)
def __init__(self, target):
super().__init__(target)
self._glossary_maker = JitenonKokugoGlossary()
def _inflection_rules(self, entry, expression):
@ -43,8 +43,8 @@ class JitenonKokugoTerminator(JitenonTerminator):
class JitenonYojiTerminator(JitenonTerminator):
def __init__(self, name):
super().__init__(name)
def __init__(self, target):
super().__init__(target)
self._glossary_maker = JitenonYojiGlossary()
def _inflection_rules(self, entry, expression):
@ -56,8 +56,8 @@ class JitenonYojiTerminator(JitenonTerminator):
class JitenonKotowazaTerminator(JitenonTerminator):
def __init__(self, name):
super().__init__(name)
def __init__(self, target):
super().__init__(target)
self._glossary_maker = JitenonKotowazaGlossary()
def _inflection_rules(self, entry, expression):

View file

@ -1,5 +1,3 @@
from bot.data import load_yomichan_inflection_categories
from bot.entries.smk8 import Smk8KanjiEntry as KanjiEntry
from bot.entries.smk8 import Smk8PhraseEntry as PhraseEntry
@ -9,10 +7,8 @@ from bot.yomichan.grammar import sudachi_rules, tags_to_rules
class Smk8Terminator(Terminator):
def __init__(self, name):
super().__init__(name)
categories = load_yomichan_inflection_categories()
self._inflection_categories = categories[name]
def __init__(self, target):
super().__init__(target)
def _definition_tags(self, entry):
if isinstance(entry, KanjiEntry):

View file

@ -1,8 +1,13 @@
from bot.data import load_yomichan_inflection_categories
class Terminator:
def __init__(self, name):
self._name = name
def __init__(self, target):
self._target = target
self._glossary_cache = {}
self._image_dir = None
categories = load_yomichan_inflection_categories()
self._inflection_categories = categories[target.value]
def set_image_dir(self, image_dir):
self._image_dir = image_dir

View file

@ -7,6 +7,9 @@
"kahen": ["カ行変格"],
"sudachi": []
},
"jitenon-kokugo": {},
"jitenon-yoji": {},
"jitenon-kotowaza": {},
"smk8": {
"sahen": ["サ", "サ変型"],
"godan": ["上二", "下二", "四", "五", "上二型", "下二型", "四段型", "五型", "特殊型"],

View file

@ -59,10 +59,10 @@ def main():
target_names = [x.value for x in Targets]
args = parse_args(target_names)
selected_target = Targets(args.target)
crawler = new_crawler(selected_target, args)
crawler.collect_pages()
crawler = new_crawler(selected_target)
crawler.collect_pages(args.page_dir)
crawler.read_pages()
crawler.make_yomichan_dictionary()
crawler.make_yomichan_dictionary(args.image_dir)
if __name__ == "__main__":