Add entry and term factories

This commit is contained in:
stephenmk 2023-05-06 16:55:00 -05:00
parent 3d795ab49f
commit 6dbc8b90ce
No known key found for this signature in database
GPG key ID: B6DA730DB06235F1
12 changed files with 143 additions and 111 deletions

View file

@ -1,28 +1,23 @@
import os import os
import re import re
from abc import ABC, abstractmethod
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import bot.scraper as Scraper import bot.scraper as Scraper
from bot.entries.factory import new_entry
from bot.entries.jitenon import JitenonKokugoEntry from bot.yomichan.exporters.factory import new_exporter
from bot.entries.jitenon import JitenonKotowazaEntry
from bot.entries.jitenon import JitenonYojiEntry
from bot.entries.smk8 import Smk8Entry
from bot.entries.daijirin2 import Daijirin2Entry
from bot.yomichan.export import JitenonKokugoExporter
from bot.yomichan.export import JitenonKotowazaExporter
from bot.yomichan.export import JitenonYojiExporter
from bot.yomichan.export import Smk8Exporter
from bot.yomichan.export import Daijirin2Exporter
class _Crawler(): class Crawler(ABC):
def __init__(self, args): def __init__(self, target):
self._page_dir = args.page_dir self._target = target
self._image_dir = args.image_dir
self._page_map = {} self._page_map = {}
self._entries = [] self._entries = []
self._page_id_pattern = None
@abstractmethod
def collect_pages(self, page_dir):
pass
def read_pages(self): def read_pages(self):
pages_len = len(self._page_map) pages_len = len(self._page_map)
@ -30,19 +25,20 @@ class _Crawler():
for idx, (page_id, page_path) in enumerate(items): for idx, (page_id, page_path) in enumerate(items):
update = f"Reading page {idx+1}/{pages_len}" update = f"Reading page {idx+1}/{pages_len}"
print(update, end='\r', flush=True) print(update, end='\r', flush=True)
entry = self._entry_class(page_id) entry = new_entry(self._target, page_id)
with open(page_path, "r", encoding="utf-8") as f: with open(page_path, "r", encoding="utf-8") as f:
page = f.read() page = f.read()
entry.set_page(page) entry.set_page(page)
self._entries.append(entry) self._entries.append(entry)
print() print()
def make_yomichan_dictionary(self): def make_yomichan_dictionary(self, image_dir):
self._yomi_exporter.export(self._entries, self._image_dir) exporter = new_exporter(self._target)
exporter.export(self._entries, image_dir)
def _parse_page_id(self, page_link): def _parse_page_id(self, page_link):
m = re.search(self._page_id_pattern, page_link) m = re.search(self._page_id_pattern, page_link)
if not m: if m is None:
return None return None
page_id = int(m.group(1)) page_id = int(m.group(1))
if page_id in self._page_map: if page_id in self._page_map:
@ -50,15 +46,13 @@ class _Crawler():
return page_id return page_id
class JitenonKokugoCrawler(_Crawler): class JitenonKokugoCrawler(Crawler):
def __init__(self, args): def __init__(self, target):
super().__init__(args) super().__init__(target)
self._entry_class = JitenonKokugoEntry
self._yomi_exporter = JitenonKokugoExporter(args.target)
self._gojuon_url = "https://kokugo.jitenon.jp/cat/gojuonindex.php" self._gojuon_url = "https://kokugo.jitenon.jp/cat/gojuonindex.php"
self._page_id_pattern = r"word/p([0-9]+)$" self._page_id_pattern = r"word/p([0-9]+)$"
def collect_pages(self): def collect_pages(self, page_dir):
jitenon = Scraper.Jitenon() jitenon = Scraper.Jitenon()
gojuon_doc, _ = jitenon.scrape(self._gojuon_url) gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser") gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
@ -85,11 +79,12 @@ class JitenonKokugoCrawler(_Crawler):
print(f"Finished scraping {pages_len} pages") print(f"Finished scraping {pages_len} pages")
class _JitenonCrawler(_Crawler): class _JitenonCrawler(Crawler):
def __init__(self, args): def __init__(self, target):
super().__init__(args) super().__init__(target)
self._gojuon_url = None
def collect_pages(self): def collect_pages(self, page_dir):
print("Scraping jitenon.jp") print("Scraping jitenon.jp")
jitenon = Scraper.Jitenon() jitenon = Scraper.Jitenon()
gojuon_doc, _ = jitenon.scrape(self._gojuon_url) gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
@ -110,49 +105,41 @@ class _JitenonCrawler(_Crawler):
class JitenonYojiCrawler(_JitenonCrawler): class JitenonYojiCrawler(_JitenonCrawler):
def __init__(self, args): def __init__(self, target):
super().__init__(args) super().__init__(target)
self._entry_class = JitenonYojiEntry
self._yomi_exporter = JitenonYojiExporter(args.target)
self._gojuon_url = "https://yoji.jitenon.jp/cat/gojuon.html" self._gojuon_url = "https://yoji.jitenon.jp/cat/gojuon.html"
self._page_id_pattern = r"([0-9]+)\.html$" self._page_id_pattern = r"([0-9]+)\.html$"
class JitenonKotowazaCrawler(_JitenonCrawler): class JitenonKotowazaCrawler(_JitenonCrawler):
def __init__(self, args): def __init__(self, target):
super().__init__(args) super().__init__(target)
self._entry_class = JitenonKotowazaEntry
self._yomi_exporter = JitenonKotowazaExporter(args.target)
self._gojuon_url = "https://kotowaza.jitenon.jp/cat/gojuon.php" self._gojuon_url = "https://kotowaza.jitenon.jp/cat/gojuon.php"
self._page_id_pattern = r"([0-9]+)\.php$" self._page_id_pattern = r"([0-9]+)\.php$"
class _MonokakidoCrawler(_Crawler): class _MonokakidoCrawler(Crawler):
def __init__(self, args): def __init__(self, target):
super().__init__(args) super().__init__(target)
self._page_id_pattern = r"^([0-9]+)\.xml$" self._page_id_pattern = r"^([0-9]+)\.xml$"
def collect_pages(self): def collect_pages(self, page_dir):
print(f"Searching for page files in `{self._page_dir}`") print(f"Searching for page files in `{page_dir}`")
for pagefile in os.listdir(self._page_dir): for pagefile in os.listdir(page_dir):
page_id = self._parse_page_id(pagefile) page_id = self._parse_page_id(pagefile)
if page_id is None or page_id == 0: if page_id is None or page_id == 0:
continue continue
path = os.path.join(self._page_dir, pagefile) path = os.path.join(page_dir, pagefile)
self._page_map[page_id] = path self._page_map[page_id] = path
pages_len = len(self._page_map) pages_len = len(self._page_map)
print(f"Found {pages_len} page files for processing") print(f"Found {pages_len} page files for processing")
class Smk8Crawler(_MonokakidoCrawler): class Smk8Crawler(_MonokakidoCrawler):
def __init__(self, args): def __init__(self, target):
super().__init__(args) super().__init__(target)
self._entry_class = Smk8Entry
self._yomi_exporter = Smk8Exporter(args.target)
class Daijirin2Crawler(_MonokakidoCrawler): class Daijirin2Crawler(_MonokakidoCrawler):
def __init__(self, args): def __init__(self, target):
super().__init__(args) super().__init__(target)
self._entry_class = Daijirin2Entry
self._yomi_exporter = Daijirin2Exporter(args.target)

View file

@ -7,7 +7,7 @@ from bot.crawlers.crawlers import Smk8Crawler
from bot.crawlers.crawlers import Daijirin2Crawler from bot.crawlers.crawlers import Daijirin2Crawler
def new_crawler(target, args): def new_crawler(target):
crawler_map = { crawler_map = {
Targets.JITENON_KOKUGO: JitenonKokugoCrawler, Targets.JITENON_KOKUGO: JitenonKokugoCrawler,
Targets.JITENON_YOJI: JitenonYojiCrawler, Targets.JITENON_YOJI: JitenonYojiCrawler,
@ -15,4 +15,4 @@ def new_crawler(target, args):
Targets.SMK8: Smk8Crawler, Targets.SMK8: Smk8Crawler,
Targets.DAIJIRIN2: Daijirin2Crawler, Targets.DAIJIRIN2: Daijirin2Crawler,
} }
return crawler_map[target](args) return crawler_map[target](target)

18
bot/entries/factory.py Normal file
View file

@ -0,0 +1,18 @@
from bot.targets import Targets
from bot.entries.jitenon import JitenonKokugoEntry
from bot.entries.jitenon import JitenonYojiEntry
from bot.entries.jitenon import JitenonKotowazaEntry
from bot.entries.smk8 import Smk8Entry
from bot.entries.daijirin2 import Daijirin2Entry
def new_entry(target, page_id):
entry_map = {
Targets.JITENON_KOKUGO: JitenonKokugoEntry,
Targets.JITENON_YOJI: JitenonYojiEntry,
Targets.JITENON_KOTOWAZA: JitenonKotowazaEntry,
Targets.SMK8: Smk8Entry,
Targets.DAIJIRIN2: Daijirin2Entry,
}
return entry_map[target](page_id)

View file

@ -6,27 +6,23 @@ from datetime import datetime
from platformdirs import user_documents_dir, user_cache_dir from platformdirs import user_documents_dir, user_cache_dir
from bot.data import load_yomichan_metadata from bot.data import load_yomichan_metadata
from bot.yomichan.terms.factory import new_terminator
from bot.yomichan.terms.jitenon import JitenonKokugoTerminator
from bot.yomichan.terms.jitenon import JitenonYojiTerminator
from bot.yomichan.terms.jitenon import JitenonKotowazaTerminator
from bot.yomichan.terms.smk8 import Smk8Terminator
from bot.yomichan.terms.daijirin2 import Daijirin2Terminator
class Exporter: class Exporter:
def __init__(self, name): def __init__(self, target):
self._name = name self._target = target
self._terminator = new_terminator(target)
self._build_dir = None self._build_dir = None
self._terms_per_file = 2000 self._terms_per_file = 2000
def export(self, entries, image_dir): def export(self, entries, image_dir):
self.__init_build_image_dir(image_dir) self.__init_build_image_dir(image_dir)
meta = load_yomichan_metadata() meta = load_yomichan_metadata()
index = meta[self._name]["index"] index = meta[self._target.value]["index"]
index["revision"] = self._get_revision(entries) index["revision"] = self._get_revision(entries)
index["attribution"] = self._get_attribution(entries) index["attribution"] = self._get_attribution(entries)
tags = meta[self._name]["tags"] tags = meta[self._target.value]["tags"]
terms = self.__get_terms(entries) terms = self.__get_terms(entries)
self.__make_dictionary(terms, index, tags) self.__make_dictionary(terms, index, tags)
@ -43,7 +39,7 @@ class Exporter:
def __init_build_image_dir(self, image_dir): def __init_build_image_dir(self, image_dir):
build_dir = self._get_build_dir() build_dir = self._get_build_dir()
build_img_dir = os.path.join(build_dir, self._name) build_img_dir = os.path.join(build_dir, self._target.value)
if image_dir is not None: if image_dir is not None:
print("Copying image files to build directory...") print("Copying image files to build directory...")
shutil.copytree(image_dir, build_img_dir) shutil.copytree(image_dir, build_img_dir)
@ -115,15 +111,15 @@ class Exporter:
class JitenonExporter(Exporter): class JitenonExporter(Exporter):
def __init__(self, name): def __init__(self, target):
super().__init__(name) super().__init__(target)
def _get_revision(self, entries): def _get_revision(self, entries):
modified_date = None modified_date = None
for entry in entries: for entry in entries:
if modified_date is None or entry.modified_date > modified_date: if modified_date is None or entry.modified_date > modified_date:
modified_date = entry.modified_date modified_date = entry.modified_date
revision = f"{self._name};{modified_date}" revision = f"{self._target.value};{modified_date}"
return revision return revision
def _get_attribution(self, entries): def _get_attribution(self, entries):
@ -135,44 +131,39 @@ class JitenonExporter(Exporter):
class JitenonKokugoExporter(JitenonExporter): class JitenonKokugoExporter(JitenonExporter):
def __init__(self, name): def __init__(self, target):
super().__init__(name) super().__init__(target)
self._terminator = JitenonKokugoTerminator(name)
class JitenonYojiExporter(JitenonExporter): class JitenonYojiExporter(JitenonExporter):
def __init__(self, name): def __init__(self, target):
super().__init__(name) super().__init__(target)
self._terminator = JitenonYojiTerminator(name)
class JitenonKotowazaExporter(JitenonExporter): class JitenonKotowazaExporter(JitenonExporter):
def __init__(self, name): def __init__(self, target):
super().__init__(name) super().__init__(target)
self._terminator = JitenonKotowazaTerminator(name)
class Smk8Exporter(Exporter): class Smk8Exporter(Exporter):
def __init__(self, name): def __init__(self, target):
super().__init__(name) super().__init__(target)
self._terminator = Smk8Terminator(name)
def _get_revision(self, entries): def _get_revision(self, entries):
timestamp = datetime.now().strftime("%Y-%m-%d") timestamp = datetime.now().strftime("%Y-%m-%d")
return f"{self._name};{timestamp}" return f"{self._target.value};{timestamp}"
def _get_attribution(self, entries): def _get_attribution(self, entries):
return "© Sanseido Co., LTD. 2020" return "© Sanseido Co., LTD. 2020"
class Daijirin2Exporter(Exporter): class Daijirin2Exporter(Exporter):
def __init__(self, name): def __init__(self, target):
super().__init__(name) super().__init__(target)
self._terminator = Daijirin2Terminator(name)
def _get_revision(self, entries): def _get_revision(self, entries):
timestamp = datetime.now().strftime("%Y-%m-%d") timestamp = datetime.now().strftime("%Y-%m-%d")
return f"{self._name};{timestamp}" return f"{self._target.value};{timestamp}"
def _get_attribution(self, entries): def _get_attribution(self, entries):
return "© Sanseido Co., LTD. 2019" return "© Sanseido Co., LTD. 2019"

View file

@ -0,0 +1,18 @@
from bot.targets import Targets
from bot.yomichan.exporters.export import JitenonKokugoExporter
from bot.yomichan.exporters.export import JitenonYojiExporter
from bot.yomichan.exporters.export import JitenonKotowazaExporter
from bot.yomichan.exporters.export import Smk8Exporter
from bot.yomichan.exporters.export import Daijirin2Exporter
def new_exporter(target):
exporter_map = {
Targets.JITENON_KOKUGO: JitenonKokugoExporter,
Targets.JITENON_YOJI: JitenonYojiExporter,
Targets.JITENON_KOTOWAZA: JitenonKotowazaExporter,
Targets.SMK8: Smk8Exporter,
Targets.DAIJIRIN2: Daijirin2Exporter,
}
return exporter_map[target](target)

View file

@ -1,5 +1,3 @@
from bot.data import load_yomichan_inflection_categories
from bot.entries.daijirin2 import Daijirin2PhraseEntry as PhraseEntry from bot.entries.daijirin2 import Daijirin2PhraseEntry as PhraseEntry
from bot.yomichan.terms.terminator import Terminator from bot.yomichan.terms.terminator import Terminator
@ -8,10 +6,8 @@ from bot.yomichan.grammar import sudachi_rules, tags_to_rules
class Daijirin2Terminator(Terminator): class Daijirin2Terminator(Terminator):
def __init__(self, name): def __init__(self, target):
super().__init__(name) super().__init__(target)
categories = load_yomichan_inflection_categories()
self._inflection_categories = categories[name]
def _definition_tags(self, entry): def _definition_tags(self, entry):
return "" return ""

View file

@ -0,0 +1,18 @@
from bot.targets import Targets
from bot.yomichan.terms.jitenon import JitenonKokugoTerminator
from bot.yomichan.terms.jitenon import JitenonYojiTerminator
from bot.yomichan.terms.jitenon import JitenonKotowazaTerminator
from bot.yomichan.terms.smk8 import Smk8Terminator
from bot.yomichan.terms.daijirin2 import Daijirin2Terminator
def new_terminator(target):
terminator_map = {
Targets.JITENON_KOKUGO: JitenonKokugoTerminator,
Targets.JITENON_YOJI: JitenonYojiTerminator,
Targets.JITENON_KOTOWAZA: JitenonKotowazaTerminator,
Targets.SMK8: Smk8Terminator,
Targets.DAIJIRIN2: Daijirin2Terminator,
}
return terminator_map[target](target)

View file

@ -7,8 +7,8 @@ from bot.yomichan.glossary.jitenon import JitenonKotowazaGlossary
class JitenonTerminator(Terminator): class JitenonTerminator(Terminator):
def __init__(self, name): def __init__(self, target):
super().__init__(name) super().__init__(target)
def _definition_tags(self, entry): def _definition_tags(self, entry):
return None return None
@ -31,8 +31,8 @@ class JitenonTerminator(Terminator):
class JitenonKokugoTerminator(JitenonTerminator): class JitenonKokugoTerminator(JitenonTerminator):
def __init__(self, name): def __init__(self, target):
super().__init__(name) super().__init__(target)
self._glossary_maker = JitenonKokugoGlossary() self._glossary_maker = JitenonKokugoGlossary()
def _inflection_rules(self, entry, expression): def _inflection_rules(self, entry, expression):
@ -43,8 +43,8 @@ class JitenonKokugoTerminator(JitenonTerminator):
class JitenonYojiTerminator(JitenonTerminator): class JitenonYojiTerminator(JitenonTerminator):
def __init__(self, name): def __init__(self, target):
super().__init__(name) super().__init__(target)
self._glossary_maker = JitenonYojiGlossary() self._glossary_maker = JitenonYojiGlossary()
def _inflection_rules(self, entry, expression): def _inflection_rules(self, entry, expression):
@ -56,8 +56,8 @@ class JitenonYojiTerminator(JitenonTerminator):
class JitenonKotowazaTerminator(JitenonTerminator): class JitenonKotowazaTerminator(JitenonTerminator):
def __init__(self, name): def __init__(self, target):
super().__init__(name) super().__init__(target)
self._glossary_maker = JitenonKotowazaGlossary() self._glossary_maker = JitenonKotowazaGlossary()
def _inflection_rules(self, entry, expression): def _inflection_rules(self, entry, expression):

View file

@ -1,5 +1,3 @@
from bot.data import load_yomichan_inflection_categories
from bot.entries.smk8 import Smk8KanjiEntry as KanjiEntry from bot.entries.smk8 import Smk8KanjiEntry as KanjiEntry
from bot.entries.smk8 import Smk8PhraseEntry as PhraseEntry from bot.entries.smk8 import Smk8PhraseEntry as PhraseEntry
@ -9,10 +7,8 @@ from bot.yomichan.grammar import sudachi_rules, tags_to_rules
class Smk8Terminator(Terminator): class Smk8Terminator(Terminator):
def __init__(self, name): def __init__(self, target):
super().__init__(name) super().__init__(target)
categories = load_yomichan_inflection_categories()
self._inflection_categories = categories[name]
def _definition_tags(self, entry): def _definition_tags(self, entry):
if isinstance(entry, KanjiEntry): if isinstance(entry, KanjiEntry):

View file

@ -1,8 +1,13 @@
from bot.data import load_yomichan_inflection_categories
class Terminator: class Terminator:
def __init__(self, name): def __init__(self, target):
self._name = name self._target = target
self._glossary_cache = {} self._glossary_cache = {}
self._image_dir = None self._image_dir = None
categories = load_yomichan_inflection_categories()
self._inflection_categories = categories[target.value]
def set_image_dir(self, image_dir): def set_image_dir(self, image_dir):
self._image_dir = image_dir self._image_dir = image_dir

View file

@ -7,6 +7,9 @@
"kahen": ["カ行変格"], "kahen": ["カ行変格"],
"sudachi": [] "sudachi": []
}, },
"jitenon-kokugo": {},
"jitenon-yoji": {},
"jitenon-kotowaza": {},
"smk8": { "smk8": {
"sahen": ["サ", "サ変型"], "sahen": ["サ", "サ変型"],
"godan": ["上二", "下二", "四", "五", "上二型", "下二型", "四段型", "五型", "特殊型"], "godan": ["上二", "下二", "四", "五", "上二型", "下二型", "四段型", "五型", "特殊型"],

View file

@ -59,10 +59,10 @@ def main():
target_names = [x.value for x in Targets] target_names = [x.value for x in Targets]
args = parse_args(target_names) args = parse_args(target_names)
selected_target = Targets(args.target) selected_target = Targets(args.target)
crawler = new_crawler(selected_target, args) crawler = new_crawler(selected_target)
crawler.collect_pages() crawler.collect_pages(args.page_dir)
crawler.read_pages() crawler.read_pages()
crawler.make_yomichan_dictionary() crawler.make_yomichan_dictionary(args.image_dir)
if __name__ == "__main__": if __name__ == "__main__":