From 8868383a08786c496bcac0093ae9ec56d3e722b4 Mon Sep 17 00:00:00 2001 From: stephenmk Date: Sat, 22 Apr 2023 17:56:52 -0500 Subject: [PATCH] Organize crawler logic into classes --- bot/crawlers.py | 144 +++++++++++++++++--------------- bot/entries/jitenon.py | 2 +- bot/entries/jitenon_kotowaza.py | 8 +- bot/entries/jitenon_yoji.py | 6 +- jitenbot.py | 23 ++--- 5 files changed, 96 insertions(+), 87 deletions(-) diff --git a/bot/crawlers.py b/bot/crawlers.py index 5a8940c..9b1512f 100644 --- a/bot/crawlers.py +++ b/bot/crawlers.py @@ -2,78 +2,84 @@ import re from bs4 import BeautifulSoup import bot.scraper as Scraper -import bot.yomichan.export as YomichanExport -from bot.entries.jitenon_kotowaza import JitenonKotowaza -from bot.entries.jitenon_yoji import JitenonYoji + +from bot.entries.jitenon_kotowaza import JitenonKotowazaEntry +from bot.yomichan.export import JitenonKotowazaExporter + +from bot.entries.jitenon_yoji import JitenonYojiEntry +from bot.yomichan.export import JitenonYojiExporter -def run_all(): - jitenon_yoji() - jitenon_kotowaza() +class Crawler(): + def __init__(self): + self.crawl_map = {} + self.entries = [] + + def make_entries(self): + entries_len = len(self.crawl_map) + items = self.crawl_map.items() + for idx, (entry_id, entry_path) in enumerate(items): + update = f"Reading entry {idx+1}/{entries_len}" + print(update, end='\r', flush=True) + entry = self.entry_class(entry_id) + entry.add_document(entry_path) + self.entries.append(entry) + print() + + def make_yomichan_dictionary(self): + self.yomi_exporter.export(self.entries) -def jitenon_yoji(): - print("Scraping jitenon-yoji...") - entry_id_to_entry_path = {} - jitenon = Scraper.Jitenon() - gojuon_doc, _ = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html") - gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser") - for gojuon_a in gojuon_soup.select(".kana_area a", href=True): - gojuon_href = gojuon_a['href'] - kana_doc, _ = jitenon.scrape(gojuon_href) - kana_soup = BeautifulSoup(kana_doc, features="html.parser") - for kana_a in kana_soup.select(".word_box a", href=True): - kana_href = kana_a['href'] - entry_id = int(re.search(r"([0-9]+).html", kana_href).group(1)) - if entry_id in entry_id_to_entry_path: - continue - _, entry_path = jitenon.scrape(kana_href) - entry_id_to_entry_path[entry_id] = entry_path - entries_len = len(entry_id_to_entry_path) - print(f"Finished scraping {entries_len} entries") - entries = [] - items = entry_id_to_entry_path.items() - for idx, (entry_id, entry_path) in enumerate(items): - update = f"Reading entry {idx+1}/{entries_len}" - print(update, end='\r', flush=True) - entry = JitenonYoji(entry_id) - entry.add_document(entry_path) - entries.append(entry) - print() - exporter = YomichanExport.JitenonYojiExporter() - exporter.export(entries) +class JitenonYojiCrawler(Crawler): + def __init__(self): + super().__init__() + self.entry_class = JitenonYojiEntry + self.yomi_exporter = JitenonYojiExporter() + + def crawl(self): + print("Scraping jitenon-yoji...") + jitenon = Scraper.Jitenon() + gojuon_doc, _ = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html") + gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser") + for gojuon_a in gojuon_soup.select(".kana_area a", href=True): + gojuon_href = gojuon_a['href'] + kana_doc, _ = jitenon.scrape(gojuon_href) + kana_soup = BeautifulSoup(kana_doc, features="html.parser") + for kana_a in kana_soup.select(".word_box a", href=True): + kana_href = kana_a['href'] + entry_id = int(re.search(r"([0-9]+).html", kana_href).group(1)) + if entry_id in self.crawl_map: + continue + _, entry_path = jitenon.scrape(kana_href) + self.crawl_map[entry_id] = entry_path + entries_len = len(self.crawl_map) + print(f"Finished scraping {entries_len} entries") -def jitenon_kotowaza(): - print("Scraping jitenon-kotowaza...") - entry_id_to_entry_path = {} - jitenon = Scraper.Jitenon() - gojuon_doc, _ = jitenon.scrape("https://kotowaza.jitenon.jp/cat/gojuon.php") - gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser") - for gojuon_a in gojuon_soup.select(".kana_area a", href=True): - gojuon_href = gojuon_a['href'] - kana_doc, _ = jitenon.scrape(gojuon_href) - kana_soup = BeautifulSoup(kana_doc, features="html.parser") - for kana_a in kana_soup.select(".word_box a", href=True): - kana_href = kana_a['href'] - m = re.search(r"([0-9]+).php", kana_href) - if not m: - continue - entry_id = int(m.group(1)) - if entry_id in entry_id_to_entry_path: - continue - _, entry_path = jitenon.scrape(kana_href) - entry_id_to_entry_path[entry_id] = entry_path - entries_len = len(entry_id_to_entry_path) - print(f"Finished scraping {entries_len} entries") - entries = [] - items = entry_id_to_entry_path.items() - for idx, (entry_id, entry_path) in enumerate(items): - update = f"Reading entry {idx+1}/{entries_len}" - print(update, end='\r', flush=True) - entry = JitenonKotowaza(entry_id) - entry.add_document(entry_path) - entries.append(entry) - print() - exporter = YomichanExport.JitenonKotowazaExporter() - exporter.export(entries) +class JitenonKotowazaCrawler(Crawler): + def __init__(self): + super().__init__() + self.entry_class = JitenonKotowazaEntry + self.yomi_exporter = JitenonKotowazaExporter() + + def crawl(self): + print("Scraping jitenon-kotowaza...") + jitenon = Scraper.Jitenon() + gojuon_doc, _ = jitenon.scrape("https://kotowaza.jitenon.jp/cat/gojuon.php") + gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser") + for gojuon_a in gojuon_soup.select(".kana_area a", href=True): + gojuon_href = gojuon_a['href'] + kana_doc, _ = jitenon.scrape(gojuon_href) + kana_soup = BeautifulSoup(kana_doc, features="html.parser") + for kana_a in kana_soup.select(".word_box a", href=True): + kana_href = kana_a['href'] + m = re.search(r"([0-9]+).php", kana_href) + if not m: + continue + entry_id = int(m.group(1)) + if entry_id in self.crawl_map: + continue + _, entry_path = jitenon.scrape(kana_href) + self.crawl_map[entry_id] = entry_path + entries_len = len(self.crawl_map) + print(f"Finished scraping {entries_len} entries") diff --git a/bot/entries/jitenon.py b/bot/entries/jitenon.py index 71efe49..65dc647 100644 --- a/bot/entries/jitenon.py +++ b/bot/entries/jitenon.py @@ -6,7 +6,7 @@ import bot.yomichan.html_gloss as YomichanGloss import bot.util as Util -class Jitenon: +class JitenonEntry: def __init__(self, sequence): self.sequence = sequence self.yomichan_glossary = [""] diff --git a/bot/entries/jitenon_kotowaza.py b/bot/entries/jitenon_kotowaza.py index 6019c02..23a4c21 100644 --- a/bot/entries/jitenon_kotowaza.py +++ b/bot/entries/jitenon_kotowaza.py @@ -1,8 +1,8 @@ -from bot.entries.jitenon import Jitenon +from bot.entries.jitenon import JitenonEntry import bot.yomichan.grammar as Grammar -class JitenonKotowaza(Jitenon): +class JitenonKotowazaEntry(JitenonEntry): columns = { "言葉": ["expression", ""], "読み方": ["yomikata", ""], @@ -14,7 +14,7 @@ class JitenonKotowaza(Jitenon): } def __init__(self, sequence): - Jitenon.__init__(self, sequence) + super().__init__(sequence) def yomichan_terms(self): terms = [] @@ -38,4 +38,4 @@ class JitenonKotowaza(Jitenon): return [["金棒引き", "かなぼうひき"], ["鉄棒引き", "かなぼうひき"]] else: - return Jitenon._headwords(self) + return super()._headwords() diff --git a/bot/entries/jitenon_yoji.py b/bot/entries/jitenon_yoji.py index 1dc9792..d08d607 100644 --- a/bot/entries/jitenon_yoji.py +++ b/bot/entries/jitenon_yoji.py @@ -1,7 +1,7 @@ -from bot.entries.jitenon import Jitenon +from bot.entries.jitenon import JitenonEntry -class JitenonYoji(Jitenon): +class JitenonYojiEntry(JitenonEntry): columns = { "四字熟語": ["expression", ""], "読み方": ["yomikata", ""], @@ -14,7 +14,7 @@ class JitenonYoji(Jitenon): } def __init__(self, sequence): - Jitenon.__init__(self, sequence) + super().__init__(sequence) def yomichan_terms(self): terms = [] diff --git a/jitenbot.py b/jitenbot.py index aaf213a..304fb64 100644 --- a/jitenbot.py +++ b/jitenbot.py @@ -17,32 +17,35 @@ along with this program. If not, see . """ import argparse -import bot.crawlers as Crawlers +from bot.crawlers import JitenonYojiCrawler +from bot.crawlers import JitenonKotowazaCrawler -choices = { - 'all': Crawlers.run_all, - 'jitenon-yoji': Crawlers.jitenon_yoji, - 'jitenon-kotowaza': Crawlers.jitenon_kotowaza, +crawlers = { + 'jitenon-yoji': JitenonYojiCrawler, + 'jitenon-kotowaza': JitenonKotowazaCrawler, } def parse_args(): parser = argparse.ArgumentParser( prog='jitenbot', - description='Crawl and convert Japanese web dictionaries.') + description='Convert Japanese dictionary files to new formats.') parser.add_argument( 'target', - choices=choices.keys(), - help='website to crawl') + choices=crawlers.keys(), + help='Dictionary to convert.') args = parser.parse_args() return args def main(): args = parse_args() - crawler = choices[args.target] - crawler() + crawler_class = crawlers[args.target] + crawler = crawler_class() + crawler.crawl() + crawler.make_entries() + crawler.make_yomichan_dictionary() if __name__ == "__main__":