Organize crawler logic into classes

2023-04-22 17:56:52 -05:00 · 2023-04-22 17:56:52 -05:00 · 8868383a08
parent 4721eed4c6
commit 8868383a08
5 changed files with 96 additions and 87 deletions
--- a/bot/crawlers.py
+++ b/bot/crawlers.py
@ -2,78 +2,84 @@ import re
 from bs4 import BeautifulSoup
 import bot.scraper as Scraper
-import bot.yomichan.export as YomichanExport
+
-from bot.entries.jitenon_kotowaza import JitenonKotowaza
+from bot.entries.jitenon_kotowaza import JitenonKotowazaEntry
-from bot.entries.jitenon_yoji import JitenonYoji
+from bot.yomichan.export import JitenonKotowazaExporter
 from bot.entries.jitenon_yoji import JitenonYojiEntry
 from bot.yomichan.export import JitenonYojiExporter
-def run_all():
+class Crawler():
-    jitenon_yoji()
+    def __init__(self):
-    jitenon_kotowaza()
+        self.crawl_map = {}
        self.entries = []
    def make_entries(self):
        entries_len = len(self.crawl_map)
        items = self.crawl_map.items()
        for idx, (entry_id, entry_path) in enumerate(items):
            update = f"Reading entry {idx+1}/{entries_len}"
            print(update, end='\r', flush=True)
            entry = self.entry_class(entry_id)
            entry.add_document(entry_path)
            self.entries.append(entry)
        print()
    def make_yomichan_dictionary(self):
        self.yomi_exporter.export(self.entries)
-def jitenon_yoji():
+class JitenonYojiCrawler(Crawler):
-    print("Scraping jitenon-yoji...")
+    def __init__(self):
-    entry_id_to_entry_path = {}
+        super().__init__()
-    jitenon = Scraper.Jitenon()
+        self.entry_class = JitenonYojiEntry
-    gojuon_doc, _ = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html")
+        self.yomi_exporter = JitenonYojiExporter()
-    gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
+
-    for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
+    def crawl(self):
-        gojuon_href = gojuon_a['href']
+        print("Scraping jitenon-yoji...")
-        kana_doc, _ = jitenon.scrape(gojuon_href)
+        jitenon = Scraper.Jitenon()
-        kana_soup = BeautifulSoup(kana_doc, features="html.parser")
+        gojuon_doc, _ = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html")
-        for kana_a in kana_soup.select(".word_box a", href=True):
+        gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
-            kana_href = kana_a['href']
+        for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
-            entry_id = int(re.search(r"([0-9]+).html", kana_href).group(1))
+            gojuon_href = gojuon_a['href']
-            if entry_id in entry_id_to_entry_path:
+            kana_doc, _ = jitenon.scrape(gojuon_href)
-                continue
+            kana_soup = BeautifulSoup(kana_doc, features="html.parser")
-            _, entry_path = jitenon.scrape(kana_href)
+            for kana_a in kana_soup.select(".word_box a", href=True):
-            entry_id_to_entry_path[entry_id] = entry_path
+                kana_href = kana_a['href']
-    entries_len = len(entry_id_to_entry_path)
+                entry_id = int(re.search(r"([0-9]+).html", kana_href).group(1))
-    print(f"Finished scraping {entries_len} entries")
+                if entry_id in self.crawl_map:
-    entries = []
+                    continue
-    items = entry_id_to_entry_path.items()
+                _, entry_path = jitenon.scrape(kana_href)
-    for idx, (entry_id, entry_path) in enumerate(items):
+                self.crawl_map[entry_id] = entry_path
-        update = f"Reading entry {idx+1}/{entries_len}"
+        entries_len = len(self.crawl_map)
-        print(update, end='\r', flush=True)
+        print(f"Finished scraping {entries_len} entries")
        entry = JitenonYoji(entry_id)
        entry.add_document(entry_path)
        entries.append(entry)
    print()
    exporter = YomichanExport.JitenonYojiExporter()
    exporter.export(entries)
-def jitenon_kotowaza():
+class JitenonKotowazaCrawler(Crawler):
-    print("Scraping jitenon-kotowaza...")
+    def __init__(self):
-    entry_id_to_entry_path = {}
+        super().__init__()
-    jitenon = Scraper.Jitenon()
+        self.entry_class = JitenonKotowazaEntry
-    gojuon_doc, _ = jitenon.scrape("https://kotowaza.jitenon.jp/cat/gojuon.php")
+        self.yomi_exporter = JitenonKotowazaExporter()
-    gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
+
-    for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
+    def crawl(self):
-        gojuon_href = gojuon_a['href']
+        print("Scraping jitenon-kotowaza...")
-        kana_doc, _ = jitenon.scrape(gojuon_href)
+        jitenon = Scraper.Jitenon()
-        kana_soup = BeautifulSoup(kana_doc, features="html.parser")
+        gojuon_doc, _ = jitenon.scrape("https://kotowaza.jitenon.jp/cat/gojuon.php")
-        for kana_a in kana_soup.select(".word_box a", href=True):
+        gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
-            kana_href = kana_a['href']
+        for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
-            m = re.search(r"([0-9]+).php", kana_href)
+            gojuon_href = gojuon_a['href']
-            if not m:
+            kana_doc, _ = jitenon.scrape(gojuon_href)
-                continue
+            kana_soup = BeautifulSoup(kana_doc, features="html.parser")
-            entry_id = int(m.group(1))
+            for kana_a in kana_soup.select(".word_box a", href=True):
-            if entry_id in entry_id_to_entry_path:
+                kana_href = kana_a['href']
-                continue
+                m = re.search(r"([0-9]+).php", kana_href)
-            _, entry_path = jitenon.scrape(kana_href)
+                if not m:
-            entry_id_to_entry_path[entry_id] = entry_path
+                    continue
-    entries_len = len(entry_id_to_entry_path)
+                entry_id = int(m.group(1))
-    print(f"Finished scraping {entries_len} entries")
+                if entry_id in self.crawl_map:
-    entries = []
+                    continue
-    items = entry_id_to_entry_path.items()
+                _, entry_path = jitenon.scrape(kana_href)
-    for idx, (entry_id, entry_path) in enumerate(items):
+                self.crawl_map[entry_id] = entry_path
-        update = f"Reading entry {idx+1}/{entries_len}"
+        entries_len = len(self.crawl_map)
-        print(update, end='\r', flush=True)
+        print(f"Finished scraping {entries_len} entries")
        entry = JitenonKotowaza(entry_id)
        entry.add_document(entry_path)
        entries.append(entry)
    print()
    exporter = YomichanExport.JitenonKotowazaExporter()
    exporter.export(entries)
--- a/bot/entries/jitenon.py
+++ b/bot/entries/jitenon.py
@ -6,7 +6,7 @@ import bot.yomichan.html_gloss as YomichanGloss
 import bot.util as Util
-class Jitenon:
+class JitenonEntry:
    def __init__(self, sequence):
        self.sequence = sequence
        self.yomichan_glossary = [""]
--- a/bot/entries/jitenon_kotowaza.py
+++ b/bot/entries/jitenon_kotowaza.py
@ -1,8 +1,8 @@
-from bot.entries.jitenon import Jitenon
+from bot.entries.jitenon import JitenonEntry
 import bot.yomichan.grammar as Grammar
-class JitenonKotowaza(Jitenon):
+class JitenonKotowazaEntry(JitenonEntry):
    columns = {
        "言葉":   ["expression", ""],
        "読み方": ["yomikata", ""],
@ -14,7 +14,7 @@ class JitenonKotowaza(Jitenon):
    }
    def __init__(self, sequence):
-        Jitenon.__init__(self, sequence)
+        super().__init__(sequence)
    def yomichan_terms(self):
        terms = []
@ -38,4 +38,4 @@ class JitenonKotowaza(Jitenon):
            return [["金棒引き", "かなぼうひき"],
                    ["鉄棒引き", "かなぼうひき"]]
        else:
-            return Jitenon._headwords(self)
+            return super()._headwords()
--- a/bot/entries/jitenon_yoji.py
+++ b/bot/entries/jitenon_yoji.py
@ -1,7 +1,7 @@
-from bot.entries.jitenon import Jitenon
+from bot.entries.jitenon import JitenonEntry
-class JitenonYoji(Jitenon):
+class JitenonYojiEntry(JitenonEntry):
    columns = {
        "四字熟語": ["expression", ""],
        "読み方":   ["yomikata", ""],
@ -14,7 +14,7 @@ class JitenonYoji(Jitenon):
    }
    def __init__(self, sequence):
-        Jitenon.__init__(self, sequence)
+        super().__init__(sequence)
    def yomichan_terms(self):
        terms = []
--- a/jitenbot.py
+++ b/jitenbot.py
@ -17,32 +17,35 @@ along with this program.  If not, see <https://www.gnu.org/licenses/>.
 """
 import argparse
-import bot.crawlers as Crawlers
+from bot.crawlers import JitenonYojiCrawler
 from bot.crawlers import JitenonKotowazaCrawler
-choices = {
+crawlers = {
-    'all': Crawlers.run_all,
+    'jitenon-yoji': JitenonYojiCrawler,
-    'jitenon-yoji': Crawlers.jitenon_yoji,
+    'jitenon-kotowaza': JitenonKotowazaCrawler,
    'jitenon-kotowaza': Crawlers.jitenon_kotowaza,
 }
 def parse_args():
    parser = argparse.ArgumentParser(
        prog='jitenbot',
-        description='Crawl and convert Japanese web dictionaries.')
+        description='Convert Japanese dictionary files to new formats.')
    parser.add_argument(
        'target',
-        choices=choices.keys(),
+        choices=crawlers.keys(),
-        help='website to crawl')
+        help='Dictionary to convert.')
    args = parser.parse_args()
    return args
 def main():
    args = parse_args()
-    crawler = choices[args.target]
+    crawler_class = crawlers[args.target]
-    crawler()
+    crawler = crawler_class()
    crawler.crawl()
    crawler.make_entries()
    crawler.make_yomichan_dictionary()
 if __name__ == "__main__":