Organize crawler logic into classes

2023-04-22 17:56:52 -05:00 · 2023-04-22 17:56:52 -05:00 · 8868383a08
parent 4721eed4c6
commit 8868383a08
5 changed files with 96 additions and 87 deletions
--- a/bot/crawlers.py
+++ b/bot/crawlers.py
@ -2,78 +2,84 @@ import re
 from bs4 import BeautifulSoup

 import bot.scraper as Scraper
-import bot.yomichan.export as YomichanExport
-from bot.entries.jitenon_kotowaza import JitenonKotowaza
-from bot.entries.jitenon_yoji import JitenonYoji
+
+from bot.entries.jitenon_kotowaza import JitenonKotowazaEntry
+from bot.yomichan.export import JitenonKotowazaExporter
+
+from bot.entries.jitenon_yoji import JitenonYojiEntry
+from bot.yomichan.export import JitenonYojiExporter


-def run_all():
-    jitenon_yoji()
-    jitenon_kotowaza()
+class Crawler():
+    def __init__(self):
+        self.crawl_map = {}
+        self.entries = []
+
+    def make_entries(self):
+        entries_len = len(self.crawl_map)
+        items = self.crawl_map.items()
+        for idx, (entry_id, entry_path) in enumerate(items):
+            update = f"Reading entry {idx+1}/{entries_len}"
+            print(update, end='\r', flush=True)
+            entry = self.entry_class(entry_id)
+            entry.add_document(entry_path)
+            self.entries.append(entry)
+        print()
+
+    def make_yomichan_dictionary(self):
+        self.yomi_exporter.export(self.entries)


-def jitenon_yoji():
-    print("Scraping jitenon-yoji...")
-    entry_id_to_entry_path = {}
-    jitenon = Scraper.Jitenon()
-    gojuon_doc, _ = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html")
-    gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
-    for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
-        gojuon_href = gojuon_a['href']
-        kana_doc, _ = jitenon.scrape(gojuon_href)
-        kana_soup = BeautifulSoup(kana_doc, features="html.parser")
-        for kana_a in kana_soup.select(".word_box a", href=True):
-            kana_href = kana_a['href']
-            entry_id = int(re.search(r"([0-9]+).html", kana_href).group(1))
-            if entry_id in entry_id_to_entry_path:
-                continue
-            _, entry_path = jitenon.scrape(kana_href)
-            entry_id_to_entry_path[entry_id] = entry_path
-    entries_len = len(entry_id_to_entry_path)
-    print(f"Finished scraping {entries_len} entries")
-    entries = []
-    items = entry_id_to_entry_path.items()
-    for idx, (entry_id, entry_path) in enumerate(items):
-        update = f"Reading entry {idx+1}/{entries_len}"
-        print(update, end='\r', flush=True)
-        entry = JitenonYoji(entry_id)
-        entry.add_document(entry_path)
-        entries.append(entry)
-    print()
-    exporter = YomichanExport.JitenonYojiExporter()
-    exporter.export(entries)
+class JitenonYojiCrawler(Crawler):
+    def __init__(self):
+        super().__init__()
+        self.entry_class = JitenonYojiEntry
+        self.yomi_exporter = JitenonYojiExporter()
+
+    def crawl(self):
+        print("Scraping jitenon-yoji...")
+        jitenon = Scraper.Jitenon()
+        gojuon_doc, _ = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html")
+        gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
+        for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
+            gojuon_href = gojuon_a['href']
+            kana_doc, _ = jitenon.scrape(gojuon_href)
+            kana_soup = BeautifulSoup(kana_doc, features="html.parser")
+            for kana_a in kana_soup.select(".word_box a", href=True):
+                kana_href = kana_a['href']
+                entry_id = int(re.search(r"([0-9]+).html", kana_href).group(1))
+                if entry_id in self.crawl_map:
+                    continue
+                _, entry_path = jitenon.scrape(kana_href)
+                self.crawl_map[entry_id] = entry_path
+        entries_len = len(self.crawl_map)
+        print(f"Finished scraping {entries_len} entries")


-def jitenon_kotowaza():
-    print("Scraping jitenon-kotowaza...")
-    entry_id_to_entry_path = {}
-    jitenon = Scraper.Jitenon()
-    gojuon_doc, _ = jitenon.scrape("https://kotowaza.jitenon.jp/cat/gojuon.php")
-    gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
-    for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
-        gojuon_href = gojuon_a['href']
-        kana_doc, _ = jitenon.scrape(gojuon_href)
-        kana_soup = BeautifulSoup(kana_doc, features="html.parser")
-        for kana_a in kana_soup.select(".word_box a", href=True):
-            kana_href = kana_a['href']
-            m = re.search(r"([0-9]+).php", kana_href)
-            if not m:
-                continue
-            entry_id = int(m.group(1))
-            if entry_id in entry_id_to_entry_path:
-                continue
-            _, entry_path = jitenon.scrape(kana_href)
-            entry_id_to_entry_path[entry_id] = entry_path
-    entries_len = len(entry_id_to_entry_path)
-    print(f"Finished scraping {entries_len} entries")
-    entries = []
-    items = entry_id_to_entry_path.items()
-    for idx, (entry_id, entry_path) in enumerate(items):
-        update = f"Reading entry {idx+1}/{entries_len}"
-        print(update, end='\r', flush=True)
-        entry = JitenonKotowaza(entry_id)
-        entry.add_document(entry_path)
-        entries.append(entry)
-    print()
-    exporter = YomichanExport.JitenonKotowazaExporter()
-    exporter.export(entries)
+class JitenonKotowazaCrawler(Crawler):
+    def __init__(self):
+        super().__init__()
+        self.entry_class = JitenonKotowazaEntry
+        self.yomi_exporter = JitenonKotowazaExporter()
+
+    def crawl(self):
+        print("Scraping jitenon-kotowaza...")
+        jitenon = Scraper.Jitenon()
+        gojuon_doc, _ = jitenon.scrape("https://kotowaza.jitenon.jp/cat/gojuon.php")
+        gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
+        for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
+            gojuon_href = gojuon_a['href']
+            kana_doc, _ = jitenon.scrape(gojuon_href)
+            kana_soup = BeautifulSoup(kana_doc, features="html.parser")
+            for kana_a in kana_soup.select(".word_box a", href=True):
+                kana_href = kana_a['href']
+                m = re.search(r"([0-9]+).php", kana_href)
+                if not m:
+                    continue
+                entry_id = int(m.group(1))
+                if entry_id in self.crawl_map:
+                    continue
+                _, entry_path = jitenon.scrape(kana_href)
+                self.crawl_map[entry_id] = entry_path
+        entries_len = len(self.crawl_map)
+        print(f"Finished scraping {entries_len} entries")
--- a/bot/entries/jitenon.py
+++ b/bot/entries/jitenon.py
@ -6,7 +6,7 @@ import bot.yomichan.html_gloss as YomichanGloss
 import bot.util as Util


-class Jitenon:
+class JitenonEntry:
    def __init__(self, sequence):
        self.sequence = sequence
        self.yomichan_glossary = [""]
--- a/bot/entries/jitenon_kotowaza.py
+++ b/bot/entries/jitenon_kotowaza.py
@ -1,8 +1,8 @@
-from bot.entries.jitenon import Jitenon
+from bot.entries.jitenon import JitenonEntry
 import bot.yomichan.grammar as Grammar


-class JitenonKotowaza(Jitenon):
+class JitenonKotowazaEntry(JitenonEntry):
    columns = {
        "言葉":   ["expression", ""],
        "読み方": ["yomikata", ""],
@ -14,7 +14,7 @@ class JitenonKotowaza(Jitenon):
    }

    def __init__(self, sequence):
-        Jitenon.__init__(self, sequence)
+        super().__init__(sequence)

    def yomichan_terms(self):
        terms = []
@ -38,4 +38,4 @@ class JitenonKotowaza(Jitenon):
            return [["金棒引き", "かなぼうひき"],
                    ["鉄棒引き", "かなぼうひき"]]
        else:
-            return Jitenon._headwords(self)
+            return super()._headwords()
--- a/bot/entries/jitenon_yoji.py
+++ b/bot/entries/jitenon_yoji.py
@ -1,7 +1,7 @@
-from bot.entries.jitenon import Jitenon
+from bot.entries.jitenon import JitenonEntry


-class JitenonYoji(Jitenon):
+class JitenonYojiEntry(JitenonEntry):
    columns = {
        "四字熟語": ["expression", ""],
        "読み方":   ["yomikata", ""],
@ -14,7 +14,7 @@ class JitenonYoji(Jitenon):
    }

    def __init__(self, sequence):
-        Jitenon.__init__(self, sequence)
+        super().__init__(sequence)

    def yomichan_terms(self):
        terms = []
--- a/jitenbot.py
+++ b/jitenbot.py
@ -17,32 +17,35 @@ along with this program.  If not, see <https://www.gnu.org/licenses/>.
 """

 import argparse
-import bot.crawlers as Crawlers
+from bot.crawlers import JitenonYojiCrawler
+from bot.crawlers import JitenonKotowazaCrawler


-choices = {
-    'all': Crawlers.run_all,
-    'jitenon-yoji': Crawlers.jitenon_yoji,
-    'jitenon-kotowaza': Crawlers.jitenon_kotowaza,
+crawlers = {
+    'jitenon-yoji': JitenonYojiCrawler,
+    'jitenon-kotowaza': JitenonKotowazaCrawler,
 }


 def parse_args():
    parser = argparse.ArgumentParser(
        prog='jitenbot',
-        description='Crawl and convert Japanese web dictionaries.')
+        description='Convert Japanese dictionary files to new formats.')
    parser.add_argument(
        'target',
-        choices=choices.keys(),
-        help='website to crawl')
+        choices=crawlers.keys(),
+        help='Dictionary to convert.')
    args = parser.parse_args()
    return args


 def main():
    args = parse_args()
-    crawler = choices[args.target]
-    crawler()
+    crawler_class = crawlers[args.target]
+    crawler = crawler_class()
+    crawler.crawl()
+    crawler.make_entries()
+    crawler.make_yomichan_dictionary()


 if __name__ == "__main__":