Split jitenon crawler class into subclasses

2023-04-22 18:32:11 -05:00 · 2023-04-22 18:32:11 -05:00 · 7d7e32ba45
parent 8868383a08
commit 7d7e32ba45
2 changed files with 42 additions and 44 deletions
--- a/bot/crawlers.py
+++ b/bot/crawlers.py
@ -12,74 +12,72 @@ from bot.yomichan.export import JitenonYojiExporter

 class Crawler():
    def __init__(self):
-        self.crawl_map = {}
-        self.entries = []
+        self._crawl_map = {}
+        self.__entries = []

    def make_entries(self):
-        entries_len = len(self.crawl_map)
-        items = self.crawl_map.items()
+        entries_len = len(self._crawl_map)
+        items = self._crawl_map.items()
        for idx, (entry_id, entry_path) in enumerate(items):
            update = f"Reading entry {idx+1}/{entries_len}"
            print(update, end='\r', flush=True)
-            entry = self.entry_class(entry_id)
+            entry = self._entry_class(entry_id)
            entry.add_document(entry_path)
-            self.entries.append(entry)
+            self.__entries.append(entry)
        print()

    def make_yomichan_dictionary(self):
-        self.yomi_exporter.export(self.entries)
+        self._yomi_exporter.export(self.__entries)


-class JitenonYojiCrawler(Crawler):
+class JitenonCrawler(Crawler):
    def __init__(self):
        super().__init__()
-        self.entry_class = JitenonYojiEntry
-        self.yomi_exporter = JitenonYojiExporter()

    def crawl(self):
-        print("Scraping jitenon-yoji...")
+        print(f"Scraping {self._name}...")
        jitenon = Scraper.Jitenon()
-        gojuon_doc, _ = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html")
+        gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
        gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
        for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
            gojuon_href = gojuon_a['href']
            kana_doc, _ = jitenon.scrape(gojuon_href)
            kana_soup = BeautifulSoup(kana_doc, features="html.parser")
            for kana_a in kana_soup.select(".word_box a", href=True):
-                kana_href = kana_a['href']
-                entry_id = int(re.search(r"([0-9]+).html", kana_href).group(1))
-                if entry_id in self.crawl_map:
+                entry_link = kana_a['href']
+                entry_id = self.__parse_entry_id(entry_link)
+                if entry_id is None:
                    continue
-                _, entry_path = jitenon.scrape(kana_href)
-                self.crawl_map[entry_id] = entry_path
-        entries_len = len(self.crawl_map)
+                _, entry_path = jitenon.scrape(entry_link)
+                self._crawl_map[entry_id] = entry_path
+        entries_len = len(self._crawl_map)
        print(f"Finished scraping {entries_len} entries")

+    def __parse_entry_id(self, entry_link):
+        m = re.search(self._entry_id_pattern, entry_link)
+        if not m:
+            return None
+        entry_id = int(m.group(1))
+        if entry_id in self._crawl_map:
+            return None
+        return entry_id

-class JitenonKotowazaCrawler(Crawler):
+
+class JitenonYojiCrawler(JitenonCrawler):
    def __init__(self):
        super().__init__()
-        self.entry_class = JitenonKotowazaEntry
-        self.yomi_exporter = JitenonKotowazaExporter()
+        self._entry_class = JitenonYojiEntry
+        self._yomi_exporter = JitenonYojiExporter()
+        self._name = "jitenon-yoji"
+        self._gojuon_url = "https://yoji.jitenon.jp/cat/gojuon.html"
+        self._entry_id_pattern = r"([0-9]+).html"

-    def crawl(self):
-        print("Scraping jitenon-kotowaza...")
-        jitenon = Scraper.Jitenon()
-        gojuon_doc, _ = jitenon.scrape("https://kotowaza.jitenon.jp/cat/gojuon.php")
-        gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
-        for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
-            gojuon_href = gojuon_a['href']
-            kana_doc, _ = jitenon.scrape(gojuon_href)
-            kana_soup = BeautifulSoup(kana_doc, features="html.parser")
-            for kana_a in kana_soup.select(".word_box a", href=True):
-                kana_href = kana_a['href']
-                m = re.search(r"([0-9]+).php", kana_href)
-                if not m:
-                    continue
-                entry_id = int(m.group(1))
-                if entry_id in self.crawl_map:
-                    continue
-                _, entry_path = jitenon.scrape(kana_href)
-                self.crawl_map[entry_id] = entry_path
-        entries_len = len(self.crawl_map)
-        print(f"Finished scraping {entries_len} entries")
+
+class JitenonKotowazaCrawler(JitenonCrawler):
+    def __init__(self):
+        super().__init__()
+        self._entry_class = JitenonKotowazaEntry
+        self._yomi_exporter = JitenonKotowazaExporter()
+        self._name = "jitenon-kotowaza"
+        self._gojuon_url = "https://kotowaza.jitenon.jp/cat/gojuon.php"
+        self._entry_id_pattern = r"([0-9]+).php"
--- a/bot/yomichan/export.py
+++ b/bot/yomichan/export.py
@ -46,7 +46,7 @@ class Exporter:
        return terms

    def __make_dictionary(self, terms, index, tags):
-        print(f"Exporting {len(terms)} Yomichan terms to zip file...")
+        print(f"Exporting {len(terms)} Yomichan terms...")
        self.__write_term_banks(terms)
        self.__write_index(index)
        self.__write_tag_bank(tags)
@ -89,7 +89,7 @@ class Exporter:
        base_filename = os.path.join(out_dir, filename)
        build_dir = self._get_build_dir()
        shutil.make_archive(base_filename, archive_format, build_dir)
-        print(f"Dictionary file exported to {out_filepath}")
+        print(f"Dictionary file saved to {out_filepath}")

    def __rm_build_dir(self):
        build_dir = self._get_build_dir()