Organize Yomichan export logic into classes

2023-04-22 16:49:29 -05:00 · 2023-04-22 16:49:29 -05:00 · 4721eed4c6
parent e73c4d3d7f
commit 4721eed4c6
4 changed files with 162 additions and 94 deletions
--- a/bot/crawlers.py
+++ b/bot/crawlers.py
@ -3,7 +3,6 @@ from bs4 import BeautifulSoup

 import bot.scraper as Scraper
 import bot.yomichan.export as YomichanExport
-
 from bot.entries.jitenon_kotowaza import JitenonKotowaza
 from bot.entries.jitenon_yoji import JitenonYoji

@ -14,48 +13,67 @@ def run_all():


 def jitenon_yoji():
-    seq_to_entries = {}
+    print("Scraping jitenon-yoji...")
+    entry_id_to_entry_path = {}
    jitenon = Scraper.Jitenon()
-    gojuon_doc = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html")
+    gojuon_doc, _ = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html")
    gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
    for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
        gojuon_href = gojuon_a['href']
-        kana_doc = jitenon.scrape(gojuon_href)
+        kana_doc, _ = jitenon.scrape(gojuon_href)
        kana_soup = BeautifulSoup(kana_doc, features="html.parser")
        for kana_a in kana_soup.select(".word_box a", href=True):
            kana_href = kana_a['href']
-            sequence = int(re.search(r"([0-9]+).html", kana_href).group(1))
-            if sequence in seq_to_entries:
+            entry_id = int(re.search(r"([0-9]+).html", kana_href).group(1))
+            if entry_id in entry_id_to_entry_path:
                continue
-            yoji_doc = jitenon.scrape(kana_href)
-            entry = JitenonYoji(sequence)
-            entry.add_document(yoji_doc)
-            seq_to_entries[sequence] = entry
-    entries = seq_to_entries.values()
-    YomichanExport.jitenon_yoji(entries)
+            _, entry_path = jitenon.scrape(kana_href)
+            entry_id_to_entry_path[entry_id] = entry_path
+    entries_len = len(entry_id_to_entry_path)
+    print(f"Finished scraping {entries_len} entries")
+    entries = []
+    items = entry_id_to_entry_path.items()
+    for idx, (entry_id, entry_path) in enumerate(items):
+        update = f"Reading entry {idx+1}/{entries_len}"
+        print(update, end='\r', flush=True)
+        entry = JitenonYoji(entry_id)
+        entry.add_document(entry_path)
+        entries.append(entry)
+    print()
+    exporter = YomichanExport.JitenonYojiExporter()
+    exporter.export(entries)


 def jitenon_kotowaza():
-    seq_to_entries = {}
+    print("Scraping jitenon-kotowaza...")
+    entry_id_to_entry_path = {}
    jitenon = Scraper.Jitenon()
-    gojuon_doc = jitenon.scrape("https://kotowaza.jitenon.jp/cat/gojuon.php")
+    gojuon_doc, _ = jitenon.scrape("https://kotowaza.jitenon.jp/cat/gojuon.php")
    gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
    for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
        gojuon_href = gojuon_a['href']
-        kana_doc = jitenon.scrape(gojuon_href)
+        kana_doc, _ = jitenon.scrape(gojuon_href)
        kana_soup = BeautifulSoup(kana_doc, features="html.parser")
        for kana_a in kana_soup.select(".word_box a", href=True):
            kana_href = kana_a['href']
            m = re.search(r"([0-9]+).php", kana_href)
-            if m:
-                sequence = int(m.group(1))
-            else:
+            if not m:
                continue
-            if sequence in seq_to_entries:
+            entry_id = int(m.group(1))
+            if entry_id in entry_id_to_entry_path:
                continue
-            kotowaza_doc = jitenon.scrape(kana_href)
-            entry = JitenonKotowaza(sequence)
-            entry.add_document(kotowaza_doc)
-            seq_to_entries[sequence] = entry
-    entries = seq_to_entries.values()
-    YomichanExport.jitenon_kotowaza(entries)
+            _, entry_path = jitenon.scrape(kana_href)
+            entry_id_to_entry_path[entry_id] = entry_path
+    entries_len = len(entry_id_to_entry_path)
+    print(f"Finished scraping {entries_len} entries")
+    entries = []
+    items = entry_id_to_entry_path.items()
+    for idx, (entry_id, entry_path) in enumerate(items):
+        update = f"Reading entry {idx+1}/{entries_len}"
+        print(update, end='\r', flush=True)
+        entry = JitenonKotowaza(entry_id)
+        entry.add_document(entry_path)
+        entries.append(entry)
+    print()
+    exporter = YomichanExport.JitenonKotowazaExporter()
+    exporter.export(entries)
--- a/bot/entries/jitenon.py
+++ b/bot/entries/jitenon.py
@ -15,7 +15,9 @@ class Jitenon:
        for column in self.columns.values():
            setattr(self, column[0], column[1])

-    def add_document(self, html):
+    def add_document(self, path):
+        with open(path, "r") as f:
+            html = f.read()
        yoji_soup = BeautifulSoup(html, features="html5lib")
        self.__set_modified_date(html)
        self.attribution = yoji_soup.find(class_="copyright").text
--- a/bot/scraper.py
+++ b/bot/scraper.py
@ -28,13 +28,14 @@ class Scraper():
        url = urlparse(urlstring, scheme='https://', allow_fragments=True)
        self.__validate_url(url)
        cache_path = self.__cache_path(url)
-        cache_contents = self.__read_cache(cache_path)
-        if cache_contents is not None:
-            return cache_contents
-        html = self.__get(urlstring)
-        with open(cache_path, "w") as f:
-            f.write(html)
-        return html
+        html = self.__read_cache(cache_path)
+        if html is None:
+            html = self.__get(urlstring)
+            with open(cache_path, "w") as f:
+                f.write(html)
+        else:
+            print("Discovering cached files...", end='\r', flush=True)
+        return html, cache_path

    def __set_session(self):
        retry_strategy = Retry(
@ -106,4 +107,4 @@ class Scraper():
 class Jitenon(Scraper):
    def __init__(self):
        self.domain = r"jitenon\.jp"
-        Scraper.__init__(self)
+        super().__init__()
--- a/bot/yomichan/export.py
+++ b/bot/yomichan/export.py
@ -8,74 +8,121 @@ from platformdirs import user_documents_dir, user_cache_dir
 import bot.data as Data


-def jitenon_yoji(entries):
-    __jitenon(entries, "jitenon-yoji")
+class Exporter:
+    def __init__(self):
+        self._build_dir = None
+        self._terms_per_file = 2000

+    def export(self, entries):
+        meta = Data.yomichan_metadata()
+        index = meta[self._name]["index"]
+        index["revision"] = self._get_revision(entries)
+        index["attribution"] = self._get_attribution(entries)
+        tags = meta[self._name]["tags"]
+        terms = self.__get_terms(entries)
+        self.__make_dictionary(terms, index, tags)

-def jitenon_kotowaza(entries):
-    __jitenon(entries, "jitenon-kotowaza")
+    def _get_build_dir(self):
+        if self._build_dir is not None:
+            return self._build_dir
+        cache_dir = user_cache_dir("jitenbot")
+        timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
+        build_directory = os.path.join(cache_dir, f"build_{timestamp}")
+        if Path(build_directory).is_dir():
+            shutil.rmtree(build_directory)
+        os.makedirs(build_directory)
+        self._build_dir = build_directory
+        return self._build_dir

+    def __get_terms(self, entries):
+        terms = []
+        entries_len = len(entries)
+        for idx, entry in enumerate(entries):
+            update = f"Creating Yomichan terms for entry {idx+1}/{entries_len}"
+            print(update, end='\r', flush=True)
+            for term in entry.yomichan_terms():
+                terms.append(term)
+        print()
+        return terms

-def __jitenon(entries, name):
-    terms, modified_date, attribution = __terms(entries)
-    meta = Data.yomichan_metadata()
+    def __make_dictionary(self, terms, index, tags):
+        print(f"Exporting {len(terms)} Yomichan terms to zip file...")
+        self.__write_term_banks(terms)
+        self.__write_index(index)
+        self.__write_tag_bank(tags)
+        self.__write_archive(index["title"])
+        self.__rm_build_dir()

-    index = meta[name]["index"]
-    index["revision"] = f"{name}.{modified_date}"
-    index["attribution"] = attribution
-    tags = meta[name]["tags"]
+    def __write_term_banks(self, terms):
+        build_dir = self._get_build_dir()
+        max_i = int(len(terms) / self._terms_per_file) + 1
+        for i in range(max_i):
+            term_file = os.path.join(build_dir, f"term_bank_{i+1}.json")
+            with open(term_file, "w", encoding='utf8') as f:
+                start = self._terms_per_file * i
+                end = self._terms_per_file * (i + 1)
+                json.dump(terms[start:end], f, indent=4, ensure_ascii=False)

-    __create_zip(terms, index, tags)
+    def __write_index(self, index):
+        build_dir = self._get_build_dir()
+        index_file = os.path.join(build_dir, "index.json")
+        with open(index_file, 'w', encoding='utf8') as f:
+            json.dump(index, f, indent=4, ensure_ascii=False)

-
-def __terms(entries):
-    terms = []
-    modified_date = None
-    attribution = ""
-    for entry in entries:
-        if modified_date is None or entry.modified_date > modified_date:
-            modified_date = entry.modified_date
-            attribution = entry.attribution
-        for term in entry.yomichan_terms():
-            terms.append(term)
-    return terms, modified_date, attribution
-
-
-def __create_zip(terms, index, tags):
-    cache_dir = user_cache_dir("jitenbot")
-    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
-    build_directory = os.path.join(cache_dir, f"build_{timestamp}")
-    if Path(build_directory).is_dir():
-        shutil.rmtree(build_directory)
-    os.makedirs(build_directory)
-
-    terms_per_file = 1000
-    max_i = int(len(terms) / terms_per_file) + 1
-    for i in range(max_i):
-        term_file = os.path.join(build_directory, f"term_bank_{i+1}.json")
-        with open(term_file, "w", encoding='utf8') as f:
-            start = terms_per_file * i
-            end = terms_per_file * (i + 1)
-            json.dump(terms[start:end], f, indent=4, ensure_ascii=False)
-
-    index_file = os.path.join(build_directory, "index.json")
-    with open(index_file, 'w', encoding='utf8') as f:
-        json.dump(index, f, indent=4, ensure_ascii=False)
-
-    if len(tags) > 0:
-        tag_file = os.path.join(build_directory, "tag_bank_1.json")
+    def __write_tag_bank(self, tags):
+        if len(tags) == 0:
+            return
+        build_dir = self._get_build_dir()
+        tag_file = os.path.join(build_dir, "tag_bank_1.json")
        with open(tag_file, 'w', encoding='utf8') as f:
            json.dump(tags, f, indent=4, ensure_ascii=False)

-    zip_filename = index["title"]
-    zip_file = f"{zip_filename}.zip"
-    shutil.make_archive(zip_filename, "zip", build_directory)
+    def __write_archive(self, filename):
+        archive_format = "zip"
+        out_dir = os.path.join(user_documents_dir(), "jitenbot")
+        if not Path(out_dir).is_dir():
+            os.makedirs(out_dir)
+        out_file = f"{filename}.{archive_format}"
+        out_filepath = os.path.join(out_dir, out_file)
+        if Path(out_filepath).is_file():
+            os.remove(out_filepath)
+        base_filename = os.path.join(out_dir, filename)
+        build_dir = self._get_build_dir()
+        shutil.make_archive(base_filename, archive_format, build_dir)
+        print(f"Dictionary file exported to {out_filepath}")

-    out_dir = os.path.join(user_documents_dir(), "jitenbot")
-    out_file = os.path.join(out_dir, zip_file)
-    if not Path(out_dir).is_dir():
-        os.mkdir(out_dir)
-    elif Path(out_file).is_file():
-        os.remove(out_file)
-    shutil.move(zip_file, out_dir)
-    shutil.rmtree(build_directory)
+    def __rm_build_dir(self):
+        build_dir = self._get_build_dir()
+        shutil.rmtree(build_dir)
+
+
+class JitenonExporter(Exporter):
+    def __init__(self):
+        super().__init__()
+
+    def _get_revision(self, entries):
+        modified_date = None
+        for entry in entries:
+            if modified_date is None or entry.modified_date > modified_date:
+                modified_date = entry.modified_date
+        revision = f"{self._name}.{modified_date}"
+        return revision
+
+    def _get_attribution(self, entries):
+        modified_date = None
+        for entry in entries:
+            if modified_date is None or entry.modified_date > modified_date:
+                attribution = entry.attribution
+        return attribution
+
+
+class JitenonYojiExporter(JitenonExporter):
+    def __init__(self):
+        super().__init__()
+        self._name = "jitenon-yoji"
+
+
+class JitenonKotowazaExporter(JitenonExporter):
+    def __init__(self):
+        super().__init__()
+        self._name = "jitenon-kotowaza"