diff --git a/bot/crawlers.py b/bot/crawlers.py index c55896e..5a8940c 100644 --- a/bot/crawlers.py +++ b/bot/crawlers.py @@ -3,7 +3,6 @@ from bs4 import BeautifulSoup import bot.scraper as Scraper import bot.yomichan.export as YomichanExport - from bot.entries.jitenon_kotowaza import JitenonKotowaza from bot.entries.jitenon_yoji import JitenonYoji @@ -14,48 +13,67 @@ def run_all(): def jitenon_yoji(): - seq_to_entries = {} + print("Scraping jitenon-yoji...") + entry_id_to_entry_path = {} jitenon = Scraper.Jitenon() - gojuon_doc = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html") + gojuon_doc, _ = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html") gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser") for gojuon_a in gojuon_soup.select(".kana_area a", href=True): gojuon_href = gojuon_a['href'] - kana_doc = jitenon.scrape(gojuon_href) + kana_doc, _ = jitenon.scrape(gojuon_href) kana_soup = BeautifulSoup(kana_doc, features="html.parser") for kana_a in kana_soup.select(".word_box a", href=True): kana_href = kana_a['href'] - sequence = int(re.search(r"([0-9]+).html", kana_href).group(1)) - if sequence in seq_to_entries: + entry_id = int(re.search(r"([0-9]+).html", kana_href).group(1)) + if entry_id in entry_id_to_entry_path: continue - yoji_doc = jitenon.scrape(kana_href) - entry = JitenonYoji(sequence) - entry.add_document(yoji_doc) - seq_to_entries[sequence] = entry - entries = seq_to_entries.values() - YomichanExport.jitenon_yoji(entries) + _, entry_path = jitenon.scrape(kana_href) + entry_id_to_entry_path[entry_id] = entry_path + entries_len = len(entry_id_to_entry_path) + print(f"Finished scraping {entries_len} entries") + entries = [] + items = entry_id_to_entry_path.items() + for idx, (entry_id, entry_path) in enumerate(items): + update = f"Reading entry {idx+1}/{entries_len}" + print(update, end='\r', flush=True) + entry = JitenonYoji(entry_id) + entry.add_document(entry_path) + entries.append(entry) + print() + exporter = YomichanExport.JitenonYojiExporter() + exporter.export(entries) def jitenon_kotowaza(): - seq_to_entries = {} + print("Scraping jitenon-kotowaza...") + entry_id_to_entry_path = {} jitenon = Scraper.Jitenon() - gojuon_doc = jitenon.scrape("https://kotowaza.jitenon.jp/cat/gojuon.php") + gojuon_doc, _ = jitenon.scrape("https://kotowaza.jitenon.jp/cat/gojuon.php") gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser") for gojuon_a in gojuon_soup.select(".kana_area a", href=True): gojuon_href = gojuon_a['href'] - kana_doc = jitenon.scrape(gojuon_href) + kana_doc, _ = jitenon.scrape(gojuon_href) kana_soup = BeautifulSoup(kana_doc, features="html.parser") for kana_a in kana_soup.select(".word_box a", href=True): kana_href = kana_a['href'] m = re.search(r"([0-9]+).php", kana_href) - if m: - sequence = int(m.group(1)) - else: + if not m: continue - if sequence in seq_to_entries: + entry_id = int(m.group(1)) + if entry_id in entry_id_to_entry_path: continue - kotowaza_doc = jitenon.scrape(kana_href) - entry = JitenonKotowaza(sequence) - entry.add_document(kotowaza_doc) - seq_to_entries[sequence] = entry - entries = seq_to_entries.values() - YomichanExport.jitenon_kotowaza(entries) + _, entry_path = jitenon.scrape(kana_href) + entry_id_to_entry_path[entry_id] = entry_path + entries_len = len(entry_id_to_entry_path) + print(f"Finished scraping {entries_len} entries") + entries = [] + items = entry_id_to_entry_path.items() + for idx, (entry_id, entry_path) in enumerate(items): + update = f"Reading entry {idx+1}/{entries_len}" + print(update, end='\r', flush=True) + entry = JitenonKotowaza(entry_id) + entry.add_document(entry_path) + entries.append(entry) + print() + exporter = YomichanExport.JitenonKotowazaExporter() + exporter.export(entries) diff --git a/bot/entries/jitenon.py b/bot/entries/jitenon.py index aac1929..71efe49 100644 --- a/bot/entries/jitenon.py +++ b/bot/entries/jitenon.py @@ -15,7 +15,9 @@ class Jitenon: for column in self.columns.values(): setattr(self, column[0], column[1]) - def add_document(self, html): + def add_document(self, path): + with open(path, "r") as f: + html = f.read() yoji_soup = BeautifulSoup(html, features="html5lib") self.__set_modified_date(html) self.attribution = yoji_soup.find(class_="copyright").text diff --git a/bot/scraper.py b/bot/scraper.py index a8c4905..25060fa 100644 --- a/bot/scraper.py +++ b/bot/scraper.py @@ -28,13 +28,14 @@ class Scraper(): url = urlparse(urlstring, scheme='https://', allow_fragments=True) self.__validate_url(url) cache_path = self.__cache_path(url) - cache_contents = self.__read_cache(cache_path) - if cache_contents is not None: - return cache_contents - html = self.__get(urlstring) - with open(cache_path, "w") as f: - f.write(html) - return html + html = self.__read_cache(cache_path) + if html is None: + html = self.__get(urlstring) + with open(cache_path, "w") as f: + f.write(html) + else: + print("Discovering cached files...", end='\r', flush=True) + return html, cache_path def __set_session(self): retry_strategy = Retry( @@ -106,4 +107,4 @@ class Scraper(): class Jitenon(Scraper): def __init__(self): self.domain = r"jitenon\.jp" - Scraper.__init__(self) + super().__init__() diff --git a/bot/yomichan/export.py b/bot/yomichan/export.py index c413b84..a3c099d 100644 --- a/bot/yomichan/export.py +++ b/bot/yomichan/export.py @@ -8,74 +8,121 @@ from platformdirs import user_documents_dir, user_cache_dir import bot.data as Data -def jitenon_yoji(entries): - __jitenon(entries, "jitenon-yoji") +class Exporter: + def __init__(self): + self._build_dir = None + self._terms_per_file = 2000 + def export(self, entries): + meta = Data.yomichan_metadata() + index = meta[self._name]["index"] + index["revision"] = self._get_revision(entries) + index["attribution"] = self._get_attribution(entries) + tags = meta[self._name]["tags"] + terms = self.__get_terms(entries) + self.__make_dictionary(terms, index, tags) -def jitenon_kotowaza(entries): - __jitenon(entries, "jitenon-kotowaza") + def _get_build_dir(self): + if self._build_dir is not None: + return self._build_dir + cache_dir = user_cache_dir("jitenbot") + timestamp = datetime.now().strftime("%Y%m%d%H%M%S") + build_directory = os.path.join(cache_dir, f"build_{timestamp}") + if Path(build_directory).is_dir(): + shutil.rmtree(build_directory) + os.makedirs(build_directory) + self._build_dir = build_directory + return self._build_dir + def __get_terms(self, entries): + terms = [] + entries_len = len(entries) + for idx, entry in enumerate(entries): + update = f"Creating Yomichan terms for entry {idx+1}/{entries_len}" + print(update, end='\r', flush=True) + for term in entry.yomichan_terms(): + terms.append(term) + print() + return terms -def __jitenon(entries, name): - terms, modified_date, attribution = __terms(entries) - meta = Data.yomichan_metadata() + def __make_dictionary(self, terms, index, tags): + print(f"Exporting {len(terms)} Yomichan terms to zip file...") + self.__write_term_banks(terms) + self.__write_index(index) + self.__write_tag_bank(tags) + self.__write_archive(index["title"]) + self.__rm_build_dir() - index = meta[name]["index"] - index["revision"] = f"{name}.{modified_date}" - index["attribution"] = attribution - tags = meta[name]["tags"] + def __write_term_banks(self, terms): + build_dir = self._get_build_dir() + max_i = int(len(terms) / self._terms_per_file) + 1 + for i in range(max_i): + term_file = os.path.join(build_dir, f"term_bank_{i+1}.json") + with open(term_file, "w", encoding='utf8') as f: + start = self._terms_per_file * i + end = self._terms_per_file * (i + 1) + json.dump(terms[start:end], f, indent=4, ensure_ascii=False) - __create_zip(terms, index, tags) + def __write_index(self, index): + build_dir = self._get_build_dir() + index_file = os.path.join(build_dir, "index.json") + with open(index_file, 'w', encoding='utf8') as f: + json.dump(index, f, indent=4, ensure_ascii=False) - -def __terms(entries): - terms = [] - modified_date = None - attribution = "" - for entry in entries: - if modified_date is None or entry.modified_date > modified_date: - modified_date = entry.modified_date - attribution = entry.attribution - for term in entry.yomichan_terms(): - terms.append(term) - return terms, modified_date, attribution - - -def __create_zip(terms, index, tags): - cache_dir = user_cache_dir("jitenbot") - timestamp = datetime.now().strftime("%Y%m%d%H%M%S") - build_directory = os.path.join(cache_dir, f"build_{timestamp}") - if Path(build_directory).is_dir(): - shutil.rmtree(build_directory) - os.makedirs(build_directory) - - terms_per_file = 1000 - max_i = int(len(terms) / terms_per_file) + 1 - for i in range(max_i): - term_file = os.path.join(build_directory, f"term_bank_{i+1}.json") - with open(term_file, "w", encoding='utf8') as f: - start = terms_per_file * i - end = terms_per_file * (i + 1) - json.dump(terms[start:end], f, indent=4, ensure_ascii=False) - - index_file = os.path.join(build_directory, "index.json") - with open(index_file, 'w', encoding='utf8') as f: - json.dump(index, f, indent=4, ensure_ascii=False) - - if len(tags) > 0: - tag_file = os.path.join(build_directory, "tag_bank_1.json") + def __write_tag_bank(self, tags): + if len(tags) == 0: + return + build_dir = self._get_build_dir() + tag_file = os.path.join(build_dir, "tag_bank_1.json") with open(tag_file, 'w', encoding='utf8') as f: json.dump(tags, f, indent=4, ensure_ascii=False) - zip_filename = index["title"] - zip_file = f"{zip_filename}.zip" - shutil.make_archive(zip_filename, "zip", build_directory) + def __write_archive(self, filename): + archive_format = "zip" + out_dir = os.path.join(user_documents_dir(), "jitenbot") + if not Path(out_dir).is_dir(): + os.makedirs(out_dir) + out_file = f"{filename}.{archive_format}" + out_filepath = os.path.join(out_dir, out_file) + if Path(out_filepath).is_file(): + os.remove(out_filepath) + base_filename = os.path.join(out_dir, filename) + build_dir = self._get_build_dir() + shutil.make_archive(base_filename, archive_format, build_dir) + print(f"Dictionary file exported to {out_filepath}") - out_dir = os.path.join(user_documents_dir(), "jitenbot") - out_file = os.path.join(out_dir, zip_file) - if not Path(out_dir).is_dir(): - os.mkdir(out_dir) - elif Path(out_file).is_file(): - os.remove(out_file) - shutil.move(zip_file, out_dir) - shutil.rmtree(build_directory) + def __rm_build_dir(self): + build_dir = self._get_build_dir() + shutil.rmtree(build_dir) + + +class JitenonExporter(Exporter): + def __init__(self): + super().__init__() + + def _get_revision(self, entries): + modified_date = None + for entry in entries: + if modified_date is None or entry.modified_date > modified_date: + modified_date = entry.modified_date + revision = f"{self._name}.{modified_date}" + return revision + + def _get_attribution(self, entries): + modified_date = None + for entry in entries: + if modified_date is None or entry.modified_date > modified_date: + attribution = entry.attribution + return attribution + + +class JitenonYojiExporter(JitenonExporter): + def __init__(self): + super().__init__() + self._name = "jitenon-yoji" + + +class JitenonKotowazaExporter(JitenonExporter): + def __init__(self): + super().__init__() + self._name = "jitenon-kotowaza"