diff --git a/bot/crawlers.py b/bot/crawlers.py index 9b1512f..37af503 100644 --- a/bot/crawlers.py +++ b/bot/crawlers.py @@ -12,74 +12,72 @@ from bot.yomichan.export import JitenonYojiExporter class Crawler(): def __init__(self): - self.crawl_map = {} - self.entries = [] + self._crawl_map = {} + self.__entries = [] def make_entries(self): - entries_len = len(self.crawl_map) - items = self.crawl_map.items() + entries_len = len(self._crawl_map) + items = self._crawl_map.items() for idx, (entry_id, entry_path) in enumerate(items): update = f"Reading entry {idx+1}/{entries_len}" print(update, end='\r', flush=True) - entry = self.entry_class(entry_id) + entry = self._entry_class(entry_id) entry.add_document(entry_path) - self.entries.append(entry) + self.__entries.append(entry) print() def make_yomichan_dictionary(self): - self.yomi_exporter.export(self.entries) + self._yomi_exporter.export(self.__entries) -class JitenonYojiCrawler(Crawler): +class JitenonCrawler(Crawler): def __init__(self): super().__init__() - self.entry_class = JitenonYojiEntry - self.yomi_exporter = JitenonYojiExporter() def crawl(self): - print("Scraping jitenon-yoji...") + print(f"Scraping {self._name}...") jitenon = Scraper.Jitenon() - gojuon_doc, _ = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html") + gojuon_doc, _ = jitenon.scrape(self._gojuon_url) gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser") for gojuon_a in gojuon_soup.select(".kana_area a", href=True): gojuon_href = gojuon_a['href'] kana_doc, _ = jitenon.scrape(gojuon_href) kana_soup = BeautifulSoup(kana_doc, features="html.parser") for kana_a in kana_soup.select(".word_box a", href=True): - kana_href = kana_a['href'] - entry_id = int(re.search(r"([0-9]+).html", kana_href).group(1)) - if entry_id in self.crawl_map: + entry_link = kana_a['href'] + entry_id = self.__parse_entry_id(entry_link) + if entry_id is None: continue - _, entry_path = jitenon.scrape(kana_href) - self.crawl_map[entry_id] = entry_path - entries_len = len(self.crawl_map) + _, entry_path = jitenon.scrape(entry_link) + self._crawl_map[entry_id] = entry_path + entries_len = len(self._crawl_map) print(f"Finished scraping {entries_len} entries") + def __parse_entry_id(self, entry_link): + m = re.search(self._entry_id_pattern, entry_link) + if not m: + return None + entry_id = int(m.group(1)) + if entry_id in self._crawl_map: + return None + return entry_id -class JitenonKotowazaCrawler(Crawler): + +class JitenonYojiCrawler(JitenonCrawler): def __init__(self): super().__init__() - self.entry_class = JitenonKotowazaEntry - self.yomi_exporter = JitenonKotowazaExporter() + self._entry_class = JitenonYojiEntry + self._yomi_exporter = JitenonYojiExporter() + self._name = "jitenon-yoji" + self._gojuon_url = "https://yoji.jitenon.jp/cat/gojuon.html" + self._entry_id_pattern = r"([0-9]+).html" - def crawl(self): - print("Scraping jitenon-kotowaza...") - jitenon = Scraper.Jitenon() - gojuon_doc, _ = jitenon.scrape("https://kotowaza.jitenon.jp/cat/gojuon.php") - gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser") - for gojuon_a in gojuon_soup.select(".kana_area a", href=True): - gojuon_href = gojuon_a['href'] - kana_doc, _ = jitenon.scrape(gojuon_href) - kana_soup = BeautifulSoup(kana_doc, features="html.parser") - for kana_a in kana_soup.select(".word_box a", href=True): - kana_href = kana_a['href'] - m = re.search(r"([0-9]+).php", kana_href) - if not m: - continue - entry_id = int(m.group(1)) - if entry_id in self.crawl_map: - continue - _, entry_path = jitenon.scrape(kana_href) - self.crawl_map[entry_id] = entry_path - entries_len = len(self.crawl_map) - print(f"Finished scraping {entries_len} entries") + +class JitenonKotowazaCrawler(JitenonCrawler): + def __init__(self): + super().__init__() + self._entry_class = JitenonKotowazaEntry + self._yomi_exporter = JitenonKotowazaExporter() + self._name = "jitenon-kotowaza" + self._gojuon_url = "https://kotowaza.jitenon.jp/cat/gojuon.php" + self._entry_id_pattern = r"([0-9]+).php" diff --git a/bot/yomichan/export.py b/bot/yomichan/export.py index a3c099d..ab24eaf 100644 --- a/bot/yomichan/export.py +++ b/bot/yomichan/export.py @@ -46,7 +46,7 @@ class Exporter: return terms def __make_dictionary(self, terms, index, tags): - print(f"Exporting {len(terms)} Yomichan terms to zip file...") + print(f"Exporting {len(terms)} Yomichan terms...") self.__write_term_banks(terms) self.__write_index(index) self.__write_tag_bank(tags) @@ -89,7 +89,7 @@ class Exporter: base_filename = os.path.join(out_dir, filename) build_dir = self._get_build_dir() shutil.make_archive(base_filename, archive_format, build_dir) - print(f"Dictionary file exported to {out_filepath}") + print(f"Dictionary file saved to {out_filepath}") def __rm_build_dir(self): build_dir = self._get_build_dir()