Split jitenon crawler class into subclasses
This commit is contained in:
parent
8868383a08
commit
7d7e32ba45
|
@ -12,74 +12,72 @@ from bot.yomichan.export import JitenonYojiExporter
|
||||||
|
|
||||||
class Crawler():
|
class Crawler():
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.crawl_map = {}
|
self._crawl_map = {}
|
||||||
self.entries = []
|
self.__entries = []
|
||||||
|
|
||||||
def make_entries(self):
|
def make_entries(self):
|
||||||
entries_len = len(self.crawl_map)
|
entries_len = len(self._crawl_map)
|
||||||
items = self.crawl_map.items()
|
items = self._crawl_map.items()
|
||||||
for idx, (entry_id, entry_path) in enumerate(items):
|
for idx, (entry_id, entry_path) in enumerate(items):
|
||||||
update = f"Reading entry {idx+1}/{entries_len}"
|
update = f"Reading entry {idx+1}/{entries_len}"
|
||||||
print(update, end='\r', flush=True)
|
print(update, end='\r', flush=True)
|
||||||
entry = self.entry_class(entry_id)
|
entry = self._entry_class(entry_id)
|
||||||
entry.add_document(entry_path)
|
entry.add_document(entry_path)
|
||||||
self.entries.append(entry)
|
self.__entries.append(entry)
|
||||||
print()
|
print()
|
||||||
|
|
||||||
def make_yomichan_dictionary(self):
|
def make_yomichan_dictionary(self):
|
||||||
self.yomi_exporter.export(self.entries)
|
self._yomi_exporter.export(self.__entries)
|
||||||
|
|
||||||
|
|
||||||
class JitenonYojiCrawler(Crawler):
|
class JitenonCrawler(Crawler):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.entry_class = JitenonYojiEntry
|
|
||||||
self.yomi_exporter = JitenonYojiExporter()
|
|
||||||
|
|
||||||
def crawl(self):
|
def crawl(self):
|
||||||
print("Scraping jitenon-yoji...")
|
print(f"Scraping {self._name}...")
|
||||||
jitenon = Scraper.Jitenon()
|
jitenon = Scraper.Jitenon()
|
||||||
gojuon_doc, _ = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html")
|
gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
|
||||||
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
|
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
|
||||||
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
|
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
|
||||||
gojuon_href = gojuon_a['href']
|
gojuon_href = gojuon_a['href']
|
||||||
kana_doc, _ = jitenon.scrape(gojuon_href)
|
kana_doc, _ = jitenon.scrape(gojuon_href)
|
||||||
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
|
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
|
||||||
for kana_a in kana_soup.select(".word_box a", href=True):
|
for kana_a in kana_soup.select(".word_box a", href=True):
|
||||||
kana_href = kana_a['href']
|
entry_link = kana_a['href']
|
||||||
entry_id = int(re.search(r"([0-9]+).html", kana_href).group(1))
|
entry_id = self.__parse_entry_id(entry_link)
|
||||||
if entry_id in self.crawl_map:
|
if entry_id is None:
|
||||||
continue
|
continue
|
||||||
_, entry_path = jitenon.scrape(kana_href)
|
_, entry_path = jitenon.scrape(entry_link)
|
||||||
self.crawl_map[entry_id] = entry_path
|
self._crawl_map[entry_id] = entry_path
|
||||||
entries_len = len(self.crawl_map)
|
entries_len = len(self._crawl_map)
|
||||||
print(f"Finished scraping {entries_len} entries")
|
print(f"Finished scraping {entries_len} entries")
|
||||||
|
|
||||||
|
def __parse_entry_id(self, entry_link):
|
||||||
|
m = re.search(self._entry_id_pattern, entry_link)
|
||||||
|
if not m:
|
||||||
|
return None
|
||||||
|
entry_id = int(m.group(1))
|
||||||
|
if entry_id in self._crawl_map:
|
||||||
|
return None
|
||||||
|
return entry_id
|
||||||
|
|
||||||
class JitenonKotowazaCrawler(Crawler):
|
|
||||||
|
class JitenonYojiCrawler(JitenonCrawler):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.entry_class = JitenonKotowazaEntry
|
self._entry_class = JitenonYojiEntry
|
||||||
self.yomi_exporter = JitenonKotowazaExporter()
|
self._yomi_exporter = JitenonYojiExporter()
|
||||||
|
self._name = "jitenon-yoji"
|
||||||
|
self._gojuon_url = "https://yoji.jitenon.jp/cat/gojuon.html"
|
||||||
|
self._entry_id_pattern = r"([0-9]+).html"
|
||||||
|
|
||||||
def crawl(self):
|
|
||||||
print("Scraping jitenon-kotowaza...")
|
class JitenonKotowazaCrawler(JitenonCrawler):
|
||||||
jitenon = Scraper.Jitenon()
|
def __init__(self):
|
||||||
gojuon_doc, _ = jitenon.scrape("https://kotowaza.jitenon.jp/cat/gojuon.php")
|
super().__init__()
|
||||||
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
|
self._entry_class = JitenonKotowazaEntry
|
||||||
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
|
self._yomi_exporter = JitenonKotowazaExporter()
|
||||||
gojuon_href = gojuon_a['href']
|
self._name = "jitenon-kotowaza"
|
||||||
kana_doc, _ = jitenon.scrape(gojuon_href)
|
self._gojuon_url = "https://kotowaza.jitenon.jp/cat/gojuon.php"
|
||||||
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
|
self._entry_id_pattern = r"([0-9]+).php"
|
||||||
for kana_a in kana_soup.select(".word_box a", href=True):
|
|
||||||
kana_href = kana_a['href']
|
|
||||||
m = re.search(r"([0-9]+).php", kana_href)
|
|
||||||
if not m:
|
|
||||||
continue
|
|
||||||
entry_id = int(m.group(1))
|
|
||||||
if entry_id in self.crawl_map:
|
|
||||||
continue
|
|
||||||
_, entry_path = jitenon.scrape(kana_href)
|
|
||||||
self.crawl_map[entry_id] = entry_path
|
|
||||||
entries_len = len(self.crawl_map)
|
|
||||||
print(f"Finished scraping {entries_len} entries")
|
|
||||||
|
|
|
@ -46,7 +46,7 @@ class Exporter:
|
||||||
return terms
|
return terms
|
||||||
|
|
||||||
def __make_dictionary(self, terms, index, tags):
|
def __make_dictionary(self, terms, index, tags):
|
||||||
print(f"Exporting {len(terms)} Yomichan terms to zip file...")
|
print(f"Exporting {len(terms)} Yomichan terms...")
|
||||||
self.__write_term_banks(terms)
|
self.__write_term_banks(terms)
|
||||||
self.__write_index(index)
|
self.__write_index(index)
|
||||||
self.__write_tag_bank(tags)
|
self.__write_tag_bank(tags)
|
||||||
|
@ -89,7 +89,7 @@ class Exporter:
|
||||||
base_filename = os.path.join(out_dir, filename)
|
base_filename = os.path.join(out_dir, filename)
|
||||||
build_dir = self._get_build_dir()
|
build_dir = self._get_build_dir()
|
||||||
shutil.make_archive(base_filename, archive_format, build_dir)
|
shutil.make_archive(base_filename, archive_format, build_dir)
|
||||||
print(f"Dictionary file exported to {out_filepath}")
|
print(f"Dictionary file saved to {out_filepath}")
|
||||||
|
|
||||||
def __rm_build_dir(self):
|
def __rm_build_dir(self):
|
||||||
build_dir = self._get_build_dir()
|
build_dir = self._get_build_dir()
|
||||||
|
|
Loading…
Reference in a new issue