jitenbot/bot/crawlers.py

84 lines
2.8 KiB
Python
Raw Normal View History

import re
from bs4 import BeautifulSoup
2023-04-11 17:01:23 +00:00
import bot.scraper as Scraper
2023-04-22 22:56:52 +00:00
from bot.entries.jitenon_kotowaza import JitenonKotowazaEntry
from bot.yomichan.export import JitenonKotowazaExporter
2023-04-22 22:56:52 +00:00
from bot.entries.jitenon_yoji import JitenonYojiEntry
from bot.yomichan.export import JitenonYojiExporter
2023-04-10 16:14:52 +00:00
2023-04-22 22:56:52 +00:00
class Crawler():
def __init__(self):
self._crawl_map = {}
self.__entries = []
2023-04-10 16:14:52 +00:00
2023-04-22 22:56:52 +00:00
def make_entries(self):
entries_len = len(self._crawl_map)
items = self._crawl_map.items()
2023-04-22 22:56:52 +00:00
for idx, (entry_id, entry_path) in enumerate(items):
update = f"Reading entry {idx+1}/{entries_len}"
print(update, end='\r', flush=True)
entry = self._entry_class(entry_id)
2023-04-22 22:56:52 +00:00
entry.add_document(entry_path)
self.__entries.append(entry)
2023-04-22 22:56:52 +00:00
print()
2023-04-10 16:14:52 +00:00
2023-04-22 22:56:52 +00:00
def make_yomichan_dictionary(self):
self._yomi_exporter.export(self.__entries)
2023-04-22 22:56:52 +00:00
class JitenonCrawler(Crawler):
2023-04-22 22:56:52 +00:00
def __init__(self):
super().__init__()
def crawl(self):
print(f"Scraping {self._name}...")
2023-04-22 22:56:52 +00:00
jitenon = Scraper.Jitenon()
gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
2023-04-22 22:56:52 +00:00
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
gojuon_href = gojuon_a['href']
kana_doc, _ = jitenon.scrape(gojuon_href)
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
for kana_a in kana_soup.select(".word_box a", href=True):
entry_link = kana_a['href']
entry_id = self.__parse_entry_id(entry_link)
if entry_id is None:
2023-04-22 22:56:52 +00:00
continue
_, entry_path = jitenon.scrape(entry_link)
self._crawl_map[entry_id] = entry_path
entries_len = len(self._crawl_map)
2023-04-22 22:56:52 +00:00
print(f"Finished scraping {entries_len} entries")
def __parse_entry_id(self, entry_link):
m = re.search(self._entry_id_pattern, entry_link)
if not m:
return None
entry_id = int(m.group(1))
if entry_id in self._crawl_map:
return None
return entry_id
2023-04-22 22:56:52 +00:00
class JitenonYojiCrawler(JitenonCrawler):
2023-04-22 22:56:52 +00:00
def __init__(self):
super().__init__()
self._entry_class = JitenonYojiEntry
self._yomi_exporter = JitenonYojiExporter()
self._name = "jitenon-yoji"
self._gojuon_url = "https://yoji.jitenon.jp/cat/gojuon.html"
self._entry_id_pattern = r"([0-9]+).html"
2023-04-22 22:56:52 +00:00
class JitenonKotowazaCrawler(JitenonCrawler):
def __init__(self):
super().__init__()
self._entry_class = JitenonKotowazaEntry
self._yomi_exporter = JitenonKotowazaExporter()
self._name = "jitenon-kotowaza"
self._gojuon_url = "https://kotowaza.jitenon.jp/cat/gojuon.php"
self._entry_id_pattern = r"([0-9]+).php"