jitenbot/bot/crawlers.py

import re
from bs4 import BeautifulSoup

import bot.scraper as Scraper
import bot.yomichan.export as YomichanExport
from bot.entries.jitenon_kotowaza import JitenonKotowaza
from bot.entries.jitenon_yoji import JitenonYoji


def run_all():
    jitenon_yoji()
    jitenon_kotowaza()


def jitenon_yoji():
    print("Scraping jitenon-yoji...")
    entry_id_to_entry_path = {}
    jitenon = Scraper.Jitenon()
    gojuon_doc, _ = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html")
    gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
    for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
        gojuon_href = gojuon_a['href']
        kana_doc, _ = jitenon.scrape(gojuon_href)
        kana_soup = BeautifulSoup(kana_doc, features="html.parser")
        for kana_a in kana_soup.select(".word_box a", href=True):
            kana_href = kana_a['href']
            entry_id = int(re.search(r"([0-9]+).html", kana_href).group(1))
            if entry_id in entry_id_to_entry_path:
                continue
            _, entry_path = jitenon.scrape(kana_href)
            entry_id_to_entry_path[entry_id] = entry_path
    entries_len = len(entry_id_to_entry_path)
    print(f"Finished scraping {entries_len} entries")
    entries = []
    items = entry_id_to_entry_path.items()
    for idx, (entry_id, entry_path) in enumerate(items):
        update = f"Reading entry {idx+1}/{entries_len}"
        print(update, end='\r', flush=True)
        entry = JitenonYoji(entry_id)
        entry.add_document(entry_path)
        entries.append(entry)
    print()
    exporter = YomichanExport.JitenonYojiExporter()
    exporter.export(entries)


def jitenon_kotowaza():
    print("Scraping jitenon-kotowaza...")
    entry_id_to_entry_path = {}
    jitenon = Scraper.Jitenon()
    gojuon_doc, _ = jitenon.scrape("https://kotowaza.jitenon.jp/cat/gojuon.php")
    gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
    for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
        gojuon_href = gojuon_a['href']
        kana_doc, _ = jitenon.scrape(gojuon_href)
        kana_soup = BeautifulSoup(kana_doc, features="html.parser")
        for kana_a in kana_soup.select(".word_box a", href=True):
            kana_href = kana_a['href']
            m = re.search(r"([0-9]+).php", kana_href)
            if not m:
                continue
            entry_id = int(m.group(1))
            if entry_id in entry_id_to_entry_path:
                continue
            _, entry_path = jitenon.scrape(kana_href)
            entry_id_to_entry_path[entry_id] = entry_path
    entries_len = len(entry_id_to_entry_path)
    print(f"Finished scraping {entries_len} entries")
    entries = []
    items = entry_id_to_entry_path.items()
    for idx, (entry_id, entry_path) in enumerate(items):
        update = f"Reading entry {idx+1}/{entries_len}"
        print(update, end='\r', flush=True)
        entry = JitenonKotowaza(entry_id)
        entry.add_document(entry_path)
        entries.append(entry)
    print()
    exporter = YomichanExport.JitenonKotowazaExporter()
    exporter.export(entries)