jitenbot/bot/crawlers.py

import re
from bs4 import BeautifulSoup

import bot.scraper as Scraper

from bot.entries.jitenon_kotowaza import JitenonKotowazaEntry
from bot.yomichan.export import JitenonKotowazaExporter

from bot.entries.jitenon_yoji import JitenonYojiEntry
from bot.yomichan.export import JitenonYojiExporter


class Crawler():
    def __init__(self):
        self._crawl_map = {}
        self.__entries = []

    def make_entries(self):
        entries_len = len(self._crawl_map)
        items = self._crawl_map.items()
        for idx, (entry_id, entry_path) in enumerate(items):
            update = f"Reading entry {idx+1}/{entries_len}"
            print(update, end='\r', flush=True)
            entry = self._entry_class(entry_id)
            entry.add_document(entry_path)
            self.__entries.append(entry)
        print()

    def make_yomichan_dictionary(self):
        self._yomi_exporter.export(self.__entries)


class JitenonCrawler(Crawler):
    def __init__(self):
        super().__init__()

    def crawl(self):
        print(f"Scraping {self._name}...")
        jitenon = Scraper.Jitenon()
        gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
        gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
        for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
            gojuon_href = gojuon_a['href']
            kana_doc, _ = jitenon.scrape(gojuon_href)
            kana_soup = BeautifulSoup(kana_doc, features="html.parser")
            for kana_a in kana_soup.select(".word_box a", href=True):
                entry_link = kana_a['href']
                entry_id = self.__parse_entry_id(entry_link)
                if entry_id is None:
                    continue
                _, entry_path = jitenon.scrape(entry_link)
                self._crawl_map[entry_id] = entry_path
        entries_len = len(self._crawl_map)
        print(f"Finished scraping {entries_len} entries")

    def __parse_entry_id(self, entry_link):
        m = re.search(self._entry_id_pattern, entry_link)
        if not m:
            return None
        entry_id = int(m.group(1))
        if entry_id in self._crawl_map:
            return None
        return entry_id


class JitenonYojiCrawler(JitenonCrawler):
    def __init__(self):
        super().__init__()
        self._entry_class = JitenonYojiEntry
        self._yomi_exporter = JitenonYojiExporter()
        self._name = "jitenon-yoji"
        self._gojuon_url = "https://yoji.jitenon.jp/cat/gojuon.html"
        self._entry_id_pattern = r"([0-9]+).html"


class JitenonKotowazaCrawler(JitenonCrawler):
    def __init__(self):
        super().__init__()
        self._entry_class = JitenonKotowazaEntry
        self._yomi_exporter = JitenonKotowazaExporter()
        self._name = "jitenon-kotowaza"
        self._gojuon_url = "https://kotowaza.jitenon.jp/cat/gojuon.php"
        self._entry_id_pattern = r"([0-9]+).php"
First version Support for Jitenon's yoji dictionary 2023-04-08 03:05:36 +00:00			`import re`
			`from bs4 import BeautifulSoup`

Reorganize file structure 2023-04-11 17:01:23 +00:00			`import bot.scraper as Scraper`
First version Support for Jitenon's yoji dictionary 2023-04-08 03:05:36 +00:00
Organize crawler logic into classes 2023-04-22 22:56:52 +00:00			`from bot.entries.jitenon_kotowaza import JitenonKotowazaEntry`
			`from bot.yomichan.export import JitenonKotowazaExporter`
First version Support for Jitenon's yoji dictionary 2023-04-08 03:05:36 +00:00
Organize crawler logic into classes 2023-04-22 22:56:52 +00:00			`from bot.entries.jitenon_yoji import JitenonYojiEntry`
			`from bot.yomichan.export import JitenonYojiExporter`
Add support Jitenon Kotowaza 2023-04-10 16:14:52 +00:00

Organize crawler logic into classes 2023-04-22 22:56:52 +00:00			`class Crawler():`
			`def __init__(self):`
Split jitenon crawler class into subclasses 2023-04-22 23:32:11 +00:00			`self._crawl_map = {}`
			`self.__entries = []`
Add support Jitenon Kotowaza 2023-04-10 16:14:52 +00:00
Organize crawler logic into classes 2023-04-22 22:56:52 +00:00			`def make_entries(self):`
Split jitenon crawler class into subclasses 2023-04-22 23:32:11 +00:00			`entries_len = len(self._crawl_map)`
			`items = self._crawl_map.items()`
Organize crawler logic into classes 2023-04-22 22:56:52 +00:00			`for idx, (entry_id, entry_path) in enumerate(items):`
			`update = f"Reading entry {idx+1}/{entries_len}"`
			`print(update, end='\r', flush=True)`
Split jitenon crawler class into subclasses 2023-04-22 23:32:11 +00:00			`entry = self._entry_class(entry_id)`
Organize crawler logic into classes 2023-04-22 22:56:52 +00:00			`entry.add_document(entry_path)`
Split jitenon crawler class into subclasses 2023-04-22 23:32:11 +00:00			`self.__entries.append(entry)`
Organize crawler logic into classes 2023-04-22 22:56:52 +00:00			`print()`
Add support Jitenon Kotowaza 2023-04-10 16:14:52 +00:00
Organize crawler logic into classes 2023-04-22 22:56:52 +00:00			`def make_yomichan_dictionary(self):`
Split jitenon crawler class into subclasses 2023-04-22 23:32:11 +00:00			`self._yomi_exporter.export(self.__entries)`
Organize crawler logic into classes 2023-04-22 22:56:52 +00:00

Split jitenon crawler class into subclasses 2023-04-22 23:32:11 +00:00			`class JitenonCrawler(Crawler):`
Organize crawler logic into classes 2023-04-22 22:56:52 +00:00			`def __init__(self):`
			`super().__init__()`

			`def crawl(self):`
Split jitenon crawler class into subclasses 2023-04-22 23:32:11 +00:00			`print(f"Scraping {self._name}...")`
Organize crawler logic into classes 2023-04-22 22:56:52 +00:00			`jitenon = Scraper.Jitenon()`
Split jitenon crawler class into subclasses 2023-04-22 23:32:11 +00:00			`gojuon_doc, _ = jitenon.scrape(self._gojuon_url)`
Organize crawler logic into classes 2023-04-22 22:56:52 +00:00			`gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")`
			`for gojuon_a in gojuon_soup.select(".kana_area a", href=True):`
			`gojuon_href = gojuon_a['href']`
			`kana_doc, _ = jitenon.scrape(gojuon_href)`
			`kana_soup = BeautifulSoup(kana_doc, features="html.parser")`
			`for kana_a in kana_soup.select(".word_box a", href=True):`
Split jitenon crawler class into subclasses 2023-04-22 23:32:11 +00:00			`entry_link = kana_a['href']`
			`entry_id = self.__parse_entry_id(entry_link)`
			`if entry_id is None:`
Organize crawler logic into classes 2023-04-22 22:56:52 +00:00			`continue`
Split jitenon crawler class into subclasses 2023-04-22 23:32:11 +00:00			`_, entry_path = jitenon.scrape(entry_link)`
			`self._crawl_map[entry_id] = entry_path`
			`entries_len = len(self._crawl_map)`
Organize crawler logic into classes 2023-04-22 22:56:52 +00:00			`print(f"Finished scraping {entries_len} entries")`

Split jitenon crawler class into subclasses 2023-04-22 23:32:11 +00:00			`def __parse_entry_id(self, entry_link):`
			`m = re.search(self._entry_id_pattern, entry_link)`
			`if not m:`
			`return None`
			`entry_id = int(m.group(1))`
			`if entry_id in self._crawl_map:`
			`return None`
			`return entry_id`
Organize crawler logic into classes 2023-04-22 22:56:52 +00:00
Split jitenon crawler class into subclasses 2023-04-22 23:32:11 +00:00
			`class JitenonYojiCrawler(JitenonCrawler):`
Organize crawler logic into classes 2023-04-22 22:56:52 +00:00			`def __init__(self):`
			`super().__init__()`
Split jitenon crawler class into subclasses 2023-04-22 23:32:11 +00:00			`self._entry_class = JitenonYojiEntry`
			`self._yomi_exporter = JitenonYojiExporter()`
			`self._name = "jitenon-yoji"`
			`self._gojuon_url = "https://yoji.jitenon.jp/cat/gojuon.html"`
			`self._entry_id_pattern = r"([0-9]+).html"`
Organize crawler logic into classes 2023-04-22 22:56:52 +00:00
Split jitenon crawler class into subclasses 2023-04-22 23:32:11 +00:00
			`class JitenonKotowazaCrawler(JitenonCrawler):`
			`def __init__(self):`
			`super().__init__()`
			`self._entry_class = JitenonKotowazaEntry`
			`self._yomi_exporter = JitenonKotowazaExporter()`
			`self._name = "jitenon-kotowaza"`
			`self._gojuon_url = "https://kotowaza.jitenon.jp/cat/gojuon.php"`
			`self._entry_id_pattern = r"([0-9]+).php"`