39 lines
1.7 KiB
Python
39 lines
1.7 KiB
Python
import re
|
|
from bs4 import BeautifulSoup
|
|
|
|
from bot.crawlers.base.crawler import BaseCrawler
|
|
from bot.crawlers.scrapers.jitenon import Jitenon as JitenonScraper
|
|
|
|
|
|
class Crawler(BaseCrawler):
|
|
def __init__(self, target):
|
|
super().__init__(target)
|
|
self._gojuon_url = "https://kokugo.jitenon.jp/cat/gojuonindex.php"
|
|
self._page_id_pattern = r"word/p([0-9]+)$"
|
|
|
|
def collect_pages(self, page_dir):
|
|
jitenon = JitenonScraper()
|
|
gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
|
|
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
|
|
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
|
|
gojuon_href = gojuon_a['href']
|
|
max_kana_page = 1
|
|
current_kana_page = 1
|
|
while current_kana_page <= max_kana_page:
|
|
kana_doc, _ = jitenon.scrape(f"{gojuon_href}&page={current_kana_page}")
|
|
current_kana_page += 1
|
|
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
|
|
page_total = kana_soup.find(class_="page_total").text
|
|
m = re.search(r"全([0-9]+)件", page_total)
|
|
if m:
|
|
max_kana_page = int(m.group(1))
|
|
for kana_a in kana_soup.select(".word_box a", href=True):
|
|
page_link = kana_a['href']
|
|
page_id = self._parse_page_id(page_link)
|
|
if page_id is None:
|
|
continue
|
|
_, page_path = jitenon.scrape(page_link)
|
|
self._page_map[page_id] = page_path
|
|
pages_len = len(self._page_map)
|
|
print(f"Finished scraping {pages_len} pages")
|