b03978d1f7
This is a clumsy way of doing it (since it would be better to have a wrapper function append the timestamp), but that will be taken care of when the logging logic is all overhauled anyway.
41 lines
1.7 KiB
Python
41 lines
1.7 KiB
Python
import re
|
|
from bs4 import BeautifulSoup
|
|
|
|
from bot.time import timestamp
|
|
from bot.crawlers.base.crawler import BaseCrawler
|
|
from bot.crawlers.scrapers.jitenon import Jitenon as JitenonScraper
|
|
|
|
|
|
class Crawler(BaseCrawler):
|
|
def __init__(self, target):
|
|
super().__init__(target)
|
|
self._gojuon_url = "https://kokugo.jitenon.jp/cat/gojuonindex.php"
|
|
self._page_id_pattern = r"word/p([0-9]+)$"
|
|
|
|
def collect_pages(self, page_dir):
|
|
print(f"{timestamp()} Scraping {self._gojuon_url}")
|
|
jitenon = JitenonScraper()
|
|
gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
|
|
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
|
|
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
|
|
gojuon_href = gojuon_a['href']
|
|
max_kana_page = 1
|
|
current_kana_page = 1
|
|
while current_kana_page <= max_kana_page:
|
|
kana_doc, _ = jitenon.scrape(f"{gojuon_href}&page={current_kana_page}")
|
|
current_kana_page += 1
|
|
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
|
|
page_total = kana_soup.find(class_="page_total").text
|
|
m = re.search(r"全([0-9]+)件", page_total)
|
|
if m:
|
|
max_kana_page = int(m.group(1))
|
|
for kana_a in kana_soup.select(".word_box a", href=True):
|
|
page_link = kana_a['href']
|
|
page_id = self._parse_page_id(page_link)
|
|
if page_id is None:
|
|
continue
|
|
_, page_path = jitenon.scrape(page_link)
|
|
self._page_map[page_id] = page_path
|
|
pages_len = len(self._page_map)
|
|
print(f"\n{timestamp()} Found {pages_len} entry pages")
|