jitenbot/crawlers.py

46 lines
1.6 KiB
Python
Raw Normal View History

import re
from bs4 import BeautifulSoup
import scraper as Scraper
import yomichan as Yomichan
from jitenon_yoji import JitenonYoji
def jitenon_yoji_crawler():
entries = {}
jitenon = Scraper.Jitenon()
gojuon_doc = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html")
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
gojuon_href = gojuon_a['href']
kana_doc = jitenon.scrape(gojuon_href)
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
for kana_a in kana_soup.select(".word_box a", href=True):
kana_href = kana_a['href']
sequence = int(re.search(r"([0-9]+).html", kana_href).group(1))
if sequence in entries:
continue
yoji_doc = jitenon.scrape(kana_href)
entry = JitenonYoji(sequence)
entry.add_document(yoji_doc)
entries[sequence] = entry
terms = []
attribution = ""
modified_date = None
for entry in entries.values():
if modified_date is None or entry.modified_date > modified_date:
modified_date = entry.modified_date
attribution = entry.attribution
for term in entry.yomichan_terms():
terms.append(term)
index = {
"title": "四字熟語辞典オンライン",
"revision": f"jitenon-yoji.{modified_date}",
"sequenced": True,
"format": 3,
"url": "https://yoji.jitenon.jp/",
"attribution": attribution,
}
Yomichan.create_zip(terms, index)