jitenbot/bot/crawlers.py

62 lines
2.2 KiB
Python
Raw Normal View History

import re
from bs4 import BeautifulSoup
2023-04-11 17:01:23 +00:00
import bot.scraper as Scraper
import bot.yomichan.export as YomichanExport
2023-04-22 17:03:00 +00:00
2023-04-11 17:01:23 +00:00
from bot.entries.jitenon_kotowaza import JitenonKotowaza
from bot.entries.jitenon_yoji import JitenonYoji
2023-04-10 16:14:52 +00:00
def run_all():
jitenon_yoji()
jitenon_kotowaza()
def jitenon_yoji():
2023-04-10 20:20:33 +00:00
seq_to_entries = {}
jitenon = Scraper.Jitenon()
gojuon_doc = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html")
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
gojuon_href = gojuon_a['href']
kana_doc = jitenon.scrape(gojuon_href)
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
for kana_a in kana_soup.select(".word_box a", href=True):
kana_href = kana_a['href']
sequence = int(re.search(r"([0-9]+).html", kana_href).group(1))
2023-04-10 20:20:33 +00:00
if sequence in seq_to_entries:
continue
yoji_doc = jitenon.scrape(kana_href)
entry = JitenonYoji(sequence)
entry.add_document(yoji_doc)
2023-04-10 20:20:33 +00:00
seq_to_entries[sequence] = entry
entries = seq_to_entries.values()
YomichanExport.jitenon_yoji(entries)
2023-04-10 16:14:52 +00:00
def jitenon_kotowaza():
2023-04-10 20:20:33 +00:00
seq_to_entries = {}
2023-04-10 16:14:52 +00:00
jitenon = Scraper.Jitenon()
gojuon_doc = jitenon.scrape("https://kotowaza.jitenon.jp/cat/gojuon.php")
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
gojuon_href = gojuon_a['href']
kana_doc = jitenon.scrape(gojuon_href)
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
for kana_a in kana_soup.select(".word_box a", href=True):
kana_href = kana_a['href']
m = re.search(r"([0-9]+).php", kana_href)
if m:
sequence = int(m.group(1))
else:
continue
2023-04-10 20:20:33 +00:00
if sequence in seq_to_entries:
2023-04-10 16:14:52 +00:00
continue
kotowaza_doc = jitenon.scrape(kana_href)
entry = JitenonKotowaza(sequence)
entry.add_document(kotowaza_doc)
2023-04-10 20:20:33 +00:00
seq_to_entries[sequence] = entry
entries = seq_to_entries.values()
YomichanExport.jitenon_kotowaza(entries)