80 lines
3.1 KiB
Python
80 lines
3.1 KiB
Python
import re
|
|
from bs4 import BeautifulSoup
|
|
|
|
import bot.scraper as Scraper
|
|
import bot.yomichan.export as YomichanExport
|
|
from bot.entries.jitenon_kotowaza import JitenonKotowaza
|
|
from bot.entries.jitenon_yoji import JitenonYoji
|
|
|
|
|
|
def run_all():
|
|
jitenon_yoji()
|
|
jitenon_kotowaza()
|
|
|
|
|
|
def jitenon_yoji():
|
|
print("Scraping jitenon-yoji...")
|
|
entry_id_to_entry_path = {}
|
|
jitenon = Scraper.Jitenon()
|
|
gojuon_doc, _ = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html")
|
|
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
|
|
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
|
|
gojuon_href = gojuon_a['href']
|
|
kana_doc, _ = jitenon.scrape(gojuon_href)
|
|
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
|
|
for kana_a in kana_soup.select(".word_box a", href=True):
|
|
kana_href = kana_a['href']
|
|
entry_id = int(re.search(r"([0-9]+).html", kana_href).group(1))
|
|
if entry_id in entry_id_to_entry_path:
|
|
continue
|
|
_, entry_path = jitenon.scrape(kana_href)
|
|
entry_id_to_entry_path[entry_id] = entry_path
|
|
entries_len = len(entry_id_to_entry_path)
|
|
print(f"Finished scraping {entries_len} entries")
|
|
entries = []
|
|
items = entry_id_to_entry_path.items()
|
|
for idx, (entry_id, entry_path) in enumerate(items):
|
|
update = f"Reading entry {idx+1}/{entries_len}"
|
|
print(update, end='\r', flush=True)
|
|
entry = JitenonYoji(entry_id)
|
|
entry.add_document(entry_path)
|
|
entries.append(entry)
|
|
print()
|
|
exporter = YomichanExport.JitenonYojiExporter()
|
|
exporter.export(entries)
|
|
|
|
|
|
def jitenon_kotowaza():
|
|
print("Scraping jitenon-kotowaza...")
|
|
entry_id_to_entry_path = {}
|
|
jitenon = Scraper.Jitenon()
|
|
gojuon_doc, _ = jitenon.scrape("https://kotowaza.jitenon.jp/cat/gojuon.php")
|
|
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
|
|
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
|
|
gojuon_href = gojuon_a['href']
|
|
kana_doc, _ = jitenon.scrape(gojuon_href)
|
|
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
|
|
for kana_a in kana_soup.select(".word_box a", href=True):
|
|
kana_href = kana_a['href']
|
|
m = re.search(r"([0-9]+).php", kana_href)
|
|
if not m:
|
|
continue
|
|
entry_id = int(m.group(1))
|
|
if entry_id in entry_id_to_entry_path:
|
|
continue
|
|
_, entry_path = jitenon.scrape(kana_href)
|
|
entry_id_to_entry_path[entry_id] = entry_path
|
|
entries_len = len(entry_id_to_entry_path)
|
|
print(f"Finished scraping {entries_len} entries")
|
|
entries = []
|
|
items = entry_id_to_entry_path.items()
|
|
for idx, (entry_id, entry_path) in enumerate(items):
|
|
update = f"Reading entry {idx+1}/{entries_len}"
|
|
print(update, end='\r', flush=True)
|
|
entry = JitenonKotowaza(entry_id)
|
|
entry.add_document(entry_path)
|
|
entries.append(entry)
|
|
print()
|
|
exporter = YomichanExport.JitenonKotowazaExporter()
|
|
exporter.export(entries)
|