2023-04-08 03:05:36 +00:00
|
|
|
import re
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
import scraper as Scraper
|
|
|
|
import yomichan as Yomichan
|
|
|
|
from jitenon_yoji import JitenonYoji
|
2023-04-10 16:14:52 +00:00
|
|
|
from jitenon_kotowaza import JitenonKotowaza
|
2023-04-08 03:05:36 +00:00
|
|
|
|
|
|
|
|
2023-04-10 16:14:52 +00:00
|
|
|
def run_all():
|
|
|
|
jitenon_yoji()
|
|
|
|
jitenon_kotowaza()
|
|
|
|
|
|
|
|
|
|
|
|
def jitenon_yoji():
|
2023-04-08 03:05:36 +00:00
|
|
|
entries = {}
|
|
|
|
jitenon = Scraper.Jitenon()
|
2023-04-08 23:17:09 +00:00
|
|
|
gojuon_doc = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html")
|
|
|
|
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
|
2023-04-08 03:05:36 +00:00
|
|
|
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
|
|
|
|
gojuon_href = gojuon_a['href']
|
2023-04-08 23:17:09 +00:00
|
|
|
kana_doc = jitenon.scrape(gojuon_href)
|
|
|
|
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
|
2023-04-08 03:05:36 +00:00
|
|
|
for kana_a in kana_soup.select(".word_box a", href=True):
|
|
|
|
kana_href = kana_a['href']
|
|
|
|
sequence = int(re.search(r"([0-9]+).html", kana_href).group(1))
|
|
|
|
if sequence in entries:
|
|
|
|
continue
|
2023-04-08 23:17:09 +00:00
|
|
|
yoji_doc = jitenon.scrape(kana_href)
|
2023-04-08 03:05:36 +00:00
|
|
|
entry = JitenonYoji(sequence)
|
2023-04-08 23:17:09 +00:00
|
|
|
entry.add_document(yoji_doc)
|
2023-04-08 03:05:36 +00:00
|
|
|
entries[sequence] = entry
|
|
|
|
terms = []
|
2023-04-08 23:17:09 +00:00
|
|
|
attribution = ""
|
|
|
|
modified_date = None
|
2023-04-08 03:05:36 +00:00
|
|
|
for entry in entries.values():
|
2023-04-08 23:17:09 +00:00
|
|
|
if modified_date is None or entry.modified_date > modified_date:
|
|
|
|
modified_date = entry.modified_date
|
|
|
|
attribution = entry.attribution
|
2023-04-08 03:05:36 +00:00
|
|
|
for term in entry.yomichan_terms():
|
|
|
|
terms.append(term)
|
|
|
|
index = {
|
|
|
|
"title": "四字熟語辞典オンライン",
|
2023-04-08 23:17:09 +00:00
|
|
|
"revision": f"jitenon-yoji.{modified_date}",
|
2023-04-08 03:05:36 +00:00
|
|
|
"sequenced": True,
|
|
|
|
"format": 3,
|
|
|
|
"url": "https://yoji.jitenon.jp/",
|
2023-04-08 23:17:09 +00:00
|
|
|
"attribution": attribution,
|
2023-04-08 03:05:36 +00:00
|
|
|
}
|
|
|
|
Yomichan.create_zip(terms, index)
|
2023-04-10 16:14:52 +00:00
|
|
|
|
|
|
|
|
|
|
|
def jitenon_kotowaza():
|
|
|
|
entries = {}
|
|
|
|
jitenon = Scraper.Jitenon()
|
|
|
|
gojuon_doc = jitenon.scrape("https://kotowaza.jitenon.jp/cat/gojuon.php")
|
|
|
|
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
|
|
|
|
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
|
|
|
|
gojuon_href = gojuon_a['href']
|
|
|
|
kana_doc = jitenon.scrape(gojuon_href)
|
|
|
|
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
|
|
|
|
for kana_a in kana_soup.select(".word_box a", href=True):
|
|
|
|
kana_href = kana_a['href']
|
|
|
|
m = re.search(r"([0-9]+).php", kana_href)
|
|
|
|
if m:
|
|
|
|
sequence = int(m.group(1))
|
|
|
|
else:
|
|
|
|
# print(f"Skipping {kana_href}")
|
|
|
|
continue
|
|
|
|
if sequence in entries:
|
|
|
|
continue
|
|
|
|
kotowaza_doc = jitenon.scrape(kana_href)
|
|
|
|
entry = JitenonKotowaza(sequence)
|
|
|
|
entry.add_document(kotowaza_doc)
|
|
|
|
entries[sequence] = entry
|
|
|
|
terms = []
|
|
|
|
attribution = ""
|
|
|
|
modified_date = None
|
|
|
|
for entry in entries.values():
|
|
|
|
if modified_date is None or entry.modified_date > modified_date:
|
|
|
|
modified_date = entry.modified_date
|
|
|
|
attribution = entry.attribution
|
|
|
|
for term in entry.yomichan_terms():
|
|
|
|
terms.append(term)
|
|
|
|
index = {
|
|
|
|
"title": "故事・ことわざ・慣用句オンライン",
|
|
|
|
"revision": f"jitenon-kotowaza.{modified_date}",
|
|
|
|
"sequenced": True,
|
|
|
|
"format": 3,
|
|
|
|
"url": "https://kotowaza.jitenon.jp/",
|
|
|
|
"attribution": attribution,
|
|
|
|
}
|
|
|
|
Yomichan.create_zip(terms, index)
|