Add support Jitenon Kotowaza

This commit is contained in:
stephenmk 2023-04-10 11:14:52 -05:00
parent 79632843cb
commit 16d694d2d2
No known key found for this signature in database
GPG key ID: B6DA730DB06235F1
4 changed files with 94 additions and 18 deletions

View file

@ -4,9 +4,15 @@ from bs4 import BeautifulSoup
import scraper as Scraper import scraper as Scraper
import yomichan as Yomichan import yomichan as Yomichan
from jitenon_yoji import JitenonYoji from jitenon_yoji import JitenonYoji
from jitenon_kotowaza import JitenonKotowaza
def jitenon_yoji_crawler(): def run_all():
jitenon_yoji()
jitenon_kotowaza()
def jitenon_yoji():
entries = {} entries = {}
jitenon = Scraper.Jitenon() jitenon = Scraper.Jitenon()
gojuon_doc = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html") gojuon_doc = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html")
@ -24,7 +30,6 @@ def jitenon_yoji_crawler():
entry = JitenonYoji(sequence) entry = JitenonYoji(sequence)
entry.add_document(yoji_doc) entry.add_document(yoji_doc)
entries[sequence] = entry entries[sequence] = entry
terms = [] terms = []
attribution = "" attribution = ""
modified_date = None modified_date = None
@ -43,3 +48,46 @@ def jitenon_yoji_crawler():
"attribution": attribution, "attribution": attribution,
} }
Yomichan.create_zip(terms, index) Yomichan.create_zip(terms, index)
def jitenon_kotowaza():
entries = {}
jitenon = Scraper.Jitenon()
gojuon_doc = jitenon.scrape("https://kotowaza.jitenon.jp/cat/gojuon.php")
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
gojuon_href = gojuon_a['href']
kana_doc = jitenon.scrape(gojuon_href)
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
for kana_a in kana_soup.select(".word_box a", href=True):
kana_href = kana_a['href']
m = re.search(r"([0-9]+).php", kana_href)
if m:
sequence = int(m.group(1))
else:
# print(f"Skipping {kana_href}")
continue
if sequence in entries:
continue
kotowaza_doc = jitenon.scrape(kana_href)
entry = JitenonKotowaza(sequence)
entry.add_document(kotowaza_doc)
entries[sequence] = entry
terms = []
attribution = ""
modified_date = None
for entry in entries.values():
if modified_date is None or entry.modified_date > modified_date:
modified_date = entry.modified_date
attribution = entry.attribution
for term in entry.yomichan_terms():
terms.append(term)
index = {
"title": "故事・ことわざ・慣用句オンライン",
"revision": f"jitenon-kotowaza.{modified_date}",
"sequenced": True,
"format": 3,
"url": "https://kotowaza.jitenon.jp/",
"attribution": attribution,
}
Yomichan.create_zip(terms, index)

View file

@ -16,7 +16,34 @@ along with this program. If not, see <https://www.gnu.org/licenses/>.
""" """
from crawlers import jitenon_yoji_crawler import argparse
import crawlers as Crawlers
choices = {
'all': Crawlers.run_all,
'jitenon-yoji': Crawlers.jitenon_yoji,
'jitenon-kotowaza': Crawlers.jitenon_kotowaza,
}
def parse_args():
parser = argparse.ArgumentParser(
prog='jitenbot',
description='Crawl and convert Japanese web dictionaries.')
parser.add_argument(
'target',
choices=choices.keys(),
help='website to crawl')
args = parser.parse_args()
return args
def main():
args = parse_args()
crawler = choices[args.target]
crawler()
if __name__ == "__main__": if __name__ == "__main__":
jitenon_yoji_crawler() main()

View file

@ -8,7 +8,7 @@ import util as Util
class JitenonYoji: class JitenonYoji:
columns = { columns = {
"四字熟語": ["yojijukugo", ""], "四字熟語": ["expression", ""],
"読み方": ["yomikata", ""], "読み方": ["yomikata", ""],
"意味": ["imi", ""], "意味": ["imi", ""],
"出典": ["shutten", ""], "出典": ["shutten", ""],
@ -79,7 +79,7 @@ class JitenonYoji:
def __headwords(self): def __headwords(self):
words = [] words = []
for yomikata in self.__yomikatas(): for yomikata in self.__yomikatas():
headword = [self.yojijukugo, yomikata] headword = [self.expression, yomikata]
if headword in words: if headword in words:
words.remove(headword) words.remove(headword)
words.append(headword) words.append(headword)
@ -90,16 +90,17 @@ class JitenonYoji:
return words return words
def __yomikatas(self): def __yomikatas(self):
m = re.search(r"^[ぁ-ヿ]+$", self.yomikata) yomikata = self.yomikata.replace(" ", "")
m = re.search(r"^[ぁ-ヿ]+$", yomikata)
if m: if m:
return [self.yomikata] return [yomikata]
m = re.search(r"^([ぁ-ヿ]+)<br/>", self.yomikata) m = re.search(r"^([ぁ-ヿ]+)<br/>", yomikata)
if m: if m:
return [m.group(1)] return [m.group(1)]
m = re.search(r"^[ぁ-ヿ]+[ぁ-ヿ][ぁ-ヿ]+$", self.yomikata) m = re.search(r"^[ぁ-ヿ]+[ぁ-ヿ][ぁ-ヿ]+$", yomikata)
if m: if m:
return Util.expand_shouryaku(self.yomikata) return Util.expand_shouryaku(yomikata)
m = re.search(r"^([ぁ-ヿ]+)([ぁ-ヿ/\s]+)$", self.yomikata) m = re.search(r"^([ぁ-ヿ]+)([ぁ-ヿ/\s]+)$", yomikata)
if m: if m:
yomikatas = [m.group(1)] yomikatas = [m.group(1)]
alts = m.group(2).split("/") alts = m.group(2).split("/")

View file

@ -14,8 +14,8 @@ from datetime import datetime
class Scraper(): class Scraper():
def __init__(self): def __init__(self):
self.netloc_re = \ pattern = r"^(?:([A-Za-z0-9.\-]+)\.)?" + self.domain + r"$"
re.compile(r"^(?:([A-Za-z0-9.\-]+)\.)?" + self.domain + r"$") self.netloc_re = re.compile(pattern)
self.__set_session() self.__set_session()
def scrape(self, urlstring): def scrape(self, urlstring):
@ -80,20 +80,20 @@ class Scraper():
file_contents = None file_contents = None
return file_contents return file_contents
def __get(self, url): def __get(self, urlstring):
delay = 10 delay = 10
time.sleep(delay) time.sleep(delay)
now = datetime.now().strftime("%H:%M:%S") now = datetime.now().strftime("%H:%M:%S")
print(f"{now} scraping {url.geturl()} ...", end='') print(f"{now} scraping {urlstring} ...", end='')
try: try:
response = self.session.get(url, timeout=10) response = self.session.get(urlstring, timeout=10)
print("OK") print("OK")
return response.text return response.text
except Exception: except Exception:
print("failed") print("failed")
print("resetting session and trying again") print("resetting session and trying again")
self.__set_session() self.__set_session()
response = self.session.get(url, timeout=10) response = self.session.get(urlstring, timeout=10)
return response.text return response.text