Add support Jitenon Kotowaza

This commit is contained in:
stephenmk 2023-04-10 11:14:52 -05:00
parent 79632843cb
commit 16d694d2d2
No known key found for this signature in database
GPG key ID: B6DA730DB06235F1
4 changed files with 94 additions and 18 deletions

View file

@ -4,9 +4,15 @@ from bs4 import BeautifulSoup
import scraper as Scraper
import yomichan as Yomichan
from jitenon_yoji import JitenonYoji
from jitenon_kotowaza import JitenonKotowaza
def jitenon_yoji_crawler():
def run_all():
jitenon_yoji()
jitenon_kotowaza()
def jitenon_yoji():
entries = {}
jitenon = Scraper.Jitenon()
gojuon_doc = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html")
@ -24,7 +30,6 @@ def jitenon_yoji_crawler():
entry = JitenonYoji(sequence)
entry.add_document(yoji_doc)
entries[sequence] = entry
terms = []
attribution = ""
modified_date = None
@ -43,3 +48,46 @@ def jitenon_yoji_crawler():
"attribution": attribution,
}
Yomichan.create_zip(terms, index)
def jitenon_kotowaza():
entries = {}
jitenon = Scraper.Jitenon()
gojuon_doc = jitenon.scrape("https://kotowaza.jitenon.jp/cat/gojuon.php")
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
gojuon_href = gojuon_a['href']
kana_doc = jitenon.scrape(gojuon_href)
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
for kana_a in kana_soup.select(".word_box a", href=True):
kana_href = kana_a['href']
m = re.search(r"([0-9]+).php", kana_href)
if m:
sequence = int(m.group(1))
else:
# print(f"Skipping {kana_href}")
continue
if sequence in entries:
continue
kotowaza_doc = jitenon.scrape(kana_href)
entry = JitenonKotowaza(sequence)
entry.add_document(kotowaza_doc)
entries[sequence] = entry
terms = []
attribution = ""
modified_date = None
for entry in entries.values():
if modified_date is None or entry.modified_date > modified_date:
modified_date = entry.modified_date
attribution = entry.attribution
for term in entry.yomichan_terms():
terms.append(term)
index = {
"title": "故事・ことわざ・慣用句オンライン",
"revision": f"jitenon-kotowaza.{modified_date}",
"sequenced": True,
"format": 3,
"url": "https://kotowaza.jitenon.jp/",
"attribution": attribution,
}
Yomichan.create_zip(terms, index)

View file

@ -16,7 +16,34 @@ along with this program. If not, see <https://www.gnu.org/licenses/>.
"""
from crawlers import jitenon_yoji_crawler
import argparse
import crawlers as Crawlers
choices = {
'all': Crawlers.run_all,
'jitenon-yoji': Crawlers.jitenon_yoji,
'jitenon-kotowaza': Crawlers.jitenon_kotowaza,
}
def parse_args():
parser = argparse.ArgumentParser(
prog='jitenbot',
description='Crawl and convert Japanese web dictionaries.')
parser.add_argument(
'target',
choices=choices.keys(),
help='website to crawl')
args = parser.parse_args()
return args
def main():
args = parse_args()
crawler = choices[args.target]
crawler()
if __name__ == "__main__":
jitenon_yoji_crawler()
main()

View file

@ -8,7 +8,7 @@ import util as Util
class JitenonYoji:
columns = {
"四字熟語": ["yojijukugo", ""],
"四字熟語": ["expression", ""],
"読み方": ["yomikata", ""],
"意味": ["imi", ""],
"出典": ["shutten", ""],
@ -79,7 +79,7 @@ class JitenonYoji:
def __headwords(self):
words = []
for yomikata in self.__yomikatas():
headword = [self.yojijukugo, yomikata]
headword = [self.expression, yomikata]
if headword in words:
words.remove(headword)
words.append(headword)
@ -90,16 +90,17 @@ class JitenonYoji:
return words
def __yomikatas(self):
m = re.search(r"^[ぁ-ヿ]+$", self.yomikata)
yomikata = self.yomikata.replace(" ", "")
m = re.search(r"^[ぁ-ヿ]+$", yomikata)
if m:
return [self.yomikata]
m = re.search(r"^([ぁ-ヿ]+)<br/>", self.yomikata)
return [yomikata]
m = re.search(r"^([ぁ-ヿ]+)<br/>", yomikata)
if m:
return [m.group(1)]
m = re.search(r"^[ぁ-ヿ]+[ぁ-ヿ][ぁ-ヿ]+$", self.yomikata)
m = re.search(r"^[ぁ-ヿ]+[ぁ-ヿ][ぁ-ヿ]+$", yomikata)
if m:
return Util.expand_shouryaku(self.yomikata)
m = re.search(r"^([ぁ-ヿ]+)([ぁ-ヿ/\s]+)$", self.yomikata)
return Util.expand_shouryaku(yomikata)
m = re.search(r"^([ぁ-ヿ]+)([ぁ-ヿ/\s]+)$", yomikata)
if m:
yomikatas = [m.group(1)]
alts = m.group(2).split("/")

View file

@ -14,8 +14,8 @@ from datetime import datetime
class Scraper():
def __init__(self):
self.netloc_re = \
re.compile(r"^(?:([A-Za-z0-9.\-]+)\.)?" + self.domain + r"$")
pattern = r"^(?:([A-Za-z0-9.\-]+)\.)?" + self.domain + r"$"
self.netloc_re = re.compile(pattern)
self.__set_session()
def scrape(self, urlstring):
@ -80,20 +80,20 @@ class Scraper():
file_contents = None
return file_contents
def __get(self, url):
def __get(self, urlstring):
delay = 10
time.sleep(delay)
now = datetime.now().strftime("%H:%M:%S")
print(f"{now} scraping {url.geturl()} ...", end='')
print(f"{now} scraping {urlstring} ...", end='')
try:
response = self.session.get(url, timeout=10)
response = self.session.get(urlstring, timeout=10)
print("OK")
return response.text
except Exception:
print("failed")
print("resetting session and trying again")
self.__set_session()
response = self.session.get(url, timeout=10)
response = self.session.get(urlstring, timeout=10)
return response.text