From 16d694d2d25145f7b331dd31d5087c0c05feb592 Mon Sep 17 00:00:00 2001 From: stephenmk Date: Mon, 10 Apr 2023 11:14:52 -0500 Subject: [PATCH] Add support Jitenon Kotowaza --- crawlers.py | 52 +++++++++++++++++++++++++++++++++++++++++++++++-- jitenbot.py | 31 +++++++++++++++++++++++++++-- jitenon_yoji.py | 17 ++++++++-------- scraper.py | 12 ++++++------ 4 files changed, 94 insertions(+), 18 deletions(-) diff --git a/crawlers.py b/crawlers.py index 76c2c84..ebaf0ed 100644 --- a/crawlers.py +++ b/crawlers.py @@ -4,9 +4,15 @@ from bs4 import BeautifulSoup import scraper as Scraper import yomichan as Yomichan from jitenon_yoji import JitenonYoji +from jitenon_kotowaza import JitenonKotowaza -def jitenon_yoji_crawler(): +def run_all(): + jitenon_yoji() + jitenon_kotowaza() + + +def jitenon_yoji(): entries = {} jitenon = Scraper.Jitenon() gojuon_doc = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html") @@ -24,7 +30,6 @@ def jitenon_yoji_crawler(): entry = JitenonYoji(sequence) entry.add_document(yoji_doc) entries[sequence] = entry - terms = [] attribution = "" modified_date = None @@ -43,3 +48,46 @@ def jitenon_yoji_crawler(): "attribution": attribution, } Yomichan.create_zip(terms, index) + + +def jitenon_kotowaza(): + entries = {} + jitenon = Scraper.Jitenon() + gojuon_doc = jitenon.scrape("https://kotowaza.jitenon.jp/cat/gojuon.php") + gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser") + for gojuon_a in gojuon_soup.select(".kana_area a", href=True): + gojuon_href = gojuon_a['href'] + kana_doc = jitenon.scrape(gojuon_href) + kana_soup = BeautifulSoup(kana_doc, features="html.parser") + for kana_a in kana_soup.select(".word_box a", href=True): + kana_href = kana_a['href'] + m = re.search(r"([0-9]+).php", kana_href) + if m: + sequence = int(m.group(1)) + else: + # print(f"Skipping {kana_href}") + continue + if sequence in entries: + continue + kotowaza_doc = jitenon.scrape(kana_href) + entry = JitenonKotowaza(sequence) + entry.add_document(kotowaza_doc) + entries[sequence] = entry + terms = [] + attribution = "" + modified_date = None + for entry in entries.values(): + if modified_date is None or entry.modified_date > modified_date: + modified_date = entry.modified_date + attribution = entry.attribution + for term in entry.yomichan_terms(): + terms.append(term) + index = { + "title": "故事・ことわざ・慣用句オンライン", + "revision": f"jitenon-kotowaza.{modified_date}", + "sequenced": True, + "format": 3, + "url": "https://kotowaza.jitenon.jp/", + "attribution": attribution, + } + Yomichan.create_zip(terms, index) diff --git a/jitenbot.py b/jitenbot.py index 76a2f1f..1a6503a 100644 --- a/jitenbot.py +++ b/jitenbot.py @@ -16,7 +16,34 @@ along with this program. If not, see . """ -from crawlers import jitenon_yoji_crawler +import argparse +import crawlers as Crawlers + + +choices = { + 'all': Crawlers.run_all, + 'jitenon-yoji': Crawlers.jitenon_yoji, + 'jitenon-kotowaza': Crawlers.jitenon_kotowaza, +} + + +def parse_args(): + parser = argparse.ArgumentParser( + prog='jitenbot', + description='Crawl and convert Japanese web dictionaries.') + parser.add_argument( + 'target', + choices=choices.keys(), + help='website to crawl') + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + crawler = choices[args.target] + crawler() + if __name__ == "__main__": - jitenon_yoji_crawler() + main() diff --git a/jitenon_yoji.py b/jitenon_yoji.py index 5b48e6d..4345dca 100644 --- a/jitenon_yoji.py +++ b/jitenon_yoji.py @@ -8,7 +8,7 @@ import util as Util class JitenonYoji: columns = { - "四字熟語": ["yojijukugo", ""], + "四字熟語": ["expression", ""], "読み方": ["yomikata", ""], "意味": ["imi", ""], "出典": ["shutten", ""], @@ -79,7 +79,7 @@ class JitenonYoji: def __headwords(self): words = [] for yomikata in self.__yomikatas(): - headword = [self.yojijukugo, yomikata] + headword = [self.expression, yomikata] if headword in words: words.remove(headword) words.append(headword) @@ -90,16 +90,17 @@ class JitenonYoji: return words def __yomikatas(self): - m = re.search(r"^[ぁ-ヿ]+$", self.yomikata) + yomikata = self.yomikata.replace(" ", "") + m = re.search(r"^[ぁ-ヿ]+$", yomikata) if m: - return [self.yomikata] - m = re.search(r"^([ぁ-ヿ]+)
", self.yomikata) + return [yomikata] + m = re.search(r"^([ぁ-ヿ]+)
", yomikata) if m: return [m.group(1)] - m = re.search(r"^[ぁ-ヿ]+([ぁ-ヿ])[ぁ-ヿ]+$", self.yomikata) + m = re.search(r"^[ぁ-ヿ]+([ぁ-ヿ])[ぁ-ヿ]+$", yomikata) if m: - return Util.expand_shouryaku(self.yomikata) - m = re.search(r"^([ぁ-ヿ]+)(([ぁ-ヿ/\s]+))$", self.yomikata) + return Util.expand_shouryaku(yomikata) + m = re.search(r"^([ぁ-ヿ]+)(([ぁ-ヿ/\s]+))$", yomikata) if m: yomikatas = [m.group(1)] alts = m.group(2).split("/") diff --git a/scraper.py b/scraper.py index 42cd4b8..a8de968 100644 --- a/scraper.py +++ b/scraper.py @@ -14,8 +14,8 @@ from datetime import datetime class Scraper(): def __init__(self): - self.netloc_re = \ - re.compile(r"^(?:([A-Za-z0-9.\-]+)\.)?" + self.domain + r"$") + pattern = r"^(?:([A-Za-z0-9.\-]+)\.)?" + self.domain + r"$" + self.netloc_re = re.compile(pattern) self.__set_session() def scrape(self, urlstring): @@ -80,20 +80,20 @@ class Scraper(): file_contents = None return file_contents - def __get(self, url): + def __get(self, urlstring): delay = 10 time.sleep(delay) now = datetime.now().strftime("%H:%M:%S") - print(f"{now} scraping {url.geturl()} ...", end='') + print(f"{now} scraping {urlstring} ...", end='') try: - response = self.session.get(url, timeout=10) + response = self.session.get(urlstring, timeout=10) print("OK") return response.text except Exception: print("failed") print("resetting session and trying again") self.__set_session() - response = self.session.get(url, timeout=10) + response = self.session.get(urlstring, timeout=10) return response.text