From 08b180f4426b94e9c0b71185f280894b41000396 Mon Sep 17 00:00:00 2001 From: stephenmk Date: Sun, 23 Apr 2023 00:17:42 -0500 Subject: [PATCH] Redesign Jitenon glossaries for yomichan --- README.md | 35 ------------------ bot/entries/jitenon.py | 11 ++++++ bot/yomichan/glossary/gloss.py | 8 +--- bot/yomichan/glossary/jitenon.py | 63 ++++++++++++++++++++++++++++++-- bot/yomichan/terms/jitenon.py | 2 +- bot/yomichan/terms/terminator.py | 2 +- jitenbot.py | 30 ++++++++++----- 7 files changed, 95 insertions(+), 56 deletions(-) delete mode 100644 README.md diff --git a/README.md b/README.md deleted file mode 100644 index 5fddea1..0000000 --- a/README.md +++ /dev/null @@ -1,35 +0,0 @@ -# jitenbot -Jitenbot is a program for scraping Japanese dictionary websites and converting the scraped data into structured dictionary files. - -### Target Websites - -* [四字熟語辞典オンライン](https://yoji.jitenon.jp/) -* [故事・ことわざ・慣用句オンライン](https://kotowaza.jitenon.jp/) - -### Export Formats - -* [Yomichan](https://github.com/foosoft/yomichan) - -# Usage -Add your desired HTTP request headers to [config.json](https://github.com/stephenmk/jitenbot/blob/main/config.json) -and ensure that all [requirements](https://github.com/stephenmk/jitenbot/blob/main/requirements.txt) -are installed. - -``` -jitenbot [-h] {all,jitenon-yoji,jitenon-kotowaza} - -positional arguments: - {all,jitenon-yoji,jitenon-kotowaza} - website to crawl - -options: - -h, --help show this help message and exit -``` - -Scraped webpages are written to a `webcache` directory. Each page may be as large as 100 KiB, -and a single dictionary may include thousands of pages. Ensure that adequate disk space is available. - -Jitenbot will pause for at least 10 seconds between each web request. Depending upon the size of -the target dictionary, it make take hours or days to finish scraping. - -Exported dictionary files will be saved in an `output` directory. diff --git a/bot/entries/jitenon.py b/bot/entries/jitenon.py index f9f2a1a..24f2ad2 100644 --- a/bot/entries/jitenon.py +++ b/bot/entries/jitenon.py @@ -36,6 +36,17 @@ class JitenonEntry: self._set_headwords() return self._headwords + def get_first_expression(self): + headwords = self.get_headwords() + expressions = next(iter(headwords.values())) + expression = expressions[0] + return expression + + def get_first_reading(self): + headwords = self.get_headwords() + reading = next(iter(headwords.keys())) + return reading + def _set_headwords(self): headwords = {} for yomikata in self.__yomikatas(): diff --git a/bot/yomichan/glossary/gloss.py b/bot/yomichan/glossary/gloss.py index 1a0e743..ddea607 100644 --- a/bot/yomichan/glossary/gloss.py +++ b/bot/yomichan/glossary/gloss.py @@ -15,7 +15,7 @@ def __get_markup_structure(soup): content = [] for child in soup.children: if child.name is None: - text = __clean(child.text) + text = child.text.strip() if text != "": content.append(text) else: @@ -35,12 +35,6 @@ def __get_markup_structure(soup): return node -def __clean(text): - text = text.replace("/", "/") - text = text.strip() - return text - - def __get_attributes(attrs): attributes = {} if "href" in attrs: diff --git a/bot/yomichan/glossary/jitenon.py b/bot/yomichan/glossary/jitenon.py index 3407d01..1b58da5 100644 --- a/bot/yomichan/glossary/jitenon.py +++ b/bot/yomichan/glossary/jitenon.py @@ -6,6 +6,31 @@ from bot.yomichan.glossary.gloss import make_gloss def make_glossary(entry): soup = BeautifulSoup(entry.markup, "html5lib") + __replace_punctuation(soup) + __add_internal_links(soup) + __convert_paragraphs(soup) + __style_table_headers(soup) + __unwrap_table_body(soup) + __decompose_table_rows(soup, entry) + __insert_headword_line(soup, entry) + gloss = make_gloss(soup.body) + glossary = [gloss] + return glossary + + +def __replace_punctuation(soup): + punctuation = { + "/": "/", + ",": "、", + } + for el in soup.find_all(string=True): + text = el.text + for old, new in punctuation.items(): + text = text.replace(old, new) + el.replace_with(text) + + +def __add_internal_links(soup): patterns = [ r"^(.+)([ぁ-ヿ、\s]+)$", r"^(.+)([ぁ-ヿ、\s]+([ぁ-ヿ、\s])[ぁ-ヿ、\s]+)$" @@ -16,10 +41,42 @@ def make_glossary(entry): if m: a['href'] = f"?query={m.group(1)}&wildcards=off" break + + +def __convert_paragraphs(soup): for p in soup.find_all("p"): p.name = "span" + + +def __style_table_headers(soup): for th in soup.find_all("th"): th['style'] = "vertical-align: middle; text-align: center;" - gloss = make_gloss(soup.body) - glossary = [gloss] - return glossary + + +def __unwrap_table_body(soup): + if soup.find("tbody") is not None: + soup.tbody.unwrap() + + +def __decompose_table_rows(soup, entry): + for tr in soup.find_all("tr"): + if tr.find("th") is None: + continue + elif tr.th.text in ["四字熟語", "言葉"]: + tr.decompose() + elif tr.th.text == "読み方": + if re.match(r"^[ぁ-ヿ、]+$", entry.yomikata): + tr.decompose() + elif tr.th.text == "意味": + imi = tr.td + imi.name = "div" + soup.body.insert(0, imi) + tr.decompose() + if soup.find("tr") is None: + soup.table.decompose() + + +def __insert_headword_line(soup, entry): + headword_line = soup.new_tag("span") + headword_line.string = f"{entry.get_first_reading()}【{entry.expression}】" + soup.body.insert(0, headword_line) diff --git a/bot/yomichan/terms/jitenon.py b/bot/yomichan/terms/jitenon.py index ace79c8..56b8841 100644 --- a/bot/yomichan/terms/jitenon.py +++ b/bot/yomichan/terms/jitenon.py @@ -35,7 +35,7 @@ class JitenonYojiTerminator(JitenonTerminator): return "" def _term_tags(self, entry): - tags = entry.kankenkyuu.replace(" ", "").split("/") + tags = entry.kankenkyuu.split("/") return " ".join(tags) diff --git a/bot/yomichan/terms/terminator.py b/bot/yomichan/terms/terminator.py index b3b2fc0..af15c3f 100644 --- a/bot/yomichan/terms/terminator.py +++ b/bot/yomichan/terms/terminator.py @@ -12,7 +12,7 @@ class Terminator: score = -len(terms) glossary = self._glossary(entry) sequence = self._sequence(entry) - term_tags = "" + term_tags = self._term_tags(entry) term = [ expression, reading, definition_tags, inflection_rules, score, glossary, sequence, term_tags diff --git a/jitenbot.py b/jitenbot.py index 8496467..1cf84ce 100644 --- a/jitenbot.py +++ b/jitenbot.py @@ -22,19 +22,31 @@ from bot.crawlers import JitenonKotowazaCrawler crawlers = { - 'jitenon-yoji': JitenonYojiCrawler, - 'jitenon-kotowaza': JitenonKotowazaCrawler, + "jitenon-yoji": JitenonYojiCrawler, + "jitenon-kotowaza": JitenonKotowazaCrawler, } +def add_target_argument(parser): + target_argument_params = { + "choices": crawlers.keys(), + "help": "Dictionary to convert." + } + parser.add_argument("target", **target_argument_params) + + +def make_parser(): + argument_parser_params = { + "prog": "jitenbot", + "description": "Convert Japanese dictionary files to new formats.", + } + parser = argparse.ArgumentParser(**argument_parser_params) + return parser + + def parse_args(): - parser = argparse.ArgumentParser( - prog='jitenbot', - description='Convert Japanese dictionary files to new formats.') - parser.add_argument( - 'target', - choices=crawlers.keys(), - help='Dictionary to convert.') + parser = make_parser() + add_target_argument(parser) args = parser.parse_args() return args