From 9b3fdc86d13eb8117e876df9d836dd22413e87a1 Mon Sep 17 00:00:00 2001 From: stephenmk Date: Wed, 26 Jul 2023 19:28:50 -0500 Subject: [PATCH] Reorganize file structure of entries modules --- bot/entries/{ => base}/entry.py | 0 bot/entries/{ => base}/expressions.py | 37 --- .../{jitenon.py => base/jitenon_entry.py} | 107 +------ bot/entries/base/sanseido_entry.py | 60 ++++ bot/entries/daijirin2.py | 231 ---------------- bot/entries/daijirin2/base_entry.py | 88 ++++++ bot/entries/daijirin2/child_entry.py | 9 + bot/entries/daijirin2/entry.py | 50 ++++ bot/entries/daijirin2/phrase_entry.py | 67 +++++ .../preprocess.py} | 0 bot/entries/factory.py | 21 +- bot/entries/jitenon_kokugo/entry.py | 45 +++ bot/entries/jitenon_kotowaza/entry.py | 35 +++ bot/entries/jitenon_yoji/entry.py | 27 ++ bot/entries/sankoku8.py | 260 ------------------ bot/entries/sankoku8/base_entry.py | 97 +++++++ bot/entries/sankoku8/child_entry.py | 8 + bot/entries/sankoku8/entry.py | 14 + bot/entries/sankoku8/parse.py | 65 +++++ bot/entries/sankoku8/phrase_entry.py | 37 +++ .../preprocess.py} | 0 bot/entries/smk8.py | 221 --------------- bot/entries/smk8/base_entry.py | 73 +++++ bot/entries/smk8/child_entry.py | 17 ++ bot/entries/smk8/entry.py | 26 ++ bot/entries/smk8/kanji_entry.py | 22 ++ bot/entries/smk8/phrase_entry.py | 64 +++++ .../preprocess.py} | 0 bot/yomichan/terms/daijirin2.py | 5 +- bot/yomichan/terms/sankoku8.py | 5 +- bot/yomichan/terms/smk8.py | 4 +- tests/test_daijirin_phrases.py | 21 ++ tests/test_expressions.py | 27 +- tests/test_sankoku_phrases.py | 14 +- tests/test_smk_phrases.py | 19 ++ 35 files changed, 863 insertions(+), 913 deletions(-) rename bot/entries/{ => base}/entry.py (100%) rename bot/entries/{ => base}/expressions.py (67%) rename bot/entries/{jitenon.py => base/jitenon_entry.py} (58%) create mode 100644 bot/entries/base/sanseido_entry.py delete mode 100644 bot/entries/daijirin2.py create mode 100644 bot/entries/daijirin2/base_entry.py create mode 100644 bot/entries/daijirin2/child_entry.py create mode 100644 bot/entries/daijirin2/entry.py create mode 100644 bot/entries/daijirin2/phrase_entry.py rename bot/entries/{daijirin2_preprocess.py => daijirin2/preprocess.py} (100%) create mode 100644 bot/entries/jitenon_kokugo/entry.py create mode 100644 bot/entries/jitenon_kotowaza/entry.py create mode 100644 bot/entries/jitenon_yoji/entry.py delete mode 100644 bot/entries/sankoku8.py create mode 100644 bot/entries/sankoku8/base_entry.py create mode 100644 bot/entries/sankoku8/child_entry.py create mode 100644 bot/entries/sankoku8/entry.py create mode 100644 bot/entries/sankoku8/parse.py create mode 100644 bot/entries/sankoku8/phrase_entry.py rename bot/entries/{sankoku8_preprocess.py => sankoku8/preprocess.py} (100%) delete mode 100644 bot/entries/smk8.py create mode 100644 bot/entries/smk8/base_entry.py create mode 100644 bot/entries/smk8/child_entry.py create mode 100644 bot/entries/smk8/entry.py create mode 100644 bot/entries/smk8/kanji_entry.py create mode 100644 bot/entries/smk8/phrase_entry.py rename bot/entries/{smk8_preprocess.py => smk8/preprocess.py} (100%) create mode 100644 tests/test_daijirin_phrases.py create mode 100644 tests/test_smk_phrases.py diff --git a/bot/entries/entry.py b/bot/entries/base/entry.py similarity index 100% rename from bot/entries/entry.py rename to bot/entries/base/entry.py diff --git a/bot/entries/expressions.py b/bot/entries/base/expressions.py similarity index 67% rename from bot/entries/expressions.py rename to bot/entries/base/expressions.py index 687a325..7d20891 100644 --- a/bot/entries/expressions.py +++ b/bot/entries/base/expressions.py @@ -85,40 +85,3 @@ def expand_abbreviation_list(expressions): if new_exp not in new_exps: new_exps.append(new_exp) return new_exps - - -def expand_smk_alternatives(text): - """Return a list of strings described by △ notation.""" - m = re.search(r"△([^(]+)(([^(]+))", text) - if m is None: - return [text] - alt_parts = [m.group(1)] - for alt_part in m.group(2).split("・"): - alt_parts.append(alt_part) - alts = [] - for alt_part in alt_parts: - alt_exp = re.sub(r"△[^(]+([^(]+)", alt_part, text) - alts.append(alt_exp) - return alts - - -def expand_daijirin_alternatives(text): - """Return a list of strings described by = notation.""" - group_pattern = r"([^=]+)(=([^(]+)(=([^(]+)))?" - groups = re.findall(group_pattern, text) - expressions = [""] - for group in groups: - new_exps = [] - for expression in expressions: - new_exps.append(expression + group[0]) - expressions = new_exps.copy() - if group[1] == "": - continue - new_exps = [] - for expression in expressions: - new_exps.append(expression + group[2]) - for expression in expressions: - for alt in group[3].split("・"): - new_exps.append(expression + alt) - expressions = new_exps.copy() - return expressions diff --git a/bot/entries/jitenon.py b/bot/entries/base/jitenon_entry.py similarity index 58% rename from bot/entries/jitenon.py rename to bot/entries/base/jitenon_entry.py index 65c4d2e..7af845b 100644 --- a/bot/entries/jitenon.py +++ b/bot/entries/base/jitenon_entry.py @@ -3,11 +3,11 @@ from abc import abstractmethod from datetime import datetime, date from bs4 import BeautifulSoup -from bot.entries.entry import Entry -import bot.entries.expressions as Expressions +from bot.entries.base.entry import Entry +import bot.entries.base.expressions as Expressions -class _JitenonEntry(Entry): +class JitenonEntry(Entry): def __init__(self, target, entry_id): super().__init__(target, entry_id) self.expression = "" @@ -140,104 +140,3 @@ class _JitenonEntry(Entry): elif isinstance(attr_val, list): colvals.append(";".join(attr_val)) return ",".join(colvals) - - -class JitenonYojiEntry(_JitenonEntry): - def __init__(self, target, entry_id): - super().__init__(target, entry_id) - self.origin = "" - self.kanken_level = "" - self.category = "" - self.related_expressions = [] - - def _get_column_map(self): - return { - "四字熟語": "expression", - "読み方": "yomikata", - "意味": "definition", - "異形": "other_forms", - "出典": "origin", - "漢検級": "kanken_level", - "場面用途": "category", - "類義語": "related_expressions", - } - - def _add_variant_expressions(self, headwords): - for expressions in headwords.values(): - Expressions.add_variant_kanji(expressions) - - -class JitenonKotowazaEntry(_JitenonEntry): - def __init__(self, target, entry_id): - super().__init__(target, entry_id) - self.origin = "" - self.example = "" - self.related_expressions = [] - - def _get_column_map(self): - return { - "言葉": "expression", - "読み方": "yomikata", - "意味": "definition", - "異形": "other_forms", - "出典": "origin", - "例文": "example", - "類句": "related_expressions", - } - - def _get_headwords(self): - if self.expression == "金棒引き・鉄棒引き": - headwords = { - "かなぼうひき": ["金棒引き", "鉄棒引き"] - } - else: - headwords = super()._get_headwords() - return headwords - - def _add_variant_expressions(self, headwords): - for expressions in headwords.values(): - Expressions.add_variant_kanji(expressions) - Expressions.add_fullwidth(expressions) - - -class JitenonKokugoEntry(_JitenonEntry): - def __init__(self, target, entry_id): - super().__init__(target, entry_id) - self.example = "" - self.alt_expression = "" - self.antonym = "" - self.attachments = "" - self.compounds = "" - self.related_words = "" - - def _get_column_map(self): - return { - "言葉": "expression", - "読み方": "yomikata", - "意味": "definition", - "例文": "example", - "別表記": "alt_expression", - "対義語": "antonym", - "活用": "attachments", - "用例": "compounds", - "類語": "related_words", - } - - def _get_headwords(self): - headwords = {} - for reading in self.yomikata.split("・"): - if reading not in headwords: - headwords[reading] = [] - for expression in self.expression.split("・"): - headwords[reading].append(expression) - if self.alt_expression.strip() != "": - for expression in self.alt_expression.split("・"): - headwords[reading].append(expression) - return headwords - - def _add_variant_expressions(self, headwords): - for expressions in headwords.values(): - Expressions.add_variant_kanji(expressions) - Expressions.add_fullwidth(expressions) - Expressions.remove_iteration_mark(expressions) - Expressions.add_iteration_mark(expressions) diff --git a/bot/entries/base/sanseido_entry.py b/bot/entries/base/sanseido_entry.py new file mode 100644 index 0000000..4e1098d --- /dev/null +++ b/bot/entries/base/sanseido_entry.py @@ -0,0 +1,60 @@ +from abc import abstractmethod +from bs4 import BeautifulSoup + +from bot.entries.base.entry import Entry +import bot.entries.base.expressions as Expressions + + +class SanseidoEntry(Entry): + def set_page(self, page): + page = self._decompose_subentries(page) + self._page = page + + def get_page_soup(self): + soup = BeautifulSoup(self._page, "xml") + return soup + + def get_global_identifier(self): + parent_part = format(self.entry_id[0], '06') + child_part = hex(self.entry_id[1]).lstrip('0x').zfill(4).upper() + return f"@{self.target.value}-{parent_part}-{child_part}" + + def _decompose_subentries(self, page): + soup = BeautifulSoup(page, features="xml") + for x in self._get_subentry_parameters(): + subentry_class, tags, subentry_list = x + for tag in tags: + tag_soup = soup.find(tag) + while tag_soup is not None: + tag_soup.name = "項目" + subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"]) + self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id + subentry = subentry_class(self.target, subentry_id) + page = tag_soup.decode() + subentry.set_page(page) + subentry_list.append(subentry) + tag_soup.decompose() + tag_soup = soup.find(tag) + return soup.decode() + + @abstractmethod + def _get_subentry_parameters(self): + pass + + def _add_variant_expressions(self, headwords): + for expressions in headwords.values(): + Expressions.add_variant_kanji(expressions) + Expressions.add_fullwidth(expressions) + Expressions.remove_iteration_mark(expressions) + Expressions.add_iteration_mark(expressions) + + @staticmethod + def id_string_to_entry_id(id_string): + parts = id_string.split("-") + if len(parts) == 1: + return (int(parts[0]), 0) + elif len(parts) == 2: + # subentries have a hexadecimal part + return (int(parts[0]), int(parts[1], 16)) + else: + raise Exception(f"Invalid entry ID: {id_string}") diff --git a/bot/entries/daijirin2.py b/bot/entries/daijirin2.py deleted file mode 100644 index f7a629c..0000000 --- a/bot/entries/daijirin2.py +++ /dev/null @@ -1,231 +0,0 @@ -from bs4 import BeautifulSoup - -import bot.entries.expressions as Expressions -import bot.soup as Soup -from bot.data import load_phrase_readings -from bot.data import load_daijirin2_kana_abbreviations -from bot.entries.entry import Entry -from bot.entries.daijirin2_preprocess import preprocess_page - - -class _BaseDaijirin2Entry(Entry): - def __init__(self, target, entry_id): - super().__init__(target, entry_id) - self.children = [] - self.phrases = [] - self._kana_abbreviations = load_daijirin2_kana_abbreviations() - - def get_global_identifier(self): - parent_part = format(self.entry_id[0], '06') - child_part = hex(self.entry_id[1]).lstrip('0x').zfill(4).upper() - return f"@{self.target.value}-{parent_part}-{child_part}" - - def set_page(self, page): - page = self.__decompose_subentries(page) - self._page = page - - def get_page_soup(self): - soup = BeautifulSoup(self._page, "xml") - return soup - - def get_part_of_speech_tags(self): - if self._part_of_speech_tags is not None: - return self._part_of_speech_tags - self._part_of_speech_tags = [] - soup = self.get_page_soup() - for pos_group in soup.find_all("品詞G"): - if pos_group.parent.name == "大語義": - self._set_part_of_speech_tags(pos_group) - return self._part_of_speech_tags - - def _set_part_of_speech_tags(self, el): - pos_names = ["品詞", "品詞活用", "品詞行", "用法"] - for child in el.children: - if child.name is not None: - self._set_part_of_speech_tags(child) - continue - pos = str(child) - if el.name not in pos_names: - continue - elif pos in ["[", "]"]: - continue - elif pos in self._part_of_speech_tags: - continue - else: - self._part_of_speech_tags.append(pos) - - def _get_regular_headwords(self, soup): - self._fill_alts(soup) - reading = soup.find("見出仮名").text - expressions = [] - for el in soup.find_all("標準表記"): - expression = self._clean_expression(el.text) - if "—" in expression: - kana_abbrs = self._kana_abbreviations[self.entry_id] - for abbr in kana_abbrs: - expression = expression.replace("—", abbr, 1) - expressions.append(expression) - expressions = Expressions.expand_abbreviation_list(expressions) - if len(expressions) == 0: - expressions.append(reading) - headwords = {reading: expressions} - return headwords - - def _add_variant_expressions(self, headwords): - for expressions in headwords.values(): - Expressions.add_variant_kanji(expressions) - Expressions.add_fullwidth(expressions) - Expressions.remove_iteration_mark(expressions) - Expressions.add_iteration_mark(expressions) - - def __decompose_subentries(self, page): - soup = BeautifulSoup(page, features="xml") - subentry_parameters = [ - [Daijirin2ChildEntry, ["子項目"], self.children], - [Daijirin2PhraseEntry, ["句項目"], self.phrases], - ] - for x in subentry_parameters: - subentry_class, tags, subentry_list = x - for tag in tags: - tag_soup = soup.find(tag) - while tag_soup is not None: - tag_soup.name = "項目" - subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"]) - self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id - subentry = subentry_class(self.target, subentry_id) - page = tag_soup.decode() - subentry.set_page(page) - subentry_list.append(subentry) - tag_soup.decompose() - tag_soup = soup.find(tag) - return soup.decode() - - @staticmethod - def id_string_to_entry_id(id_string): - parts = id_string.split("-") - if len(parts) == 1: - return (int(parts[0]), 0) - elif len(parts) == 2: - # subentries have a hexadecimal part - return (int(parts[0]), int(parts[1], 16)) - else: - raise Exception(f"Invalid entry ID: {id_string}") - - @staticmethod - def _delete_unused_nodes(soup): - """Remove extra markup elements that appear in the entry - headword line which are not part of the entry headword""" - unused_nodes = [ - "漢字音logo", "活用分節", "連語句活用分節", "語構成", - "表外字マーク", "表外字マーク", "ルビG" - ] - for name in unused_nodes: - Soup.delete_soup_nodes(soup, name) - - @staticmethod - def _clean_expression(expression): - for x in ["〈", "〉", "《", "》", " "]: - expression = expression.replace(x, "") - return expression - - @staticmethod - def _fill_alts(soup): - for gaiji in soup.find_all(class_="gaiji"): - if gaiji.name == "img" and gaiji.has_attr("alt"): - gaiji.name = "span" - gaiji.string = gaiji.attrs["alt"] - - -class Daijirin2Entry(_BaseDaijirin2Entry): - def __init__(self, target, page_id): - entry_id = (page_id, 0) - super().__init__(target, entry_id) - - def set_page(self, page): - page = preprocess_page(page) - super().set_page(page) - - def _get_headwords(self): - soup = self.get_page_soup() - self._delete_unused_nodes(soup) - if soup.find("漢字見出") is not None: - headwords = self._get_kanji_headwords(soup) - elif soup.find("略語G") is not None: - headwords = self._get_acronym_headwords(soup) - else: - headwords = self._get_regular_headwords(soup) - return headwords - - def _get_kanji_headwords(self, soup): - readings = [] - for el in soup.find_all("漢字音"): - hira = Expressions.kata_to_hira(el.text) - readings.append(hira) - if soup.find("漢字音") is None: - readings.append("") - expressions = [] - for el in soup.find_all("漢字見出"): - expressions.append(el.text) - headwords = {} - for reading in readings: - headwords[reading] = expressions - return headwords - - def _get_acronym_headwords(self, soup): - expressions = [] - for el in soup.find_all("略語"): - expression_parts = [] - for part in el.find_all(["欧字", "和字"]): - expression_parts.append(part.text) - expression = "".join(expression_parts) - expressions.append(expression) - headwords = {"": expressions} - return headwords - - -class Daijirin2ChildEntry(_BaseDaijirin2Entry): - def _get_headwords(self): - soup = self.get_page_soup() - self._delete_unused_nodes(soup) - headwords = self._get_regular_headwords(soup) - return headwords - - -class Daijirin2PhraseEntry(_BaseDaijirin2Entry): - def get_part_of_speech_tags(self): - # phrases do not contain these tags - return [] - - def _get_headwords(self): - soup = self.get_page_soup() - headwords = {} - expressions = self._find_expressions(soup) - readings = self._find_readings() - for idx, expression in enumerate(expressions): - reading = readings[idx] - if reading in headwords: - headwords[reading].append(expression) - else: - headwords[reading] = [expression] - return headwords - - def _find_expressions(self, soup): - self._delete_unused_nodes(soup) - text = soup.find("句表記").text - text = self._clean_expression(text) - alternatives = Expressions.expand_daijirin_alternatives(text) - expressions = [] - for alt in alternatives: - for exp in Expressions.expand_abbreviation(alt): - expressions.append(exp) - return expressions - - def _find_readings(self): - phrase_readings = load_phrase_readings(self.target) - text = phrase_readings[self.entry_id] - alternatives = Expressions.expand_daijirin_alternatives(text) - readings = [] - for alt in alternatives: - for reading in Expressions.expand_abbreviation(alt): - readings.append(reading) - return readings diff --git a/bot/entries/daijirin2/base_entry.py b/bot/entries/daijirin2/base_entry.py new file mode 100644 index 0000000..1113404 --- /dev/null +++ b/bot/entries/daijirin2/base_entry.py @@ -0,0 +1,88 @@ +import bot.soup as Soup +from bot.data import load_daijirin2_kana_abbreviations +from bot.entries.base.sanseido_entry import SanseidoEntry +import bot.entries.base.expressions as Expressions + + +class BaseEntry(SanseidoEntry): + def __init__(self, target, entry_id): + super().__init__(target, entry_id) + self.children = [] + self.phrases = [] + self._kana_abbreviations = load_daijirin2_kana_abbreviations() + + def get_part_of_speech_tags(self): + if self._part_of_speech_tags is not None: + return self._part_of_speech_tags + self._part_of_speech_tags = [] + soup = self.get_page_soup() + for pos_group in soup.find_all("品詞G"): + if pos_group.parent.name == "大語義": + self._set_part_of_speech_tags(pos_group) + return self._part_of_speech_tags + + def _set_part_of_speech_tags(self, el): + pos_names = ["品詞", "品詞活用", "品詞行", "用法"] + for child in el.children: + if child.name is not None: + self._set_part_of_speech_tags(child) + continue + pos = str(child) + if el.name not in pos_names: + continue + elif pos in ["[", "]"]: + continue + elif pos in self._part_of_speech_tags: + continue + else: + self._part_of_speech_tags.append(pos) + + def _get_regular_headwords(self, soup): + self._fill_alts(soup) + reading = soup.find("見出仮名").text + expressions = [] + for el in soup.find_all("標準表記"): + expression = self._clean_expression(el.text) + if "—" in expression: + kana_abbrs = self._kana_abbreviations[self.entry_id] + for abbr in kana_abbrs: + expression = expression.replace("—", abbr, 1) + expressions.append(expression) + expressions = Expressions.expand_abbreviation_list(expressions) + if len(expressions) == 0: + expressions.append(reading) + headwords = {reading: expressions} + return headwords + + def _get_subentry_parameters(self): + from bot.entries.daijirin2.child_entry import ChildEntry + from bot.entries.daijirin2.phrase_entry import PhraseEntry + subentry_parameters = [ + [ChildEntry, ["子項目"], self.children], + [PhraseEntry, ["句項目"], self.phrases], + ] + return subentry_parameters + + @staticmethod + def _delete_unused_nodes(soup): + """Remove extra markup elements that appear in the entry + headword line which are not part of the entry headword""" + unused_nodes = [ + "漢字音logo", "活用分節", "連語句活用分節", "語構成", + "表外字マーク", "表外字マーク", "ルビG" + ] + for name in unused_nodes: + Soup.delete_soup_nodes(soup, name) + + @staticmethod + def _clean_expression(expression): + for x in ["〈", "〉", "《", "》", " "]: + expression = expression.replace(x, "") + return expression + + @staticmethod + def _fill_alts(soup): + for gaiji in soup.find_all(class_="gaiji"): + if gaiji.name == "img" and gaiji.has_attr("alt"): + gaiji.name = "span" + gaiji.string = gaiji.attrs["alt"] diff --git a/bot/entries/daijirin2/child_entry.py b/bot/entries/daijirin2/child_entry.py new file mode 100644 index 0000000..42685a0 --- /dev/null +++ b/bot/entries/daijirin2/child_entry.py @@ -0,0 +1,9 @@ +from bot.entries.daijirin2.base_entry import BaseEntry + + +class ChildEntry(BaseEntry): + def _get_headwords(self): + soup = self.get_page_soup() + self._delete_unused_nodes(soup) + headwords = self._get_regular_headwords(soup) + return headwords diff --git a/bot/entries/daijirin2/entry.py b/bot/entries/daijirin2/entry.py new file mode 100644 index 0000000..0b6970f --- /dev/null +++ b/bot/entries/daijirin2/entry.py @@ -0,0 +1,50 @@ +import bot.entries.base.expressions as Expressions +from bot.entries.daijirin2.base_entry import BaseEntry +from bot.entries.daijirin2.preprocess import preprocess_page + + +class Entry(BaseEntry): + def __init__(self, target, page_id): + entry_id = (page_id, 0) + super().__init__(target, entry_id) + + def set_page(self, page): + page = preprocess_page(page) + super().set_page(page) + + def _get_headwords(self): + soup = self.get_page_soup() + self._delete_unused_nodes(soup) + if soup.find("漢字見出") is not None: + headwords = self._get_kanji_headwords(soup) + elif soup.find("略語G") is not None: + headwords = self._get_acronym_headwords(soup) + else: + headwords = self._get_regular_headwords(soup) + return headwords + + def _get_kanji_headwords(self, soup): + readings = [] + for el in soup.find_all("漢字音"): + hira = Expressions.kata_to_hira(el.text) + readings.append(hira) + if soup.find("漢字音") is None: + readings.append("") + expressions = [] + for el in soup.find_all("漢字見出"): + expressions.append(el.text) + headwords = {} + for reading in readings: + headwords[reading] = expressions + return headwords + + def _get_acronym_headwords(self, soup): + expressions = [] + for el in soup.find_all("略語"): + expression_parts = [] + for part in el.find_all(["欧字", "和字"]): + expression_parts.append(part.text) + expression = "".join(expression_parts) + expressions.append(expression) + headwords = {"": expressions} + return headwords diff --git a/bot/entries/daijirin2/phrase_entry.py b/bot/entries/daijirin2/phrase_entry.py new file mode 100644 index 0000000..0470d7d --- /dev/null +++ b/bot/entries/daijirin2/phrase_entry.py @@ -0,0 +1,67 @@ +import re + +import bot.entries.base.expressions as Expressions +from bot.data import load_phrase_readings +from bot.entries.daijirin2.base_entry import BaseEntry + + +class PhraseEntry(BaseEntry): + def get_part_of_speech_tags(self): + # phrases do not contain these tags + return [] + + def _get_headwords(self): + soup = self.get_page_soup() + headwords = {} + expressions = self._find_expressions(soup) + readings = self._find_readings() + for idx, expression in enumerate(expressions): + reading = readings[idx] + if reading in headwords: + headwords[reading].append(expression) + else: + headwords[reading] = [expression] + return headwords + + def _find_expressions(self, soup): + self._delete_unused_nodes(soup) + text = soup.find("句表記").text + text = self._clean_expression(text) + alternatives = parse_phrase(text) + expressions = [] + for alt in alternatives: + for exp in Expressions.expand_abbreviation(alt): + expressions.append(exp) + return expressions + + def _find_readings(self): + phrase_readings = load_phrase_readings(self.target) + text = phrase_readings[self.entry_id] + alternatives = parse_phrase(text) + readings = [] + for alt in alternatives: + for reading in Expressions.expand_abbreviation(alt): + readings.append(reading) + return readings + + +def parse_phrase(text): + """Return a list of strings described by = notation.""" + group_pattern = r"([^=]+)(=([^(]+)(=([^(]+)))?" + groups = re.findall(group_pattern, text) + expressions = [""] + for group in groups: + new_exps = [] + for expression in expressions: + new_exps.append(expression + group[0]) + expressions = new_exps.copy() + if group[1] == "": + continue + new_exps = [] + for expression in expressions: + new_exps.append(expression + group[2]) + for expression in expressions: + for alt in group[3].split("・"): + new_exps.append(expression + alt) + expressions = new_exps.copy() + return expressions diff --git a/bot/entries/daijirin2_preprocess.py b/bot/entries/daijirin2/preprocess.py similarity index 100% rename from bot/entries/daijirin2_preprocess.py rename to bot/entries/daijirin2/preprocess.py diff --git a/bot/entries/factory.py b/bot/entries/factory.py index 162c102..594762f 100644 --- a/bot/entries/factory.py +++ b/bot/entries/factory.py @@ -1,20 +1,7 @@ -from bot.targets import Targets - -from bot.entries.jitenon import JitenonKokugoEntry -from bot.entries.jitenon import JitenonYojiEntry -from bot.entries.jitenon import JitenonKotowazaEntry -from bot.entries.smk8 import Smk8Entry -from bot.entries.daijirin2 import Daijirin2Entry -from bot.entries.sankoku8 import Sankoku8Entry +import importlib def new_entry(target, page_id): - entry_map = { - Targets.JITENON_KOKUGO: JitenonKokugoEntry, - Targets.JITENON_YOJI: JitenonYojiEntry, - Targets.JITENON_KOTOWAZA: JitenonKotowazaEntry, - Targets.SMK8: Smk8Entry, - Targets.DAIJIRIN2: Daijirin2Entry, - Targets.SANKOKU8: Sankoku8Entry, - } - return entry_map[target](target, page_id) + module_path = f"bot.entries.{target.name.lower()}.entry" + module = importlib.import_module(module_path) + return module.Entry(target, page_id) diff --git a/bot/entries/jitenon_kokugo/entry.py b/bot/entries/jitenon_kokugo/entry.py new file mode 100644 index 0000000..523ac63 --- /dev/null +++ b/bot/entries/jitenon_kokugo/entry.py @@ -0,0 +1,45 @@ +from bot.entries.base.jitenon_entry import JitenonEntry +import bot.entries.base.expressions as Expressions + + +class Entry(JitenonEntry): + def __init__(self, target, entry_id): + super().__init__(target, entry_id) + self.example = "" + self.alt_expression = "" + self.antonym = "" + self.attachments = "" + self.compounds = "" + self.related_words = "" + + def _get_column_map(self): + return { + "言葉": "expression", + "読み方": "yomikata", + "意味": "definition", + "例文": "example", + "別表記": "alt_expression", + "対義語": "antonym", + "活用": "attachments", + "用例": "compounds", + "類語": "related_words", + } + + def _get_headwords(self): + headwords = {} + for reading in self.yomikata.split("・"): + if reading not in headwords: + headwords[reading] = [] + for expression in self.expression.split("・"): + headwords[reading].append(expression) + if self.alt_expression.strip() != "": + for expression in self.alt_expression.split("・"): + headwords[reading].append(expression) + return headwords + + def _add_variant_expressions(self, headwords): + for expressions in headwords.values(): + Expressions.add_variant_kanji(expressions) + Expressions.add_fullwidth(expressions) + Expressions.remove_iteration_mark(expressions) + Expressions.add_iteration_mark(expressions) diff --git a/bot/entries/jitenon_kotowaza/entry.py b/bot/entries/jitenon_kotowaza/entry.py new file mode 100644 index 0000000..71dc35f --- /dev/null +++ b/bot/entries/jitenon_kotowaza/entry.py @@ -0,0 +1,35 @@ +from bot.entries.base.jitenon_entry import JitenonEntry +import bot.entries.base.expressions as Expressions + + +class Entry(JitenonEntry): + def __init__(self, target, entry_id): + super().__init__(target, entry_id) + self.origin = "" + self.example = "" + self.related_expressions = [] + + def _get_column_map(self): + return { + "言葉": "expression", + "読み方": "yomikata", + "意味": "definition", + "異形": "other_forms", + "出典": "origin", + "例文": "example", + "類句": "related_expressions", + } + + def _get_headwords(self): + if self.expression == "金棒引き・鉄棒引き": + headwords = { + "かなぼうひき": ["金棒引き", "鉄棒引き"] + } + else: + headwords = super()._get_headwords() + return headwords + + def _add_variant_expressions(self, headwords): + for expressions in headwords.values(): + Expressions.add_variant_kanji(expressions) + Expressions.add_fullwidth(expressions) diff --git a/bot/entries/jitenon_yoji/entry.py b/bot/entries/jitenon_yoji/entry.py new file mode 100644 index 0000000..e0e8b13 --- /dev/null +++ b/bot/entries/jitenon_yoji/entry.py @@ -0,0 +1,27 @@ +import bot.entries.base.expressions as Expressions +from bot.entries.base.jitenon_entry import JitenonEntry + + +class Entry(JitenonEntry): + def __init__(self, target, entry_id): + super().__init__(target, entry_id) + self.origin = "" + self.kanken_level = "" + self.category = "" + self.related_expressions = [] + + def _get_column_map(self): + return { + "四字熟語": "expression", + "読み方": "yomikata", + "意味": "definition", + "異形": "other_forms", + "出典": "origin", + "漢検級": "kanken_level", + "場面用途": "category", + "類義語": "related_expressions", + } + + def _add_variant_expressions(self, headwords): + for expressions in headwords.values(): + Expressions.add_variant_kanji(expressions) diff --git a/bot/entries/sankoku8.py b/bot/entries/sankoku8.py deleted file mode 100644 index 9653f68..0000000 --- a/bot/entries/sankoku8.py +++ /dev/null @@ -1,260 +0,0 @@ -from bs4 import BeautifulSoup -import bot.entries.expressions as Expressions -import bot.soup as Soup -from bot.entries.entry import Entry -from bot.data import load_phrase_readings -from bot.entries.sankoku8_preprocess import preprocess_page - - -class _BaseSankoku8Entry(Entry): - def __init__(self, target, entry_id): - super().__init__(target, entry_id) - self.children = [] - self.phrases = [] - self._hyouki_name = "表記" - self._midashi_name = None - self._midashi_kana_name = None - - def get_global_identifier(self): - parent_part = format(self.entry_id[0], '06') - child_part = hex(self.entry_id[1]).lstrip('0x').zfill(4).upper() - return f"@{self.target.value}-{parent_part}-{child_part}" - - def set_page(self, page): - page = self.__decompose_subentries(page) - self._page = page - - def get_page_soup(self): - soup = BeautifulSoup(self._page, "xml") - return soup - - def _get_headwords(self): - soup = self.get_page_soup() - self._delete_unused_nodes(soup) - readings = self._find_readings(soup) - expressions = self._find_expressions(soup) - headwords = {} - for reading in readings: - headwords[reading] = [] - if len(readings) == 1: - reading = readings[0] - if soup.find(self._midashi_name).find(self._hyouki_name) is None: - headwords[reading].append(reading) - for exp in expressions: - if exp not in headwords[reading]: - headwords[reading].append(exp) - elif len(readings) > 1 and len(expressions) == 0: - for reading in readings: - headwords[reading].append(reading) - elif len(readings) > 1 and len(expressions) == 1: - if soup.find(self._midashi_name).find(self._hyouki_name) is None: - for reading in readings: - headwords[reading].append(reading) - expression = expressions[0] - for reading in readings: - if expression not in headwords[reading]: - headwords[reading].append(expression) - elif len(readings) > 1 and len(expressions) == len(readings): - if soup.find(self._midashi_name).find(self._hyouki_name) is None: - for reading in readings: - headwords[reading].append(reading) - for idx, reading in enumerate(readings): - exp = expressions[idx] - if exp not in headwords[reading]: - headwords[reading].append(exp) - else: - raise Exception() # shouldn't happen - return headwords - - def _add_variant_expressions(self, headwords): - for expressions in headwords.values(): - Expressions.add_variant_kanji(expressions) - Expressions.add_fullwidth(expressions) - Expressions.remove_iteration_mark(expressions) - Expressions.add_iteration_mark(expressions) - - def get_part_of_speech_tags(self): - if self._part_of_speech_tags is not None: - return self._part_of_speech_tags - self._part_of_speech_tags = [] - soup = self.get_page_soup() - for midashi in soup.find_all([self._midashi_name, "見出部要素"]): - pos_group = midashi.find("品詞G") - if pos_group is None: - continue - for tag in pos_group.find_all("a"): - if tag.text not in self._part_of_speech_tags: - self._part_of_speech_tags.append(tag.text) - return self._part_of_speech_tags - - def _find_expressions(self, soup): - expressions = [] - for hyouki in soup.find_all(self._hyouki_name): - for expression in parse_hyouki_soup(hyouki, [""]): - expressions.append(expression) - return expressions - - def _find_readings(self, soup): - midasi_kana = soup.find(self._midashi_kana_name) - readings = parse_hyouki_soup(midasi_kana, [""]) - return readings - - def __decompose_subentries(self, page): - soup = BeautifulSoup(page, features="xml") - subentry_parameters = [ - [Sankoku8ChildEntry, ["子項目"], self.children], - [Sankoku8PhraseEntry, ["句項目"], self.phrases], - ] - for x in subentry_parameters: - subentry_class, tags, subentry_list = x - for tag in tags: - tag_soup = soup.find(tag) - while tag_soup is not None: - tag_soup.name = "項目" - subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"]) - self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id - subentry = subentry_class(self.target, subentry_id) - page = tag_soup.decode() - subentry.set_page(page) - subentry_list.append(subentry) - tag_soup.decompose() - tag_soup = soup.find(tag) - return soup.decode() - - @staticmethod - def id_string_to_entry_id(id_string): - parts = id_string.split("-") - if len(parts) == 1: - return (int(parts[0]), 0) - elif len(parts) == 2: - # subentries have a hexadecimal part - return (int(parts[0]), int(parts[1], 16)) - else: - raise Exception(f"Invalid entry ID: {id_string}") - - @staticmethod - def _delete_unused_nodes(soup): - """Remove extra markup elements that appear in the entry - headword line which are not part of the entry headword""" - unused_nodes = [ - "語構成", "平板", "アクセント", "表外字マーク", "表外音訓マーク", - "アクセント分節", "活用分節", "ルビG", "分書" - ] - for name in unused_nodes: - Soup.delete_soup_nodes(soup, name) - - -class Sankoku8Entry(_BaseSankoku8Entry): - def __init__(self, target, page_id): - entry_id = (page_id, 0) - super().__init__(target, entry_id) - self._midashi_name = "見出部" - self._midashi_kana_name = "見出仮名" - - def set_page(self, page): - page = preprocess_page(page) - super().set_page(page) - - -class Sankoku8ChildEntry(_BaseSankoku8Entry): - def __init__(self, target, page_id): - super().__init__(target, page_id) - self._midashi_name = "子見出部" - self._midashi_kana_name = "子見出仮名" - - -class Sankoku8PhraseEntry(_BaseSankoku8Entry): - def get_part_of_speech_tags(self): - # phrases do not contain these tags - return [] - - def _get_headwords(self): - soup = self.get_page_soup() - self._delete_unused_nodes(soup) - expressions = self._find_expressions(soup) - readings = self._find_readings(soup) - headwords = {} - if len(expressions) != len(readings): - raise Exception(f"{self.entry_id[0]}-{self.entry_id[1]}") - for idx, expression in enumerate(expressions): - reading = readings[idx] - if reading in headwords: - headwords[reading].append(expression) - else: - headwords[reading] = [expression] - return headwords - - def _find_expressions(self, soup): - phrase_soup = soup.find("句表記") - expressions = parse_hyouki_soup(phrase_soup, [""]) - return expressions - - def _find_readings(self, soup): - reading_patterns = load_phrase_readings(self.target) - reading_pattern = reading_patterns[self.entry_id] - readings = parse_hyouki_pattern(reading_pattern) - return readings - - -def parse_hyouki_soup(soup, base_exps): - omitted_characters = [ - "/", "〈", "〉", "(", ")", "⦅", "⦆", ":", "…" - ] - exps = base_exps.copy() - for child in soup.children: - new_exps = [] - if child.name == "言換G": - for alt in child.find_all("言換"): - parts = parse_hyouki_soup(alt, [""]) - for exp in exps: - for part in parts: - new_exps.append(exp + part) - elif child.name == "補足表記": - alt1 = child.find("表記対象") - alt2 = child.find("表記内容G") - parts1 = parse_hyouki_soup(alt1, [""]) - parts2 = parse_hyouki_soup(alt2, [""]) - for exp in exps: - for part in parts1: - new_exps.append(exp + part) - for part in parts2: - new_exps.append(exp + part) - elif child.name == "省略": - parts = parse_hyouki_soup(child, [""]) - for exp in exps: - new_exps.append(exp) - for part in parts: - new_exps.append(exp + part) - elif child.name is not None: - new_exps = parse_hyouki_soup(child, exps) - else: - text = child.text - for char in omitted_characters: - text = text.replace(char, "") - for exp in exps: - new_exps.append(exp + text) - exps = new_exps.copy() - return exps - - -def parse_hyouki_pattern(pattern): - replacements = { - "(": "<省略>(", - ")": ")", - "{": "<補足表記><表記対象>", - "・": "<表記内容G>(<表記内容>", - "}": "", - "〈": "<言換G>〈<言換>", - "/": "/<言換>", - "〉": "", - "⦅": "<補足表記><表記対象>", - "\": "<表記内容G>⦅<表記内容>", - "⦆": "", - } - markup = f"{pattern}" - for key, val in replacements.items(): - markup = markup.replace(key, val) - soup = BeautifulSoup(markup, "xml") - hyouki_soup = soup.find("span") - exps = parse_hyouki_soup(hyouki_soup, [""]) - return exps diff --git a/bot/entries/sankoku8/base_entry.py b/bot/entries/sankoku8/base_entry.py new file mode 100644 index 0000000..93c0515 --- /dev/null +++ b/bot/entries/sankoku8/base_entry.py @@ -0,0 +1,97 @@ +import bot.soup as Soup +from bot.entries.base.sanseido_entry import SanseidoEntry +from bot.entries.sankoku8.parse import parse_hyouki_soup + + +class BaseEntry(SanseidoEntry): + def __init__(self, target, entry_id): + super().__init__(target, entry_id) + self.children = [] + self.phrases = [] + self._hyouki_name = "表記" + self._midashi_name = None + self._midashi_kana_name = None + + def _get_headwords(self): + soup = self.get_page_soup() + self._delete_unused_nodes(soup) + readings = self._find_readings(soup) + expressions = self._find_expressions(soup) + headwords = {} + for reading in readings: + headwords[reading] = [] + if len(readings) == 1: + reading = readings[0] + if soup.find(self._midashi_name).find(self._hyouki_name) is None: + headwords[reading].append(reading) + for exp in expressions: + if exp not in headwords[reading]: + headwords[reading].append(exp) + elif len(readings) > 1 and len(expressions) == 0: + for reading in readings: + headwords[reading].append(reading) + elif len(readings) > 1 and len(expressions) == 1: + if soup.find(self._midashi_name).find(self._hyouki_name) is None: + for reading in readings: + headwords[reading].append(reading) + expression = expressions[0] + for reading in readings: + if expression not in headwords[reading]: + headwords[reading].append(expression) + elif len(readings) > 1 and len(expressions) == len(readings): + if soup.find(self._midashi_name).find(self._hyouki_name) is None: + for reading in readings: + headwords[reading].append(reading) + for idx, reading in enumerate(readings): + exp = expressions[idx] + if exp not in headwords[reading]: + headwords[reading].append(exp) + else: + raise Exception() # shouldn't happen + return headwords + + def get_part_of_speech_tags(self): + if self._part_of_speech_tags is not None: + return self._part_of_speech_tags + self._part_of_speech_tags = [] + soup = self.get_page_soup() + for midashi in soup.find_all([self._midashi_name, "見出部要素"]): + pos_group = midashi.find("品詞G") + if pos_group is None: + continue + for tag in pos_group.find_all("a"): + if tag.text not in self._part_of_speech_tags: + self._part_of_speech_tags.append(tag.text) + return self._part_of_speech_tags + + def _find_expressions(self, soup): + expressions = [] + for hyouki in soup.find_all(self._hyouki_name): + for expression in parse_hyouki_soup(hyouki, [""]): + expressions.append(expression) + return expressions + + def _find_readings(self, soup): + midasi_kana = soup.find(self._midashi_kana_name) + readings = parse_hyouki_soup(midasi_kana, [""]) + return readings + + def _get_subentry_parameters(self): + from bot.entries.sankoku8.child_entry import ChildEntry + from bot.entries.sankoku8.phrase_entry import PhraseEntry + subentry_parameters = [ + [ChildEntry, ["子項目"], self.children], + [PhraseEntry, ["句項目"], self.phrases], + ] + return subentry_parameters + + @staticmethod + def _delete_unused_nodes(soup): + """Remove extra markup elements that appear in the entry + headword line which are not part of the entry headword""" + unused_nodes = [ + "語構成", "平板", "アクセント", "表外字マーク", "表外音訓マーク", + "アクセント分節", "活用分節", "ルビG", "分書" + ] + for name in unused_nodes: + Soup.delete_soup_nodes(soup, name) diff --git a/bot/entries/sankoku8/child_entry.py b/bot/entries/sankoku8/child_entry.py new file mode 100644 index 0000000..9f6b1c1 --- /dev/null +++ b/bot/entries/sankoku8/child_entry.py @@ -0,0 +1,8 @@ +from bot.entries.sankoku8.base_entry import BaseEntry + + +class ChildEntry(BaseEntry): + def __init__(self, target, page_id): + super().__init__(target, page_id) + self._midashi_name = "子見出部" + self._midashi_kana_name = "子見出仮名" diff --git a/bot/entries/sankoku8/entry.py b/bot/entries/sankoku8/entry.py new file mode 100644 index 0000000..533ac66 --- /dev/null +++ b/bot/entries/sankoku8/entry.py @@ -0,0 +1,14 @@ +from bot.entries.sankoku8.base_entry import BaseEntry +from bot.entries.sankoku8.preprocess import preprocess_page + + +class Entry(BaseEntry): + def __init__(self, target, page_id): + entry_id = (page_id, 0) + super().__init__(target, entry_id) + self._midashi_name = "見出部" + self._midashi_kana_name = "見出仮名" + + def set_page(self, page): + page = preprocess_page(page) + super().set_page(page) diff --git a/bot/entries/sankoku8/parse.py b/bot/entries/sankoku8/parse.py new file mode 100644 index 0000000..a57574b --- /dev/null +++ b/bot/entries/sankoku8/parse.py @@ -0,0 +1,65 @@ +from bs4 import BeautifulSoup + + +def parse_hyouki_soup(soup, base_exps): + omitted_characters = [ + "/", "〈", "〉", "(", ")", "⦅", "⦆", ":", "…" + ] + exps = base_exps.copy() + for child in soup.children: + new_exps = [] + if child.name == "言換G": + for alt in child.find_all("言換"): + parts = parse_hyouki_soup(alt, [""]) + for exp in exps: + for part in parts: + new_exps.append(exp + part) + elif child.name == "補足表記": + alt1 = child.find("表記対象") + alt2 = child.find("表記内容G") + parts1 = parse_hyouki_soup(alt1, [""]) + parts2 = parse_hyouki_soup(alt2, [""]) + for exp in exps: + for part in parts1: + new_exps.append(exp + part) + for part in parts2: + new_exps.append(exp + part) + elif child.name == "省略": + parts = parse_hyouki_soup(child, [""]) + for exp in exps: + new_exps.append(exp) + for part in parts: + new_exps.append(exp + part) + elif child.name is not None: + new_exps = parse_hyouki_soup(child, exps) + else: + text = child.text + for char in omitted_characters: + text = text.replace(char, "") + for exp in exps: + new_exps.append(exp + text) + exps = new_exps.copy() + return exps + + +def parse_hyouki_pattern(pattern): + replacements = { + "(": "<省略>(", + ")": ")", + "{": "<補足表記><表記対象>", + "・": "<表記内容G>(<表記内容>", + "}": "", + "〈": "<言換G>〈<言換>", + "/": "/<言換>", + "〉": "", + "⦅": "<補足表記><表記対象>", + "\": "<表記内容G>⦅<表記内容>", + "⦆": "", + } + markup = f"{pattern}" + for key, val in replacements.items(): + markup = markup.replace(key, val) + soup = BeautifulSoup(markup, "xml") + hyouki_soup = soup.find("span") + exps = parse_hyouki_soup(hyouki_soup, [""]) + return exps diff --git a/bot/entries/sankoku8/phrase_entry.py b/bot/entries/sankoku8/phrase_entry.py new file mode 100644 index 0000000..e5da208 --- /dev/null +++ b/bot/entries/sankoku8/phrase_entry.py @@ -0,0 +1,37 @@ +from bot.data import load_phrase_readings +from bot.entries.sankoku8.base_entry import BaseEntry +from bot.entries.sankoku8.parse import parse_hyouki_soup +from bot.entries.sankoku8.parse import parse_hyouki_pattern + + +class PhraseEntry(BaseEntry): + def get_part_of_speech_tags(self): + # phrases do not contain these tags + return [] + + def _get_headwords(self): + soup = self.get_page_soup() + self._delete_unused_nodes(soup) + expressions = self._find_expressions(soup) + readings = self._find_readings(soup) + headwords = {} + if len(expressions) != len(readings): + raise Exception(f"{self.entry_id[0]}-{self.entry_id[1]}") + for idx, expression in enumerate(expressions): + reading = readings[idx] + if reading in headwords: + headwords[reading].append(expression) + else: + headwords[reading] = [expression] + return headwords + + def _find_expressions(self, soup): + phrase_soup = soup.find("句表記") + expressions = parse_hyouki_soup(phrase_soup, [""]) + return expressions + + def _find_readings(self, soup): + reading_patterns = load_phrase_readings(self.target) + reading_pattern = reading_patterns[self.entry_id] + readings = parse_hyouki_pattern(reading_pattern) + return readings diff --git a/bot/entries/sankoku8_preprocess.py b/bot/entries/sankoku8/preprocess.py similarity index 100% rename from bot/entries/sankoku8_preprocess.py rename to bot/entries/sankoku8/preprocess.py diff --git a/bot/entries/smk8.py b/bot/entries/smk8.py deleted file mode 100644 index 2d43e4a..0000000 --- a/bot/entries/smk8.py +++ /dev/null @@ -1,221 +0,0 @@ -from bs4 import BeautifulSoup - -import bot.entries.expressions as Expressions -import bot.soup as Soup -from bot.data import load_phrase_readings -from bot.entries.entry import Entry -from bot.entries.smk8_preprocess import preprocess_page - - -class _BaseSmk8Entry(Entry): - def __init__(self, target, entry_id): - super().__init__(target, entry_id) - self.children = [] - self.phrases = [] - self.kanjis = [] - - def get_global_identifier(self): - parent_part = format(self.entry_id[0], '06') - child_part = hex(self.entry_id[1]).lstrip('0x').zfill(4).upper() - return f"@{self.target.value}-{parent_part}-{child_part}" - - def set_page(self, page): - page = self.__decompose_subentries(page) - self._page = page - - def get_page_soup(self): - soup = BeautifulSoup(self._page, "xml") - return soup - - def get_part_of_speech_tags(self): - if self._part_of_speech_tags is not None: - return self._part_of_speech_tags - self._part_of_speech_tags = [] - soup = self.get_page_soup() - headword_info = soup.find("見出要素") - if headword_info is None: - return self._part_of_speech_tags - for tag in headword_info.find_all("品詞M"): - if tag.text not in self._part_of_speech_tags: - self._part_of_speech_tags.append(tag.text) - return self._part_of_speech_tags - - def _add_variant_expressions(self, headwords): - for expressions in headwords.values(): - Expressions.add_variant_kanji(expressions) - Expressions.add_fullwidth(expressions) - Expressions.remove_iteration_mark(expressions) - Expressions.add_iteration_mark(expressions) - - def _find_reading(self, soup): - midasi_kana = soup.find("見出仮名") - reading = midasi_kana.text - for x in [" ", "・"]: - reading = reading.replace(x, "") - return reading - - def _find_expressions(self, soup): - clean_expressions = [] - for expression in soup.find_all("標準表記"): - clean_expression = self._clean_expression(expression.text) - clean_expressions.append(clean_expression) - expressions = Expressions.expand_abbreviation_list(clean_expressions) - return expressions - - def __decompose_subentries(self, page): - soup = BeautifulSoup(page, features="xml") - subentry_parameters = [ - [Smk8ChildEntry, ["子項目F", "子項目"], self.children], - [Smk8PhraseEntry, ["句項目F", "句項目"], self.phrases], - [Smk8KanjiEntry, ["造語成分項目"], self.kanjis], - ] - for x in subentry_parameters: - subentry_class, tags, subentry_list = x - for tag in tags: - tag_soup = soup.find(tag) - while tag_soup is not None: - tag_soup.name = "項目" - subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"]) - self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id - subentry = subentry_class(self.target, subentry_id) - page = tag_soup.decode() - subentry.set_page(page) - subentry_list.append(subentry) - tag_soup.decompose() - tag_soup = soup.find(tag) - return soup.decode() - - @staticmethod - def id_string_to_entry_id(id_string): - parts = id_string.split("-") - if len(parts) == 1: - return (int(parts[0]), 0) - elif len(parts) == 2: - # subentries have a hexadecimal part - return (int(parts[0]), int(parts[1], 16)) - else: - raise Exception(f"Invalid entry ID: {id_string}") - - @staticmethod - def _delete_unused_nodes(soup): - """Remove extra markup elements that appear in the entry - headword line which are not part of the entry headword""" - unused_nodes = [ - "表音表記", "表外音訓マーク", "表外字マーク", "ルビG" - ] - for name in unused_nodes: - Soup.delete_soup_nodes(soup, name) - - @staticmethod - def _clean_expression(expression): - for x in ["〈", "〉", "{", "}", "…", " "]: - expression = expression.replace(x, "") - return expression - - @staticmethod - def _fill_alts(soup): - for el in soup.find_all(["親見出仮名", "親見出表記"]): - el.string = el.attrs["alt"] - for gaiji in soup.find_all("外字"): - gaiji.string = gaiji.img.attrs["alt"] - - -class Smk8Entry(_BaseSmk8Entry): - def __init__(self, target, page_id): - entry_id = (page_id, 0) - super().__init__(target, entry_id) - - def set_page(self, page): - page = preprocess_page(page) - super().set_page(page) - - def _get_headwords(self): - soup = self.get_page_soup() - self._delete_unused_nodes(soup) - self._fill_alts(soup) - reading = self._find_reading(soup) - expressions = [] - if soup.find("見出部").find("標準表記") is None: - expressions.append(reading) - for expression in self._find_expressions(soup): - if expression not in expressions: - expressions.append(expression) - headwords = {reading: expressions} - return headwords - - -class Smk8ChildEntry(_BaseSmk8Entry): - def _get_headwords(self): - soup = self.get_page_soup() - self._delete_unused_nodes(soup) - self._fill_alts(soup) - reading = self._find_reading(soup) - expressions = [] - if soup.find("子見出部").find("標準表記") is None: - expressions.append(reading) - for expression in self._find_expressions(soup): - if expression not in expressions: - expressions.append(expression) - headwords = {reading: expressions} - return headwords - - -class Smk8PhraseEntry(_BaseSmk8Entry): - def __init__(self, target, entry_id): - super().__init__(target, entry_id) - self.__phrase_readings = load_phrase_readings(self.target) - - def get_part_of_speech_tags(self): - # phrases do not contain these tags - return [] - - def _get_headwords(self): - soup = self.get_page_soup() - headwords = {} - expressions = self._find_expressions(soup) - readings = self._find_readings() - for idx, expression in enumerate(expressions): - reading = readings[idx] - if reading in headwords: - headwords[reading].append(expression) - else: - headwords[reading] = [expression] - return headwords - - def _find_expressions(self, soup): - self._delete_unused_nodes(soup) - self._fill_alts(soup) - text = soup.find("標準表記").text - text = self._clean_expression(text) - alternatives = Expressions.expand_smk_alternatives(text) - expressions = [] - for alt in alternatives: - for exp in Expressions.expand_abbreviation(alt): - expressions.append(exp) - return expressions - - def _find_readings(self): - text = self.__phrase_readings[self.entry_id] - alternatives = Expressions.expand_smk_alternatives(text) - readings = [] - for alt in alternatives: - for reading in Expressions.expand_abbreviation(alt): - readings.append(reading) - return readings - - -class Smk8KanjiEntry(_BaseSmk8Entry): - def _get_headwords(self): - soup = self.get_page_soup() - self._delete_unused_nodes(soup) - self._fill_alts(soup) - reading = self.__get_parent_reading() - expressions = self._find_expressions(soup) - headwords = {reading: expressions} - return headwords - - def __get_parent_reading(self): - parent_id = self.SUBENTRY_ID_TO_ENTRY_ID[self.entry_id] - parent = self.ID_TO_ENTRY[parent_id] - reading = parent.get_first_reading() - return reading diff --git a/bot/entries/smk8/base_entry.py b/bot/entries/smk8/base_entry.py new file mode 100644 index 0000000..7bf32c2 --- /dev/null +++ b/bot/entries/smk8/base_entry.py @@ -0,0 +1,73 @@ +import bot.soup as Soup +import bot.entries.base.expressions as Expressions +from bot.entries.base.sanseido_entry import SanseidoEntry + + +class BaseEntry(SanseidoEntry): + def __init__(self, target, entry_id): + super().__init__(target, entry_id) + self.children = [] + self.phrases = [] + self.kanjis = [] + + def get_part_of_speech_tags(self): + if self._part_of_speech_tags is not None: + return self._part_of_speech_tags + self._part_of_speech_tags = [] + soup = self.get_page_soup() + headword_info = soup.find("見出要素") + if headword_info is None: + return self._part_of_speech_tags + for tag in headword_info.find_all("品詞M"): + if tag.text not in self._part_of_speech_tags: + self._part_of_speech_tags.append(tag.text) + return self._part_of_speech_tags + + def _find_reading(self, soup): + midasi_kana = soup.find("見出仮名") + reading = midasi_kana.text + for x in [" ", "・"]: + reading = reading.replace(x, "") + return reading + + def _find_expressions(self, soup): + clean_expressions = [] + for expression in soup.find_all("標準表記"): + clean_expression = self._clean_expression(expression.text) + clean_expressions.append(clean_expression) + expressions = Expressions.expand_abbreviation_list(clean_expressions) + return expressions + + def _get_subentry_parameters(self): + from bot.entries.smk8.child_entry import ChildEntry + from bot.entries.smk8.phrase_entry import PhraseEntry + from bot.entries.smk8.kanji_entry import KanjiEntry + subentry_parameters = [ + [ChildEntry, ["子項目F", "子項目"], self.children], + [PhraseEntry, ["句項目F", "句項目"], self.phrases], + [KanjiEntry, ["造語成分項目"], self.kanjis], + ] + return subentry_parameters + + @staticmethod + def _delete_unused_nodes(soup): + """Remove extra markup elements that appear in the entry + headword line which are not part of the entry headword""" + unused_nodes = [ + "表音表記", "表外音訓マーク", "表外字マーク", "ルビG" + ] + for name in unused_nodes: + Soup.delete_soup_nodes(soup, name) + + @staticmethod + def _clean_expression(expression): + for x in ["〈", "〉", "{", "}", "…", " "]: + expression = expression.replace(x, "") + return expression + + @staticmethod + def _fill_alts(soup): + for elm in soup.find_all(["親見出仮名", "親見出表記"]): + elm.string = elm.attrs["alt"] + for gaiji in soup.find_all("外字"): + gaiji.string = gaiji.img.attrs["alt"] diff --git a/bot/entries/smk8/child_entry.py b/bot/entries/smk8/child_entry.py new file mode 100644 index 0000000..0dbe375 --- /dev/null +++ b/bot/entries/smk8/child_entry.py @@ -0,0 +1,17 @@ +from bot.entries.smk8.base_entry import BaseEntry + + +class ChildEntry(BaseEntry): + def _get_headwords(self): + soup = self.get_page_soup() + self._delete_unused_nodes(soup) + self._fill_alts(soup) + reading = self._find_reading(soup) + expressions = [] + if soup.find("子見出部").find("標準表記") is None: + expressions.append(reading) + for expression in self._find_expressions(soup): + if expression not in expressions: + expressions.append(expression) + headwords = {reading: expressions} + return headwords diff --git a/bot/entries/smk8/entry.py b/bot/entries/smk8/entry.py new file mode 100644 index 0000000..4baed42 --- /dev/null +++ b/bot/entries/smk8/entry.py @@ -0,0 +1,26 @@ +from bot.entries.smk8.base_entry import BaseEntry +from bot.entries.smk8.preprocess import preprocess_page + + +class Entry(BaseEntry): + def __init__(self, target, page_id): + entry_id = (page_id, 0) + super().__init__(target, entry_id) + + def set_page(self, page): + page = preprocess_page(page) + super().set_page(page) + + def _get_headwords(self): + soup = self.get_page_soup() + self._delete_unused_nodes(soup) + self._fill_alts(soup) + reading = self._find_reading(soup) + expressions = [] + if soup.find("見出部").find("標準表記") is None: + expressions.append(reading) + for expression in self._find_expressions(soup): + if expression not in expressions: + expressions.append(expression) + headwords = {reading: expressions} + return headwords diff --git a/bot/entries/smk8/kanji_entry.py b/bot/entries/smk8/kanji_entry.py new file mode 100644 index 0000000..3e77faf --- /dev/null +++ b/bot/entries/smk8/kanji_entry.py @@ -0,0 +1,22 @@ +from bot.entries.smk8.base_entry import BaseEntry + + +class KanjiEntry(BaseEntry): + def get_part_of_speech_tags(self): + # kanji entries do not contain these tags + return [] + + def _get_headwords(self): + soup = self.get_page_soup() + self._delete_unused_nodes(soup) + self._fill_alts(soup) + reading = self.__get_parent_reading() + expressions = self._find_expressions(soup) + headwords = {reading: expressions} + return headwords + + def __get_parent_reading(self): + parent_id = self.SUBENTRY_ID_TO_ENTRY_ID[self.entry_id] + parent = self.ID_TO_ENTRY[parent_id] + reading = parent.get_first_reading() + return reading diff --git a/bot/entries/smk8/phrase_entry.py b/bot/entries/smk8/phrase_entry.py new file mode 100644 index 0000000..aac9b84 --- /dev/null +++ b/bot/entries/smk8/phrase_entry.py @@ -0,0 +1,64 @@ +import re + +import bot.entries.base.expressions as Expressions +from bot.data import load_phrase_readings +from bot.entries.smk8.base_entry import BaseEntry + + +class PhraseEntry(BaseEntry): + def __init__(self, target, entry_id): + super().__init__(target, entry_id) + self.__phrase_readings = load_phrase_readings(self.target) + + def get_part_of_speech_tags(self): + # phrase entries do not contain these tags + return [] + + def _get_headwords(self): + soup = self.get_page_soup() + headwords = {} + expressions = self._find_expressions(soup) + readings = self._find_readings() + for idx, expression in enumerate(expressions): + reading = readings[idx] + if reading in headwords: + headwords[reading].append(expression) + else: + headwords[reading] = [expression] + return headwords + + def _find_expressions(self, soup): + self._delete_unused_nodes(soup) + self._fill_alts(soup) + text = soup.find("標準表記").text + text = self._clean_expression(text) + alternatives = parse_phrase(text) + expressions = [] + for alt in alternatives: + for exp in Expressions.expand_abbreviation(alt): + expressions.append(exp) + return expressions + + def _find_readings(self): + text = self.__phrase_readings[self.entry_id] + alternatives = parse_phrase(text) + readings = [] + for alt in alternatives: + for reading in Expressions.expand_abbreviation(alt): + readings.append(reading) + return readings + + +def parse_phrase(text): + """Return a list of strings described by △ notation.""" + match = re.search(r"△([^(]+)(([^(]+))", text) + if match is None: + return [text] + alt_parts = [match.group(1)] + for alt_part in match.group(2).split("・"): + alt_parts.append(alt_part) + alts = [] + for alt_part in alt_parts: + alt_exp = re.sub(r"△[^(]+([^(]+)", alt_part, text) + alts.append(alt_exp) + return alts diff --git a/bot/entries/smk8_preprocess.py b/bot/entries/smk8/preprocess.py similarity index 100% rename from bot/entries/smk8_preprocess.py rename to bot/entries/smk8/preprocess.py diff --git a/bot/yomichan/terms/daijirin2.py b/bot/yomichan/terms/daijirin2.py index 10aaa76..281fac4 100644 --- a/bot/yomichan/terms/daijirin2.py +++ b/bot/yomichan/terms/daijirin2.py @@ -1,4 +1,4 @@ -from bot.entries.daijirin2 import Daijirin2PhraseEntry as PhraseEntry +from bot.entries.daijirin2.phrase_entry import PhraseEntry from bot.yomichan.terms.terminator import Terminator from bot.yomichan.glossary.daijirin2 import make_glossary @@ -6,9 +6,6 @@ from bot.yomichan.grammar import sudachi_rules, tags_to_rules class Daijirin2Terminator(Terminator): - def __init__(self, target): - super().__init__(target) - def _definition_tags(self, entry): return "" diff --git a/bot/yomichan/terms/sankoku8.py b/bot/yomichan/terms/sankoku8.py index 613f3bb..cff264f 100644 --- a/bot/yomichan/terms/sankoku8.py +++ b/bot/yomichan/terms/sankoku8.py @@ -1,4 +1,4 @@ -from bot.entries.sankoku8 import Sankoku8PhraseEntry as PhraseEntry +from bot.entries.sankoku8.phrase_entry import PhraseEntry from bot.yomichan.terms.terminator import Terminator from bot.yomichan.glossary.sankoku8 import make_glossary @@ -6,9 +6,6 @@ from bot.yomichan.grammar import sudachi_rules, tags_to_rules class Sankoku8Terminator(Terminator): - def __init__(self, target): - super().__init__(target) - def _definition_tags(self, entry): return "" diff --git a/bot/yomichan/terms/smk8.py b/bot/yomichan/terms/smk8.py index d1e3ca7..766f4a0 100644 --- a/bot/yomichan/terms/smk8.py +++ b/bot/yomichan/terms/smk8.py @@ -1,5 +1,5 @@ -from bot.entries.smk8 import Smk8KanjiEntry as KanjiEntry -from bot.entries.smk8 import Smk8PhraseEntry as PhraseEntry +from bot.entries.smk8.kanji_entry import KanjiEntry +from bot.entries.smk8.phrase_entry import PhraseEntry from bot.yomichan.terms.terminator import Terminator from bot.yomichan.glossary.smk8 import make_glossary diff --git a/tests/test_daijirin_phrases.py b/tests/test_daijirin_phrases.py new file mode 100644 index 0000000..3ab02dd --- /dev/null +++ b/tests/test_daijirin_phrases.py @@ -0,0 +1,21 @@ +import unittest +from bot.entries.daijirin2.phrase_entry import parse_phrase + + +class TestDaijirin2PhraseParse(unittest.TestCase): + def test1(self): + text = "同じ穴の=狢(=狐・狸)" + exps = parse_phrase(text) + self.assertEqual(len(exps), 3) + self.assertIn("同じ穴の狢", exps) + self.assertIn("同じ穴の狐", exps) + self.assertIn("同じ穴の狸", exps) + + def test2(self): + text = "聞くは=一時(=一旦)の恥、聞かぬは=末代(=一生)の恥" + exps = parse_phrase(text) + self.assertEqual(len(exps), 4) + self.assertIn("聞くは一時の恥、聞かぬは末代の恥", exps) + self.assertIn("聞くは一時の恥、聞かぬは一生の恥", exps) + self.assertIn("聞くは一旦の恥、聞かぬは末代の恥", exps) + self.assertIn("聞くは一旦の恥、聞かぬは一生の恥", exps) diff --git a/tests/test_expressions.py b/tests/test_expressions.py index b2ebc26..5d90ce1 100644 --- a/tests/test_expressions.py +++ b/tests/test_expressions.py @@ -1,5 +1,5 @@ import unittest -import bot.entries.expressions as Expressions +import bot.entries.base.expressions as Expressions class TestExpressions(unittest.TestCase): @@ -69,28 +69,3 @@ class TestExpressions(unittest.TestCase): self.assertIn("有合わせ", abbrs) self.assertIn("有り合せ", abbrs) self.assertIn("有合せ", abbrs) - - def test_smk_expand_alternatives(self): - text = "△金(時間・暇)に飽かして" - exps = Expressions.expand_smk_alternatives(text) - self.assertEqual(len(exps), 3) - self.assertIn("金に飽かして", exps) - self.assertIn("時間に飽かして", exps) - self.assertIn("暇に飽かして", exps) - - def test_daijirin_expand_alternatives(self): - text = "同じ穴の=狢(=狐・狸)" - exps = Expressions.expand_daijirin_alternatives(text) - self.assertEqual(len(exps), 3) - self.assertIn("同じ穴の狢", exps) - self.assertIn("同じ穴の狐", exps) - self.assertIn("同じ穴の狸", exps) - - def test_daijirin_expand_alternatives2(self): - text = "聞くは=一時(=一旦)の恥、聞かぬは=末代(=一生)の恥" - exps = Expressions.expand_daijirin_alternatives(text) - self.assertEqual(len(exps), 4) - self.assertIn("聞くは一時の恥、聞かぬは末代の恥", exps) - self.assertIn("聞くは一時の恥、聞かぬは一生の恥", exps) - self.assertIn("聞くは一旦の恥、聞かぬは末代の恥", exps) - self.assertIn("聞くは一旦の恥、聞かぬは一生の恥", exps) diff --git a/tests/test_sankoku_phrases.py b/tests/test_sankoku_phrases.py index 7faf289..c3894e9 100644 --- a/tests/test_sankoku_phrases.py +++ b/tests/test_sankoku_phrases.py @@ -1,16 +1,16 @@ import unittest -from bot.entries.sankoku8 import parse_hyouki_pattern +from bot.entries.sankoku8.parse import parse_hyouki_pattern -class TestSankokuPhrases(unittest.TestCase): - def test_sankoku_phrases1(self): +class TestSankoku8PhraseParse(unittest.TestCase): + def test1(self): pattern = '耳にたこ(ができる)' exps = parse_hyouki_pattern(pattern) self.assertEqual(len(exps), 2) self.assertIn("耳にたこ", exps) self.assertIn("耳にたこができる", exps) - def test_sankoku_phrases2(self): + def test2(self): pattern = '一斑を〈見て/もって〉全豹を〈卜す/推す〉' exps = parse_hyouki_pattern(pattern) self.assertEqual(len(exps), 4) @@ -19,14 +19,14 @@ class TestSankokuPhrases(unittest.TestCase): self.assertIn("一斑をもって全豹を卜す", exps) self.assertIn("一斑をもって全豹を推す", exps) - def test_sankoku_phrases3(self): + def test3(self): pattern = '{かじ・舵}を切る' exps = parse_hyouki_pattern(pattern) self.assertEqual(len(exps), 2) self.assertIn("かじを切る", exps) self.assertIn("舵を切る", exps) - def test_sankoku_phrases4(self): + def test4(self): pattern = '重箱の隅を(⦅ようじ\楊枝⦆で)〈つつく/ほじくる〉' exps = parse_hyouki_pattern(pattern) self.assertEqual(len(exps), 6) @@ -37,7 +37,7 @@ class TestSankokuPhrases(unittest.TestCase): self.assertIn("重箱の隅をようじでほじくる", exps) self.assertIn("重箱の隅を楊枝でほじくる", exps) - def test_sankoku_phrases5(self): + def test5(self): pattern = '群盲象を〈{な・撫}でる/評する〉' exps = parse_hyouki_pattern(pattern) self.assertEqual(len(exps), 3) diff --git a/tests/test_smk_phrases.py b/tests/test_smk_phrases.py new file mode 100644 index 0000000..e5ce231 --- /dev/null +++ b/tests/test_smk_phrases.py @@ -0,0 +1,19 @@ +import unittest +from bot.entries.smk8.phrase_entry import parse_phrase + + +class TestSmk8PhraseParse(unittest.TestCase): + def test1(self): + text = "目と鼻の△先(間)" + exps = parse_phrase(text) + self.assertEqual(len(exps), 2) + self.assertIn("目と鼻の先", exps) + self.assertIn("目と鼻の間", exps) + + def test2(self): + text = "△金(時間・暇)に飽かして" + exps = parse_phrase(text) + self.assertEqual(len(exps), 3) + self.assertIn("金に飽かして", exps) + self.assertIn("時間に飽かして", exps) + self.assertIn("暇に飽かして", exps)