from bs4 import BeautifulSoup import bot.entries.expressions as Expressions import bot.soup as Soup from bot.entries.entry import Entry from bot.data import load_phrase_readings from bot.entries.sankoku8_preprocess import preprocess_page class _BaseSankoku8Entry(Entry): def __init__(self, target, entry_id): super().__init__(target, entry_id) self.children = [] self.phrases = [] self._hyouki_name = "表記" self._midashi_name = None self._midashi_kana_name = None def get_global_identifier(self): parent_part = format(self.entry_id[0], '06') child_part = hex(self.entry_id[1]).lstrip('0x').zfill(4).upper() return f"@{self.target.value}-{parent_part}-{child_part}" def set_page(self, page): page = self.__decompose_subentries(page) self._page = page def get_page_soup(self): soup = BeautifulSoup(self._page, "xml") return soup def _get_headwords(self): soup = self.get_page_soup() self._delete_unused_nodes(soup) readings = self._find_readings(soup) expressions = self._find_expressions(soup) headwords = {} for reading in readings: headwords[reading] = [] if len(readings) == 1: reading = readings[0] if soup.find(self._midashi_name).find(self._hyouki_name) is None: headwords[reading].append(reading) for exp in expressions: if exp not in headwords[reading]: headwords[reading].append(exp) elif len(readings) > 1 and len(expressions) == 0: for reading in readings: headwords[reading].append(reading) elif len(readings) > 1 and len(expressions) == 1: if soup.find(self._midashi_name).find(self._hyouki_name) is None: for reading in readings: headwords[reading].append(reading) expression = expressions[0] for reading in readings: if expression not in headwords[reading]: headwords[reading].append(expression) elif len(readings) > 1 and len(expressions) == len(readings): if soup.find(self._midashi_name).find(self._hyouki_name) is None: for reading in readings: headwords[reading].append(reading) for idx, reading in enumerate(readings): exp = expressions[idx] if exp not in headwords[reading]: headwords[reading].append(exp) else: raise Exception() # shouldn't happen return headwords def _add_variant_expressions(self, headwords): for expressions in headwords.values(): Expressions.add_variant_kanji(expressions) Expressions.add_fullwidth(expressions) Expressions.remove_iteration_mark(expressions) Expressions.add_iteration_mark(expressions) def get_part_of_speech_tags(self): if self._part_of_speech_tags is not None: return self._part_of_speech_tags self._part_of_speech_tags = [] soup = self.get_page_soup() for midashi in soup.find_all([self._midashi_name, "見出部要素"]): pos_group = midashi.find("品詞G") if pos_group is None: continue for tag in pos_group.find_all("a"): if tag.text not in self._part_of_speech_tags: self._part_of_speech_tags.append(tag.text) return self._part_of_speech_tags def _find_expressions(self, soup): expressions = [] for hyouki in soup.find_all(self._hyouki_name): for expression in parse_hyouki_soup(hyouki, [""]): expressions.append(expression) return expressions def _find_readings(self, soup): midasi_kana = soup.find(self._midashi_kana_name) readings = parse_hyouki_soup(midasi_kana, [""]) return readings def __decompose_subentries(self, page): soup = BeautifulSoup(page, features="xml") subentry_parameters = [ [Sankoku8ChildEntry, ["子項目"], self.children], [Sankoku8PhraseEntry, ["句項目"], self.phrases], ] for x in subentry_parameters: subentry_class, tags, subentry_list = x for tag in tags: tag_soup = soup.find(tag) while tag_soup is not None: tag_soup.name = "項目" subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"]) self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id subentry = subentry_class(self.target, subentry_id) page = tag_soup.decode() subentry.set_page(page) subentry_list.append(subentry) tag_soup.decompose() tag_soup = soup.find(tag) return soup.decode() @staticmethod def id_string_to_entry_id(id_string): parts = id_string.split("-") if len(parts) == 1: return (int(parts[0]), 0) elif len(parts) == 2: # subentries have a hexadecimal part return (int(parts[0]), int(parts[1], 16)) else: raise Exception(f"Invalid entry ID: {id_string}") @staticmethod def _delete_unused_nodes(soup): """Remove extra markup elements that appear in the entry headword line which are not part of the entry headword""" unused_nodes = [ "語構成", "平板", "アクセント", "表外字マーク", "表外音訓マーク", "アクセント分節", "活用分節", "ルビG", "分書" ] for name in unused_nodes: Soup.delete_soup_nodes(soup, name) class Sankoku8Entry(_BaseSankoku8Entry): def __init__(self, target, page_id): entry_id = (page_id, 0) super().__init__(target, entry_id) self._midashi_name = "見出部" self._midashi_kana_name = "見出仮名" def set_page(self, page): page = preprocess_page(page) super().set_page(page) class Sankoku8ChildEntry(_BaseSankoku8Entry): def __init__(self, target, page_id): super().__init__(target, page_id) self._midashi_name = "子見出部" self._midashi_kana_name = "子見出仮名" class Sankoku8PhraseEntry(_BaseSankoku8Entry): def get_part_of_speech_tags(self): # phrases do not contain these tags return [] def _get_headwords(self): soup = self.get_page_soup() self._delete_unused_nodes(soup) expressions = self._find_expressions(soup) readings = self._find_readings(soup) headwords = {} if len(expressions) != len(readings): raise Exception(f"{self.entry_id[0]}-{self.entry_id[1]}") for idx, expression in enumerate(expressions): reading = readings[idx] if reading in headwords: headwords[reading].append(expression) else: headwords[reading] = [expression] return headwords def _find_expressions(self, soup): phrase_soup = soup.find("句表記") expressions = parse_hyouki_soup(phrase_soup, [""]) return expressions def _find_readings(self, soup): reading_patterns = load_phrase_readings(self.target) reading_pattern = reading_patterns[self.entry_id] readings = parse_hyouki_pattern(reading_pattern) return readings def parse_hyouki_soup(soup, base_exps): omitted_characters = [ "/", "〈", "〉", "(", ")", "⦅", "⦆", ":", "…" ] exps = base_exps.copy() for child in soup.children: new_exps = [] if child.name == "言換G": for alt in child.find_all("言換"): parts = parse_hyouki_soup(alt, [""]) for exp in exps: for part in parts: new_exps.append(exp + part) elif child.name == "補足表記": alt1 = child.find("表記対象") alt2 = child.find("表記内容G") parts1 = parse_hyouki_soup(alt1, [""]) parts2 = parse_hyouki_soup(alt2, [""]) for exp in exps: for part in parts1: new_exps.append(exp + part) for part in parts2: new_exps.append(exp + part) elif child.name == "省略": parts = parse_hyouki_soup(child, [""]) for exp in exps: new_exps.append(exp) for part in parts: new_exps.append(exp + part) elif child.name is not None: new_exps = parse_hyouki_soup(child, exps) else: text = child.text for char in omitted_characters: text = text.replace(char, "") for exp in exps: new_exps.append(exp + text) exps = new_exps.copy() return exps def parse_hyouki_pattern(pattern): replacements = { "(": "<省略>(", ")": ")", "{": "<補足表記><表記対象>", "・": "<表記内容G>(<表記内容>", "}": "", "〈": "<言換G>〈<言換>", "/": "/<言換>", "〉": "", "⦅": "<補足表記><表記対象>", "\": "<表記内容G>⦅<表記内容>", "⦆": "", } markup = f"{pattern}" for key, val in replacements.items(): markup = markup.replace(key, val) soup = BeautifulSoup(markup, "xml") hyouki_soup = soup.find("span") exps = parse_hyouki_soup(hyouki_soup, [""]) return exps