2023-05-01 22:31:28 +00:00
|
|
|
|
import re
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
2023-05-07 01:07:07 +00:00
|
|
|
|
import bot.entries.expressions as Expressions
|
2023-05-01 22:31:28 +00:00
|
|
|
|
import bot.soup as Soup
|
|
|
|
|
from bot.data import load_smk8_phrase_readings
|
|
|
|
|
from bot.entries.entry import Entry
|
|
|
|
|
from bot.entries.smk8_preprocess import preprocess_page
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class _BaseSmk8Entry(Entry):
|
|
|
|
|
ID_TO_ENTRY = {}
|
|
|
|
|
SUBENTRY_ID_TO_ENTRY_ID = {}
|
|
|
|
|
|
|
|
|
|
def __init__(self, entry_id):
|
|
|
|
|
super().__init__(entry_id)
|
|
|
|
|
if entry_id not in self.ID_TO_ENTRY:
|
|
|
|
|
self.ID_TO_ENTRY[entry_id] = self
|
|
|
|
|
else:
|
|
|
|
|
raise Exception(f"Duplicate entry ID: {entry_id}")
|
|
|
|
|
self.children = []
|
|
|
|
|
self.phrases = []
|
|
|
|
|
self.kanjis = []
|
|
|
|
|
|
|
|
|
|
def set_page(self, page):
|
|
|
|
|
page = self.__decompose_subentries(page)
|
|
|
|
|
self._page = page
|
|
|
|
|
|
|
|
|
|
def get_page_soup(self):
|
|
|
|
|
soup = BeautifulSoup(self._page, "xml")
|
|
|
|
|
return soup
|
|
|
|
|
|
|
|
|
|
def get_headwords(self):
|
|
|
|
|
if self._headwords is not None:
|
|
|
|
|
return self._headwords
|
|
|
|
|
self._set_headwords()
|
|
|
|
|
self._set_variant_headwords()
|
|
|
|
|
return self._headwords
|
|
|
|
|
|
|
|
|
|
def get_part_of_speech_tags(self):
|
|
|
|
|
if self._part_of_speech_tags is not None:
|
|
|
|
|
return self._part_of_speech_tags
|
|
|
|
|
self._part_of_speech_tags = []
|
|
|
|
|
soup = self.get_page_soup()
|
|
|
|
|
headword_info = soup.find("見出要素")
|
|
|
|
|
if headword_info is None:
|
|
|
|
|
return self._part_of_speech_tags
|
|
|
|
|
for tag in headword_info.find_all("品詞M"):
|
|
|
|
|
if tag.text not in self._part_of_speech_tags:
|
|
|
|
|
self._part_of_speech_tags.append(tag.text)
|
|
|
|
|
return self._part_of_speech_tags
|
|
|
|
|
|
|
|
|
|
def _set_variant_headwords(self):
|
|
|
|
|
for expressions in self._headwords.values():
|
2023-05-07 01:07:07 +00:00
|
|
|
|
Expressions.add_variant_kanji(expressions)
|
2023-05-01 22:31:28 +00:00
|
|
|
|
Expressions.add_fullwidth(expressions)
|
|
|
|
|
Expressions.remove_iteration_mark(expressions)
|
|
|
|
|
Expressions.add_iteration_mark(expressions)
|
|
|
|
|
|
|
|
|
|
def _find_reading(self, soup):
|
|
|
|
|
midasi_kana = soup.find("見出仮名")
|
|
|
|
|
reading = midasi_kana.text
|
|
|
|
|
for x in [" ", "・"]:
|
|
|
|
|
reading = reading.replace(x, "")
|
|
|
|
|
return reading
|
|
|
|
|
|
|
|
|
|
def _find_expressions(self, soup):
|
|
|
|
|
clean_expressions = []
|
|
|
|
|
for expression in soup.find_all("標準表記"):
|
|
|
|
|
clean_expression = self._clean_expression(expression.text)
|
|
|
|
|
clean_expressions.append(clean_expression)
|
|
|
|
|
expressions = Expressions.expand_abbreviation_list(clean_expressions)
|
|
|
|
|
return expressions
|
|
|
|
|
|
|
|
|
|
def __decompose_subentries(self, page):
|
|
|
|
|
soup = BeautifulSoup(page, features="xml")
|
|
|
|
|
subentry_parameters = [
|
|
|
|
|
[Smk8ChildEntry, ["子項目F", "子項目"], self.children],
|
|
|
|
|
[Smk8PhraseEntry, ["句項目F", "句項目"], self.phrases],
|
|
|
|
|
[Smk8KanjiEntry, ["造語成分項目"], self.kanjis],
|
|
|
|
|
]
|
|
|
|
|
for x in subentry_parameters:
|
|
|
|
|
subentry_class, tags, subentry_list = x
|
|
|
|
|
for tag in tags:
|
|
|
|
|
tag_soup = soup.find(tag)
|
|
|
|
|
while tag_soup is not None:
|
|
|
|
|
tag_soup.name = "項目"
|
|
|
|
|
subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"])
|
|
|
|
|
self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id
|
|
|
|
|
subentry = subentry_class(subentry_id)
|
|
|
|
|
page = tag_soup.decode()
|
|
|
|
|
subentry.set_page(page)
|
|
|
|
|
subentry_list.append(subentry)
|
|
|
|
|
tag_soup.decompose()
|
|
|
|
|
tag_soup = soup.find(tag)
|
|
|
|
|
return soup.decode()
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def id_string_to_entry_id(id_string):
|
|
|
|
|
parts = id_string.split("-")
|
|
|
|
|
if len(parts) == 1:
|
|
|
|
|
return (int(parts[0]), 0)
|
|
|
|
|
elif len(parts) == 2:
|
|
|
|
|
# subentries have a hexadecimal part
|
|
|
|
|
return (int(parts[0]), int(parts[1], 16))
|
|
|
|
|
else:
|
|
|
|
|
raise Exception(f"Invalid entry ID: {id_string}")
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def _clean_expression(expression):
|
|
|
|
|
for x in ["〈", "〉", "{", "}", "…", " "]:
|
|
|
|
|
expression = expression.replace(x, "")
|
|
|
|
|
return expression
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def _fill_alts(soup):
|
|
|
|
|
for e in soup.find_all(["親見出仮名", "親見出表記"]):
|
|
|
|
|
e.string = e.attrs["alt"]
|
|
|
|
|
for gaiji in soup.find_all("外字"):
|
|
|
|
|
gaiji.string = gaiji.img.attrs["alt"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Smk8Entry(_BaseSmk8Entry):
|
|
|
|
|
def __init__(self, page_id):
|
|
|
|
|
entry_id = (page_id, 0)
|
|
|
|
|
super().__init__(entry_id)
|
|
|
|
|
|
|
|
|
|
def set_page(self, page):
|
|
|
|
|
page = preprocess_page(page)
|
|
|
|
|
super().set_page(page)
|
|
|
|
|
|
|
|
|
|
def _set_headwords(self):
|
|
|
|
|
soup = self.get_page_soup()
|
|
|
|
|
Soup.delete_soup_nodes(soup, "表音表記")
|
|
|
|
|
self._fill_alts(soup)
|
|
|
|
|
reading = self._find_reading(soup)
|
|
|
|
|
expressions = []
|
|
|
|
|
if soup.find("見出部").find("標準表記") is None:
|
|
|
|
|
expressions.append(reading)
|
|
|
|
|
for expression in self._find_expressions(soup):
|
|
|
|
|
if expression not in expressions:
|
|
|
|
|
expressions.append(expression)
|
|
|
|
|
self._headwords = {reading: expressions}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Smk8ChildEntry(_BaseSmk8Entry):
|
|
|
|
|
def __init__(self, entry_id):
|
|
|
|
|
super().__init__(entry_id)
|
|
|
|
|
|
|
|
|
|
def _set_headwords(self):
|
|
|
|
|
soup = self.get_page_soup()
|
|
|
|
|
Soup.delete_soup_nodes(soup, "表音表記")
|
|
|
|
|
self._fill_alts(soup)
|
|
|
|
|
reading = self._find_reading(soup)
|
|
|
|
|
expressions = []
|
|
|
|
|
if soup.find("子見出部").find("標準表記") is None:
|
|
|
|
|
expressions.append(reading)
|
|
|
|
|
for expression in self._find_expressions(soup):
|
|
|
|
|
if expression not in expressions:
|
|
|
|
|
expressions.append(expression)
|
|
|
|
|
self._headwords = {reading: expressions}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Smk8PhraseEntry(_BaseSmk8Entry):
|
|
|
|
|
def __init__(self, entry_id):
|
|
|
|
|
super().__init__(entry_id)
|
|
|
|
|
self.__phrase_readings = load_smk8_phrase_readings()
|
|
|
|
|
|
|
|
|
|
def get_part_of_speech_tags(self):
|
|
|
|
|
# phrases do not contain these tags
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
def _set_headwords(self):
|
|
|
|
|
soup = self.get_page_soup()
|
|
|
|
|
headwords = {}
|
|
|
|
|
expressions = self._find_expressions(soup)
|
|
|
|
|
readings = self._find_readings()
|
|
|
|
|
for idx, expression in enumerate(expressions):
|
|
|
|
|
reading = readings[idx]
|
|
|
|
|
if reading in headwords:
|
|
|
|
|
headwords[reading].append(expression)
|
|
|
|
|
else:
|
|
|
|
|
headwords[reading] = [expression]
|
|
|
|
|
self._headwords = headwords
|
|
|
|
|
|
|
|
|
|
def _find_expressions(self, soup):
|
|
|
|
|
Soup.delete_soup_nodes(soup, "ルビG")
|
|
|
|
|
self._fill_alts(soup)
|
|
|
|
|
text = soup.find("標準表記").text
|
|
|
|
|
text = self._clean_expression(text)
|
2023-05-07 01:07:07 +00:00
|
|
|
|
alternatives = Expressions.expand_smk_alternatives(text)
|
2023-05-01 22:31:28 +00:00
|
|
|
|
expressions = []
|
|
|
|
|
for alt in alternatives:
|
|
|
|
|
for exp in Expressions.expand_abbreviation(alt):
|
|
|
|
|
expressions.append(exp)
|
|
|
|
|
return expressions
|
|
|
|
|
|
|
|
|
|
def _find_readings(self):
|
|
|
|
|
text = self.__phrase_readings[self.entry_id]
|
2023-05-07 01:07:07 +00:00
|
|
|
|
alternatives = Expressions.expand_smk_alternatives(text)
|
2023-05-01 22:31:28 +00:00
|
|
|
|
readings = []
|
|
|
|
|
for alt in alternatives:
|
|
|
|
|
for reading in Expressions.expand_abbreviation(alt):
|
|
|
|
|
readings.append(reading)
|
|
|
|
|
return readings
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Smk8KanjiEntry(_BaseSmk8Entry):
|
|
|
|
|
def __init__(self, entry_id):
|
|
|
|
|
super().__init__(entry_id)
|
|
|
|
|
|
|
|
|
|
def _set_headwords(self):
|
|
|
|
|
soup = self.get_page_soup()
|
|
|
|
|
self._fill_alts(soup)
|
|
|
|
|
reading = self.__get_parent_reading()
|
|
|
|
|
expressions = self._find_expressions(soup)
|
|
|
|
|
self._headwords = {reading: expressions}
|
|
|
|
|
|
|
|
|
|
def __get_parent_reading(self):
|
|
|
|
|
parent_id = self.SUBENTRY_ID_TO_ENTRY_ID[self.entry_id]
|
|
|
|
|
parent = self.ID_TO_ENTRY[parent_id]
|
|
|
|
|
reading = parent.get_first_reading()
|
|
|
|
|
return reading
|