74 lines
2.7 KiB
Python
74 lines
2.7 KiB
Python
|
import bot.soup as Soup
|
|||
|
import bot.entries.base.expressions as Expressions
|
|||
|
from bot.entries.base.sanseido_entry import SanseidoEntry
|
|||
|
|
|||
|
|
|||
|
class BaseEntry(SanseidoEntry):
|
|||
|
def __init__(self, target, entry_id):
|
|||
|
super().__init__(target, entry_id)
|
|||
|
self.children = []
|
|||
|
self.phrases = []
|
|||
|
self.kanjis = []
|
|||
|
|
|||
|
def get_part_of_speech_tags(self):
|
|||
|
if self._part_of_speech_tags is not None:
|
|||
|
return self._part_of_speech_tags
|
|||
|
self._part_of_speech_tags = []
|
|||
|
soup = self.get_page_soup()
|
|||
|
headword_info = soup.find("見出要素")
|
|||
|
if headword_info is None:
|
|||
|
return self._part_of_speech_tags
|
|||
|
for tag in headword_info.find_all("品詞M"):
|
|||
|
if tag.text not in self._part_of_speech_tags:
|
|||
|
self._part_of_speech_tags.append(tag.text)
|
|||
|
return self._part_of_speech_tags
|
|||
|
|
|||
|
def _find_reading(self, soup):
|
|||
|
midasi_kana = soup.find("見出仮名")
|
|||
|
reading = midasi_kana.text
|
|||
|
for x in [" ", "・"]:
|
|||
|
reading = reading.replace(x, "")
|
|||
|
return reading
|
|||
|
|
|||
|
def _find_expressions(self, soup):
|
|||
|
clean_expressions = []
|
|||
|
for expression in soup.find_all("標準表記"):
|
|||
|
clean_expression = self._clean_expression(expression.text)
|
|||
|
clean_expressions.append(clean_expression)
|
|||
|
expressions = Expressions.expand_abbreviation_list(clean_expressions)
|
|||
|
return expressions
|
|||
|
|
|||
|
def _get_subentry_parameters(self):
|
|||
|
from bot.entries.smk8.child_entry import ChildEntry
|
|||
|
from bot.entries.smk8.phrase_entry import PhraseEntry
|
|||
|
from bot.entries.smk8.kanji_entry import KanjiEntry
|
|||
|
subentry_parameters = [
|
|||
|
[ChildEntry, ["子項目F", "子項目"], self.children],
|
|||
|
[PhraseEntry, ["句項目F", "句項目"], self.phrases],
|
|||
|
[KanjiEntry, ["造語成分項目"], self.kanjis],
|
|||
|
]
|
|||
|
return subentry_parameters
|
|||
|
|
|||
|
@staticmethod
|
|||
|
def _delete_unused_nodes(soup):
|
|||
|
"""Remove extra markup elements that appear in the entry
|
|||
|
headword line which are not part of the entry headword"""
|
|||
|
unused_nodes = [
|
|||
|
"表音表記", "表外音訓マーク", "表外字マーク", "ルビG"
|
|||
|
]
|
|||
|
for name in unused_nodes:
|
|||
|
Soup.delete_soup_nodes(soup, name)
|
|||
|
|
|||
|
@staticmethod
|
|||
|
def _clean_expression(expression):
|
|||
|
for x in ["〈", "〉", "{", "}", "…", " "]:
|
|||
|
expression = expression.replace(x, "")
|
|||
|
return expression
|
|||
|
|
|||
|
@staticmethod
|
|||
|
def _fill_alts(soup):
|
|||
|
for elm in soup.find_all(["親見出仮名", "親見出表記"]):
|
|||
|
elm.string = elm.attrs["alt"]
|
|||
|
for gaiji in soup.find_all("外字"):
|
|||
|
gaiji.string = gaiji.img.attrs["alt"]
|