jitenbot/bot/entries/daijirin2/entry.py

51 lines
1.7 KiB
Python
Raw Normal View History

import bot.entries.base.expressions as Expressions
from bot.entries.daijirin2.base_entry import BaseEntry
from bot.entries.daijirin2.preprocess import preprocess_page
class Entry(BaseEntry):
def __init__(self, target, page_id):
entry_id = (page_id, 0)
super().__init__(target, entry_id)
def set_page(self, page):
page = preprocess_page(page)
super().set_page(page)
def _get_headwords(self):
soup = self.get_page_soup()
self._delete_unused_nodes(soup)
if soup.find("漢字見出") is not None:
headwords = self._get_kanji_headwords(soup)
elif soup.find("略語G") is not None:
headwords = self._get_acronym_headwords(soup)
else:
headwords = self._get_regular_headwords(soup)
return headwords
def _get_kanji_headwords(self, soup):
readings = []
for el in soup.find_all("漢字音"):
hira = Expressions.kata_to_hira(el.text)
readings.append(hira)
if soup.find("漢字音") is None:
readings.append("")
expressions = []
for el in soup.find_all("漢字見出"):
expressions.append(el.text)
headwords = {}
for reading in readings:
headwords[reading] = expressions
return headwords
def _get_acronym_headwords(self, soup):
expressions = []
for el in soup.find_all("略語"):
expression_parts = []
for part in el.find_all(["欧字", "和字"]):
expression_parts.append(part.text)
expression = "".join(expression_parts)
expressions.append(expression)
headwords = {"": expressions}
return headwords