jitenbot/bot/entries/smk8/phrase_entry.py
2023-07-26 19:28:50 -05:00

65 lines
2.1 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
import bot.entries.base.expressions as Expressions
from bot.data import load_phrase_readings
from bot.entries.smk8.base_entry import BaseEntry
class PhraseEntry(BaseEntry):
def __init__(self, target, entry_id):
super().__init__(target, entry_id)
self.__phrase_readings = load_phrase_readings(self.target)
def get_part_of_speech_tags(self):
# phrase entries do not contain these tags
return []
def _get_headwords(self):
soup = self.get_page_soup()
headwords = {}
expressions = self._find_expressions(soup)
readings = self._find_readings()
for idx, expression in enumerate(expressions):
reading = readings[idx]
if reading in headwords:
headwords[reading].append(expression)
else:
headwords[reading] = [expression]
return headwords
def _find_expressions(self, soup):
self._delete_unused_nodes(soup)
self._fill_alts(soup)
text = soup.find("標準表記").text
text = self._clean_expression(text)
alternatives = parse_phrase(text)
expressions = []
for alt in alternatives:
for exp in Expressions.expand_abbreviation(alt):
expressions.append(exp)
return expressions
def _find_readings(self):
text = self.__phrase_readings[self.entry_id]
alternatives = parse_phrase(text)
readings = []
for alt in alternatives:
for reading in Expressions.expand_abbreviation(alt):
readings.append(reading)
return readings
def parse_phrase(text):
"""Return a list of strings described by △ notation."""
match = re.search(r"△([^]+)([^]+)", text)
if match is None:
return [text]
alt_parts = [match.group(1)]
for alt_part in match.group(2).split(""):
alt_parts.append(alt_part)
alts = []
for alt_part in alt_parts:
alt_exp = re.sub(r"△[^]+[^]+", alt_part, text)
alts.append(alt_exp)
return alts