65 lines
2.1 KiB
Python
65 lines
2.1 KiB
Python
import re
|
||
|
||
import bot.entries.base.expressions as Expressions
|
||
from bot.data import load_phrase_readings
|
||
from bot.entries.smk8.base_entry import BaseEntry
|
||
|
||
|
||
class PhraseEntry(BaseEntry):
|
||
def __init__(self, target, entry_id):
|
||
super().__init__(target, entry_id)
|
||
self.__phrase_readings = load_phrase_readings(self.target)
|
||
|
||
def get_part_of_speech_tags(self):
|
||
# phrase entries do not contain these tags
|
||
return []
|
||
|
||
def _get_headwords(self):
|
||
soup = self.get_page_soup()
|
||
headwords = {}
|
||
expressions = self._find_expressions(soup)
|
||
readings = self._find_readings()
|
||
for idx, expression in enumerate(expressions):
|
||
reading = readings[idx]
|
||
if reading in headwords:
|
||
headwords[reading].append(expression)
|
||
else:
|
||
headwords[reading] = [expression]
|
||
return headwords
|
||
|
||
def _find_expressions(self, soup):
|
||
self._delete_unused_nodes(soup)
|
||
self._fill_alts(soup)
|
||
text = soup.find("標準表記").text
|
||
text = self._clean_expression(text)
|
||
alternatives = parse_phrase(text)
|
||
expressions = []
|
||
for alt in alternatives:
|
||
for exp in Expressions.expand_abbreviation(alt):
|
||
expressions.append(exp)
|
||
return expressions
|
||
|
||
def _find_readings(self):
|
||
text = self.__phrase_readings[self.entry_id]
|
||
alternatives = parse_phrase(text)
|
||
readings = []
|
||
for alt in alternatives:
|
||
for reading in Expressions.expand_abbreviation(alt):
|
||
readings.append(reading)
|
||
return readings
|
||
|
||
|
||
def parse_phrase(text):
|
||
"""Return a list of strings described by △ notation."""
|
||
match = re.search(r"△([^(]+)(([^(]+))", text)
|
||
if match is None:
|
||
return [text]
|
||
alt_parts = [match.group(1)]
|
||
for alt_part in match.group(2).split("・"):
|
||
alt_parts.append(alt_part)
|
||
alts = []
|
||
for alt_part in alt_parts:
|
||
alt_exp = re.sub(r"△[^(]+([^(]+)", alt_part, text)
|
||
alts.append(alt_exp)
|
||
return alts
|