jitenbot/bot/entries/smk8/phrase_entry.py

import re

import bot.entries.base.expressions as Expressions
from bot.data import load_phrase_readings
from bot.entries.smk8.base_entry import BaseEntry


class PhraseEntry(BaseEntry):
    def __init__(self, target, entry_id):
        super().__init__(target, entry_id)
        self.__phrase_readings = load_phrase_readings(self.target)

    def get_part_of_speech_tags(self):
        # phrase entries do not contain these tags
        return []

    def _get_headwords(self):
        soup = self.get_page_soup()
        headwords = {}
        expressions = self._find_expressions(soup)
        readings = self._find_readings()
        for idx, expression in enumerate(expressions):
            reading = readings[idx]
            if reading in headwords:
                headwords[reading].append(expression)
            else:
                headwords[reading] = [expression]
        return headwords

    def _find_expressions(self, soup):
        self._delete_unused_nodes(soup)
        self._fill_alts(soup)
        text = soup.find("標準表記").text
        text = self._clean_expression(text)
        alternatives = parse_phrase(text)
        expressions = []
        for alt in alternatives:
            for exp in Expressions.expand_abbreviation(alt):
                expressions.append(exp)
        return expressions

    def _find_readings(self):
        text = self.__phrase_readings[self.entry_id]
        alternatives = parse_phrase(text)
        readings = []
        for alt in alternatives:
            for reading in Expressions.expand_abbreviation(alt):
                readings.append(reading)
        return readings


def parse_phrase(text):
    """Return a list of strings described by △ notation."""
    match = re.search(r"△([^（]+)（([^（]+)）", text)
    if match is None:
        return [text]
    alt_parts = [match.group(1)]
    for alt_part in match.group(2).split("・"):
        alt_parts.append(alt_part)
    alts = []
    for alt_part in alt_parts:
        alt_exp = re.sub(r"△[^（]+（[^（]+）", alt_part, text)
        alts.append(alt_exp)
    return alts