diff --git a/bot/entries/daijirin2.py b/bot/entries/daijirin2.py index cf93c93..1463442 100644 --- a/bot/entries/daijirin2.py +++ b/bot/entries/daijirin2.py @@ -1,7 +1,7 @@ import re from bs4 import BeautifulSoup -import bot.expressions as Expressions +import bot.entries.expressions as Expressions import bot.soup as Soup from bot.data import load_daijirin2_phrase_readings from bot.data import load_daijirin2_kana_abbreviations @@ -82,7 +82,7 @@ class _BaseDaijirin2Entry(Entry): def _set_variant_headwords(self): for expressions in self._headwords.values(): - Expressions.add_variant_kanji(expressions, self._variant_kanji) + Expressions.add_variant_kanji(expressions) Expressions.add_fullwidth(expressions) Expressions.remove_iteration_mark(expressions) Expressions.add_iteration_mark(expressions) @@ -223,7 +223,7 @@ class Daijirin2PhraseEntry(_BaseDaijirin2Entry): self._delete_unused_nodes(soup) text = soup.find("句表記").text text = self._clean_expression(text) - alternatives = self.__expand_alternatives(text) + alternatives = Expressions.expand_daijirin_alternatives(text) expressions = [] for alt in alternatives: for exp in Expressions.expand_abbreviation(alt): @@ -232,41 +232,9 @@ class Daijirin2PhraseEntry(_BaseDaijirin2Entry): def _find_readings(self): text = self.__phrase_readings[self.entry_id] - alternatives = self.__expand_alternatives(text) + alternatives = Expressions.expand_daijirin_alternatives(text) readings = [] for alt in alternatives: for reading in Expressions.expand_abbreviation(alt): readings.append(reading) return readings - - @staticmethod - def __expand_alternatives(expression): - """Return a list of strings described by = notation. - eg. "同じ穴の=狢(=狐・狸)" -> [ - "同じ穴の狢", "同じ穴の狐", "同じ穴の狸" - ] - eg. "聞くは=一時(=一旦)の恥、聞かぬは=末代(=一生)の恥" -> [ - "聞くは一時の恥、聞かぬは末代の恥", - "聞くは一時の恥、聞かぬは一生の恥", - "聞くは一旦の恥、聞かぬは末代の恥", - "聞くは一旦の恥、聞かぬは一生の恥" - ] - """ - group_pattern = r"([^=]+)(=([^(]+)(=([^(]+)))?" - groups = re.findall(group_pattern, expression) - expressions = [""] - for group in groups: - new_exps = [] - for expression in expressions: - new_exps.append(expression + group[0]) - expressions = new_exps.copy() - if group[1] == "": - continue - new_exps = [] - for expression in expressions: - new_exps.append(expression + group[2]) - for expression in expressions: - for alt in group[3].split("・"): - new_exps.append(expression + alt) - expressions = new_exps.copy() - return expressions diff --git a/bot/entries/entry.py b/bot/entries/entry.py index b6b0fbb..57316f6 100644 --- a/bot/entries/entry.py +++ b/bot/entries/entry.py @@ -1,5 +1,4 @@ from abc import ABC, abstractmethod -from bot.data import load_variant_kanji class Entry(ABC): @@ -8,7 +7,6 @@ class Entry(ABC): self._page = None self._headwords = None self._part_of_speech_tags = None - self._variant_kanji = load_variant_kanji() @abstractmethod def set_page(self, page): diff --git a/bot/expressions.py b/bot/entries/expressions.py similarity index 61% rename from bot/expressions.py rename to bot/entries/expressions.py index 7e434cb..687a325 100644 --- a/bot/expressions.py +++ b/bot/entries/expressions.py @@ -1,9 +1,11 @@ import re +from bot.data import load_variant_kanji + __KATA_TO_HIRA_MAP = { i: i - 96 for i in [ - *range(0x30A1, 0x30F6), - *range(0x30FD, 0x30FE), + *range(0x30A1, 0x30F7), + *range(0x30FD, 0x30FF), ] } @@ -27,7 +29,8 @@ def add_fullwidth(expressions): expressions.append(new_exp) -def add_variant_kanji(expressions, variant_kanji): +def add_variant_kanji(expressions): + variant_kanji = load_variant_kanji() for old_kanji, new_kanji in variant_kanji.items(): new_exps = [] for expression in expressions: @@ -58,11 +61,7 @@ def add_iteration_mark(expressions): def expand_abbreviation(abbreviated_expression): - """Return a list of words described by a 省略 notation. - eg. "有(り)合(わ)せ" -> [ - "有り合わせ", "有合わせ", "有り合せ", "有合せ" - ] - """ + """Return a list of words described by a 省略 notation.""" groups = re.findall(r"([^(]*)((([^(]+)))?", abbreviated_expression) expressions = [""] for group in groups: @@ -86,3 +85,40 @@ def expand_abbreviation_list(expressions): if new_exp not in new_exps: new_exps.append(new_exp) return new_exps + + +def expand_smk_alternatives(text): + """Return a list of strings described by △ notation.""" + m = re.search(r"△([^(]+)(([^(]+))", text) + if m is None: + return [text] + alt_parts = [m.group(1)] + for alt_part in m.group(2).split("・"): + alt_parts.append(alt_part) + alts = [] + for alt_part in alt_parts: + alt_exp = re.sub(r"△[^(]+([^(]+)", alt_part, text) + alts.append(alt_exp) + return alts + + +def expand_daijirin_alternatives(text): + """Return a list of strings described by = notation.""" + group_pattern = r"([^=]+)(=([^(]+)(=([^(]+)))?" + groups = re.findall(group_pattern, text) + expressions = [""] + for group in groups: + new_exps = [] + for expression in expressions: + new_exps.append(expression + group[0]) + expressions = new_exps.copy() + if group[1] == "": + continue + new_exps = [] + for expression in expressions: + new_exps.append(expression + group[2]) + for expression in expressions: + for alt in group[3].split("・"): + new_exps.append(expression + alt) + expressions = new_exps.copy() + return expressions diff --git a/bot/entries/jitenon.py b/bot/entries/jitenon.py index e1e17b4..d97a41b 100644 --- a/bot/entries/jitenon.py +++ b/bot/entries/jitenon.py @@ -3,7 +3,7 @@ from datetime import datetime, date from bs4 import BeautifulSoup from bot.entries.entry import Entry -import bot.expressions as Expressions +import bot.entries.expressions as Expressions class _JitenonEntry(Entry): @@ -151,7 +151,7 @@ class JitenonYojiEntry(_JitenonEntry): def _set_variant_headwords(self): for expressions in self._headwords.values(): - Expressions.add_variant_kanji(expressions, self._variant_kanji) + Expressions.add_variant_kanji(expressions) class JitenonKotowazaEntry(_JitenonEntry): @@ -178,7 +178,7 @@ class JitenonKotowazaEntry(_JitenonEntry): def _set_variant_headwords(self): for expressions in self._headwords.values(): - Expressions.add_variant_kanji(expressions, self._variant_kanji) + Expressions.add_variant_kanji(expressions) Expressions.add_fullwidth(expressions) @@ -212,7 +212,7 @@ class JitenonKokugoEntry(_JitenonEntry): def _set_variant_headwords(self): for expressions in self._headwords.values(): - Expressions.add_variant_kanji(expressions, self._variant_kanji) + Expressions.add_variant_kanji(expressions) Expressions.add_fullwidth(expressions) Expressions.remove_iteration_mark(expressions) Expressions.add_iteration_mark(expressions) diff --git a/bot/entries/smk8.py b/bot/entries/smk8.py index 0880ada..11ef7e6 100644 --- a/bot/entries/smk8.py +++ b/bot/entries/smk8.py @@ -1,7 +1,7 @@ import re from bs4 import BeautifulSoup -import bot.expressions as Expressions +import bot.entries.expressions as Expressions import bot.soup as Soup from bot.data import load_smk8_phrase_readings from bot.entries.entry import Entry @@ -52,7 +52,7 @@ class _BaseSmk8Entry(Entry): def _set_variant_headwords(self): for expressions in self._headwords.values(): - Expressions.add_variant_kanji(expressions, self._variant_kanji) + Expressions.add_variant_kanji(expressions) Expressions.add_fullwidth(expressions) Expressions.remove_iteration_mark(expressions) Expressions.add_iteration_mark(expressions) @@ -188,7 +188,7 @@ class Smk8PhraseEntry(_BaseSmk8Entry): self._fill_alts(soup) text = soup.find("標準表記").text text = self._clean_expression(text) - alternatives = self.__expand_alternatives(text) + alternatives = Expressions.expand_smk_alternatives(text) expressions = [] for alt in alternatives: for exp in Expressions.expand_abbreviation(alt): @@ -197,32 +197,13 @@ class Smk8PhraseEntry(_BaseSmk8Entry): def _find_readings(self): text = self.__phrase_readings[self.entry_id] - alternatives = self.__expand_alternatives(text) + alternatives = Expressions.expand_smk_alternatives(text) readings = [] for alt in alternatives: for reading in Expressions.expand_abbreviation(alt): readings.append(reading) return readings - @staticmethod - def __expand_alternatives(expression): - """Return a list of strings described by △ notation - eg. "△金(時間・暇)に飽かして" -> [ - "金に飽かして", "時間に飽かして", "暇に飽かして" - ] - """ - m = re.search(r"△([^(]+)(([^(]+))", expression) - if not m: - return [expression] - alt_parts = [m.group(1)] - for alt_part in m.group(2).split("・"): - alt_parts.append(alt_part) - alts = [] - for alt_part in alt_parts: - alt_exp = re.sub(r"△[^(]+([^(]+)", alt_part, expression) - alts.append(alt_exp) - return alts - class Smk8KanjiEntry(_BaseSmk8Entry): def __init__(self, entry_id): diff --git a/tests/test_expressions.py b/tests/test_expressions.py new file mode 100644 index 0000000..b2ebc26 --- /dev/null +++ b/tests/test_expressions.py @@ -0,0 +1,96 @@ +import unittest +import bot.entries.expressions as Expressions + + +class TestExpressions(unittest.TestCase): + def test_kata_to_hira(self): + hira = "Abc5゠ぁゖずほヷヸヹヺ・ーゝゞヿ" + kata = "Abc5゠ァヶズホヷヸヹヺ・ーヽヾヿ" + transformed = Expressions.kata_to_hira(kata) + self.assertEqual(transformed, hira) + + def test_add_fullwidth(self): + exps = ["Abc059!~{}あア日本語Abc059!~{}"] + Expressions.add_fullwidth(exps) + self.assertEqual(len(exps), 2) + self.assertIn("Abc059!~{}あア日本語Abc059!~{}", exps) + self.assertIn("Abc059!~{}あア日本語Abc059!~{}", exps) + + def test_add_iteration_mark(self): + exps = ["禍禍しい", "凶々しい", "凶凶しい"] + Expressions.add_iteration_mark(exps) + self.assertEqual(len(exps), 4) + self.assertIn("禍々しい", exps) + self.assertIn("禍禍しい", exps) + self.assertIn("凶々しい", exps) + self.assertIn("凶凶しい", exps) + + def test_remove_iteration_mark(self): + exps = ["禍々しい", "凶々しい", "凶凶しい"] + Expressions.remove_iteration_mark(exps) + self.assertEqual(len(exps), 4) + self.assertIn("禍々しい", exps) + self.assertIn("禍禍しい", exps) + self.assertIn("凶々しい", exps) + self.assertIn("凶凶しい", exps) + + def test_add_variant_kanji(self): + exps = ["剝く", "掴む", "摑む"] + Expressions.add_variant_kanji(exps) + self.assertEqual(len(exps), 4) + self.assertIn("剥く", exps) + self.assertIn("剝く", exps) + self.assertIn("掴む", exps) + self.assertIn("摑む", exps) + + def test_add_variant_kanji2(self): + exps = ["剝摑"] + Expressions.add_variant_kanji(exps) + self.assertEqual(len(exps), 4) + self.assertIn("剝摑", exps) + self.assertIn("剝掴", exps) + self.assertIn("剥掴", exps) + self.assertIn("剥摑", exps) + + def test_expand_abbreviation(self): + text = "有(り)合(わ)せ" + abbrs = Expressions.expand_abbreviation(text) + self.assertEqual(len(abbrs), 4) + self.assertIn("有り合わせ", abbrs) + self.assertIn("有合わせ", abbrs) + self.assertIn("有り合せ", abbrs) + self.assertIn("有合せ", abbrs) + + def test_expand_abbreviation_list(self): + texts = ["有(り)合わせ", "有り合(わ)せ", "有合せ"] + abbrs = Expressions.expand_abbreviation_list(texts) + self.assertEqual(len(abbrs), 4) + self.assertIn("有り合わせ", abbrs) + self.assertIn("有合わせ", abbrs) + self.assertIn("有り合せ", abbrs) + self.assertIn("有合せ", abbrs) + + def test_smk_expand_alternatives(self): + text = "△金(時間・暇)に飽かして" + exps = Expressions.expand_smk_alternatives(text) + self.assertEqual(len(exps), 3) + self.assertIn("金に飽かして", exps) + self.assertIn("時間に飽かして", exps) + self.assertIn("暇に飽かして", exps) + + def test_daijirin_expand_alternatives(self): + text = "同じ穴の=狢(=狐・狸)" + exps = Expressions.expand_daijirin_alternatives(text) + self.assertEqual(len(exps), 3) + self.assertIn("同じ穴の狢", exps) + self.assertIn("同じ穴の狐", exps) + self.assertIn("同じ穴の狸", exps) + + def test_daijirin_expand_alternatives2(self): + text = "聞くは=一時(=一旦)の恥、聞かぬは=末代(=一生)の恥" + exps = Expressions.expand_daijirin_alternatives(text) + self.assertEqual(len(exps), 4) + self.assertIn("聞くは一時の恥、聞かぬは末代の恥", exps) + self.assertIn("聞くは一時の恥、聞かぬは一生の恥", exps) + self.assertIn("聞くは一旦の恥、聞かぬは末代の恥", exps) + self.assertIn("聞くは一旦の恥、聞かぬは一生の恥", exps)