Add tests for Expressions functions

This commit is contained in:
stephenmk 2023-05-06 20:07:07 -05:00
parent 6dbc8b90ce
commit c737f10885
No known key found for this signature in database
GPG key ID: B6DA730DB06235F1
6 changed files with 152 additions and 73 deletions

View file

@ -1,7 +1,7 @@
import re import re
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import bot.expressions as Expressions import bot.entries.expressions as Expressions
import bot.soup as Soup import bot.soup as Soup
from bot.data import load_daijirin2_phrase_readings from bot.data import load_daijirin2_phrase_readings
from bot.data import load_daijirin2_kana_abbreviations from bot.data import load_daijirin2_kana_abbreviations
@ -82,7 +82,7 @@ class _BaseDaijirin2Entry(Entry):
def _set_variant_headwords(self): def _set_variant_headwords(self):
for expressions in self._headwords.values(): for expressions in self._headwords.values():
Expressions.add_variant_kanji(expressions, self._variant_kanji) Expressions.add_variant_kanji(expressions)
Expressions.add_fullwidth(expressions) Expressions.add_fullwidth(expressions)
Expressions.remove_iteration_mark(expressions) Expressions.remove_iteration_mark(expressions)
Expressions.add_iteration_mark(expressions) Expressions.add_iteration_mark(expressions)
@ -223,7 +223,7 @@ class Daijirin2PhraseEntry(_BaseDaijirin2Entry):
self._delete_unused_nodes(soup) self._delete_unused_nodes(soup)
text = soup.find("句表記").text text = soup.find("句表記").text
text = self._clean_expression(text) text = self._clean_expression(text)
alternatives = self.__expand_alternatives(text) alternatives = Expressions.expand_daijirin_alternatives(text)
expressions = [] expressions = []
for alt in alternatives: for alt in alternatives:
for exp in Expressions.expand_abbreviation(alt): for exp in Expressions.expand_abbreviation(alt):
@ -232,41 +232,9 @@ class Daijirin2PhraseEntry(_BaseDaijirin2Entry):
def _find_readings(self): def _find_readings(self):
text = self.__phrase_readings[self.entry_id] text = self.__phrase_readings[self.entry_id]
alternatives = self.__expand_alternatives(text) alternatives = Expressions.expand_daijirin_alternatives(text)
readings = [] readings = []
for alt in alternatives: for alt in alternatives:
for reading in Expressions.expand_abbreviation(alt): for reading in Expressions.expand_abbreviation(alt):
readings.append(reading) readings.append(reading)
return readings return readings
@staticmethod
def __expand_alternatives(expression):
"""Return a list of strings described by notation.
eg. "同じ穴の=狢(=狐・狸)" -> [
"同じ穴の狢", "同じ穴の狐", "同じ穴の狸"
]
eg. "聞くは=一時(=一旦)の恥、聞かぬは=末代(=一生)の恥" -> [
"聞くは一時の恥、聞かぬは末代の恥",
"聞くは一時の恥、聞かぬは一生の恥",
"聞くは一旦の恥、聞かぬは末代の恥",
"聞くは一旦の恥、聞かぬは一生の恥"
]
"""
group_pattern = r"([^]+)(([^]+)([^]+))?"
groups = re.findall(group_pattern, expression)
expressions = [""]
for group in groups:
new_exps = []
for expression in expressions:
new_exps.append(expression + group[0])
expressions = new_exps.copy()
if group[1] == "":
continue
new_exps = []
for expression in expressions:
new_exps.append(expression + group[2])
for expression in expressions:
for alt in group[3].split(""):
new_exps.append(expression + alt)
expressions = new_exps.copy()
return expressions

View file

@ -1,5 +1,4 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from bot.data import load_variant_kanji
class Entry(ABC): class Entry(ABC):
@ -8,7 +7,6 @@ class Entry(ABC):
self._page = None self._page = None
self._headwords = None self._headwords = None
self._part_of_speech_tags = None self._part_of_speech_tags = None
self._variant_kanji = load_variant_kanji()
@abstractmethod @abstractmethod
def set_page(self, page): def set_page(self, page):

View file

@ -1,9 +1,11 @@
import re import re
from bot.data import load_variant_kanji
__KATA_TO_HIRA_MAP = { __KATA_TO_HIRA_MAP = {
i: i - 96 for i in [ i: i - 96 for i in [
*range(0x30A1, 0x30F6), *range(0x30A1, 0x30F7),
*range(0x30FD, 0x30FE), *range(0x30FD, 0x30FF),
] ]
} }
@ -27,7 +29,8 @@ def add_fullwidth(expressions):
expressions.append(new_exp) expressions.append(new_exp)
def add_variant_kanji(expressions, variant_kanji): def add_variant_kanji(expressions):
variant_kanji = load_variant_kanji()
for old_kanji, new_kanji in variant_kanji.items(): for old_kanji, new_kanji in variant_kanji.items():
new_exps = [] new_exps = []
for expression in expressions: for expression in expressions:
@ -58,11 +61,7 @@ def add_iteration_mark(expressions):
def expand_abbreviation(abbreviated_expression): def expand_abbreviation(abbreviated_expression):
"""Return a list of words described by a 省略 notation. """Return a list of words described by a 省略 notation."""
eg. "有(り)合(わ)せ" -> [
"有り合わせ", "有合わせ", "有り合せ", "有合せ"
]
"""
groups = re.findall(r"([^]*)(([^]+))?", abbreviated_expression) groups = re.findall(r"([^]*)(([^]+))?", abbreviated_expression)
expressions = [""] expressions = [""]
for group in groups: for group in groups:
@ -86,3 +85,40 @@ def expand_abbreviation_list(expressions):
if new_exp not in new_exps: if new_exp not in new_exps:
new_exps.append(new_exp) new_exps.append(new_exp)
return new_exps return new_exps
def expand_smk_alternatives(text):
"""Return a list of strings described by △ notation."""
m = re.search(r"△([^]+)([^]+)", text)
if m is None:
return [text]
alt_parts = [m.group(1)]
for alt_part in m.group(2).split(""):
alt_parts.append(alt_part)
alts = []
for alt_part in alt_parts:
alt_exp = re.sub(r"△[^]+[^]+", alt_part, text)
alts.append(alt_exp)
return alts
def expand_daijirin_alternatives(text):
"""Return a list of strings described by notation."""
group_pattern = r"([^]+)(([^]+)([^]+))?"
groups = re.findall(group_pattern, text)
expressions = [""]
for group in groups:
new_exps = []
for expression in expressions:
new_exps.append(expression + group[0])
expressions = new_exps.copy()
if group[1] == "":
continue
new_exps = []
for expression in expressions:
new_exps.append(expression + group[2])
for expression in expressions:
for alt in group[3].split(""):
new_exps.append(expression + alt)
expressions = new_exps.copy()
return expressions

View file

@ -3,7 +3,7 @@ from datetime import datetime, date
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from bot.entries.entry import Entry from bot.entries.entry import Entry
import bot.expressions as Expressions import bot.entries.expressions as Expressions
class _JitenonEntry(Entry): class _JitenonEntry(Entry):
@ -151,7 +151,7 @@ class JitenonYojiEntry(_JitenonEntry):
def _set_variant_headwords(self): def _set_variant_headwords(self):
for expressions in self._headwords.values(): for expressions in self._headwords.values():
Expressions.add_variant_kanji(expressions, self._variant_kanji) Expressions.add_variant_kanji(expressions)
class JitenonKotowazaEntry(_JitenonEntry): class JitenonKotowazaEntry(_JitenonEntry):
@ -178,7 +178,7 @@ class JitenonKotowazaEntry(_JitenonEntry):
def _set_variant_headwords(self): def _set_variant_headwords(self):
for expressions in self._headwords.values(): for expressions in self._headwords.values():
Expressions.add_variant_kanji(expressions, self._variant_kanji) Expressions.add_variant_kanji(expressions)
Expressions.add_fullwidth(expressions) Expressions.add_fullwidth(expressions)
@ -212,7 +212,7 @@ class JitenonKokugoEntry(_JitenonEntry):
def _set_variant_headwords(self): def _set_variant_headwords(self):
for expressions in self._headwords.values(): for expressions in self._headwords.values():
Expressions.add_variant_kanji(expressions, self._variant_kanji) Expressions.add_variant_kanji(expressions)
Expressions.add_fullwidth(expressions) Expressions.add_fullwidth(expressions)
Expressions.remove_iteration_mark(expressions) Expressions.remove_iteration_mark(expressions)
Expressions.add_iteration_mark(expressions) Expressions.add_iteration_mark(expressions)

View file

@ -1,7 +1,7 @@
import re import re
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import bot.expressions as Expressions import bot.entries.expressions as Expressions
import bot.soup as Soup import bot.soup as Soup
from bot.data import load_smk8_phrase_readings from bot.data import load_smk8_phrase_readings
from bot.entries.entry import Entry from bot.entries.entry import Entry
@ -52,7 +52,7 @@ class _BaseSmk8Entry(Entry):
def _set_variant_headwords(self): def _set_variant_headwords(self):
for expressions in self._headwords.values(): for expressions in self._headwords.values():
Expressions.add_variant_kanji(expressions, self._variant_kanji) Expressions.add_variant_kanji(expressions)
Expressions.add_fullwidth(expressions) Expressions.add_fullwidth(expressions)
Expressions.remove_iteration_mark(expressions) Expressions.remove_iteration_mark(expressions)
Expressions.add_iteration_mark(expressions) Expressions.add_iteration_mark(expressions)
@ -188,7 +188,7 @@ class Smk8PhraseEntry(_BaseSmk8Entry):
self._fill_alts(soup) self._fill_alts(soup)
text = soup.find("標準表記").text text = soup.find("標準表記").text
text = self._clean_expression(text) text = self._clean_expression(text)
alternatives = self.__expand_alternatives(text) alternatives = Expressions.expand_smk_alternatives(text)
expressions = [] expressions = []
for alt in alternatives: for alt in alternatives:
for exp in Expressions.expand_abbreviation(alt): for exp in Expressions.expand_abbreviation(alt):
@ -197,32 +197,13 @@ class Smk8PhraseEntry(_BaseSmk8Entry):
def _find_readings(self): def _find_readings(self):
text = self.__phrase_readings[self.entry_id] text = self.__phrase_readings[self.entry_id]
alternatives = self.__expand_alternatives(text) alternatives = Expressions.expand_smk_alternatives(text)
readings = [] readings = []
for alt in alternatives: for alt in alternatives:
for reading in Expressions.expand_abbreviation(alt): for reading in Expressions.expand_abbreviation(alt):
readings.append(reading) readings.append(reading)
return readings return readings
@staticmethod
def __expand_alternatives(expression):
"""Return a list of strings described by △ notation
eg. "△金(時間・暇)に飽かして" -> [
"金に飽かして", "時間に飽かして", "暇に飽かして"
]
"""
m = re.search(r"△([^]+)([^]+)", expression)
if not m:
return [expression]
alt_parts = [m.group(1)]
for alt_part in m.group(2).split(""):
alt_parts.append(alt_part)
alts = []
for alt_part in alt_parts:
alt_exp = re.sub(r"△[^]+[^]+", alt_part, expression)
alts.append(alt_exp)
return alts
class Smk8KanjiEntry(_BaseSmk8Entry): class Smk8KanjiEntry(_BaseSmk8Entry):
def __init__(self, entry_id): def __init__(self, entry_id):

96
tests/test_expressions.py Normal file
View file

@ -0,0 +1,96 @@
import unittest
import bot.entries.expressions as Expressions
class TestExpressions(unittest.TestCase):
def test_kata_to_hira(self):
hira = "Abc5ぁゖずほヷヸヹヺ・ーゝゞヿ"
kata = "Abc5ァヶズホヷヸヹヺ・ーヽヾヿ"
transformed = Expressions.kata_to_hira(kata)
self.assertEqual(transformed, hira)
def test_add_fullwidth(self):
exps = ["Abc059!~{}あア日本語Abc059!~{}"]
Expressions.add_fullwidth(exps)
self.assertEqual(len(exps), 2)
self.assertIn("Abc059!~{}あア日本語Abc059!~{}", exps)
self.assertIn("Abc059!~{}あア日本語Abc059!~{}", exps)
def test_add_iteration_mark(self):
exps = ["禍禍しい", "凶々しい", "凶凶しい"]
Expressions.add_iteration_mark(exps)
self.assertEqual(len(exps), 4)
self.assertIn("禍々しい", exps)
self.assertIn("禍禍しい", exps)
self.assertIn("凶々しい", exps)
self.assertIn("凶凶しい", exps)
def test_remove_iteration_mark(self):
exps = ["禍々しい", "凶々しい", "凶凶しい"]
Expressions.remove_iteration_mark(exps)
self.assertEqual(len(exps), 4)
self.assertIn("禍々しい", exps)
self.assertIn("禍禍しい", exps)
self.assertIn("凶々しい", exps)
self.assertIn("凶凶しい", exps)
def test_add_variant_kanji(self):
exps = ["剝く", "掴む", "摑む"]
Expressions.add_variant_kanji(exps)
self.assertEqual(len(exps), 4)
self.assertIn("剥く", exps)
self.assertIn("剝く", exps)
self.assertIn("掴む", exps)
self.assertIn("摑む", exps)
def test_add_variant_kanji2(self):
exps = ["剝摑"]
Expressions.add_variant_kanji(exps)
self.assertEqual(len(exps), 4)
self.assertIn("剝摑", exps)
self.assertIn("剝掴", exps)
self.assertIn("剥掴", exps)
self.assertIn("剥摑", exps)
def test_expand_abbreviation(self):
text = "有(り)合(わ)せ"
abbrs = Expressions.expand_abbreviation(text)
self.assertEqual(len(abbrs), 4)
self.assertIn("有り合わせ", abbrs)
self.assertIn("有合わせ", abbrs)
self.assertIn("有り合せ", abbrs)
self.assertIn("有合せ", abbrs)
def test_expand_abbreviation_list(self):
texts = ["有(り)合わせ", "有り合(わ)せ", "有合せ"]
abbrs = Expressions.expand_abbreviation_list(texts)
self.assertEqual(len(abbrs), 4)
self.assertIn("有り合わせ", abbrs)
self.assertIn("有合わせ", abbrs)
self.assertIn("有り合せ", abbrs)
self.assertIn("有合せ", abbrs)
def test_smk_expand_alternatives(self):
text = "△金(時間・暇)に飽かして"
exps = Expressions.expand_smk_alternatives(text)
self.assertEqual(len(exps), 3)
self.assertIn("金に飽かして", exps)
self.assertIn("時間に飽かして", exps)
self.assertIn("暇に飽かして", exps)
def test_daijirin_expand_alternatives(self):
text = "同じ穴の=狢(=狐・狸)"
exps = Expressions.expand_daijirin_alternatives(text)
self.assertEqual(len(exps), 3)
self.assertIn("同じ穴の狢", exps)
self.assertIn("同じ穴の狐", exps)
self.assertIn("同じ穴の狸", exps)
def test_daijirin_expand_alternatives2(self):
text = "聞くは=一時(=一旦)の恥、聞かぬは=末代(=一生)の恥"
exps = Expressions.expand_daijirin_alternatives(text)
self.assertEqual(len(exps), 4)
self.assertIn("聞くは一時の恥、聞かぬは末代の恥", exps)
self.assertIn("聞くは一時の恥、聞かぬは一生の恥", exps)
self.assertIn("聞くは一旦の恥、聞かぬは末代の恥", exps)
self.assertIn("聞くは一旦の恥、聞かぬは一生の恥", exps)