Add tests for Expressions functions

This commit is contained in:
stephenmk 2023-05-06 20:07:07 -05:00
parent 6dbc8b90ce
commit c737f10885
No known key found for this signature in database
GPG key ID: B6DA730DB06235F1
6 changed files with 152 additions and 73 deletions

View file

@ -1,7 +1,7 @@
import re
from bs4 import BeautifulSoup
import bot.expressions as Expressions
import bot.entries.expressions as Expressions
import bot.soup as Soup
from bot.data import load_daijirin2_phrase_readings
from bot.data import load_daijirin2_kana_abbreviations
@ -82,7 +82,7 @@ class _BaseDaijirin2Entry(Entry):
def _set_variant_headwords(self):
for expressions in self._headwords.values():
Expressions.add_variant_kanji(expressions, self._variant_kanji)
Expressions.add_variant_kanji(expressions)
Expressions.add_fullwidth(expressions)
Expressions.remove_iteration_mark(expressions)
Expressions.add_iteration_mark(expressions)
@ -223,7 +223,7 @@ class Daijirin2PhraseEntry(_BaseDaijirin2Entry):
self._delete_unused_nodes(soup)
text = soup.find("句表記").text
text = self._clean_expression(text)
alternatives = self.__expand_alternatives(text)
alternatives = Expressions.expand_daijirin_alternatives(text)
expressions = []
for alt in alternatives:
for exp in Expressions.expand_abbreviation(alt):
@ -232,41 +232,9 @@ class Daijirin2PhraseEntry(_BaseDaijirin2Entry):
def _find_readings(self):
text = self.__phrase_readings[self.entry_id]
alternatives = self.__expand_alternatives(text)
alternatives = Expressions.expand_daijirin_alternatives(text)
readings = []
for alt in alternatives:
for reading in Expressions.expand_abbreviation(alt):
readings.append(reading)
return readings
@staticmethod
def __expand_alternatives(expression):
"""Return a list of strings described by notation.
eg. "同じ穴の=狢(=狐・狸)" -> [
"同じ穴の狢", "同じ穴の狐", "同じ穴の狸"
]
eg. "聞くは=一時(=一旦)の恥、聞かぬは=末代(=一生)の恥" -> [
"聞くは一時の恥、聞かぬは末代の恥",
"聞くは一時の恥、聞かぬは一生の恥",
"聞くは一旦の恥、聞かぬは末代の恥",
"聞くは一旦の恥、聞かぬは一生の恥"
]
"""
group_pattern = r"([^]+)(([^]+)([^]+))?"
groups = re.findall(group_pattern, expression)
expressions = [""]
for group in groups:
new_exps = []
for expression in expressions:
new_exps.append(expression + group[0])
expressions = new_exps.copy()
if group[1] == "":
continue
new_exps = []
for expression in expressions:
new_exps.append(expression + group[2])
for expression in expressions:
for alt in group[3].split(""):
new_exps.append(expression + alt)
expressions = new_exps.copy()
return expressions

View file

@ -1,5 +1,4 @@
from abc import ABC, abstractmethod
from bot.data import load_variant_kanji
class Entry(ABC):
@ -8,7 +7,6 @@ class Entry(ABC):
self._page = None
self._headwords = None
self._part_of_speech_tags = None
self._variant_kanji = load_variant_kanji()
@abstractmethod
def set_page(self, page):

View file

@ -1,9 +1,11 @@
import re
from bot.data import load_variant_kanji
__KATA_TO_HIRA_MAP = {
i: i - 96 for i in [
*range(0x30A1, 0x30F6),
*range(0x30FD, 0x30FE),
*range(0x30A1, 0x30F7),
*range(0x30FD, 0x30FF),
]
}
@ -27,7 +29,8 @@ def add_fullwidth(expressions):
expressions.append(new_exp)
def add_variant_kanji(expressions, variant_kanji):
def add_variant_kanji(expressions):
variant_kanji = load_variant_kanji()
for old_kanji, new_kanji in variant_kanji.items():
new_exps = []
for expression in expressions:
@ -58,11 +61,7 @@ def add_iteration_mark(expressions):
def expand_abbreviation(abbreviated_expression):
"""Return a list of words described by a 省略 notation.
eg. "有(り)合(わ)せ" -> [
"有り合わせ", "有合わせ", "有り合せ", "有合せ"
]
"""
"""Return a list of words described by a 省略 notation."""
groups = re.findall(r"([^]*)(([^]+))?", abbreviated_expression)
expressions = [""]
for group in groups:
@ -86,3 +85,40 @@ def expand_abbreviation_list(expressions):
if new_exp not in new_exps:
new_exps.append(new_exp)
return new_exps
def expand_smk_alternatives(text):
"""Return a list of strings described by △ notation."""
m = re.search(r"△([^]+)([^]+)", text)
if m is None:
return [text]
alt_parts = [m.group(1)]
for alt_part in m.group(2).split(""):
alt_parts.append(alt_part)
alts = []
for alt_part in alt_parts:
alt_exp = re.sub(r"△[^]+[^]+", alt_part, text)
alts.append(alt_exp)
return alts
def expand_daijirin_alternatives(text):
"""Return a list of strings described by notation."""
group_pattern = r"([^]+)(([^]+)([^]+))?"
groups = re.findall(group_pattern, text)
expressions = [""]
for group in groups:
new_exps = []
for expression in expressions:
new_exps.append(expression + group[0])
expressions = new_exps.copy()
if group[1] == "":
continue
new_exps = []
for expression in expressions:
new_exps.append(expression + group[2])
for expression in expressions:
for alt in group[3].split(""):
new_exps.append(expression + alt)
expressions = new_exps.copy()
return expressions

View file

@ -3,7 +3,7 @@ from datetime import datetime, date
from bs4 import BeautifulSoup
from bot.entries.entry import Entry
import bot.expressions as Expressions
import bot.entries.expressions as Expressions
class _JitenonEntry(Entry):
@ -151,7 +151,7 @@ class JitenonYojiEntry(_JitenonEntry):
def _set_variant_headwords(self):
for expressions in self._headwords.values():
Expressions.add_variant_kanji(expressions, self._variant_kanji)
Expressions.add_variant_kanji(expressions)
class JitenonKotowazaEntry(_JitenonEntry):
@ -178,7 +178,7 @@ class JitenonKotowazaEntry(_JitenonEntry):
def _set_variant_headwords(self):
for expressions in self._headwords.values():
Expressions.add_variant_kanji(expressions, self._variant_kanji)
Expressions.add_variant_kanji(expressions)
Expressions.add_fullwidth(expressions)
@ -212,7 +212,7 @@ class JitenonKokugoEntry(_JitenonEntry):
def _set_variant_headwords(self):
for expressions in self._headwords.values():
Expressions.add_variant_kanji(expressions, self._variant_kanji)
Expressions.add_variant_kanji(expressions)
Expressions.add_fullwidth(expressions)
Expressions.remove_iteration_mark(expressions)
Expressions.add_iteration_mark(expressions)

View file

@ -1,7 +1,7 @@
import re
from bs4 import BeautifulSoup
import bot.expressions as Expressions
import bot.entries.expressions as Expressions
import bot.soup as Soup
from bot.data import load_smk8_phrase_readings
from bot.entries.entry import Entry
@ -52,7 +52,7 @@ class _BaseSmk8Entry(Entry):
def _set_variant_headwords(self):
for expressions in self._headwords.values():
Expressions.add_variant_kanji(expressions, self._variant_kanji)
Expressions.add_variant_kanji(expressions)
Expressions.add_fullwidth(expressions)
Expressions.remove_iteration_mark(expressions)
Expressions.add_iteration_mark(expressions)
@ -188,7 +188,7 @@ class Smk8PhraseEntry(_BaseSmk8Entry):
self._fill_alts(soup)
text = soup.find("標準表記").text
text = self._clean_expression(text)
alternatives = self.__expand_alternatives(text)
alternatives = Expressions.expand_smk_alternatives(text)
expressions = []
for alt in alternatives:
for exp in Expressions.expand_abbreviation(alt):
@ -197,32 +197,13 @@ class Smk8PhraseEntry(_BaseSmk8Entry):
def _find_readings(self):
text = self.__phrase_readings[self.entry_id]
alternatives = self.__expand_alternatives(text)
alternatives = Expressions.expand_smk_alternatives(text)
readings = []
for alt in alternatives:
for reading in Expressions.expand_abbreviation(alt):
readings.append(reading)
return readings
@staticmethod
def __expand_alternatives(expression):
"""Return a list of strings described by △ notation
eg. "△金(時間・暇)に飽かして" -> [
"金に飽かして", "時間に飽かして", "暇に飽かして"
]
"""
m = re.search(r"△([^]+)([^]+)", expression)
if not m:
return [expression]
alt_parts = [m.group(1)]
for alt_part in m.group(2).split(""):
alt_parts.append(alt_part)
alts = []
for alt_part in alt_parts:
alt_exp = re.sub(r"△[^]+[^]+", alt_part, expression)
alts.append(alt_exp)
return alts
class Smk8KanjiEntry(_BaseSmk8Entry):
def __init__(self, entry_id):

96
tests/test_expressions.py Normal file
View file

@ -0,0 +1,96 @@
import unittest
import bot.entries.expressions as Expressions
class TestExpressions(unittest.TestCase):
def test_kata_to_hira(self):
hira = "Abc5ぁゖずほヷヸヹヺ・ーゝゞヿ"
kata = "Abc5ァヶズホヷヸヹヺ・ーヽヾヿ"
transformed = Expressions.kata_to_hira(kata)
self.assertEqual(transformed, hira)
def test_add_fullwidth(self):
exps = ["Abc059!~{}あア日本語Abc059!~{}"]
Expressions.add_fullwidth(exps)
self.assertEqual(len(exps), 2)
self.assertIn("Abc059!~{}あア日本語Abc059!~{}", exps)
self.assertIn("Abc059!~{}あア日本語Abc059!~{}", exps)
def test_add_iteration_mark(self):
exps = ["禍禍しい", "凶々しい", "凶凶しい"]
Expressions.add_iteration_mark(exps)
self.assertEqual(len(exps), 4)
self.assertIn("禍々しい", exps)
self.assertIn("禍禍しい", exps)
self.assertIn("凶々しい", exps)
self.assertIn("凶凶しい", exps)
def test_remove_iteration_mark(self):
exps = ["禍々しい", "凶々しい", "凶凶しい"]
Expressions.remove_iteration_mark(exps)
self.assertEqual(len(exps), 4)
self.assertIn("禍々しい", exps)
self.assertIn("禍禍しい", exps)
self.assertIn("凶々しい", exps)
self.assertIn("凶凶しい", exps)
def test_add_variant_kanji(self):
exps = ["剝く", "掴む", "摑む"]
Expressions.add_variant_kanji(exps)
self.assertEqual(len(exps), 4)
self.assertIn("剥く", exps)
self.assertIn("剝く", exps)
self.assertIn("掴む", exps)
self.assertIn("摑む", exps)
def test_add_variant_kanji2(self):
exps = ["剝摑"]
Expressions.add_variant_kanji(exps)
self.assertEqual(len(exps), 4)
self.assertIn("剝摑", exps)
self.assertIn("剝掴", exps)
self.assertIn("剥掴", exps)
self.assertIn("剥摑", exps)
def test_expand_abbreviation(self):
text = "有(り)合(わ)せ"
abbrs = Expressions.expand_abbreviation(text)
self.assertEqual(len(abbrs), 4)
self.assertIn("有り合わせ", abbrs)
self.assertIn("有合わせ", abbrs)
self.assertIn("有り合せ", abbrs)
self.assertIn("有合せ", abbrs)
def test_expand_abbreviation_list(self):
texts = ["有(り)合わせ", "有り合(わ)せ", "有合せ"]
abbrs = Expressions.expand_abbreviation_list(texts)
self.assertEqual(len(abbrs), 4)
self.assertIn("有り合わせ", abbrs)
self.assertIn("有合わせ", abbrs)
self.assertIn("有り合せ", abbrs)
self.assertIn("有合せ", abbrs)
def test_smk_expand_alternatives(self):
text = "△金(時間・暇)に飽かして"
exps = Expressions.expand_smk_alternatives(text)
self.assertEqual(len(exps), 3)
self.assertIn("金に飽かして", exps)
self.assertIn("時間に飽かして", exps)
self.assertIn("暇に飽かして", exps)
def test_daijirin_expand_alternatives(self):
text = "同じ穴の=狢(=狐・狸)"
exps = Expressions.expand_daijirin_alternatives(text)
self.assertEqual(len(exps), 3)
self.assertIn("同じ穴の狢", exps)
self.assertIn("同じ穴の狐", exps)
self.assertIn("同じ穴の狸", exps)
def test_daijirin_expand_alternatives2(self):
text = "聞くは=一時(=一旦)の恥、聞かぬは=末代(=一生)の恥"
exps = Expressions.expand_daijirin_alternatives(text)
self.assertEqual(len(exps), 4)
self.assertIn("聞くは一時の恥、聞かぬは末代の恥", exps)
self.assertIn("聞くは一時の恥、聞かぬは一生の恥", exps)
self.assertIn("聞くは一旦の恥、聞かぬは末代の恥", exps)
self.assertIn("聞くは一旦の恥、聞かぬは一生の恥", exps)