Add tests for Expressions
functions
This commit is contained in:
parent
6dbc8b90ce
commit
c737f10885
|
@ -1,7 +1,7 @@
|
|||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
import bot.expressions as Expressions
|
||||
import bot.entries.expressions as Expressions
|
||||
import bot.soup as Soup
|
||||
from bot.data import load_daijirin2_phrase_readings
|
||||
from bot.data import load_daijirin2_kana_abbreviations
|
||||
|
@ -82,7 +82,7 @@ class _BaseDaijirin2Entry(Entry):
|
|||
|
||||
def _set_variant_headwords(self):
|
||||
for expressions in self._headwords.values():
|
||||
Expressions.add_variant_kanji(expressions, self._variant_kanji)
|
||||
Expressions.add_variant_kanji(expressions)
|
||||
Expressions.add_fullwidth(expressions)
|
||||
Expressions.remove_iteration_mark(expressions)
|
||||
Expressions.add_iteration_mark(expressions)
|
||||
|
@ -223,7 +223,7 @@ class Daijirin2PhraseEntry(_BaseDaijirin2Entry):
|
|||
self._delete_unused_nodes(soup)
|
||||
text = soup.find("句表記").text
|
||||
text = self._clean_expression(text)
|
||||
alternatives = self.__expand_alternatives(text)
|
||||
alternatives = Expressions.expand_daijirin_alternatives(text)
|
||||
expressions = []
|
||||
for alt in alternatives:
|
||||
for exp in Expressions.expand_abbreviation(alt):
|
||||
|
@ -232,41 +232,9 @@ class Daijirin2PhraseEntry(_BaseDaijirin2Entry):
|
|||
|
||||
def _find_readings(self):
|
||||
text = self.__phrase_readings[self.entry_id]
|
||||
alternatives = self.__expand_alternatives(text)
|
||||
alternatives = Expressions.expand_daijirin_alternatives(text)
|
||||
readings = []
|
||||
for alt in alternatives:
|
||||
for reading in Expressions.expand_abbreviation(alt):
|
||||
readings.append(reading)
|
||||
return readings
|
||||
|
||||
@staticmethod
|
||||
def __expand_alternatives(expression):
|
||||
"""Return a list of strings described by = notation.
|
||||
eg. "同じ穴の=狢(=狐・狸)" -> [
|
||||
"同じ穴の狢", "同じ穴の狐", "同じ穴の狸"
|
||||
]
|
||||
eg. "聞くは=一時(=一旦)の恥、聞かぬは=末代(=一生)の恥" -> [
|
||||
"聞くは一時の恥、聞かぬは末代の恥",
|
||||
"聞くは一時の恥、聞かぬは一生の恥",
|
||||
"聞くは一旦の恥、聞かぬは末代の恥",
|
||||
"聞くは一旦の恥、聞かぬは一生の恥"
|
||||
]
|
||||
"""
|
||||
group_pattern = r"([^=]+)(=([^(]+)(=([^(]+)))?"
|
||||
groups = re.findall(group_pattern, expression)
|
||||
expressions = [""]
|
||||
for group in groups:
|
||||
new_exps = []
|
||||
for expression in expressions:
|
||||
new_exps.append(expression + group[0])
|
||||
expressions = new_exps.copy()
|
||||
if group[1] == "":
|
||||
continue
|
||||
new_exps = []
|
||||
for expression in expressions:
|
||||
new_exps.append(expression + group[2])
|
||||
for expression in expressions:
|
||||
for alt in group[3].split("・"):
|
||||
new_exps.append(expression + alt)
|
||||
expressions = new_exps.copy()
|
||||
return expressions
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
from abc import ABC, abstractmethod
|
||||
from bot.data import load_variant_kanji
|
||||
|
||||
|
||||
class Entry(ABC):
|
||||
|
@ -8,7 +7,6 @@ class Entry(ABC):
|
|||
self._page = None
|
||||
self._headwords = None
|
||||
self._part_of_speech_tags = None
|
||||
self._variant_kanji = load_variant_kanji()
|
||||
|
||||
@abstractmethod
|
||||
def set_page(self, page):
|
||||
|
|
|
@ -1,9 +1,11 @@
|
|||
import re
|
||||
from bot.data import load_variant_kanji
|
||||
|
||||
|
||||
__KATA_TO_HIRA_MAP = {
|
||||
i: i - 96 for i in [
|
||||
*range(0x30A1, 0x30F6),
|
||||
*range(0x30FD, 0x30FE),
|
||||
*range(0x30A1, 0x30F7),
|
||||
*range(0x30FD, 0x30FF),
|
||||
]
|
||||
}
|
||||
|
||||
|
@ -27,7 +29,8 @@ def add_fullwidth(expressions):
|
|||
expressions.append(new_exp)
|
||||
|
||||
|
||||
def add_variant_kanji(expressions, variant_kanji):
|
||||
def add_variant_kanji(expressions):
|
||||
variant_kanji = load_variant_kanji()
|
||||
for old_kanji, new_kanji in variant_kanji.items():
|
||||
new_exps = []
|
||||
for expression in expressions:
|
||||
|
@ -58,11 +61,7 @@ def add_iteration_mark(expressions):
|
|||
|
||||
|
||||
def expand_abbreviation(abbreviated_expression):
|
||||
"""Return a list of words described by a 省略 notation.
|
||||
eg. "有(り)合(わ)せ" -> [
|
||||
"有り合わせ", "有合わせ", "有り合せ", "有合せ"
|
||||
]
|
||||
"""
|
||||
"""Return a list of words described by a 省略 notation."""
|
||||
groups = re.findall(r"([^(]*)((([^(]+)))?", abbreviated_expression)
|
||||
expressions = [""]
|
||||
for group in groups:
|
||||
|
@ -86,3 +85,40 @@ def expand_abbreviation_list(expressions):
|
|||
if new_exp not in new_exps:
|
||||
new_exps.append(new_exp)
|
||||
return new_exps
|
||||
|
||||
|
||||
def expand_smk_alternatives(text):
|
||||
"""Return a list of strings described by △ notation."""
|
||||
m = re.search(r"△([^(]+)(([^(]+))", text)
|
||||
if m is None:
|
||||
return [text]
|
||||
alt_parts = [m.group(1)]
|
||||
for alt_part in m.group(2).split("・"):
|
||||
alt_parts.append(alt_part)
|
||||
alts = []
|
||||
for alt_part in alt_parts:
|
||||
alt_exp = re.sub(r"△[^(]+([^(]+)", alt_part, text)
|
||||
alts.append(alt_exp)
|
||||
return alts
|
||||
|
||||
|
||||
def expand_daijirin_alternatives(text):
|
||||
"""Return a list of strings described by = notation."""
|
||||
group_pattern = r"([^=]+)(=([^(]+)(=([^(]+)))?"
|
||||
groups = re.findall(group_pattern, text)
|
||||
expressions = [""]
|
||||
for group in groups:
|
||||
new_exps = []
|
||||
for expression in expressions:
|
||||
new_exps.append(expression + group[0])
|
||||
expressions = new_exps.copy()
|
||||
if group[1] == "":
|
||||
continue
|
||||
new_exps = []
|
||||
for expression in expressions:
|
||||
new_exps.append(expression + group[2])
|
||||
for expression in expressions:
|
||||
for alt in group[3].split("・"):
|
||||
new_exps.append(expression + alt)
|
||||
expressions = new_exps.copy()
|
||||
return expressions
|
|
@ -3,7 +3,7 @@ from datetime import datetime, date
|
|||
from bs4 import BeautifulSoup
|
||||
|
||||
from bot.entries.entry import Entry
|
||||
import bot.expressions as Expressions
|
||||
import bot.entries.expressions as Expressions
|
||||
|
||||
|
||||
class _JitenonEntry(Entry):
|
||||
|
@ -151,7 +151,7 @@ class JitenonYojiEntry(_JitenonEntry):
|
|||
|
||||
def _set_variant_headwords(self):
|
||||
for expressions in self._headwords.values():
|
||||
Expressions.add_variant_kanji(expressions, self._variant_kanji)
|
||||
Expressions.add_variant_kanji(expressions)
|
||||
|
||||
|
||||
class JitenonKotowazaEntry(_JitenonEntry):
|
||||
|
@ -178,7 +178,7 @@ class JitenonKotowazaEntry(_JitenonEntry):
|
|||
|
||||
def _set_variant_headwords(self):
|
||||
for expressions in self._headwords.values():
|
||||
Expressions.add_variant_kanji(expressions, self._variant_kanji)
|
||||
Expressions.add_variant_kanji(expressions)
|
||||
Expressions.add_fullwidth(expressions)
|
||||
|
||||
|
||||
|
@ -212,7 +212,7 @@ class JitenonKokugoEntry(_JitenonEntry):
|
|||
|
||||
def _set_variant_headwords(self):
|
||||
for expressions in self._headwords.values():
|
||||
Expressions.add_variant_kanji(expressions, self._variant_kanji)
|
||||
Expressions.add_variant_kanji(expressions)
|
||||
Expressions.add_fullwidth(expressions)
|
||||
Expressions.remove_iteration_mark(expressions)
|
||||
Expressions.add_iteration_mark(expressions)
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
import bot.expressions as Expressions
|
||||
import bot.entries.expressions as Expressions
|
||||
import bot.soup as Soup
|
||||
from bot.data import load_smk8_phrase_readings
|
||||
from bot.entries.entry import Entry
|
||||
|
@ -52,7 +52,7 @@ class _BaseSmk8Entry(Entry):
|
|||
|
||||
def _set_variant_headwords(self):
|
||||
for expressions in self._headwords.values():
|
||||
Expressions.add_variant_kanji(expressions, self._variant_kanji)
|
||||
Expressions.add_variant_kanji(expressions)
|
||||
Expressions.add_fullwidth(expressions)
|
||||
Expressions.remove_iteration_mark(expressions)
|
||||
Expressions.add_iteration_mark(expressions)
|
||||
|
@ -188,7 +188,7 @@ class Smk8PhraseEntry(_BaseSmk8Entry):
|
|||
self._fill_alts(soup)
|
||||
text = soup.find("標準表記").text
|
||||
text = self._clean_expression(text)
|
||||
alternatives = self.__expand_alternatives(text)
|
||||
alternatives = Expressions.expand_smk_alternatives(text)
|
||||
expressions = []
|
||||
for alt in alternatives:
|
||||
for exp in Expressions.expand_abbreviation(alt):
|
||||
|
@ -197,32 +197,13 @@ class Smk8PhraseEntry(_BaseSmk8Entry):
|
|||
|
||||
def _find_readings(self):
|
||||
text = self.__phrase_readings[self.entry_id]
|
||||
alternatives = self.__expand_alternatives(text)
|
||||
alternatives = Expressions.expand_smk_alternatives(text)
|
||||
readings = []
|
||||
for alt in alternatives:
|
||||
for reading in Expressions.expand_abbreviation(alt):
|
||||
readings.append(reading)
|
||||
return readings
|
||||
|
||||
@staticmethod
|
||||
def __expand_alternatives(expression):
|
||||
"""Return a list of strings described by △ notation
|
||||
eg. "△金(時間・暇)に飽かして" -> [
|
||||
"金に飽かして", "時間に飽かして", "暇に飽かして"
|
||||
]
|
||||
"""
|
||||
m = re.search(r"△([^(]+)(([^(]+))", expression)
|
||||
if not m:
|
||||
return [expression]
|
||||
alt_parts = [m.group(1)]
|
||||
for alt_part in m.group(2).split("・"):
|
||||
alt_parts.append(alt_part)
|
||||
alts = []
|
||||
for alt_part in alt_parts:
|
||||
alt_exp = re.sub(r"△[^(]+([^(]+)", alt_part, expression)
|
||||
alts.append(alt_exp)
|
||||
return alts
|
||||
|
||||
|
||||
class Smk8KanjiEntry(_BaseSmk8Entry):
|
||||
def __init__(self, entry_id):
|
||||
|
|
96
tests/test_expressions.py
Normal file
96
tests/test_expressions.py
Normal file
|
@ -0,0 +1,96 @@
|
|||
import unittest
|
||||
import bot.entries.expressions as Expressions
|
||||
|
||||
|
||||
class TestExpressions(unittest.TestCase):
|
||||
def test_kata_to_hira(self):
|
||||
hira = "Abc5゠ぁゖずほヷヸヹヺ・ーゝゞヿ"
|
||||
kata = "Abc5゠ァヶズホヷヸヹヺ・ーヽヾヿ"
|
||||
transformed = Expressions.kata_to_hira(kata)
|
||||
self.assertEqual(transformed, hira)
|
||||
|
||||
def test_add_fullwidth(self):
|
||||
exps = ["Abc059!~{}あア日本語Abc059!~{}"]
|
||||
Expressions.add_fullwidth(exps)
|
||||
self.assertEqual(len(exps), 2)
|
||||
self.assertIn("Abc059!~{}あア日本語Abc059!~{}", exps)
|
||||
self.assertIn("Abc059!~{}あア日本語Abc059!~{}", exps)
|
||||
|
||||
def test_add_iteration_mark(self):
|
||||
exps = ["禍禍しい", "凶々しい", "凶凶しい"]
|
||||
Expressions.add_iteration_mark(exps)
|
||||
self.assertEqual(len(exps), 4)
|
||||
self.assertIn("禍々しい", exps)
|
||||
self.assertIn("禍禍しい", exps)
|
||||
self.assertIn("凶々しい", exps)
|
||||
self.assertIn("凶凶しい", exps)
|
||||
|
||||
def test_remove_iteration_mark(self):
|
||||
exps = ["禍々しい", "凶々しい", "凶凶しい"]
|
||||
Expressions.remove_iteration_mark(exps)
|
||||
self.assertEqual(len(exps), 4)
|
||||
self.assertIn("禍々しい", exps)
|
||||
self.assertIn("禍禍しい", exps)
|
||||
self.assertIn("凶々しい", exps)
|
||||
self.assertIn("凶凶しい", exps)
|
||||
|
||||
def test_add_variant_kanji(self):
|
||||
exps = ["剝く", "掴む", "摑む"]
|
||||
Expressions.add_variant_kanji(exps)
|
||||
self.assertEqual(len(exps), 4)
|
||||
self.assertIn("剥く", exps)
|
||||
self.assertIn("剝く", exps)
|
||||
self.assertIn("掴む", exps)
|
||||
self.assertIn("摑む", exps)
|
||||
|
||||
def test_add_variant_kanji2(self):
|
||||
exps = ["剝摑"]
|
||||
Expressions.add_variant_kanji(exps)
|
||||
self.assertEqual(len(exps), 4)
|
||||
self.assertIn("剝摑", exps)
|
||||
self.assertIn("剝掴", exps)
|
||||
self.assertIn("剥掴", exps)
|
||||
self.assertIn("剥摑", exps)
|
||||
|
||||
def test_expand_abbreviation(self):
|
||||
text = "有(り)合(わ)せ"
|
||||
abbrs = Expressions.expand_abbreviation(text)
|
||||
self.assertEqual(len(abbrs), 4)
|
||||
self.assertIn("有り合わせ", abbrs)
|
||||
self.assertIn("有合わせ", abbrs)
|
||||
self.assertIn("有り合せ", abbrs)
|
||||
self.assertIn("有合せ", abbrs)
|
||||
|
||||
def test_expand_abbreviation_list(self):
|
||||
texts = ["有(り)合わせ", "有り合(わ)せ", "有合せ"]
|
||||
abbrs = Expressions.expand_abbreviation_list(texts)
|
||||
self.assertEqual(len(abbrs), 4)
|
||||
self.assertIn("有り合わせ", abbrs)
|
||||
self.assertIn("有合わせ", abbrs)
|
||||
self.assertIn("有り合せ", abbrs)
|
||||
self.assertIn("有合せ", abbrs)
|
||||
|
||||
def test_smk_expand_alternatives(self):
|
||||
text = "△金(時間・暇)に飽かして"
|
||||
exps = Expressions.expand_smk_alternatives(text)
|
||||
self.assertEqual(len(exps), 3)
|
||||
self.assertIn("金に飽かして", exps)
|
||||
self.assertIn("時間に飽かして", exps)
|
||||
self.assertIn("暇に飽かして", exps)
|
||||
|
||||
def test_daijirin_expand_alternatives(self):
|
||||
text = "同じ穴の=狢(=狐・狸)"
|
||||
exps = Expressions.expand_daijirin_alternatives(text)
|
||||
self.assertEqual(len(exps), 3)
|
||||
self.assertIn("同じ穴の狢", exps)
|
||||
self.assertIn("同じ穴の狐", exps)
|
||||
self.assertIn("同じ穴の狸", exps)
|
||||
|
||||
def test_daijirin_expand_alternatives2(self):
|
||||
text = "聞くは=一時(=一旦)の恥、聞かぬは=末代(=一生)の恥"
|
||||
exps = Expressions.expand_daijirin_alternatives(text)
|
||||
self.assertEqual(len(exps), 4)
|
||||
self.assertIn("聞くは一時の恥、聞かぬは末代の恥", exps)
|
||||
self.assertIn("聞くは一時の恥、聞かぬは一生の恥", exps)
|
||||
self.assertIn("聞くは一旦の恥、聞かぬは末代の恥", exps)
|
||||
self.assertIn("聞くは一旦の恥、聞かぬは一生の恥", exps)
|
Loading…
Reference in a new issue