Add tests for Expressions
functions
This commit is contained in:
parent
6dbc8b90ce
commit
c737f10885
|
@ -1,7 +1,7 @@
|
||||||
import re
|
import re
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
import bot.expressions as Expressions
|
import bot.entries.expressions as Expressions
|
||||||
import bot.soup as Soup
|
import bot.soup as Soup
|
||||||
from bot.data import load_daijirin2_phrase_readings
|
from bot.data import load_daijirin2_phrase_readings
|
||||||
from bot.data import load_daijirin2_kana_abbreviations
|
from bot.data import load_daijirin2_kana_abbreviations
|
||||||
|
@ -82,7 +82,7 @@ class _BaseDaijirin2Entry(Entry):
|
||||||
|
|
||||||
def _set_variant_headwords(self):
|
def _set_variant_headwords(self):
|
||||||
for expressions in self._headwords.values():
|
for expressions in self._headwords.values():
|
||||||
Expressions.add_variant_kanji(expressions, self._variant_kanji)
|
Expressions.add_variant_kanji(expressions)
|
||||||
Expressions.add_fullwidth(expressions)
|
Expressions.add_fullwidth(expressions)
|
||||||
Expressions.remove_iteration_mark(expressions)
|
Expressions.remove_iteration_mark(expressions)
|
||||||
Expressions.add_iteration_mark(expressions)
|
Expressions.add_iteration_mark(expressions)
|
||||||
|
@ -223,7 +223,7 @@ class Daijirin2PhraseEntry(_BaseDaijirin2Entry):
|
||||||
self._delete_unused_nodes(soup)
|
self._delete_unused_nodes(soup)
|
||||||
text = soup.find("句表記").text
|
text = soup.find("句表記").text
|
||||||
text = self._clean_expression(text)
|
text = self._clean_expression(text)
|
||||||
alternatives = self.__expand_alternatives(text)
|
alternatives = Expressions.expand_daijirin_alternatives(text)
|
||||||
expressions = []
|
expressions = []
|
||||||
for alt in alternatives:
|
for alt in alternatives:
|
||||||
for exp in Expressions.expand_abbreviation(alt):
|
for exp in Expressions.expand_abbreviation(alt):
|
||||||
|
@ -232,41 +232,9 @@ class Daijirin2PhraseEntry(_BaseDaijirin2Entry):
|
||||||
|
|
||||||
def _find_readings(self):
|
def _find_readings(self):
|
||||||
text = self.__phrase_readings[self.entry_id]
|
text = self.__phrase_readings[self.entry_id]
|
||||||
alternatives = self.__expand_alternatives(text)
|
alternatives = Expressions.expand_daijirin_alternatives(text)
|
||||||
readings = []
|
readings = []
|
||||||
for alt in alternatives:
|
for alt in alternatives:
|
||||||
for reading in Expressions.expand_abbreviation(alt):
|
for reading in Expressions.expand_abbreviation(alt):
|
||||||
readings.append(reading)
|
readings.append(reading)
|
||||||
return readings
|
return readings
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def __expand_alternatives(expression):
|
|
||||||
"""Return a list of strings described by = notation.
|
|
||||||
eg. "同じ穴の=狢(=狐・狸)" -> [
|
|
||||||
"同じ穴の狢", "同じ穴の狐", "同じ穴の狸"
|
|
||||||
]
|
|
||||||
eg. "聞くは=一時(=一旦)の恥、聞かぬは=末代(=一生)の恥" -> [
|
|
||||||
"聞くは一時の恥、聞かぬは末代の恥",
|
|
||||||
"聞くは一時の恥、聞かぬは一生の恥",
|
|
||||||
"聞くは一旦の恥、聞かぬは末代の恥",
|
|
||||||
"聞くは一旦の恥、聞かぬは一生の恥"
|
|
||||||
]
|
|
||||||
"""
|
|
||||||
group_pattern = r"([^=]+)(=([^(]+)(=([^(]+)))?"
|
|
||||||
groups = re.findall(group_pattern, expression)
|
|
||||||
expressions = [""]
|
|
||||||
for group in groups:
|
|
||||||
new_exps = []
|
|
||||||
for expression in expressions:
|
|
||||||
new_exps.append(expression + group[0])
|
|
||||||
expressions = new_exps.copy()
|
|
||||||
if group[1] == "":
|
|
||||||
continue
|
|
||||||
new_exps = []
|
|
||||||
for expression in expressions:
|
|
||||||
new_exps.append(expression + group[2])
|
|
||||||
for expression in expressions:
|
|
||||||
for alt in group[3].split("・"):
|
|
||||||
new_exps.append(expression + alt)
|
|
||||||
expressions = new_exps.copy()
|
|
||||||
return expressions
|
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from bot.data import load_variant_kanji
|
|
||||||
|
|
||||||
|
|
||||||
class Entry(ABC):
|
class Entry(ABC):
|
||||||
|
@ -8,7 +7,6 @@ class Entry(ABC):
|
||||||
self._page = None
|
self._page = None
|
||||||
self._headwords = None
|
self._headwords = None
|
||||||
self._part_of_speech_tags = None
|
self._part_of_speech_tags = None
|
||||||
self._variant_kanji = load_variant_kanji()
|
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def set_page(self, page):
|
def set_page(self, page):
|
||||||
|
|
|
@ -1,9 +1,11 @@
|
||||||
import re
|
import re
|
||||||
|
from bot.data import load_variant_kanji
|
||||||
|
|
||||||
|
|
||||||
__KATA_TO_HIRA_MAP = {
|
__KATA_TO_HIRA_MAP = {
|
||||||
i: i - 96 for i in [
|
i: i - 96 for i in [
|
||||||
*range(0x30A1, 0x30F6),
|
*range(0x30A1, 0x30F7),
|
||||||
*range(0x30FD, 0x30FE),
|
*range(0x30FD, 0x30FF),
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -27,7 +29,8 @@ def add_fullwidth(expressions):
|
||||||
expressions.append(new_exp)
|
expressions.append(new_exp)
|
||||||
|
|
||||||
|
|
||||||
def add_variant_kanji(expressions, variant_kanji):
|
def add_variant_kanji(expressions):
|
||||||
|
variant_kanji = load_variant_kanji()
|
||||||
for old_kanji, new_kanji in variant_kanji.items():
|
for old_kanji, new_kanji in variant_kanji.items():
|
||||||
new_exps = []
|
new_exps = []
|
||||||
for expression in expressions:
|
for expression in expressions:
|
||||||
|
@ -58,11 +61,7 @@ def add_iteration_mark(expressions):
|
||||||
|
|
||||||
|
|
||||||
def expand_abbreviation(abbreviated_expression):
|
def expand_abbreviation(abbreviated_expression):
|
||||||
"""Return a list of words described by a 省略 notation.
|
"""Return a list of words described by a 省略 notation."""
|
||||||
eg. "有(り)合(わ)せ" -> [
|
|
||||||
"有り合わせ", "有合わせ", "有り合せ", "有合せ"
|
|
||||||
]
|
|
||||||
"""
|
|
||||||
groups = re.findall(r"([^(]*)((([^(]+)))?", abbreviated_expression)
|
groups = re.findall(r"([^(]*)((([^(]+)))?", abbreviated_expression)
|
||||||
expressions = [""]
|
expressions = [""]
|
||||||
for group in groups:
|
for group in groups:
|
||||||
|
@ -86,3 +85,40 @@ def expand_abbreviation_list(expressions):
|
||||||
if new_exp not in new_exps:
|
if new_exp not in new_exps:
|
||||||
new_exps.append(new_exp)
|
new_exps.append(new_exp)
|
||||||
return new_exps
|
return new_exps
|
||||||
|
|
||||||
|
|
||||||
|
def expand_smk_alternatives(text):
|
||||||
|
"""Return a list of strings described by △ notation."""
|
||||||
|
m = re.search(r"△([^(]+)(([^(]+))", text)
|
||||||
|
if m is None:
|
||||||
|
return [text]
|
||||||
|
alt_parts = [m.group(1)]
|
||||||
|
for alt_part in m.group(2).split("・"):
|
||||||
|
alt_parts.append(alt_part)
|
||||||
|
alts = []
|
||||||
|
for alt_part in alt_parts:
|
||||||
|
alt_exp = re.sub(r"△[^(]+([^(]+)", alt_part, text)
|
||||||
|
alts.append(alt_exp)
|
||||||
|
return alts
|
||||||
|
|
||||||
|
|
||||||
|
def expand_daijirin_alternatives(text):
|
||||||
|
"""Return a list of strings described by = notation."""
|
||||||
|
group_pattern = r"([^=]+)(=([^(]+)(=([^(]+)))?"
|
||||||
|
groups = re.findall(group_pattern, text)
|
||||||
|
expressions = [""]
|
||||||
|
for group in groups:
|
||||||
|
new_exps = []
|
||||||
|
for expression in expressions:
|
||||||
|
new_exps.append(expression + group[0])
|
||||||
|
expressions = new_exps.copy()
|
||||||
|
if group[1] == "":
|
||||||
|
continue
|
||||||
|
new_exps = []
|
||||||
|
for expression in expressions:
|
||||||
|
new_exps.append(expression + group[2])
|
||||||
|
for expression in expressions:
|
||||||
|
for alt in group[3].split("・"):
|
||||||
|
new_exps.append(expression + alt)
|
||||||
|
expressions = new_exps.copy()
|
||||||
|
return expressions
|
|
@ -3,7 +3,7 @@ from datetime import datetime, date
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from bot.entries.entry import Entry
|
from bot.entries.entry import Entry
|
||||||
import bot.expressions as Expressions
|
import bot.entries.expressions as Expressions
|
||||||
|
|
||||||
|
|
||||||
class _JitenonEntry(Entry):
|
class _JitenonEntry(Entry):
|
||||||
|
@ -151,7 +151,7 @@ class JitenonYojiEntry(_JitenonEntry):
|
||||||
|
|
||||||
def _set_variant_headwords(self):
|
def _set_variant_headwords(self):
|
||||||
for expressions in self._headwords.values():
|
for expressions in self._headwords.values():
|
||||||
Expressions.add_variant_kanji(expressions, self._variant_kanji)
|
Expressions.add_variant_kanji(expressions)
|
||||||
|
|
||||||
|
|
||||||
class JitenonKotowazaEntry(_JitenonEntry):
|
class JitenonKotowazaEntry(_JitenonEntry):
|
||||||
|
@ -178,7 +178,7 @@ class JitenonKotowazaEntry(_JitenonEntry):
|
||||||
|
|
||||||
def _set_variant_headwords(self):
|
def _set_variant_headwords(self):
|
||||||
for expressions in self._headwords.values():
|
for expressions in self._headwords.values():
|
||||||
Expressions.add_variant_kanji(expressions, self._variant_kanji)
|
Expressions.add_variant_kanji(expressions)
|
||||||
Expressions.add_fullwidth(expressions)
|
Expressions.add_fullwidth(expressions)
|
||||||
|
|
||||||
|
|
||||||
|
@ -212,7 +212,7 @@ class JitenonKokugoEntry(_JitenonEntry):
|
||||||
|
|
||||||
def _set_variant_headwords(self):
|
def _set_variant_headwords(self):
|
||||||
for expressions in self._headwords.values():
|
for expressions in self._headwords.values():
|
||||||
Expressions.add_variant_kanji(expressions, self._variant_kanji)
|
Expressions.add_variant_kanji(expressions)
|
||||||
Expressions.add_fullwidth(expressions)
|
Expressions.add_fullwidth(expressions)
|
||||||
Expressions.remove_iteration_mark(expressions)
|
Expressions.remove_iteration_mark(expressions)
|
||||||
Expressions.add_iteration_mark(expressions)
|
Expressions.add_iteration_mark(expressions)
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import re
|
import re
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
import bot.expressions as Expressions
|
import bot.entries.expressions as Expressions
|
||||||
import bot.soup as Soup
|
import bot.soup as Soup
|
||||||
from bot.data import load_smk8_phrase_readings
|
from bot.data import load_smk8_phrase_readings
|
||||||
from bot.entries.entry import Entry
|
from bot.entries.entry import Entry
|
||||||
|
@ -52,7 +52,7 @@ class _BaseSmk8Entry(Entry):
|
||||||
|
|
||||||
def _set_variant_headwords(self):
|
def _set_variant_headwords(self):
|
||||||
for expressions in self._headwords.values():
|
for expressions in self._headwords.values():
|
||||||
Expressions.add_variant_kanji(expressions, self._variant_kanji)
|
Expressions.add_variant_kanji(expressions)
|
||||||
Expressions.add_fullwidth(expressions)
|
Expressions.add_fullwidth(expressions)
|
||||||
Expressions.remove_iteration_mark(expressions)
|
Expressions.remove_iteration_mark(expressions)
|
||||||
Expressions.add_iteration_mark(expressions)
|
Expressions.add_iteration_mark(expressions)
|
||||||
|
@ -188,7 +188,7 @@ class Smk8PhraseEntry(_BaseSmk8Entry):
|
||||||
self._fill_alts(soup)
|
self._fill_alts(soup)
|
||||||
text = soup.find("標準表記").text
|
text = soup.find("標準表記").text
|
||||||
text = self._clean_expression(text)
|
text = self._clean_expression(text)
|
||||||
alternatives = self.__expand_alternatives(text)
|
alternatives = Expressions.expand_smk_alternatives(text)
|
||||||
expressions = []
|
expressions = []
|
||||||
for alt in alternatives:
|
for alt in alternatives:
|
||||||
for exp in Expressions.expand_abbreviation(alt):
|
for exp in Expressions.expand_abbreviation(alt):
|
||||||
|
@ -197,32 +197,13 @@ class Smk8PhraseEntry(_BaseSmk8Entry):
|
||||||
|
|
||||||
def _find_readings(self):
|
def _find_readings(self):
|
||||||
text = self.__phrase_readings[self.entry_id]
|
text = self.__phrase_readings[self.entry_id]
|
||||||
alternatives = self.__expand_alternatives(text)
|
alternatives = Expressions.expand_smk_alternatives(text)
|
||||||
readings = []
|
readings = []
|
||||||
for alt in alternatives:
|
for alt in alternatives:
|
||||||
for reading in Expressions.expand_abbreviation(alt):
|
for reading in Expressions.expand_abbreviation(alt):
|
||||||
readings.append(reading)
|
readings.append(reading)
|
||||||
return readings
|
return readings
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def __expand_alternatives(expression):
|
|
||||||
"""Return a list of strings described by △ notation
|
|
||||||
eg. "△金(時間・暇)に飽かして" -> [
|
|
||||||
"金に飽かして", "時間に飽かして", "暇に飽かして"
|
|
||||||
]
|
|
||||||
"""
|
|
||||||
m = re.search(r"△([^(]+)(([^(]+))", expression)
|
|
||||||
if not m:
|
|
||||||
return [expression]
|
|
||||||
alt_parts = [m.group(1)]
|
|
||||||
for alt_part in m.group(2).split("・"):
|
|
||||||
alt_parts.append(alt_part)
|
|
||||||
alts = []
|
|
||||||
for alt_part in alt_parts:
|
|
||||||
alt_exp = re.sub(r"△[^(]+([^(]+)", alt_part, expression)
|
|
||||||
alts.append(alt_exp)
|
|
||||||
return alts
|
|
||||||
|
|
||||||
|
|
||||||
class Smk8KanjiEntry(_BaseSmk8Entry):
|
class Smk8KanjiEntry(_BaseSmk8Entry):
|
||||||
def __init__(self, entry_id):
|
def __init__(self, entry_id):
|
||||||
|
|
96
tests/test_expressions.py
Normal file
96
tests/test_expressions.py
Normal file
|
@ -0,0 +1,96 @@
|
||||||
|
import unittest
|
||||||
|
import bot.entries.expressions as Expressions
|
||||||
|
|
||||||
|
|
||||||
|
class TestExpressions(unittest.TestCase):
|
||||||
|
def test_kata_to_hira(self):
|
||||||
|
hira = "Abc5゠ぁゖずほヷヸヹヺ・ーゝゞヿ"
|
||||||
|
kata = "Abc5゠ァヶズホヷヸヹヺ・ーヽヾヿ"
|
||||||
|
transformed = Expressions.kata_to_hira(kata)
|
||||||
|
self.assertEqual(transformed, hira)
|
||||||
|
|
||||||
|
def test_add_fullwidth(self):
|
||||||
|
exps = ["Abc059!~{}あア日本語Abc059!~{}"]
|
||||||
|
Expressions.add_fullwidth(exps)
|
||||||
|
self.assertEqual(len(exps), 2)
|
||||||
|
self.assertIn("Abc059!~{}あア日本語Abc059!~{}", exps)
|
||||||
|
self.assertIn("Abc059!~{}あア日本語Abc059!~{}", exps)
|
||||||
|
|
||||||
|
def test_add_iteration_mark(self):
|
||||||
|
exps = ["禍禍しい", "凶々しい", "凶凶しい"]
|
||||||
|
Expressions.add_iteration_mark(exps)
|
||||||
|
self.assertEqual(len(exps), 4)
|
||||||
|
self.assertIn("禍々しい", exps)
|
||||||
|
self.assertIn("禍禍しい", exps)
|
||||||
|
self.assertIn("凶々しい", exps)
|
||||||
|
self.assertIn("凶凶しい", exps)
|
||||||
|
|
||||||
|
def test_remove_iteration_mark(self):
|
||||||
|
exps = ["禍々しい", "凶々しい", "凶凶しい"]
|
||||||
|
Expressions.remove_iteration_mark(exps)
|
||||||
|
self.assertEqual(len(exps), 4)
|
||||||
|
self.assertIn("禍々しい", exps)
|
||||||
|
self.assertIn("禍禍しい", exps)
|
||||||
|
self.assertIn("凶々しい", exps)
|
||||||
|
self.assertIn("凶凶しい", exps)
|
||||||
|
|
||||||
|
def test_add_variant_kanji(self):
|
||||||
|
exps = ["剝く", "掴む", "摑む"]
|
||||||
|
Expressions.add_variant_kanji(exps)
|
||||||
|
self.assertEqual(len(exps), 4)
|
||||||
|
self.assertIn("剥く", exps)
|
||||||
|
self.assertIn("剝く", exps)
|
||||||
|
self.assertIn("掴む", exps)
|
||||||
|
self.assertIn("摑む", exps)
|
||||||
|
|
||||||
|
def test_add_variant_kanji2(self):
|
||||||
|
exps = ["剝摑"]
|
||||||
|
Expressions.add_variant_kanji(exps)
|
||||||
|
self.assertEqual(len(exps), 4)
|
||||||
|
self.assertIn("剝摑", exps)
|
||||||
|
self.assertIn("剝掴", exps)
|
||||||
|
self.assertIn("剥掴", exps)
|
||||||
|
self.assertIn("剥摑", exps)
|
||||||
|
|
||||||
|
def test_expand_abbreviation(self):
|
||||||
|
text = "有(り)合(わ)せ"
|
||||||
|
abbrs = Expressions.expand_abbreviation(text)
|
||||||
|
self.assertEqual(len(abbrs), 4)
|
||||||
|
self.assertIn("有り合わせ", abbrs)
|
||||||
|
self.assertIn("有合わせ", abbrs)
|
||||||
|
self.assertIn("有り合せ", abbrs)
|
||||||
|
self.assertIn("有合せ", abbrs)
|
||||||
|
|
||||||
|
def test_expand_abbreviation_list(self):
|
||||||
|
texts = ["有(り)合わせ", "有り合(わ)せ", "有合せ"]
|
||||||
|
abbrs = Expressions.expand_abbreviation_list(texts)
|
||||||
|
self.assertEqual(len(abbrs), 4)
|
||||||
|
self.assertIn("有り合わせ", abbrs)
|
||||||
|
self.assertIn("有合わせ", abbrs)
|
||||||
|
self.assertIn("有り合せ", abbrs)
|
||||||
|
self.assertIn("有合せ", abbrs)
|
||||||
|
|
||||||
|
def test_smk_expand_alternatives(self):
|
||||||
|
text = "△金(時間・暇)に飽かして"
|
||||||
|
exps = Expressions.expand_smk_alternatives(text)
|
||||||
|
self.assertEqual(len(exps), 3)
|
||||||
|
self.assertIn("金に飽かして", exps)
|
||||||
|
self.assertIn("時間に飽かして", exps)
|
||||||
|
self.assertIn("暇に飽かして", exps)
|
||||||
|
|
||||||
|
def test_daijirin_expand_alternatives(self):
|
||||||
|
text = "同じ穴の=狢(=狐・狸)"
|
||||||
|
exps = Expressions.expand_daijirin_alternatives(text)
|
||||||
|
self.assertEqual(len(exps), 3)
|
||||||
|
self.assertIn("同じ穴の狢", exps)
|
||||||
|
self.assertIn("同じ穴の狐", exps)
|
||||||
|
self.assertIn("同じ穴の狸", exps)
|
||||||
|
|
||||||
|
def test_daijirin_expand_alternatives2(self):
|
||||||
|
text = "聞くは=一時(=一旦)の恥、聞かぬは=末代(=一生)の恥"
|
||||||
|
exps = Expressions.expand_daijirin_alternatives(text)
|
||||||
|
self.assertEqual(len(exps), 4)
|
||||||
|
self.assertIn("聞くは一時の恥、聞かぬは末代の恥", exps)
|
||||||
|
self.assertIn("聞くは一時の恥、聞かぬは一生の恥", exps)
|
||||||
|
self.assertIn("聞くは一旦の恥、聞かぬは末代の恥", exps)
|
||||||
|
self.assertIn("聞くは一旦の恥、聞かぬは一生の恥", exps)
|
Loading…
Reference in a new issue