2023-04-08 03:05:36 +00:00
|
|
|
|
import re
|
2023-05-07 01:07:07 +00:00
|
|
|
|
from bot.data import load_variant_kanji
|
|
|
|
|
|
2023-04-08 03:05:36 +00:00
|
|
|
|
|
2023-05-01 22:31:28 +00:00
|
|
|
|
__KATA_TO_HIRA_MAP = {
|
|
|
|
|
i: i - 96 for i in [
|
2023-05-07 01:07:07 +00:00
|
|
|
|
*range(0x30A1, 0x30F7),
|
|
|
|
|
*range(0x30FD, 0x30FF),
|
2023-05-01 22:31:28 +00:00
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__HALFWIDTH_TO_FULLWIDTH_MAP = {
|
|
|
|
|
i: i + 0xFEE0 for i in [
|
|
|
|
|
*range(0x21, 0x7F),
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def kata_to_hira(text):
|
|
|
|
|
hira = text.translate(__KATA_TO_HIRA_MAP)
|
|
|
|
|
return hira
|
2023-04-23 16:46:27 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def add_fullwidth(expressions):
|
|
|
|
|
for expression in expressions:
|
2023-05-01 22:31:28 +00:00
|
|
|
|
new_exp = expression.translate(__HALFWIDTH_TO_FULLWIDTH_MAP)
|
|
|
|
|
if new_exp not in expressions:
|
|
|
|
|
expressions.append(new_exp)
|
2023-04-23 16:46:27 +00:00
|
|
|
|
|
|
|
|
|
|
2023-05-07 01:07:07 +00:00
|
|
|
|
def add_variant_kanji(expressions):
|
|
|
|
|
variant_kanji = load_variant_kanji()
|
2023-04-23 16:46:27 +00:00
|
|
|
|
for old_kanji, new_kanji in variant_kanji.items():
|
|
|
|
|
new_exps = []
|
|
|
|
|
for expression in expressions:
|
|
|
|
|
if old_kanji in expression:
|
|
|
|
|
new_exp = expression.replace(old_kanji, new_kanji)
|
|
|
|
|
new_exps.append(new_exp)
|
|
|
|
|
for new_exp in new_exps:
|
|
|
|
|
if new_exp not in expressions:
|
|
|
|
|
expressions.append(new_exp)
|
|
|
|
|
|
2023-04-08 03:05:36 +00:00
|
|
|
|
|
2023-05-01 22:31:28 +00:00
|
|
|
|
def remove_iteration_mark(expressions):
|
|
|
|
|
iterated_kanji = r"(.)々"
|
|
|
|
|
for expression in expressions:
|
|
|
|
|
for char in re.findall(iterated_kanji, expression):
|
|
|
|
|
new_exp = expression.replace(f"{char}々", f"{char}{char}")
|
|
|
|
|
if new_exp not in expressions:
|
|
|
|
|
expressions.append(new_exp)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def add_iteration_mark(expressions):
|
|
|
|
|
repeat_kanji = r"([^0-z0-zぁ-ヿ])\1"
|
|
|
|
|
for expression in expressions:
|
|
|
|
|
for char in re.findall(repeat_kanji, expression):
|
|
|
|
|
new_exp = expression.replace(f"{char}{char}", f"{char}々")
|
|
|
|
|
if new_exp not in expressions:
|
|
|
|
|
expressions.append(new_exp)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def expand_abbreviation(abbreviated_expression):
|
2023-05-07 01:07:07 +00:00
|
|
|
|
"""Return a list of words described by a 省略 notation."""
|
2023-05-01 22:31:28 +00:00
|
|
|
|
groups = re.findall(r"([^(]*)((([^(]+)))?", abbreviated_expression)
|
|
|
|
|
expressions = [""]
|
2023-04-08 03:05:36 +00:00
|
|
|
|
for group in groups:
|
2023-05-01 22:31:28 +00:00
|
|
|
|
new_exps = []
|
|
|
|
|
for expression in expressions:
|
|
|
|
|
new_exps.append(expression + group[0])
|
|
|
|
|
expressions = new_exps.copy()
|
2023-04-08 03:05:36 +00:00
|
|
|
|
if group[2] == '':
|
|
|
|
|
continue
|
2023-05-01 22:31:28 +00:00
|
|
|
|
new_exps = []
|
|
|
|
|
for expression in expressions:
|
|
|
|
|
new_exps.append(expression + group[2])
|
|
|
|
|
expressions = new_exps.copy() + expressions.copy()
|
|
|
|
|
return expressions
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def expand_abbreviation_list(expressions):
|
|
|
|
|
new_exps = []
|
|
|
|
|
for expression in expressions:
|
|
|
|
|
for new_exp in expand_abbreviation(expression):
|
|
|
|
|
if new_exp not in new_exps:
|
|
|
|
|
new_exps.append(new_exp)
|
|
|
|
|
return new_exps
|