89 lines
2.6 KiB
Python
89 lines
2.6 KiB
Python
import re
|
||
|
||
__KATA_TO_HIRA_MAP = {
|
||
i: i - 96 for i in [
|
||
*range(0x30A1, 0x30F6),
|
||
*range(0x30FD, 0x30FE),
|
||
]
|
||
}
|
||
|
||
|
||
__HALFWIDTH_TO_FULLWIDTH_MAP = {
|
||
i: i + 0xFEE0 for i in [
|
||
*range(0x21, 0x7F),
|
||
]
|
||
}
|
||
|
||
|
||
def kata_to_hira(text):
|
||
hira = text.translate(__KATA_TO_HIRA_MAP)
|
||
return hira
|
||
|
||
|
||
def add_fullwidth(expressions):
|
||
for expression in expressions:
|
||
new_exp = expression.translate(__HALFWIDTH_TO_FULLWIDTH_MAP)
|
||
if new_exp not in expressions:
|
||
expressions.append(new_exp)
|
||
|
||
|
||
def add_variant_kanji(expressions, variant_kanji):
|
||
for old_kanji, new_kanji in variant_kanji.items():
|
||
new_exps = []
|
||
for expression in expressions:
|
||
if old_kanji in expression:
|
||
new_exp = expression.replace(old_kanji, new_kanji)
|
||
new_exps.append(new_exp)
|
||
for new_exp in new_exps:
|
||
if new_exp not in expressions:
|
||
expressions.append(new_exp)
|
||
|
||
|
||
def remove_iteration_mark(expressions):
|
||
iterated_kanji = r"(.)々"
|
||
for expression in expressions:
|
||
for char in re.findall(iterated_kanji, expression):
|
||
new_exp = expression.replace(f"{char}々", f"{char}{char}")
|
||
if new_exp not in expressions:
|
||
expressions.append(new_exp)
|
||
|
||
|
||
def add_iteration_mark(expressions):
|
||
repeat_kanji = r"([^0-z0-zぁ-ヿ])\1"
|
||
for expression in expressions:
|
||
for char in re.findall(repeat_kanji, expression):
|
||
new_exp = expression.replace(f"{char}{char}", f"{char}々")
|
||
if new_exp not in expressions:
|
||
expressions.append(new_exp)
|
||
|
||
|
||
def expand_abbreviation(abbreviated_expression):
|
||
"""Return a list of words described by a 省略 notation.
|
||
eg. "有(り)合(わ)せ" -> [
|
||
"有り合わせ", "有合わせ", "有り合せ", "有合せ"
|
||
]
|
||
"""
|
||
groups = re.findall(r"([^(]*)((([^(]+)))?", abbreviated_expression)
|
||
expressions = [""]
|
||
for group in groups:
|
||
new_exps = []
|
||
for expression in expressions:
|
||
new_exps.append(expression + group[0])
|
||
expressions = new_exps.copy()
|
||
if group[2] == '':
|
||
continue
|
||
new_exps = []
|
||
for expression in expressions:
|
||
new_exps.append(expression + group[2])
|
||
expressions = new_exps.copy() + expressions.copy()
|
||
return expressions
|
||
|
||
|
||
def expand_abbreviation_list(expressions):
|
||
new_exps = []
|
||
for expression in expressions:
|
||
for new_exp in expand_abbreviation(expression):
|
||
if new_exp not in new_exps:
|
||
new_exps.append(new_exp)
|
||
return new_exps
|