Create extra forms for expressions with rare kanji variants

This commit is contained in:
stephenmk 2023-04-23 11:46:27 -05:00
parent fbaba0f499
commit 25fa5d107a
No known key found for this signature in database
GPG key ID: B6DA730DB06235F1
4 changed files with 78 additions and 0 deletions

View file

@ -1,6 +1,7 @@
import os
import sys
import json
import csv
from pathlib import Path
from platformdirs import user_config_dir
@ -33,6 +34,15 @@ def yomichan_metadata():
return data
def variant_kanji():
def loader(data, row):
data[row[0]] = row[1]
file_name = "variant_kanji.csv"
data = {}
__load_csv(file_name, loader, data)
return data
def __default_config():
file_name = "default_config.json"
data = __load_json(file_name)
@ -47,3 +57,15 @@ def __load_json(file_name):
with open(file_path, "r", encoding="utf-8") as f:
data = json.load(f)
return data
def __load_csv(file_name, loader, data, delim=',', quote='"'):
file_path = os.path.join("data", file_name)
if not Path(file_path).is_file():
print(f"Missing data file: {file_path}")
sys.exit(1)
with open(file_path, "r", encoding="utf-8") as f:
reader = csv.reader(f, delimiter=delim, quotechar=quote)
for row in reader:
loader(data, row)
return data

View file

@ -2,11 +2,16 @@ import re
from datetime import datetime, date
from bs4 import BeautifulSoup
import bot.data as Data
import bot.expressions as Expressions
class JitenonEntry:
_VARIANT_KANJI = None
def __init__(self, entry_id):
if self._VARIANT_KANJI is None:
self._VARIANT_KANJI = Data.variant_kanji()
self.entry_id = entry_id
self.markup = ""
self.modified_date = date(1970, 1, 1)
@ -34,6 +39,7 @@ class JitenonEntry:
if self._headwords is not None:
return self._headwords
self._set_headwords()
self._set_variant_headwords()
return self._headwords
def get_first_expression(self):
@ -148,6 +154,10 @@ class JitenonYojiEntry(JitenonEntry):
def __init__(self, sequence):
super().__init__(sequence)
def _set_variant_headwords(self):
for expressions in self._headwords.values():
Expressions.add_variant_kanji(expressions, self._VARIANT_KANJI)
class JitenonKotowazaEntry(JitenonEntry):
COLUMNS = {
@ -170,3 +180,8 @@ class JitenonKotowazaEntry(JitenonEntry):
}
else:
super()._set_headwords()
def _set_variant_headwords(self):
for expressions in self._headwords.values():
Expressions.add_variant_kanji(expressions, self._VARIANT_KANJI)
Expressions.add_fullwidth(expressions)

View file

@ -1,5 +1,27 @@
import re
__WIDE_MAP = {i: i + 0xFEE0 for i in range(0x21, 0x7F)}
def add_fullwidth(expressions):
for expression in expressions:
if re.match(r"[A-Za-z0-9]", expression):
new_exp = expression.translate(__WIDE_MAP)
if new_exp not in expressions:
expressions.append(new_exp)
def add_variant_kanji(expressions, variant_kanji):
for old_kanji, new_kanji in variant_kanji.items():
new_exps = []
for expression in expressions:
if old_kanji in expression:
new_exp = expression.replace(old_kanji, new_kanji)
new_exps.append(new_exp)
for new_exp in new_exps:
if new_exp not in expressions:
expressions.append(new_exp)
def expand_shouryaku(shouryaku):
"""Return a list of words described by a 省略 notation.

19
data/variant_kanji.csv Normal file
View file

@ -0,0 +1,19 @@
儘,侭
凜,凛
剝,剥
吞,呑
啞,唖
噓,嘘
嚙,噛
塡,填
姸,妍
搔,掻
摑,掴
潑,溌
竜,龍
簞,箪
籠,篭
藪,薮
蠟,蝋
醬,醤
鶯,鴬
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19