Create extra forms for expressions with rare kanji variants

This commit is contained in:
stephenmk 2023-04-23 11:46:27 -05:00
parent fbaba0f499
commit 25fa5d107a
No known key found for this signature in database
GPG key ID: B6DA730DB06235F1
4 changed files with 78 additions and 0 deletions

View file

@ -1,6 +1,7 @@
import os import os
import sys import sys
import json import json
import csv
from pathlib import Path from pathlib import Path
from platformdirs import user_config_dir from platformdirs import user_config_dir
@ -33,6 +34,15 @@ def yomichan_metadata():
return data return data
def variant_kanji():
def loader(data, row):
data[row[0]] = row[1]
file_name = "variant_kanji.csv"
data = {}
__load_csv(file_name, loader, data)
return data
def __default_config(): def __default_config():
file_name = "default_config.json" file_name = "default_config.json"
data = __load_json(file_name) data = __load_json(file_name)
@ -47,3 +57,15 @@ def __load_json(file_name):
with open(file_path, "r", encoding="utf-8") as f: with open(file_path, "r", encoding="utf-8") as f:
data = json.load(f) data = json.load(f)
return data return data
def __load_csv(file_name, loader, data, delim=',', quote='"'):
file_path = os.path.join("data", file_name)
if not Path(file_path).is_file():
print(f"Missing data file: {file_path}")
sys.exit(1)
with open(file_path, "r", encoding="utf-8") as f:
reader = csv.reader(f, delimiter=delim, quotechar=quote)
for row in reader:
loader(data, row)
return data

View file

@ -2,11 +2,16 @@ import re
from datetime import datetime, date from datetime import datetime, date
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import bot.data as Data
import bot.expressions as Expressions import bot.expressions as Expressions
class JitenonEntry: class JitenonEntry:
_VARIANT_KANJI = None
def __init__(self, entry_id): def __init__(self, entry_id):
if self._VARIANT_KANJI is None:
self._VARIANT_KANJI = Data.variant_kanji()
self.entry_id = entry_id self.entry_id = entry_id
self.markup = "" self.markup = ""
self.modified_date = date(1970, 1, 1) self.modified_date = date(1970, 1, 1)
@ -34,6 +39,7 @@ class JitenonEntry:
if self._headwords is not None: if self._headwords is not None:
return self._headwords return self._headwords
self._set_headwords() self._set_headwords()
self._set_variant_headwords()
return self._headwords return self._headwords
def get_first_expression(self): def get_first_expression(self):
@ -148,6 +154,10 @@ class JitenonYojiEntry(JitenonEntry):
def __init__(self, sequence): def __init__(self, sequence):
super().__init__(sequence) super().__init__(sequence)
def _set_variant_headwords(self):
for expressions in self._headwords.values():
Expressions.add_variant_kanji(expressions, self._VARIANT_KANJI)
class JitenonKotowazaEntry(JitenonEntry): class JitenonKotowazaEntry(JitenonEntry):
COLUMNS = { COLUMNS = {
@ -170,3 +180,8 @@ class JitenonKotowazaEntry(JitenonEntry):
} }
else: else:
super()._set_headwords() super()._set_headwords()
def _set_variant_headwords(self):
for expressions in self._headwords.values():
Expressions.add_variant_kanji(expressions, self._VARIANT_KANJI)
Expressions.add_fullwidth(expressions)

View file

@ -1,5 +1,27 @@
import re import re
__WIDE_MAP = {i: i + 0xFEE0 for i in range(0x21, 0x7F)}
def add_fullwidth(expressions):
for expression in expressions:
if re.match(r"[A-Za-z0-9]", expression):
new_exp = expression.translate(__WIDE_MAP)
if new_exp not in expressions:
expressions.append(new_exp)
def add_variant_kanji(expressions, variant_kanji):
for old_kanji, new_kanji in variant_kanji.items():
new_exps = []
for expression in expressions:
if old_kanji in expression:
new_exp = expression.replace(old_kanji, new_kanji)
new_exps.append(new_exp)
for new_exp in new_exps:
if new_exp not in expressions:
expressions.append(new_exp)
def expand_shouryaku(shouryaku): def expand_shouryaku(shouryaku):
"""Return a list of words described by a 省略 notation. """Return a list of words described by a 省略 notation.

19
data/variant_kanji.csv Normal file
View file

@ -0,0 +1,19 @@
儘,侭
凜,凛
剝,剥
吞,呑
啞,唖
噓,嘘
嚙,噛
塡,填
姸,妍
搔,掻
摑,掴
潑,溌
竜,龍
簞,箪
籠,篭
藪,薮
蠟,蝋
醬,醤
鶯,鴬
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19