diff --git a/bot/data.py b/bot/data.py index 1b7eeb1..9415eaa 100644 --- a/bot/data.py +++ b/bot/data.py @@ -1,6 +1,7 @@ import os import sys import json +import csv from pathlib import Path from platformdirs import user_config_dir @@ -33,6 +34,15 @@ def yomichan_metadata(): return data +def variant_kanji(): + def loader(data, row): + data[row[0]] = row[1] + file_name = "variant_kanji.csv" + data = {} + __load_csv(file_name, loader, data) + return data + + def __default_config(): file_name = "default_config.json" data = __load_json(file_name) @@ -47,3 +57,15 @@ def __load_json(file_name): with open(file_path, "r", encoding="utf-8") as f: data = json.load(f) return data + + +def __load_csv(file_name, loader, data, delim=',', quote='"'): + file_path = os.path.join("data", file_name) + if not Path(file_path).is_file(): + print(f"Missing data file: {file_path}") + sys.exit(1) + with open(file_path, "r", encoding="utf-8") as f: + reader = csv.reader(f, delimiter=delim, quotechar=quote) + for row in reader: + loader(data, row) + return data diff --git a/bot/entries/jitenon.py b/bot/entries/jitenon.py index 24f2ad2..9ba61ee 100644 --- a/bot/entries/jitenon.py +++ b/bot/entries/jitenon.py @@ -2,11 +2,16 @@ import re from datetime import datetime, date from bs4 import BeautifulSoup +import bot.data as Data import bot.expressions as Expressions class JitenonEntry: + _VARIANT_KANJI = None + def __init__(self, entry_id): + if self._VARIANT_KANJI is None: + self._VARIANT_KANJI = Data.variant_kanji() self.entry_id = entry_id self.markup = "" self.modified_date = date(1970, 1, 1) @@ -34,6 +39,7 @@ class JitenonEntry: if self._headwords is not None: return self._headwords self._set_headwords() + self._set_variant_headwords() return self._headwords def get_first_expression(self): @@ -148,6 +154,10 @@ class JitenonYojiEntry(JitenonEntry): def __init__(self, sequence): super().__init__(sequence) + def _set_variant_headwords(self): + for expressions in self._headwords.values(): + Expressions.add_variant_kanji(expressions, self._VARIANT_KANJI) + class JitenonKotowazaEntry(JitenonEntry): COLUMNS = { @@ -170,3 +180,8 @@ class JitenonKotowazaEntry(JitenonEntry): } else: super()._set_headwords() + + def _set_variant_headwords(self): + for expressions in self._headwords.values(): + Expressions.add_variant_kanji(expressions, self._VARIANT_KANJI) + Expressions.add_fullwidth(expressions) diff --git a/bot/expressions.py b/bot/expressions.py index ba39db6..5168082 100644 --- a/bot/expressions.py +++ b/bot/expressions.py @@ -1,5 +1,27 @@ import re +__WIDE_MAP = {i: i + 0xFEE0 for i in range(0x21, 0x7F)} + + +def add_fullwidth(expressions): + for expression in expressions: + if re.match(r"[A-Za-z0-9]", expression): + new_exp = expression.translate(__WIDE_MAP) + if new_exp not in expressions: + expressions.append(new_exp) + + +def add_variant_kanji(expressions, variant_kanji): + for old_kanji, new_kanji in variant_kanji.items(): + new_exps = [] + for expression in expressions: + if old_kanji in expression: + new_exp = expression.replace(old_kanji, new_kanji) + new_exps.append(new_exp) + for new_exp in new_exps: + if new_exp not in expressions: + expressions.append(new_exp) + def expand_shouryaku(shouryaku): """Return a list of words described by a 省略 notation. diff --git a/data/variant_kanji.csv b/data/variant_kanji.csv new file mode 100644 index 0000000..adc6bf3 --- /dev/null +++ b/data/variant_kanji.csv @@ -0,0 +1,19 @@ +儘,侭 +凜,凛 +剝,剥 +吞,呑 +啞,唖 +噓,嘘 +嚙,噛 +塡,填 +姸,妍 +搔,掻 +摑,掴 +潑,溌 +竜,龍 +簞,箪 +籠,篭 +藪,薮 +蠟,蝋 +醬,醤 +鶯,鴬