jitenbot/bot/data.py

import os
import sys
import json
import csv
from functools import cache
from pathlib import Path

from platformdirs import user_config_dir


@cache
def get_adobe_glyph(code):
    adobe_glyphs = __load_adobe_glyphs()
    override_adobe_glyphs = __load_override_adobe_glyphs()
    if code in override_adobe_glyphs:
        return override_adobe_glyphs[code]
    if len(adobe_glyphs[code]) > 1:
        raise Exception(f"Multiple glyphs available for code {code}")
    return adobe_glyphs[code][0]


@cache
def load_config():
    config_dir = user_config_dir("jitenbot")
    if not Path(config_dir).is_dir():
        os.makedirs(config_dir)
    config_file = os.path.join(config_dir, "config.json")
    if Path(config_file).is_file():
        with open(config_file, "r", encoding="utf-8") as f:
            config = json.load(f)
    else:
        config = __load_default_config()
        with open(config_file, "w", encoding="utf-8") as f:
            json.dump(config, f, indent=4)
    return config


@cache
def load_yomichan_inflection_categories():
    file_name = os.path.join(
        "yomichan", "inflection_categories.json")
    data = __load_json(file_name)
    return data


@cache
def load_yomichan_metadata():
    file_name = os.path.join(
        "yomichan", "index.json")
    data = __load_json(file_name)
    return data


@cache
def load_variant_kanji():
    def loader(data, row):
        data[row[0]] = row[1]
    file_name = os.path.join(
        "entries", "variant_kanji.csv")
    data = {}
    __load_csv(file_name, loader, data)
    return data


@cache
def load_phrase_readings(target):
    def loader(data, row):
        entry_id = (int(row[0]), int(row[1]))
        reading = row[2]
        data[entry_id] = reading
    file_name = os.path.join(
        "entries", target.value, "phrase_readings.csv")
    data = {}
    __load_csv(file_name, loader, data)
    return data


@cache
def load_daijirin2_kana_abbreviations():
    def loader(data, row):
        entry_id = (int(row[0]), int(row[1]))
        abbreviations = []
        for abbr in row[2:]:
            if abbr.strip() != "":
                abbreviations.append(abbr)
        data[entry_id] = abbreviations
    file_name = os.path.join(
        "entries", "daijirin2", "kana_abbreviations.csv")
    data = {}
    __load_csv(file_name, loader, data)
    return data


@cache
def load_yomichan_name_conversion(target):
    file_name = os.path.join(
        "yomichan", "name_conversion", f"{target.value}.json")
    data = __load_json(file_name)
    return data


@cache
def load_yomichan_term_schema():
    file_name = os.path.join(
        "yomichan", "dictionary-term-bank-v3-schema.json")
    schema = __load_json(file_name)
    return schema


@cache
def load_mdict_name_conversion(target):
    file_name = os.path.join(
        "mdict", "name_conversion", f"{target.value}.json")
    data = __load_json(file_name)
    return data


@cache
def __load_default_config():
    file_name = "default_config.json"
    data = __load_json(file_name)
    return data


@cache
def __load_adobe_glyphs():
    def loader(data, row):
        if row[0].startswith("#"):
            return
        character = chr(int(row[0].split(" ")[0], 16))
        code = int(row[2].removeprefix(" CID+"))
        if code in data:
            if character not in data[code]:
                data[code].append(character)
        else:
            data[code] = [character]
    file_name = os.path.join(
        "entries", "adobe", "Adobe-Japan1_sequences.txt")
    data = {}
    __load_csv(file_name, loader, data, delim=';')
    return data


@cache
def __load_override_adobe_glyphs():
    file_name = os.path.join(
        "entries", "adobe", "override_glyphs.json")
    json_data = __load_json(file_name)
    data = {}
    for key, val in json_data.items():
        data[int(key)] = val
    return data


def __load_json(file_name):
    file_path = os.path.join("data", file_name)
    if not Path(file_path).is_file():
        print(f"Missing data file: {file_path}")
        sys.exit(1)
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data


def __load_csv(file_name, loader, data, delim=',', quote='"'):
    file_path = os.path.join("data", file_name)
    if not Path(file_path).is_file():
        print(f"Missing data file: {file_path}")
        sys.exit(1)
    with open(file_path, "r", encoding="utf-8") as f:
        reader = csv.reader(f, delimiter=delim, quotechar=quote)
        for row in reader:
            loader(data, row)
    return data
Use standard platform directories for cache, config, and output data 2023-04-22 18:37:34 +00:00			`import os`
			`import sys`
			`import json`
Create extra forms for expressions with rare kanji variants 2023-04-23 16:46:27 +00:00			`import csv`
Add support for Shinmeikai 8th edition & Daijirin 4th edition 2023-05-01 22:31:28 +00:00			`from functools import cache`
Use standard platform directories for cache, config, and output data 2023-04-22 18:37:34 +00:00			`from pathlib import Path`

			`from platformdirs import user_config_dir`


Add support for Shinmeikai 8th edition & Daijirin 4th edition 2023-05-01 22:31:28 +00:00			`@cache`
			`def get_adobe_glyph(code):`
			`adobe_glyphs = __load_adobe_glyphs()`
			`override_adobe_glyphs = __load_override_adobe_glyphs()`
			`if code in override_adobe_glyphs:`
			`return override_adobe_glyphs[code]`
			`if len(adobe_glyphs[code]) > 1:`
			`raise Exception(f"Multiple glyphs available for code {code}")`
			`return adobe_glyphs[code][0]`


			`@cache`
Import functions explicitly 2023-04-23 17:33:42 +00:00			`def load_config():`
Use standard platform directories for cache, config, and output data 2023-04-22 18:37:34 +00:00			`config_dir = user_config_dir("jitenbot")`
			`if not Path(config_dir).is_dir():`
			`os.makedirs(config_dir)`
			`config_file = os.path.join(config_dir, "config.json")`
			`if Path(config_file).is_file():`
Specify UTF-8 encoding with file i/o 2023-05-02 01:03:03 +00:00			`with open(config_file, "r", encoding="utf-8") as f:`
Use standard platform directories for cache, config, and output data 2023-04-22 18:37:34 +00:00			`config = json.load(f)`
			`else:`
Import functions explicitly 2023-04-23 17:33:42 +00:00			`config = __load_default_config()`
Specify UTF-8 encoding with file i/o 2023-05-02 01:03:03 +00:00			`with open(config_file, "w", encoding="utf-8") as f:`
Use standard platform directories for cache, config, and output data 2023-04-22 18:37:34 +00:00			`json.dump(config, f, indent=4)`
			`return config`


Add support for Shinmeikai 8th edition & Daijirin 4th edition 2023-05-01 22:31:28 +00:00			`@cache`
Import functions explicitly 2023-04-23 17:33:42 +00:00			`def load_yomichan_inflection_categories():`
Add support for sankoku8 2023-07-18 05:43:38 +00:00			`file_name = os.path.join(`
			`"yomichan", "inflection_categories.json")`
Use standard platform directories for cache, config, and output data 2023-04-22 18:37:34 +00:00			`data = __load_json(file_name)`
			`return data`


Add support for Shinmeikai 8th edition & Daijirin 4th edition 2023-05-01 22:31:28 +00:00			`@cache`
Import functions explicitly 2023-04-23 17:33:42 +00:00			`def load_yomichan_metadata():`
Add support for sankoku8 2023-07-18 05:43:38 +00:00			`file_name = os.path.join(`
			`"yomichan", "index.json")`
Move Yomichan index and tag metadata to data file 2023-04-22 19:14:28 +00:00			`data = __load_json(file_name)`
			`return data`


Add support for Shinmeikai 8th edition & Daijirin 4th edition 2023-05-01 22:31:28 +00:00			`@cache`
Import functions explicitly 2023-04-23 17:33:42 +00:00			`def load_variant_kanji():`
Create extra forms for expressions with rare kanji variants 2023-04-23 16:46:27 +00:00			`def loader(data, row):`
			`data[row[0]] = row[1]`
Add support for sankoku8 2023-07-18 05:43:38 +00:00			`file_name = os.path.join(`
			`"entries", "variant_kanji.csv")`
Create extra forms for expressions with rare kanji variants 2023-04-23 16:46:27 +00:00			`data = {}`
			`__load_csv(file_name, loader, data)`
			`return data`


Add support for Shinmeikai 8th edition & Daijirin 4th edition 2023-05-01 22:31:28 +00:00			`@cache`
Add support for sankoku8 2023-07-18 05:43:38 +00:00			`def load_phrase_readings(target):`
Add support for Shinmeikai 8th edition & Daijirin 4th edition 2023-05-01 22:31:28 +00:00			`def loader(data, row):`
			`entry_id = (int(row[0]), int(row[1]))`
			`reading = row[2]`
			`data[entry_id] = reading`
Add support for sankoku8 2023-07-18 05:43:38 +00:00			`file_name = os.path.join(`
			`"entries", target.value, "phrase_readings.csv")`
Add support for Shinmeikai 8th edition & Daijirin 4th edition 2023-05-01 22:31:28 +00:00			`data = {}`
			`__load_csv(file_name, loader, data)`
			`return data`


			`@cache`
			`def load_daijirin2_kana_abbreviations():`
			`def loader(data, row):`
			`entry_id = (int(row[0]), int(row[1]))`
			`abbreviations = []`
			`for abbr in row[2:]:`
			`if abbr.strip() != "":`
			`abbreviations.append(abbr)`
			`data[entry_id] = abbreviations`
Add support for sankoku8 2023-07-18 05:43:38 +00:00			`file_name = os.path.join(`
			`"entries", "daijirin2", "kana_abbreviations.csv")`
Add support for Shinmeikai 8th edition & Daijirin 4th edition 2023-05-01 22:31:28 +00:00			`data = {}`
			`__load_csv(file_name, loader, data)`
			`return data`


			`@cache`
Add export support for the MDict dictionary format 2023-07-08 21:49:03 +00:00			`def load_yomichan_name_conversion(target):`
Add support for sankoku8 2023-07-18 05:43:38 +00:00			`file_name = os.path.join(`
			`"yomichan", "name_conversion", f"{target.value}.json")`
Add support for Shinmeikai 8th edition & Daijirin 4th edition 2023-05-01 22:31:28 +00:00			`data = __load_json(file_name)`
			`return data`


Add support for sankoku8 2023-07-18 05:43:38 +00:00			`@cache`
			`def load_yomichan_term_schema():`
			`file_name = os.path.join(`
			`"yomichan", "dictionary-term-bank-v3-schema.json")`
			`schema = __load_json(file_name)`
			`return schema`


Add support for Shinmeikai 8th edition & Daijirin 4th edition 2023-05-01 22:31:28 +00:00			`@cache`
Add export support for the MDict dictionary format 2023-07-08 21:49:03 +00:00			`def load_mdict_name_conversion(target):`
Add support for sankoku8 2023-07-18 05:43:38 +00:00			`file_name = os.path.join(`
			`"mdict", "name_conversion", f"{target.value}.json")`
Add support for Shinmeikai 8th edition & Daijirin 4th edition 2023-05-01 22:31:28 +00:00			`data = __load_json(file_name)`
			`return data`


			`@cache`
Import functions explicitly 2023-04-23 17:33:42 +00:00			`def __load_default_config():`
Use standard platform directories for cache, config, and output data 2023-04-22 18:37:34 +00:00			`file_name = "default_config.json"`
			`data = __load_json(file_name)`
			`return data`


Add support for Shinmeikai 8th edition & Daijirin 4th edition 2023-05-01 22:31:28 +00:00			`@cache`
			`def __load_adobe_glyphs():`
			`def loader(data, row):`
			`if row[0].startswith("#"):`
			`return`
			`character = chr(int(row[0].split(" ")[0], 16))`
			`code = int(row[2].removeprefix(" CID+"))`
			`if code in data:`
			`if character not in data[code]:`
			`data[code].append(character)`
			`else:`
			`data[code] = [character]`
Add support for sankoku8 2023-07-18 05:43:38 +00:00			`file_name = os.path.join(`
			`"entries", "adobe", "Adobe-Japan1_sequences.txt")`
Add support for Shinmeikai 8th edition & Daijirin 4th edition 2023-05-01 22:31:28 +00:00			`data = {}`
			`__load_csv(file_name, loader, data, delim=';')`
			`return data`


			`@cache`
			`def __load_override_adobe_glyphs():`
Add support for sankoku8 2023-07-18 05:43:38 +00:00			`file_name = os.path.join(`
			`"entries", "adobe", "override_glyphs.json")`
Add support for Shinmeikai 8th edition & Daijirin 4th edition 2023-05-01 22:31:28 +00:00			`json_data = __load_json(file_name)`
			`data = {}`
			`for key, val in json_data.items():`
			`data[int(key)] = val`
			`return data`


Use standard platform directories for cache, config, and output data 2023-04-22 18:37:34 +00:00			`def __load_json(file_name):`
			`file_path = os.path.join("data", file_name)`
			`if not Path(file_path).is_file():`
			`print(f"Missing data file: {file_path}")`
			`sys.exit(1)`
			`with open(file_path, "r", encoding="utf-8") as f:`
			`data = json.load(f)`
			`return data`
Create extra forms for expressions with rare kanji variants 2023-04-23 16:46:27 +00:00

			`def __load_csv(file_name, loader, data, delim=',', quote='"'):`
			`file_path = os.path.join("data", file_name)`
			`if not Path(file_path).is_file():`
			`print(f"Missing data file: {file_path}")`
			`sys.exit(1)`
			`with open(file_path, "r", encoding="utf-8") as f:`
			`reader = csv.reader(f, delimiter=delim, quotechar=quote)`
			`for row in reader:`
			`loader(data, row)`
			`return data`