/* * gd-tools - a set of programs to enhance goldendict for immersion learning. * Copyright (C) 2023 Ajatt-Tools * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "kana_conv.h" #include "precompiled.h" #include "util.h" auto unicode_char_byte_len(char const& ch) -> CharByteLen { // See `man charsets` -> Unicode for explanation of below if ((ch & 0xC0) == 0x80) { // intermediate unicode char return CharByteLen::SKIP; } else if ((ch & 0x80) == 0x00) { // Start of ASCII character return CharByteLen::ONE; } else if ((ch & 0xE0) == 0xC0) { // Cyrillic, etc. return CharByteLen::TWO; } else if ((ch & 0xF0) == 0xE0) { // Start of 3 byte sequence (CJK) return CharByteLen::THREE; } else if ((ch & 0xF8) == 0xF0) { // Other Unicode return CharByteLen::FOUR; } throw gd::runtime_error{ fmt::format("Can't recognize byte: '{:x}'.", ch) }; } auto create_map(std::string_view from, std::string_view to) -> KanaConvMap { KanaConvMap result{}; for (auto const [idx, uni_char]: enum_unicode_chars(from)) { result.emplace(uni_char, to.substr(idx, uni_char.length())); } return result; } auto half_to_full(std::string& str) -> std::string& { static KanaConvMap const conv_map = { { "ア", "ア" }, { "イ", "イ" }, { "ウ", "ウ" }, { "エ", "エ" }, { "オ", "オ" }, { "カ", "カ" }, { "キ", "キ" }, { "ク", "ク" }, { "ケ", "ケ" }, { "コ", "コ" }, { "サ", "サ" }, { "シ", "シ" }, { "ス", "ス" }, { "セ", "セ" }, { "ソ", "ソ" }, { "タ", "タ" }, { "チ", "チ" }, { "ツ", "ツ" }, { "テ", "テ" }, { "ト", "ト" }, { "ナ", "ナ" }, { "ニ", "ニ" }, { "ヌ", "ヌ" }, { "ネ", "ネ" }, { "ノ", "ノ" }, { "ハ", "ハ" }, { "ヒ", "ヒ" }, { "フ", "フ" }, { "ヘ", "ヘ" }, { "ホ", "ホ" }, { "マ", "マ" }, { "ミ", "ミ" }, { "ム", "ム" }, { "メ", "メ" }, { "モ", "モ" }, { "ヤ", "ヤ" }, { "ユ", "ユ" }, { "ヨ", "ヨ" }, { "ラ", "ラ" }, { "リ", "リ" }, { "ル", "ル" }, { "レ", "レ" }, { "ロ", "ロ" }, { "ワ", "ワ" }, { "ヲ", "ヲ" }, { "ン", "ン" }, { "ァ", "ァ" }, { "ィ", "ィ" }, { "ゥ", "ゥ" }, { "ェ", "ェ" }, { "ォ", "ォ" }, { "ッ", "ッ" }, { "ャ", "ャ" }, { "ュ", "ュ" }, { "ョ", "ョ" }, { "。", "。" }, { "、", "、" }, { "・", "・" }, { "゛", "゙" }, { "゜", "゚" }, { "「", "「" }, { "」", "」" }, { "ー", "ー" } }; for (auto const [idx, uni_char]: enum_unicode_chars(str)) { if (conv_map.contains(uni_char)) { str.replace(idx, uni_char.length(), conv_map.at(uni_char)); } } return str; }