74 lines
3.2 KiB
C++
74 lines
3.2 KiB
C++
/*
|
||
* gd-tools - a set of programs to enhance goldendict for immersion learning.
|
||
* Copyright (C) 2023 Ajatt-Tools
|
||
*
|
||
* This program is free software: you can redistribute it and/or modify
|
||
* it under the terms of the GNU General Public License as published by
|
||
* the Free Software Foundation; either version 3 of the License, or
|
||
* (at your option) any later version.
|
||
*
|
||
* This program is distributed in the hope that it will be useful,
|
||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
* GNU General Public License for more details.
|
||
*
|
||
* You should have received a copy of the GNU General Public License
|
||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||
*/
|
||
|
||
#include "kana_conv.h"
|
||
#include "precompiled.h"
|
||
#include "util.h"
|
||
|
||
auto unicode_char_byte_len(char const& ch) -> CharByteLen
|
||
{
|
||
// See `man charsets` -> Unicode for explanation of below
|
||
if ((ch & 0xC0) == 0x80) {
|
||
// intermediate unicode char
|
||
return CharByteLen::SKIP;
|
||
} else if ((ch & 0x80) == 0x00) {
|
||
// Start of ASCII character
|
||
return CharByteLen::ONE;
|
||
} else if ((ch & 0xE0) == 0xC0) {
|
||
// Cyrillic, etc.
|
||
return CharByteLen::TWO;
|
||
} else if ((ch & 0xF0) == 0xE0) {
|
||
// Start of 3 byte sequence (CJK)
|
||
return CharByteLen::THREE;
|
||
} else if ((ch & 0xF8) == 0xF0) {
|
||
// Other Unicode
|
||
return CharByteLen::FOUR;
|
||
}
|
||
throw gd::runtime_error{ fmt::format("Can't recognize byte: '{:x}'.", ch) };
|
||
}
|
||
|
||
auto create_map(std::string_view from, std::string_view to) -> KanaConvMap
|
||
{
|
||
KanaConvMap result{};
|
||
for (auto const [idx, uni_char]: enum_unicode_chars(from)) {
|
||
result.emplace(uni_char, to.substr(idx, uni_char.length()));
|
||
}
|
||
return result;
|
||
}
|
||
|
||
auto half_to_full(std::string& str) -> std::string&
|
||
{
|
||
static KanaConvMap const conv_map = {
|
||
{ "ア", "ア" }, { "イ", "イ" }, { "ウ", "ウ" }, { "エ", "エ" }, { "オ", "オ" }, { "カ", "カ" }, { "キ", "キ" },
|
||
{ "ク", "ク" }, { "ケ", "ケ" }, { "コ", "コ" }, { "サ", "サ" }, { "シ", "シ" }, { "ス", "ス" }, { "セ", "セ" },
|
||
{ "ソ", "ソ" }, { "タ", "タ" }, { "チ", "チ" }, { "ツ", "ツ" }, { "テ", "テ" }, { "ト", "ト" }, { "ナ", "ナ" },
|
||
{ "ニ", "ニ" }, { "ヌ", "ヌ" }, { "ネ", "ネ" }, { "ノ", "ノ" }, { "ハ", "ハ" }, { "ヒ", "ヒ" }, { "フ", "フ" },
|
||
{ "ヘ", "ヘ" }, { "ホ", "ホ" }, { "マ", "マ" }, { "ミ", "ミ" }, { "ム", "ム" }, { "メ", "メ" }, { "モ", "モ" },
|
||
{ "ヤ", "ヤ" }, { "ユ", "ユ" }, { "ヨ", "ヨ" }, { "ラ", "ラ" }, { "リ", "リ" }, { "ル", "ル" }, { "レ", "レ" },
|
||
{ "ロ", "ロ" }, { "ワ", "ワ" }, { "ヲ", "ヲ" }, { "ン", "ン" }, { "ァ", "ァ" }, { "ィ", "ィ" }, { "ゥ", "ゥ" },
|
||
{ "ェ", "ェ" }, { "ォ", "ォ" }, { "ッ", "ッ" }, { "ャ", "ャ" }, { "ュ", "ュ" }, { "ョ", "ョ" }, { "。", "。" },
|
||
{ "、", "、" }, { "・", "・" }, { "゛", "゙" }, { "゜", "゚" }, { "「", "「" }, { "」", "」" }, { "ー", "ー" }
|
||
};
|
||
for (auto const [idx, uni_char]: enum_unicode_chars(str)) {
|
||
if (conv_map.contains(uni_char)) {
|
||
str.replace(idx, uni_char.length(), conv_map.at(uni_char));
|
||
}
|
||
}
|
||
return str;
|
||
}
|