gd-tools/src_bac/marisa.cpp
2024-02-04 14:24:04 -04:00

247 lines
7.4 KiB
C++

/*
* gd-tools - a set of programs to enhance goldendict for immersion learning.
* Copyright (C) 2023 Ajatt-Tools
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
#include "kana_conv.h"
#include "precompiled.h"
#include "util.h"
using namespace std::string_literals;
using namespace ajt::rdricpp;
static constexpr std::string_view help_text = R"EOF(usage: gd-marisa [OPTIONS]
Split sentence using MARISA and print links to each word.
OPTIONS
--word WORD required word
--sentence SENTENCE required sentence
--path-to-dic optional path to words.dic
EXAMPLES
gd-marisa --word %GDWORD% --sentence %GDSEARCH%
)EOF";
static constexpr std::string_view css_style = R"EOF(
<style>
.gd-marisa {
font-size: 2rem;
margin-bottom: 0.05em;
margin-top: -0.2em;
color: #1268c3;
font-weight: normal;
}
.gd-marisa a {
display: inline-block;
font-weight: normal;
color: royalblue;
text-decoration: none;
border-bottom: dashed max(1px, calc(1em / 16)) currentColor;
}
.gd-marisa a.gd-headword {
background-color: #ddeeff;
border-radius: 0.2rem;
font-weight: 500;
}
.gd-marisa > ul {
--size: 1rem;
font-size: var(--size);
padding-inline-start: var(--size);
margin-block: 2px;
}
.gd-marisa .alternatives {
--size: 1rem;
display: grid;
font-size: var(--size);
gap: calc( var(--size) / 4);
max-width: 100%;
margin: 0 auto;
grid-template-columns: repeat(auto-fill, minmax(150px, 1fr));
align-content: start;
justify-content: space-around;
text-align: left;
padding: 5px 0px;
}
.gd-marisa .alternatives > ul {
list-style-type: none;
margin: 0;
padding: calc( var(--size) / 4);
background-color: hsl(150deg 30% 60% / 10%);
}
</style>
)EOF";
static constexpr std::size_t max_forward_search_len_bytes{ CharByteLen::THREE * 20UL };
auto find_dic_file() -> std::filesystem::path
{
static auto const locations = {
// possible .dic locations
std::filesystem::path("/usr/share/gd-tools/marisa_words.dic"),
std::filesystem::path(std::getenv("HOME")) / ".local/share/gd-tools/marisa_words.dic"
};
for (auto const& location: locations) {
if (std::filesystem::exists(location) and std::filesystem::is_regular_file(location)) {
return location;
}
}
throw gd::runtime_error("Couldn't find the word list.");
}
struct marisa_params
{
std::string gd_word{};
std::string gd_sentence{};
std::string path_to_dic{ find_dic_file() };
auto assign(std::string_view const key, std::string_view const value) -> void
{
if (key == "--word") {
gd_word = value;
} else if (key == "--sentence") {
gd_sentence = value;
} else if (key == "--path-to-dic") {
path_to_dic = value;
}
}
};
auto cmp_len(std::string_view a, std::string_view b) -> bool
{
return a.length() < b.length();
}
struct Deinflected
{
std::string_view from;
std::vector<Deinflection> to;
};
auto find_deinflections_starting_with(std::string_view const search_str) -> std::vector<Deinflected>
{
std::vector<Deinflected> hits;
// loop from larger towards shorter substrings
for (std::string_view substr: enum_unicode_chars(search_str) //
| std::views::reverse
| std::views::transform( //
[&search_str](Utf8CharView const ch) { //
return search_str.substr(0UL, ch.idx + ch.ch.size());
}
)) {
hits.emplace_back(substr, deinflect(substr));
}
return hits;
}
auto find_keywords_starting_with(marisa::Agent& agent, marisa::Trie const& trie, std::string const& search_str) -> JpSet
{
JpSet results{};
auto const variants = { search_str, hiragana_to_katakana(search_str), katakana_to_hiragana(search_str) };
auto deinflections = std::views::all(variants) //
| std::views::transform(find_deinflections_starting_with) //
| std::views::join //
| std::views::transform([](Deinflected const& group) { return group.to; }) //
| std::views::join;
for (auto const& deinflection: deinflections) {
agent.set_query(deinflection.term.c_str());
while (trie.common_prefix_search(agent)) { //
results.emplace(agent.key().ptr(), agent.key().length());
}
}
return results;
}
void lookup_words(marisa_params params)
{
half_to_full(params.gd_word);
std::erase_if(params.gd_word, is_space);
std::erase_if(params.gd_sentence, is_space);
if (params.gd_sentence.empty()) {
params.gd_sentence = params.gd_word;
} else {
half_to_full(params.gd_sentence);
}
marisa::Trie trie;
marisa::Agent agent;
std::ifstream file{ params.path_to_dic };
raise_if(not file.good(), fmt::format(R"(Error. The dictionary file "{}" does not exist.)", params.path_to_dic));
trie.load(params.path_to_dic.c_str());
fmt::println(R"(<div class="gd-marisa">)");
std::ptrdiff_t pos_in_gd_word{ 0 };
std::vector<JpSet> alternatives{};
alternatives.reserve(20);
// Link longest words starting with each position in sentence.
for (auto const [idx, uni_char]: enum_unicode_chars(params.gd_sentence)) {
auto const headwords{ find_keywords_starting_with(
agent,
trie, //
params.gd_sentence.substr(idx, max_forward_search_len_bytes)
) };
// set bword to the longest found key in the trie.
std::string const bword{ headwords.empty() ? std::string{ uni_char } : std::ranges::max(headwords, cmp_len) };
if (params.gd_word == bword) {
pos_in_gd_word = static_cast<std::ptrdiff_t>(bword.length());
} else {
pos_in_gd_word -= static_cast<std::ptrdiff_t>(uni_char.length());
}
fmt::print(
R"(<a class="{}" href="bword:{}">{}</a>)",
(pos_in_gd_word > 0 ? "gd-headword" : "gd-word"),
bword,
uni_char
);
alternatives.push_back(headwords);
}
// Show available entries for other substrings.
fmt::println(R"(<div class="alternatives">)");
for (auto const& group: alternatives | std::views::filter(&JpSet::size)) {
fmt::println("<ul>");
for (auto const& word: group) {
fmt::println(
R"(<li><a class="{}" href="bword:{}">{}</a></li>)",
(word == params.gd_word ? "gd-headword" : ""),
word,
word
);
}
fmt::println("</ul>"); // close ul
}
fmt::println("</div>"); // close div.alternatives
fmt::println("</div>"); // close div.gd-marisa
fmt::println("{}", css_style);
}
void marisa_split(std::span<std::string_view const> const args)
{
try {
lookup_words(fill_args<marisa_params>(args));
} catch (gd::help_requested const& ex) {
fmt::println(help_text);
} catch (gd::runtime_error const& ex) {
fmt::println("{}", ex.what());
}
}