now hakurei can tokenize any japanese phrase

This commit is contained in:
千住柱間 2024-05-16 21:52:52 -04:00
parent 2fce3c4a48
commit c402bffc2a
Signed by: hashirama
GPG key ID: 53E62470A86BC185
3 changed files with 110 additions and 1 deletions

BIN
a.out Executable file

Binary file not shown.

View file

@ -6,7 +6,7 @@
#include <cctype>
#include <xcdat.hpp>
// as this is in test phase, i recommend testing with the input being "特別協力組合理事"
struct Entry {
std::string_view decoded_view;

109
src/main.cc~ Normal file
View file

@ -0,0 +1,109 @@
#include <iostream>
#include <vector>
#include <algorithm>
#include <limits>
#include <cstring>
#include <cctype>
#include <xcdat.hpp>
// as this is in test phase, i recommend testing with the input being "特別協力組合理事"
struct Entry {
std::string_view decoded_view;
uint64_t id;
Entry(std::string_view decoded_view, uint64_t id) : decoded_view(decoded_view), id(id) {}
};
int main(int argc, char* argv[]) {
if (argc < 2) {
std::cerr << "Usage: " << argv[0] << " <search_string> [--debug] [--dict <path_to_dictionary>]" << std::endl;
return 1;
}
std::string search_string;
bool debug_mode = false;
std::string dict_file = "dict.bin"; // Default dictionary file path
// Parse command line arguments
for (int i = 1; i < argc; ++i) {
if (std::strcmp(argv[i], "--debug") == 0) {
debug_mode = true;
} else if (std::strcmp(argv[i], "--dict") == 0 && i + 1 < argc) {
dict_file = argv[i + 1];
i++; // Skip the next argument as it's the dictionary file path
} else {
search_string = argv[i];
}
}
if (search_string.empty()) {
std::cerr << "Search string not provided." << std::endl;
return 1;
}
std::string raw_output;
const auto trie = xcdat::load<xcdat::trie_15_type>(dict_file);
std::vector<std::string> substrings;
std::vector<Entry> results;
while (!search_string.empty()) {
auto itr = trie.make_prefix_iterator(search_string);
while (itr.next()) {
results.emplace_back(itr.decoded_view(), itr.id());
}
size_t min_length = std::numeric_limits<size_t>::max();
std::string_view smallest_prefix;
for (const auto& entry : results) {
if (entry.decoded_view.size() < min_length) {
min_length = entry.decoded_view.size();
smallest_prefix = entry.decoded_view;
}
}
if (min_length > 0) {
std::string substring = search_string.substr(0, search_string.find(smallest_prefix));
if (!substring.empty()) {
substrings.push_back(substring);
}
std::cout << search_string << " - " << smallest_prefix << " = ";
if (debug_mode) {
raw_output += search_string + " - " + std::string(smallest_prefix) + " = ";
}
size_t pos = search_string.find(smallest_prefix);
if (pos != std::string::npos) {
search_string.erase(0, pos + smallest_prefix.length());
}
else break;
std::cout << search_string << std::endl;
if (smallest_prefix.length() == 0)
break;
if (debug_mode) {
raw_output += search_string + '\n';
}
while (!search_string.empty() && std::isspace(search_string.front())) {
search_string.erase(0, 1);
}
} else {
break;
}
}
if (debug_mode) {
std::cout << "Stored raw output:" << std::endl << raw_output << std::endl;
}
for (const auto& sub : substrings) {
std::cout << sub << std::endl;
}
return 0;
}