From 825411983d86625d41fa03031ec26ee9b07cfc79 Mon Sep 17 00:00:00 2001 From: hashirama Date: Thu, 16 May 2024 21:53:02 -0400 Subject: [PATCH] now hakurei can tokenize any japanese phrase 2/2 --- src/main.cc~ | 109 --------------------------------------------------- 1 file changed, 109 deletions(-) delete mode 100644 src/main.cc~ diff --git a/src/main.cc~ b/src/main.cc~ deleted file mode 100644 index febd5e0..0000000 --- a/src/main.cc~ +++ /dev/null @@ -1,109 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - -// as this is in test phase, i recommend testing with the input being "特別協力組合理事" - -struct Entry { - std::string_view decoded_view; - uint64_t id; - Entry(std::string_view decoded_view, uint64_t id) : decoded_view(decoded_view), id(id) {} -}; - -int main(int argc, char* argv[]) { - if (argc < 2) { - std::cerr << "Usage: " << argv[0] << " [--debug] [--dict ]" << std::endl; - return 1; - } - - std::string search_string; - bool debug_mode = false; - std::string dict_file = "dict.bin"; // Default dictionary file path - - // Parse command line arguments - for (int i = 1; i < argc; ++i) { - if (std::strcmp(argv[i], "--debug") == 0) { - debug_mode = true; - } else if (std::strcmp(argv[i], "--dict") == 0 && i + 1 < argc) { - dict_file = argv[i + 1]; - i++; // Skip the next argument as it's the dictionary file path - } else { - search_string = argv[i]; - } - } - - if (search_string.empty()) { - std::cerr << "Search string not provided." << std::endl; - return 1; - } - - std::string raw_output; - const auto trie = xcdat::load(dict_file); - - std::vector substrings; - std::vector results; - while (!search_string.empty()) { - auto itr = trie.make_prefix_iterator(search_string); - - while (itr.next()) { - results.emplace_back(itr.decoded_view(), itr.id()); - } - - size_t min_length = std::numeric_limits::max(); - std::string_view smallest_prefix; - - for (const auto& entry : results) { - if (entry.decoded_view.size() < min_length) { - min_length = entry.decoded_view.size(); - smallest_prefix = entry.decoded_view; - } - } - - if (min_length > 0) { - std::string substring = search_string.substr(0, search_string.find(smallest_prefix)); - if (!substring.empty()) { - substrings.push_back(substring); - } - std::cout << search_string << " - " << smallest_prefix << " = "; - - if (debug_mode) { - raw_output += search_string + " - " + std::string(smallest_prefix) + " = "; - } - - size_t pos = search_string.find(smallest_prefix); - if (pos != std::string::npos) { - search_string.erase(0, pos + smallest_prefix.length()); - } - else break; - - std::cout << search_string << std::endl; - - if (smallest_prefix.length() == 0) - break; - - if (debug_mode) { - raw_output += search_string + '\n'; - } - - while (!search_string.empty() && std::isspace(search_string.front())) { - search_string.erase(0, 1); - } - } else { - break; - } - } - - if (debug_mode) { - std::cout << "Stored raw output:" << std::endl << raw_output << std::endl; - } - - for (const auto& sub : substrings) { - std::cout << sub << std::endl; - } - - return 0; -}