diff --git a/src/main.cc b/src/main.cc index 21209c3..d436b34 100644 --- a/src/main.cc +++ b/src/main.cc @@ -6,14 +6,34 @@ #include #include - - struct Entry { std::string_view decoded_view; uint64_t id; Entry(std::string_view decoded_view, uint64_t id) : decoded_view(decoded_view), id(id) {} }; +std::string remove_one_utf8_char(const std::string& str) { + if (str.empty()) { + return str; + } + size_t len = str.size(); + size_t i = 0; + while (i < len) { + unsigned char c = str[i]; + if (c < 0x80) { // 1-byte character + return str.substr(i + 1); + } else if ((c >> 5) == 0x6) { // 2-byte character + return str.substr(i + 2); + } else if ((c >> 4) == 0xe) { // 3-byte character + return str.substr(i + 3); + } else if ((c >> 3) == 0x1e) { // 4-byte character + return str.substr(i + 4); + } + i++; + } + return ""; +} + int main(int argc, char* argv[]) { if (argc < 2) { std::cerr << "Usage: " << argv[0] << " [--debug] [--dict ]" << std::endl; @@ -46,6 +66,7 @@ int main(int argc, char* argv[]) { std::vector substrings; std::vector results; + while (!search_string.empty()) { auto itr = trie.make_prefix_iterator(search_string); @@ -53,54 +74,41 @@ int main(int argc, char* argv[]) { results.emplace_back(itr.decoded_view(), itr.id()); } - size_t min_length = std::numeric_limits::max(); - std::string_view smallest_prefix; - - for (const auto& entry : results) { - if (entry.decoded_view.size() < min_length) { - min_length = entry.decoded_view.size(); - smallest_prefix = entry.decoded_view; + // Add all substrings to the results + if (!results.empty()) { + for (const auto& entry : results) { + substrings.push_back(std::string(entry.decoded_view)); } } - if (min_length > 0) { - std::string substring = search_string.substr(0, search_string.find(smallest_prefix)); - if (!substring.empty()) { - substrings.push_back(substring); - } - std::cout << search_string << " - " << smallest_prefix << " = "; + // Print debug information if in debug mode + if (debug_mode) { + std::cout << "Original search string: " << search_string << std::endl; + } - if (debug_mode) { - raw_output += search_string + " - " + std::string(smallest_prefix) + " = "; - } + // Remove one UTF-8 character from the search string + search_string = remove_one_utf8_char(search_string); - size_t pos = search_string.find(smallest_prefix); - if (pos != std::string::npos) { - search_string.erase(0, pos + smallest_prefix.length()); - } - else break; + // Print debug information if in debug mode + if (debug_mode) { + std::cout << "After removing one character: " << search_string << std::endl; + raw_output += search_string + '\n'; + } - std::cout << search_string << std::endl; + results.clear(); - if (smallest_prefix.length() == 0) - break; - - if (debug_mode) { - raw_output += search_string + '\n'; - } - - while (!search_string.empty() && std::isspace(search_string.front())) { - search_string.erase(0, 1); - } - } else { - break; + // Remove leading whitespace + while (!search_string.empty() && std::isspace(search_string.front())) { + search_string.erase(0, 1); } } + // Print debug information if in debug mode if (debug_mode) { std::cout << "Stored raw output:" << std::endl << raw_output << std::endl; } + // Print the substrings for (const auto& sub : substrings) { std::cout << sub << std::endl; }