Update README.md

new appearance
fix padding
2024-11-11 02:11:50 +00:00 · 2024-11-10 22:10:35 -04:00 · 2024-11-10 22:05:12 -04:00 · 2024-06-16 02:52:01 +00:00 · 2024-05-31 19:49:16 -04:00 · 2024-05-31 19:46:45 -04:00
5 changed files with 384 additions and 109 deletions
--- a/README.md
+++ b/README.md
@ -1,5 +1,30 @@
-# Hakurei
+# 🌸 Hakurei 🌸

-<img src="https://ajattix.loophole.site/hashirama/Hakurei/raw/branch/main/res/img/hakurei_reimu.png" alt="cover" width="12%" height="12%">
+Welcome to **Hakurei**, the ultimate Japanese tokenizer powered by cutting-edge technology! Whether you're crafting your next-gen anime subtitle processor, building an amazing manga reader, or just need a super-fast tokenizer, Hakurei has got you covered with unparalleled speed and accuracy. 🎌✨

-A lightning fast japanese tokenizer with no dependencies !
+<img src="https://git.ajattix.org/hashirama/Hakurei/raw/branch/main/res/img/hakurei_reimu.png" alt="cover" width="10%" height="10%">
+<img src="https://git.ajattix.org/hashirama/Hakurei/raw/branch/main/misc/2024-11-10_22-10.png"  alt="usage" width="62%" height="62%">
+
+## Table of Contents
+
+- [Features](#features)
+- [Installation](#installation)
+- [Quick Start](#quick-start)
+- [Usage](#usage)
+- [Benchmarks](#benchmarks)
+- [Contributing](#contributing)
+- [License](#license)
+
+## Features
+
+- 🚀 **Blazing Fast**: Harness the speed of the gods with our state-of-the-art algorithms.
+- 🎯 **Ultra Accurate**: Slice through Japanese text with ninja-like precision.
+- 🔧 **Easy Integration**: Plug and play with our CLI, perfect for all your projects.
+- 💪 **Super Robust**: Handle massive datasets like a mecha warrior!
+
+## Installation
+
+Installing Hakurei is a breeze!:
+
+```bash
+$ g++ src/hakurei.cpp -o hakurei 
--- a/misc/.gitignore
+++ b/misc/.gitignore
--- a/misc/2024-11-10_22-10.png
+++ b/misc/2024-11-10_22-10.png
--- a/src/hakurei.cpp
+++ b/src/hakurei.cpp
@ -0,0 +1,356 @@
+// filename: hakurei.cpp
+// in freebsd you may need this:
+// $  patchelf --set-rpath /usr/local/lib/gcc13 hakurei
+
+
+
+#include <iostream>
+#include <vector>
+#include <algorithm>
+#include <limits>
+#include <cstring>
+#include <cctype>
+#include <xcdat.hpp>
+#include <utility>
+#include <unistd.h>
+#include <filesystem>
+#include <map>
+#include <set>
+#include <fstream>
+#include <ctime>
+
+
+bool debug_mode = false;
+
+struct Entry {
+  const std::string_view decoded_view;
+  const uint64_t id;
+  inline Entry(const std::string_view decoded_view, const uint64_t id) : decoded_view(decoded_view), id(id) {}
+};
+
+inline std::pair<std::string, std::string> remove_one_utf8_char(const std::string& str) {
+  if (str.empty()) {
+    return {"", str};
+  }
+  const size_t len = str.size();
+  size_t i = 0;
+  while (i < len) {
+    const unsigned char c = str[i];
+    if (c < 0x80) { 
+      return {str.substr(i, 1), str.substr(i + 1)};
+    } else if ((c >> 5) == 0x6) { 
+      return {str.substr(i, 2), str.substr(i + 2)};
+    } else if ((c >> 4) == 0xe) { 
+      return {str.substr(i, 3), str.substr(i + 3)};
+    } else if ((c >> 3) == 0x1e) { 
+      return {str.substr(i, 4), str.substr(i + 4)};
+    }
+    i++;
+  }
+  return {"", ""};
+}
+
+
+std::string trim(const std::string& str) {
+    size_t start = str.find_first_not_of(" \t\n\r\f\v");
+    size_t end = str.find_last_not_of(" \t\n\r\f\v");
+    
+    if (start == std::string::npos || end == std::string::npos)
+        return ""; // String contains only whitespace
+
+    return str.substr(start, end - start + 1);
+}
+
+
+
+inline std::string get_input(const int argc, char* const argv[], const bool goldendict_mode) {
+  std::string search_string;
+  if (!isatty(fileno(stdin))) { 
+    std::getline(std::cin, search_string);
+  } else { 
+    if (goldendict_mode) {
+      for (int i = 1; i < argc; ++i) {
+	if (std::strcmp(argv[i], "--sentence") == 0 && i + 1 < argc) {
+	  search_string = argv[i + 1];
+	  break;
+	}
+      }
+    } else {
+      if (argc < 2) {
+	std::cerr << "Usage: " << argv[0] << " <search_string>" << std::endl;
+	exit(1);
+      }
+      for (int i = 1; i < argc; ++i) {
+	if (std::strcmp(argv[i], "--debug") == 0) {
+	  // Handle debug mode
+	} else if (std::strcmp(argv[i], "--dict") == 0 && i + 1 < argc) {
+	  i++;
+	} else {
+	  search_string = argv[i];
+	  break;
+	}
+      }
+    }
+  }
+  if (search_string.empty()) {
+    std::cerr << "Search string not provided." << std::endl;
+    exit(1);
+  }
+  
+  return trim(search_string);
+}
+
+inline std::filesystem::path find_dic_file() {
+  static const std::vector<std::filesystem::path> locations = {
+    std::filesystem::path("/usr/share/hakurei/"),
+    std::filesystem::path(std::getenv("HOME")) / ".local/share/hakurei/",
+    std::filesystem::current_path()
+  };
+  for (const auto& location : locations) {
+    const auto dict_path = location / "dict.bin";
+    if (std::filesystem::exists(dict_path) && std::filesystem::is_regular_file(dict_path)) {
+      return dict_path;
+    }
+  }
+  throw std::runtime_error("Couldn't find the word list.");
+}
+
+inline void print_debug_info(const std::vector<std::string>& substrings, const std::string& raw_output) {
+  std::cout << "Stored raw output:" << std::endl;
+  for (const auto& str : substrings) {
+    std::cout << str << std::endl;
+  }
+  std::cout << "Raw output:" << std::endl;
+  std::cout << raw_output << std::endl;
+}
+
+
+void log_execution(const int argc, char* argv[], const std::string& search_string, const std::string& dict_file, bool debug_mode, bool goldendict_mode) {
+  if (!debug_mode) return;
+  std::ofstream log_file;
+  log_file.open("/tmp/hakurei.log", std::ios_base::app);
+  if (!log_file) {
+    std::cerr << "Failed to open log file." << std::endl;
+    return;
+  }
+
+  std::time_t now = std::time(nullptr);
+  log_file << "Hakurei executed at " << std::ctime(&now);
+  log_file << "Command-line arguments:\n";
+  for (int i = 0; i < argc; ++i) {
+    log_file << "argv[" << i << "]: " << argv[i] << "\n";
+  }
+  log_file << "Parsed parameters:\n";
+  log_file << "search_string: " << search_string << "\n";
+  log_file << "dict_file: " << dict_file << "\n";
+  log_file << "debug_mode: " << debug_mode << "\n";
+  log_file << "goldendict_mode: " << goldendict_mode << "\n";
+
+  log_file.close();
+}
+
+
+void log_internal(const std::string& message) {
+  extern bool debug_mode; // Declare the external debug_mode variable
+
+  if (!debug_mode) {
+    return;
+  }
+
+  std::ofstream log_file;
+  log_file.open("/tmp/hakurei.log", std::ios_base::app);
+  if (!log_file) {
+    std::cerr << "Failed to open log file." << std::endl;
+    return;
+  }
+  std::time_t now = std::time(nullptr);
+  log_file << std::ctime(&now) << ": " << message << std::endl;
+  log_file.close();
+}
+
+inline void wrap_html_output(
+			     const std::vector<std::string>& substrings,
+			     const std::map<std::string, std::set<std::string>>& alternatives_map,
+			     std::string& sentence
+			     ) {
+  // Log the HTML output call
+  log_internal("<span style=\"color:red;\">HTML output is being called</span>");
+
+  std::string output_html;
+  output_html += "<div class=\"hakurei\">";
+
+  std::string sentence_copy = sentence;
+
+  output_html += "<div class=\"container\">";  
+
+  while (!sentence_copy.empty()) {
+    bool found = false;
+    for (const auto& substring : substrings) {
+      if (sentence_copy.rfind(substring, 0) == 0) {
+	output_html += "<div class=\"segment\"><a class=\"hakurei-headword\" href=\"bword:" + substring + "\">" + substring + "</a>";
+
+	auto alt_itr = alternatives_map.find(sentence_copy);
+	if (alt_itr != alternatives_map.end() && !alt_itr->second.empty()) {
+	  output_html += "<div class=\"alternatives\"><ul>";
+	  for (const auto& alt : alt_itr->second) {
+	    output_html += "<li><a href=\"bword:" + alt + "\">" + alt + "</a></li>";
+	  }
+	  output_html += "</ul></div>";
+	}
+
+	output_html += "</div>"; 
+	output_html += "<style>";
+	output_html += ".hakurei { font-size: 2rem; margin-bottom: 0.05em; margin-top: -0.2em; color: #1268c3; font-weight: normal; }";
+	output_html += ".hakurei a { display: inline-block; font-weight: normal; color: royalblue; text-decoration: none; border-bottom: dashed max(1px, calc(1em / 16)) currentColor; }";
+	output_html += ".hakurei a.hakurei-headword { background-color: #ddeeff; border-radius: 0.2rem; font-weight: 500; }";
+	output_html += ".hakurei > ul { --size: 1rem; font-size: var(--size); padding-inline-start: var(--size); margin-block: 2px; }";
+	output_html += ".hakurei .alternatives { --size: 1rem; display: grid; font-size: var(--size); gap: calc( var(--size) / 10); max-width: 100%; margin: 0 auto; grid-template-columns: repeat(auto-fill, minmax(60px, 1fr)); align-content: start; justify-content: space-around; text-align: left; padding: 5px 0px; }";
+	output_html += ".hakurei .alternatives > ul { list-style-type: none; margin: 0; padding: calc( var(--size) / 4); background-color: hsl(0 0% 50% / 0.05); box-shadow: 0 0 4px hsl(0 0% 0% / 0.1); border-radius: 0.2rem; }";
+	output_html += ".hakurei .alternatives > ul > li { margin-right: 1rem; }";
+	output_html += ".container { display: flex; flex-wrap: wrap; gap: 1px; }";  
+	output_html += ".segment { display: inline-block; }";  
+	output_html += "</style>";
+
+	sentence_copy = sentence_copy.substr(substring.size());
+	found = true;
+	break;
+      }
+    }
+    if (!found) {
+      const auto [removed_char, new_sentence_copy] = remove_one_utf8_char(sentence_copy);
+      if (removed_char.empty()) {
+	std::cerr << "Error: Unable to remove a character from the sentence. Exiting to prevent infinite loop." << std::endl;
+	break;
+      }
+      output_html += "<div class=\"segment\"> <a href=\"bword:" + removed_char + "\">" + removed_char + "</a></div>";
+      sentence_copy = new_sentence_copy;
+    }
+  }
+
+  output_html += "</div>";  
+  output_html += "</div>";
+  std::cout << output_html << std::endl;
+}
+
+
+
+int main(const int argc, char* argv[]) {
+
+  std::string dict_file = find_dic_file().string(); // Default dictionary file path
+  const auto trie = xcdat::load<xcdat::trie_15_type>(dict_file);
+  log_internal("Trie loaded from dictionary file");
+
+  
+
+  bool goldendict_mode = false;
+  std::string word, sentence;
+
+  log_internal("Program started");
+
+  // Parse command line arguments
+  for (int i = 1; i < argc; ++i) {
+    if (std::strcmp(argv[i], "--debug") == 0) {
+      debug_mode = true;
+      log_internal("Debug mode enabled");
+    } else if (std::strcmp(argv[i], "--dict") == 0 && i + 1 < argc) {
+      dict_file = argv[i + 1];
+      log_internal("Dictionary file set to: " + dict_file);
+      i++; // Skip the next argument as it's the dictionary file path
+    } else if (std::strcmp(argv[i], "--goldendict") == 0) {
+      goldendict_mode = true;
+      log_internal("GoldenDict mode enabled");
+    } else if (std::strcmp(argv[i], "--word") == 0 && i + 1 < argc) {
+      word = argv[i + 1];
+      log_internal("Word set to: " + word);
+      i++;
+    } else if (std::strcmp(argv[i], "--sentence") == 0 && i + 1 < argc) {
+      sentence = argv[i + 1];
+      log_internal("Sentence set to: " + sentence);
+      i++;
+    }
+  }
+
+  log_internal("Trying to get the search string...");
+  
+  std::string search_string = trim(sentence);
+  //log_internal("Search string: " + search_string);
+  //if (goldendict_mode && !sentence.empty()) {
+    //search_string = sentence;
+    // log_internal("Search string overridden by sentence: " + search_string);
+    //      }
+
+  // Log the execution details
+  log_execution(argc, argv, search_string, dict_file, debug_mode, goldendict_mode);
+
+  std::string raw_output;
+
+  std::vector<std::string> substrings;
+  std::vector<Entry> results;
+  std::map<std::string, std::vector<std::string>> derived_map;
+  std::map<std::string, std::set<std::string>> alternatives_map;
+
+  while (!search_string.empty()) {
+    auto itr = trie.make_prefix_iterator(search_string);
+
+    while (itr.next()) {
+      results.emplace_back(itr.decoded_view(), itr.id());
+      log_internal("Found result: " + std::string(itr.decoded_view()));
+    }
+
+    // Add all substrings to the results
+    if (!results.empty()) {
+      for (const auto& entry : results) {
+	const std::string substring(entry.decoded_view);
+	substrings.push_back(substring);
+	derived_map[substring].push_back(search_string);
+	alternatives_map[search_string].insert(substring);
+	log_internal("Substring added: " + substring);
+	if (!goldendict_mode) {
+	  std::cout << substring << std::endl;
+	}
+      }
+    }
+
+    // Remove one UTF-8 character from the search string and get the removed character
+    const auto [removed_char, new_search_string] = remove_one_utf8_char(search_string);
+    log_internal("Removed character: " + removed_char);
+
+    if (!removed_char.empty() && !goldendict_mode) {
+      std::cout << removed_char << std::endl;
+    }
+
+    if (new_search_string == search_string) {
+      std::cerr << "Error: Search string did not change after removing a character. Exiting to prevent infinite loop." << std::endl;
+      log_internal("Error: Search string did not change after removing a character. Exiting to prevent infinite loop.");
+      break;
+    }
+
+    search_string = new_search_string;
+    log_internal("New search string: " + search_string);
+
+    // Print debug information if in debug mode
+    if (debug_mode) {
+      std::cout << "After removing one character: " << search_string << std::endl;
+      raw_output += search_string + '\n';
+    }
+
+    results.clear();
+
+    // Remove leading whitespace
+    search_string.erase(0, search_string.find_first_not_of(" \t\n\r\f\v"));
+  }
+
+  // Print debug information if in debug mode
+  if (debug_mode) {
+    print_debug_info(substrings, raw_output);
+  }
+
+  // Wrap the output in HTML format for GoldenDict if in GoldenDict mode
+  if (goldendict_mode) {
+    wrap_html_output(substrings, alternatives_map, sentence);
+    exit(0);
+  }
+
+  log_internal("Program finished");
+  return 0;
+}
--- a/src/main.cc
+++ b/src/main.cc
@ -1,106 +0,0 @@
-#include <iostream>
-#include <vector>
-#include <algorithm>
-#include <limits>
-#include <cstring>
-#include <cctype>
-#include <xcdat.hpp>
-
-// as this is in test phase, i recommend testing with the input being "特別協力組合理事"
-
-struct Entry {
-    std::string_view decoded_view;
-    uint64_t id;
-    Entry(std::string_view decoded_view, uint64_t id) : decoded_view(decoded_view), id(id) {}
-};
-
-int main(int argc, char* argv[]) {
-    if (argc < 2) {
-        std::cerr << "Usage: " << argv[0] << " <search_string> [--debug] [--dict <path_to_dictionary>]" << std::endl;
-        return 1;
-    }
-
-    std::string search_string;
-    bool debug_mode = false;
-    std::string dict_file = "dict.bin"; // Default dictionary file path
-
-    // Parse command line arguments
-    for (int i = 1; i < argc; ++i) {
-        if (std::strcmp(argv[i], "--debug") == 0) {
-            debug_mode = true;
-        } else if (std::strcmp(argv[i], "--dict") == 0 && i + 1 < argc) {
-            dict_file = argv[i + 1];
-            i++; // Skip the next argument as it's the dictionary file path
-        } else {
-            search_string = argv[i];
-        }
-    }
-
-    if (search_string.empty()) {
-        std::cerr << "Search string not provided." << std::endl;
-        return 1;
-    }
-
-    std::string raw_output;
-    const auto trie = xcdat::load<xcdat::trie_15_type>(dict_file);
-
-    std::vector<std::string> substrings;
-
-    while (!search_string.empty()) {
-        std::vector<Entry> results;
-        auto itr = trie.make_prefix_iterator(search_string);
-
-        while (itr.next()) {
-            results.emplace_back(itr.decoded_view(), itr.id());
-        }
-
-        size_t min_length = std::numeric_limits<size_t>::max();
-        std::string_view smallest_prefix;
-
-        for (const auto& entry : results) {
-            if (entry.decoded_view.size() < min_length) {
-                min_length = entry.decoded_view.size();
-                smallest_prefix = entry.decoded_view;
-            }
-        }
-
-        if (min_length > 0) {
-            std::string substring = search_string.substr(0, search_string.find(smallest_prefix));
-            if (!substring.empty()) {
-                substrings.push_back(substring);
-            }
-            std::cout << search_string << " - " << smallest_prefix << " = ";
-
-            if (debug_mode) {
-                raw_output += search_string + " - " + std::string(smallest_prefix) + " = ";
-            }
-
-            size_t pos = search_string.find(smallest_prefix);
-            if (pos != std::string::npos) {
-                search_string.erase(0, pos + smallest_prefix.length());
-            }
-
-            std::cout << search_string << std::endl;
-
-            if (debug_mode) {
-                raw_output += search_string + '\n';
-            }
-
-            while (!search_string.empty() && std::isspace(search_string.front())) {
-                search_string.erase(0, 1);
-            }
-        } else {
-            break;
-        }
-    }
-
-    if (debug_mode) {
-        std::cout << "Stored raw output:" << std::endl << raw_output << std::endl;
-    }
-
-    for (const auto& sub : substrings) {
-        std::cout << sub << std::endl;
-    }
-
-    return 0;
-}
Author	SHA1	Message	Date
千住柱間	844ced543d	Update README.md	2024-11-11 02:11:50 +00:00
千住柱間	087e0843a5	new appearance	2024-11-10 22:10:35 -04:00
千住柱間	36449d4446	fix padding	2024-11-10 22:05:12 -04:00
千住柱間	b2a7beb0a2	Update README.md	2024-06-16 02:52:01 +00:00
千住柱間	4da75c6b27	Update README.md	2024-05-31 19:49:16 -04:00
千住柱間	b54059d828	add usage image	2024-05-31 19:46:45 -04:00
千住柱間	740ce23744	Add misc/.gitignore	2024-05-31 19:44:53 -04:00
千住柱間	a7cadd0036	only show log if in debug mode	2024-05-30 13:24:46 -04:00
千住柱間	34fb696599	Update README.md	2024-05-22 10:32:55 -04:00
千住柱間	315ebc10f3	fix single-char bdword link	2024-05-21 00:22:50 -04:00
千住柱間	a3ceaf9db0	trim white spaces 2/2	2024-05-19 02:01:28 -04:00
千住柱間	30c302d13d	trim white spaces	2024-05-19 01:56:16 -04:00
千住柱間	99d85220f8	freebsd linker fix	2024-05-19 00:17:11 -04:00
千住柱間	bca6e95b7c	temporary fix (well, it works!)	2024-05-18 23:59:06 -04:00
千住柱間	79586f3d04	cleanup	2024-05-18 22:47:27 -04:00
千住柱間	5fc647cabd	improve debugging 3/3	2024-05-18 22:33:48 -04:00
千住柱間	72e74c1e33	improve debugging 2/2	2024-05-18 22:33:14 -04:00
千住柱間	df7ffee51c	improve debugging	2024-05-18 21:45:47 -04:00
千住柱間	b6b47f6ba3	finish goldendict integration	2024-05-17 21:34:09 -04:00
千住柱間	ac8af23fba	constification	2024-05-17 15:22:38 -04:00
千住柱間	28f8e8005a	add goldendict mode	2024-05-17 15:12:13 -04:00
千住柱間	cd14096781	make functions inline & handle input from stdout	2024-05-17 14:21:11 -04:00
千住柱間	0bcd58398d	show particles	2024-05-17 10:37:17 -04:00
千住柱間	f7bddc62bf	hakurei is now usable!	2024-05-16 22:43:06 -04:00
千住柱間	50c02a1324	now hakurei can tokenize any japanese phrase 3/3	2024-05-16 21:53:23 -04:00
千住柱間	825411983d	now hakurei can tokenize any japanese phrase 2/2	2024-05-16 21:53:02 -04:00
千住柱間	c402bffc2a	now hakurei can tokenize any japanese phrase	2024-05-16 21:52:52 -04:00
千住柱間	2fce3c4a48	fix loop 2/2	2024-05-16 21:48:24 -04:00
千住柱間	7508ce6d0d	fix loop \| by Perplex	2024-05-16 21:48:06 -04:00