From af9731e6c29488c33cdc393ce8514c7f7d0e24ea Mon Sep 17 00:00:00 2001
From: Shunsuke Kanda <shnsk.knd@gmail.com>
Date: Fri, 2 Jul 2021 07:05:06 +0900
Subject: [PATCH] add

---
 README.md                 | 111 ++++++++++++++++++++++++++------------
 include/xcdat/trie.hpp    |   7 ++-
 tools/xcdat_benchmark.cpp |  61 ++++++++++++---------
 tools/xcdat_build.cpp     |  27 ++++------
 4 files changed, 127 insertions(+), 79 deletions(-)
diff --git a/README.md b/README.md
index 091664b..868cbd4 100644
--- a/README.md
+++ b/README.md
@@ -58,12 +58,10 @@ It builds the trie index from a given dataset consisting of keywords separated b
 
 ```
 $ xcdat_build enwiki-titles.txt idx.bin
-time_in_sec: 13.449
-memory_in_bytes: 1.70618e+08
-memory_in_MiB: 162.714
-number_of_keys: 15955763
-alphabet_size: 198
-max_length: 253
+Number of keys: 15955763
+Number of trie nodes: 36441058
+Memory usage in bytes: 1.70618e+08
+Memory usage in MiB: 162.714
 ```
 
 ### `xcdat_lookup`
@@ -128,6 +126,42 @@ $ xcdat_enumerate idx.bin | head -3
 138	!!!
 ```
 
+### `xcdat_benchmark`
+
+It measures the performance of Xcdat for a given dataset.
+
+```
+$ xcdat_benchmark enwiki-titles.txt
+** xcdat::trie_7_type **
+Binary mode: 0
+Alphabet size: 198
+Max key length: 253
+Number of keys: 15955763
+Number of trie nodes: 36441058
+Number of DA units: 36520704
+Number of free DA units: 79646
+TAIL length: 30776290
+Memory usage in bytes: 1.70618e+08
+Memory usage in MiB: 162.714
+Construction time in seconds: 11.828
+Lookup time in microsec/query: 0.8259
+Decode time in microsec/query: 1.4545
+** xcdat::trie_8_type **
+Binary mode: 0
+Alphabet size: 198
+Max key length: 253
+Number of keys: 15955763
+Number of trie nodes: 36441035
+Number of DA units: 36515840
+Number of free DA units: 74805
+TAIL length: 30776290
+Memory usage in bytes: 1.64104e+08
+Memory usage in MiB: 156.502
+Construction time in seconds: 11.966
+Lookup time in microsec/query: 0.844
+Decode time in microsec/query: 1.0029
+```
+
 ## Sample usage
 
 `sample/sample.cpp` provides a sample usage.
@@ -301,25 +335,40 @@ class trie {
     //! If bin_mode = true, bit flags are used istead, and the keywords can contain NULL characters.
     //! If the input keywords contain NULL characters, bin_mode will be forced to be set to true.
     template <class Strings>
-    explicit trie(const Strings& keys, bool bin_mode = false);
+    trie(const Strings& keys, bool bin_mode = false);
 
     //! Check if the binary mode.
-    inline bool bin_mode() const;
+    bool bin_mode() const;
 
     //! Get the number of stored keywords.
-    inline std::uint64_t num_keys() const;
+    std::uint64_t num_keys() const;
 
     //! Get the alphabet size.
-    inline std::uint64_t alphabet_size() const;
+    std::uint64_t alphabet_size() const;
 
     //! Get the maximum length of keywords.
-    inline std::uint64_t max_length() const;
+    std::uint64_t max_length() const;
+
+    //! Get the number of trie nodes.
+    std::uint64_t num_nodes() const;
+
+    //! Get the number of DA units.
+    std::uint64_t num_units() const;
+
+    //! Get the number of unused DA units.
+    std::uint64_t num_free_units() const;
+
+    //! Get the number of unused DA units.
+    std::uint64_t tail_length() const;
 
     //! Lookup the ID of the keyword.
-    inline std::optional<std::uint64_t> lookup(std::string_view key) const;
+    std::optional<std::uint64_t> lookup(std::string_view key) const;
 
     //! Decode the keyword associated with the ID.
-    inline std::string decode(std::uint64_t id) const;
+    std::string decode(std::uint64_t id) const;
+
+    //! Decode the keyword associated with the ID.
+    void decode(std::uint64_t id, std::string& decoded) const;
 
     //! An iterator class for common prefix search.
     class prefix_iterator {
@@ -328,24 +377,24 @@ class trie {
 
         //! Increment the iterator.
         //! Return false if the iteration is terminated.
-        inline bool next();
+        bool next();
 
         //! Get the result ID.
-        inline std::uint64_t id() const;
+        std::uint64_t id() const;
 
         //! Get the result keyword.
-        inline std::string decoded() const;
+        std::string decoded() const;
 
         //! Get the reference to the result keyword.
         //! Note that the referenced data will be changed in the next iteration.
-        inline std::string_view decoded_view() const;
+        std::string_view decoded_view() const;
     };
 
     //! Make the common prefix searcher for the given keyword.
-    inline prefix_iterator make_prefix_iterator(std::string_view key) const;
+    prefix_iterator make_prefix_iterator(std::string_view key) const;
 
     //! Preform common prefix search for the keyword.
-    inline void prefix_search(std::string_view key, const std::function<void(std::uint64_t, std::string_view)>& fn) const;
+    void prefix_search(std::string_view key, const std::function<void(std::uint64_t, std::string_view)>& fn) const;
 
     //! An iterator class for predictive search.
     class predictive_iterator {
@@ -354,41 +403,33 @@ class trie {
 
         //! Increment the iterator.
         //! Return false if the iteration is terminated.
-        inline bool next();
+        bool next();
 
         //! Get the result ID.
-        inline std::uint64_t id() const;
+        std::uint64_t id() const;
 
         //! Get the result keyword.
-        inline std::string decoded() const;
+        std::string decoded() const;
 
         //! Get the reference to the result keyword.
         //! Note that the referenced data will be changed in the next iteration.
-        inline std::string_view decoded_view() const;
+        std::string_view decoded_view() const;
     };
 
     //! Make the predictive searcher for the keyword.
-    inline predictive_iterator make_predictive_iterator(std::string_view key) const {
-        return predictive_iterator(this, key);
-    }
+    predictive_iterator make_predictive_iterator(std::string_view key) const;
 
     //! Preform predictive search for the keyword.
-    inline void predictive_search(std::string_view key,
-                                  const std::function<void(std::uint64_t, std::string_view)>& fn) const {
-        auto itr = make_predictive_iterator(key);
-        while (itr.next()) {
-            fn(itr.id(), itr.decoded_view());
-        }
-    }
+    void predictive_search(std::string_view key, const std::function<void(std::uint64_t, std::string_view)>& fn) const;
 
     //! An iterator class for enumeration.
     using enumerative_iterator = predictive_iterator;
 
     //! An iterator class for enumeration.
-    inline enumerative_iterator make_enumerative_iterator() const;
+    enumerative_iterator make_enumerative_iterator() const;
 
     //! Enumerate all the keywords and their IDs stored in the trie.
-    inline void enumerate(const std::function<void(std::uint64_t, std::string_view)>& fn) const;
+    void enumerate(const std::function<void(std::uint64_t, std::string_view)>& fn) const;
 
     //! Visit the members.
     template <class Visitor>
diff --git a/include/xcdat/trie.hpp b/include/xcdat/trie.hpp
index 1bc30a9..b0c2788 100644
--- a/include/xcdat/trie.hpp
+++ b/include/xcdat/trie.hpp
@@ -48,7 +48,7 @@ class trie {
     //! If bin_mode = true, bit flags are used istead, and the keywords can contain NULL characters.
     //! If the input keywords contain NULL characters, bin_mode will be forced to be set to true.
     template <class Strings>
-    explicit trie(const Strings& keys, bool bin_mode = false) : trie(trie_builder(keys, l1_bits, bin_mode)) {
+    trie(const Strings& keys, bool bin_mode = false) : trie(trie_builder(keys, l1_bits, bin_mode)) {
         static_assert(sizeof(char) == sizeof(typename Strings::value_type::value_type));
     }
 
@@ -87,6 +87,11 @@ class trie {
         return m_bcvec.num_free_units();
     }
 
+    //! Get the number of unused DA units.
+    inline std::uint64_t tail_length() const {
+        return m_tvec.size();
+    }
+
     //! Lookup the ID of the keyword.
     inline std::optional<std::uint64_t> lookup(std::string_view key) const {
         std::uint64_t kpos = 0, npos = 0;
diff --git a/tools/xcdat_benchmark.cpp b/tools/xcdat_benchmark.cpp
index 6cb0e2f..c525321 100644
--- a/tools/xcdat_benchmark.cpp
+++ b/tools/xcdat_benchmark.cpp
@@ -10,40 +10,52 @@ static constexpr int num_trials = 10;
 
 cmd_line_parser::parser make_parser(int argc, char** argv) {
     cmd_line_parser::parser p(argc, argv);
-    p.add("input_keys", "Input filepath of data keys");
-    p.add("num_samples", "Number of sample keys for benchmark (default=1000)", "-n", false);
+    p.add("input_keys", "Input filepath of keywords");
+    p.add("num_samples", "Number of sample keys for searches (default=1000)", "-n", false);
     p.add("random_seed", "Random seed for sampling (default=13)", "-s", false);
+    p.add("binary_mode", "Is binary mode? (default=0)", "-b", false);
     return p;
 }
 
-auto sample_keys(const std::vector<std::string>& keys, std::uint64_t num_samples, std::uint64_t random_seed) {
-    std::vector<std::string_view> sampled_keys(num_samples);
-    std::vector<std::uint64_t> sampled_ids(num_samples);
-
+std::vector<std::string_view> sample_keys(const std::vector<std::string>& keys, std::uint64_t num_samples,
+                                          std::uint64_t random_seed) {
     std::mt19937_64 engine(random_seed);
     std::uniform_int_distribution<std::uint64_t> dist(0, keys.size() - 1);
 
+    std::vector<std::string_view> sampled_keys(num_samples);
     for (std::uint64_t i = 0; i < num_samples; i++) {
-        sampled_ids[i] = dist(engine);
-        sampled_keys[i] = std::string_view(keys[sampled_ids[i]]);
+        sampled_keys[i] = std::string_view(keys[dist(engine)]);
     }
-
-    return std::make_tuple(std::move(sampled_keys), std::move(sampled_ids));
+    return sampled_keys;
 }
 
 template <class Trie>
-Trie benchmark_build(const std::vector<std::string>& keys) {
+std::vector<std::uint64_t> extract_ids(const Trie& trie, const std::vector<std::string_view>& keys) {
+    std::vector<std::uint64_t> sampled_ids(keys.size());
+    for (std::uint64_t i = 0; i < keys.size(); i++) {
+        sampled_ids[i] = trie.lookup(keys[i]).value();
+    }
+    return sampled_ids;
+}
+
+template <class Trie>
+Trie benchmark_build(const std::vector<std::string>& keys, bool binary_mode) {
     const auto start_tp = std::chrono::high_resolution_clock::now();
-    Trie trie(keys);
+    Trie trie(keys, binary_mode);
     const auto stop_tp = std::chrono::high_resolution_clock::now();
 
     const auto dur_ms = std::chrono::duration_cast<std::chrono::milliseconds>(stop_tp - start_tp);
     const double time_in_sec = dur_ms.count() / 1000.0;
     const double memory_in_bytes = xcdat::memory_in_bytes(trie);
 
+    tfm::printfln("Binary mode: %d", trie.bin_mode());
+    tfm::printfln("Alphabet size: %d", trie.alphabet_size());
+    tfm::printfln("Max key length: %d", trie.max_length());
+    tfm::printfln("Number of keys: %d", trie.num_keys());
     tfm::printfln("Number of trie nodes: %d", trie.num_nodes());
     tfm::printfln("Number of DA units: %d", trie.num_units());
     tfm::printfln("Number of free DA units: %d", trie.num_free_units());
+    tfm::printfln("TAIL length: %d", trie.tail_length());
     tfm::printfln("Memory usage in bytes: %d", memory_in_bytes);
     tfm::printfln("Memory usage in MiB: %g", memory_in_bytes / (1024.0 * 1024.0));
     tfm::printfln("Construction time in seconds: %g", time_in_sec);
@@ -77,16 +89,16 @@ void benchmark_lookup(const Trie& trie, const std::vector<std::string_view>& que
 template <class Trie>
 void benchmark_decode(const Trie& trie, const std::vector<std::uint64_t>& queries) {
     // Warmup
-    std::string tmp;
+    volatile std::uint64_t tmp = 0;
     for (const std::uint64_t query : queries) {
-        trie.decode(query, tmp);
+        tmp += trie.decode(query).size();
     }
 
     // Measure
     const auto start_tp = std::chrono::high_resolution_clock::now();
     for (int r = 0; r < num_trials; r++) {
         for (const std::uint64_t query : queries) {
-            trie.decode(query, tmp);
+            tmp += trie.decode(query).size();
         }
     }
     const auto stop_tp = std::chrono::high_resolution_clock::now();
@@ -98,12 +110,12 @@ void benchmark_decode(const Trie& trie, const std::vector<std::uint64_t>& querie
 }
 
 template <class Trie>
-void benchmark(std::vector<std::string> keys, const std::vector<std::string_view>& q_keys,
-               const std::vector<std::uint64_t>& q_ids) {
-    const auto trie = benchmark_build<Trie>(keys);
+void benchmark(std::vector<std::string> keys, const std::vector<std::string_view>& query_keys, bool binary_mode) {
+    const auto trie = benchmark_build<Trie>(keys, binary_mode);
+    const auto query_ids = extract_ids(trie, query_keys);
 
-    benchmark_lookup(trie, q_keys);
-    benchmark_decode(trie, q_ids);
+    benchmark_lookup(trie, query_keys);
+    benchmark_decode(trie, query_ids);
 }
 
 int main(int argc, char** argv) {
@@ -120,6 +132,7 @@ int main(int argc, char** argv) {
     const auto input_keys = p.get<std::string>("input_keys");
     const auto num_samples = p.get<std::uint64_t>("num_samples", 1000);
     const auto random_seed = p.get<std::uint64_t>("random_seed", 13);
+    const auto binary_mode = p.get<bool>("binary_mode", false);
 
     auto keys = xcdat::load_strings(input_keys);
     if (keys.empty()) {
@@ -127,18 +140,16 @@ int main(int argc, char** argv) {
         return 1;
     }
 
-    // To unique
     std::sort(keys.begin(), keys.end());
     keys.erase(std::unique(keys.begin(), keys.end()), keys.end());
 
-    tfm::printfln("Number of keys: %d", keys.size());
-    auto [q_keys, q_ids] = sample_keys(keys, num_samples, random_seed);
+    const auto query_keys = sample_keys(keys, num_samples, random_seed);
 
     tfm::printfln("** xcdat::trie_7_type **");
-    benchmark<xcdat::trie_7_type>(keys, q_keys, q_ids);
+    benchmark<xcdat::trie_7_type>(keys, query_keys, binary_mode);
 
     tfm::printfln("** xcdat::trie_8_type **");
-    benchmark<xcdat::trie_8_type>(keys, q_keys, q_ids);
+    benchmark<xcdat::trie_8_type>(keys, query_keys, binary_mode);
 
     return 0;
 }
\ No newline at end of file
diff --git a/tools/xcdat_build.cpp b/tools/xcdat_build.cpp
index 1449fb4..a75d8c7 100644
--- a/tools/xcdat_build.cpp
+++ b/tools/xcdat_build.cpp
@@ -10,7 +10,7 @@ cmd_line_parser::parser make_parser(int argc, char** argv) {
     p.add("input_keys", "Input filepath of data keys");
     p.add("output_idx", "Output filepath of trie index");
     p.add("trie_type", "Trie type: [7|8] (default=7)", "-t", false);
-    p.add("to_unique", "Make unique the input keys? (default=1)", "-u", false);
+    p.add("binary_mode", "Is binary mode? (default=0)", "-b", false);
     return p;
 }
 
@@ -18,32 +18,23 @@ template <class Trie>
 int build(const cmd_line_parser::parser& p) {
     const auto input_keys = p.get<std::string>("input_keys");
     const auto output_idx = p.get<std::string>("output_idx");
-    const auto to_unique = p.get<bool>("to_unique", true);
+    const auto binary_mode = p.get<bool>("binary_mode", false);
 
     auto keys = xcdat::load_strings(input_keys);
     if (keys.empty()) {
         tfm::errorfln("Error: The input dataset is empty.");
     }
 
-    if (to_unique) {
-        std::sort(keys.begin(), keys.end());
-        keys.erase(std::unique(keys.begin(), keys.end()), keys.end());
-    }
+    std::sort(keys.begin(), keys.end());
+    keys.erase(std::unique(keys.begin(), keys.end()), keys.end());
 
-    const auto start_tp = std::chrono::high_resolution_clock::now();
-    const Trie trie(keys);
-    const auto stop_tp = std::chrono::high_resolution_clock::now();
-
-    const double time_in_sec =
-        std::chrono::duration_cast<std::chrono::milliseconds>(stop_tp - start_tp).count() / 1000.0;
+    const Trie trie(keys, binary_mode);
     const double memory_in_bytes = xcdat::memory_in_bytes(trie);
 
-    tfm::printfln("time_in_sec: %g", time_in_sec);
-    tfm::printfln("memory_in_bytes: %d", memory_in_bytes);
-    tfm::printfln("memory_in_MiB: %g", memory_in_bytes / (1024.0 * 1024.0));
-    tfm::printfln("number_of_keys: %d", trie.num_keys());
-    tfm::printfln("alphabet_size: %d", trie.alphabet_size());
-    tfm::printfln("max_length: %d", trie.max_length());
+    tfm::printfln("Number of keys: %d", trie.num_keys());
+    tfm::printfln("Number of trie nodes: %d", trie.num_nodes());
+    tfm::printfln("Memory usage in bytes: %d", memory_in_bytes);
+    tfm::printfln("Memory usage in MiB: %g", memory_in_bytes / (1024.0 * 1024.0));
 
     xcdat::save(trie, output_idx);