From af9731e6c29488c33cdc393ce8514c7f7d0e24ea Mon Sep 17 00:00:00 2001 From: Shunsuke Kanda Date: Fri, 2 Jul 2021 07:05:06 +0900 Subject: [PATCH] add --- README.md | 111 ++++++++++++++++++++++++++------------ include/xcdat/trie.hpp | 7 ++- tools/xcdat_benchmark.cpp | 61 ++++++++++++--------- tools/xcdat_build.cpp | 27 ++++------ 4 files changed, 127 insertions(+), 79 deletions(-) diff --git a/README.md b/README.md index 091664b..868cbd4 100644 --- a/README.md +++ b/README.md @@ -58,12 +58,10 @@ It builds the trie index from a given dataset consisting of keywords separated b ``` $ xcdat_build enwiki-titles.txt idx.bin -time_in_sec: 13.449 -memory_in_bytes: 1.70618e+08 -memory_in_MiB: 162.714 -number_of_keys: 15955763 -alphabet_size: 198 -max_length: 253 +Number of keys: 15955763 +Number of trie nodes: 36441058 +Memory usage in bytes: 1.70618e+08 +Memory usage in MiB: 162.714 ``` ### `xcdat_lookup` @@ -128,6 +126,42 @@ $ xcdat_enumerate idx.bin | head -3 138 !!! ``` +### `xcdat_benchmark` + +It measures the performance of Xcdat for a given dataset. + +``` +$ xcdat_benchmark enwiki-titles.txt +** xcdat::trie_7_type ** +Binary mode: 0 +Alphabet size: 198 +Max key length: 253 +Number of keys: 15955763 +Number of trie nodes: 36441058 +Number of DA units: 36520704 +Number of free DA units: 79646 +TAIL length: 30776290 +Memory usage in bytes: 1.70618e+08 +Memory usage in MiB: 162.714 +Construction time in seconds: 11.828 +Lookup time in microsec/query: 0.8259 +Decode time in microsec/query: 1.4545 +** xcdat::trie_8_type ** +Binary mode: 0 +Alphabet size: 198 +Max key length: 253 +Number of keys: 15955763 +Number of trie nodes: 36441035 +Number of DA units: 36515840 +Number of free DA units: 74805 +TAIL length: 30776290 +Memory usage in bytes: 1.64104e+08 +Memory usage in MiB: 156.502 +Construction time in seconds: 11.966 +Lookup time in microsec/query: 0.844 +Decode time in microsec/query: 1.0029 +``` + ## Sample usage `sample/sample.cpp` provides a sample usage. @@ -301,25 +335,40 @@ class trie { //! If bin_mode = true, bit flags are used istead, and the keywords can contain NULL characters. //! If the input keywords contain NULL characters, bin_mode will be forced to be set to true. template - explicit trie(const Strings& keys, bool bin_mode = false); + trie(const Strings& keys, bool bin_mode = false); //! Check if the binary mode. - inline bool bin_mode() const; + bool bin_mode() const; //! Get the number of stored keywords. - inline std::uint64_t num_keys() const; + std::uint64_t num_keys() const; //! Get the alphabet size. - inline std::uint64_t alphabet_size() const; + std::uint64_t alphabet_size() const; //! Get the maximum length of keywords. - inline std::uint64_t max_length() const; + std::uint64_t max_length() const; + + //! Get the number of trie nodes. + std::uint64_t num_nodes() const; + + //! Get the number of DA units. + std::uint64_t num_units() const; + + //! Get the number of unused DA units. + std::uint64_t num_free_units() const; + + //! Get the number of unused DA units. + std::uint64_t tail_length() const; //! Lookup the ID of the keyword. - inline std::optional lookup(std::string_view key) const; + std::optional lookup(std::string_view key) const; //! Decode the keyword associated with the ID. - inline std::string decode(std::uint64_t id) const; + std::string decode(std::uint64_t id) const; + + //! Decode the keyword associated with the ID. + void decode(std::uint64_t id, std::string& decoded) const; //! An iterator class for common prefix search. class prefix_iterator { @@ -328,24 +377,24 @@ class trie { //! Increment the iterator. //! Return false if the iteration is terminated. - inline bool next(); + bool next(); //! Get the result ID. - inline std::uint64_t id() const; + std::uint64_t id() const; //! Get the result keyword. - inline std::string decoded() const; + std::string decoded() const; //! Get the reference to the result keyword. //! Note that the referenced data will be changed in the next iteration. - inline std::string_view decoded_view() const; + std::string_view decoded_view() const; }; //! Make the common prefix searcher for the given keyword. - inline prefix_iterator make_prefix_iterator(std::string_view key) const; + prefix_iterator make_prefix_iterator(std::string_view key) const; //! Preform common prefix search for the keyword. - inline void prefix_search(std::string_view key, const std::function& fn) const; + void prefix_search(std::string_view key, const std::function& fn) const; //! An iterator class for predictive search. class predictive_iterator { @@ -354,41 +403,33 @@ class trie { //! Increment the iterator. //! Return false if the iteration is terminated. - inline bool next(); + bool next(); //! Get the result ID. - inline std::uint64_t id() const; + std::uint64_t id() const; //! Get the result keyword. - inline std::string decoded() const; + std::string decoded() const; //! Get the reference to the result keyword. //! Note that the referenced data will be changed in the next iteration. - inline std::string_view decoded_view() const; + std::string_view decoded_view() const; }; //! Make the predictive searcher for the keyword. - inline predictive_iterator make_predictive_iterator(std::string_view key) const { - return predictive_iterator(this, key); - } + predictive_iterator make_predictive_iterator(std::string_view key) const; //! Preform predictive search for the keyword. - inline void predictive_search(std::string_view key, - const std::function& fn) const { - auto itr = make_predictive_iterator(key); - while (itr.next()) { - fn(itr.id(), itr.decoded_view()); - } - } + void predictive_search(std::string_view key, const std::function& fn) const; //! An iterator class for enumeration. using enumerative_iterator = predictive_iterator; //! An iterator class for enumeration. - inline enumerative_iterator make_enumerative_iterator() const; + enumerative_iterator make_enumerative_iterator() const; //! Enumerate all the keywords and their IDs stored in the trie. - inline void enumerate(const std::function& fn) const; + void enumerate(const std::function& fn) const; //! Visit the members. template diff --git a/include/xcdat/trie.hpp b/include/xcdat/trie.hpp index 1bc30a9..b0c2788 100644 --- a/include/xcdat/trie.hpp +++ b/include/xcdat/trie.hpp @@ -48,7 +48,7 @@ class trie { //! If bin_mode = true, bit flags are used istead, and the keywords can contain NULL characters. //! If the input keywords contain NULL characters, bin_mode will be forced to be set to true. template - explicit trie(const Strings& keys, bool bin_mode = false) : trie(trie_builder(keys, l1_bits, bin_mode)) { + trie(const Strings& keys, bool bin_mode = false) : trie(trie_builder(keys, l1_bits, bin_mode)) { static_assert(sizeof(char) == sizeof(typename Strings::value_type::value_type)); } @@ -87,6 +87,11 @@ class trie { return m_bcvec.num_free_units(); } + //! Get the number of unused DA units. + inline std::uint64_t tail_length() const { + return m_tvec.size(); + } + //! Lookup the ID of the keyword. inline std::optional lookup(std::string_view key) const { std::uint64_t kpos = 0, npos = 0; diff --git a/tools/xcdat_benchmark.cpp b/tools/xcdat_benchmark.cpp index 6cb0e2f..c525321 100644 --- a/tools/xcdat_benchmark.cpp +++ b/tools/xcdat_benchmark.cpp @@ -10,40 +10,52 @@ static constexpr int num_trials = 10; cmd_line_parser::parser make_parser(int argc, char** argv) { cmd_line_parser::parser p(argc, argv); - p.add("input_keys", "Input filepath of data keys"); - p.add("num_samples", "Number of sample keys for benchmark (default=1000)", "-n", false); + p.add("input_keys", "Input filepath of keywords"); + p.add("num_samples", "Number of sample keys for searches (default=1000)", "-n", false); p.add("random_seed", "Random seed for sampling (default=13)", "-s", false); + p.add("binary_mode", "Is binary mode? (default=0)", "-b", false); return p; } -auto sample_keys(const std::vector& keys, std::uint64_t num_samples, std::uint64_t random_seed) { - std::vector sampled_keys(num_samples); - std::vector sampled_ids(num_samples); - +std::vector sample_keys(const std::vector& keys, std::uint64_t num_samples, + std::uint64_t random_seed) { std::mt19937_64 engine(random_seed); std::uniform_int_distribution dist(0, keys.size() - 1); + std::vector sampled_keys(num_samples); for (std::uint64_t i = 0; i < num_samples; i++) { - sampled_ids[i] = dist(engine); - sampled_keys[i] = std::string_view(keys[sampled_ids[i]]); + sampled_keys[i] = std::string_view(keys[dist(engine)]); } - - return std::make_tuple(std::move(sampled_keys), std::move(sampled_ids)); + return sampled_keys; } template -Trie benchmark_build(const std::vector& keys) { +std::vector extract_ids(const Trie& trie, const std::vector& keys) { + std::vector sampled_ids(keys.size()); + for (std::uint64_t i = 0; i < keys.size(); i++) { + sampled_ids[i] = trie.lookup(keys[i]).value(); + } + return sampled_ids; +} + +template +Trie benchmark_build(const std::vector& keys, bool binary_mode) { const auto start_tp = std::chrono::high_resolution_clock::now(); - Trie trie(keys); + Trie trie(keys, binary_mode); const auto stop_tp = std::chrono::high_resolution_clock::now(); const auto dur_ms = std::chrono::duration_cast(stop_tp - start_tp); const double time_in_sec = dur_ms.count() / 1000.0; const double memory_in_bytes = xcdat::memory_in_bytes(trie); + tfm::printfln("Binary mode: %d", trie.bin_mode()); + tfm::printfln("Alphabet size: %d", trie.alphabet_size()); + tfm::printfln("Max key length: %d", trie.max_length()); + tfm::printfln("Number of keys: %d", trie.num_keys()); tfm::printfln("Number of trie nodes: %d", trie.num_nodes()); tfm::printfln("Number of DA units: %d", trie.num_units()); tfm::printfln("Number of free DA units: %d", trie.num_free_units()); + tfm::printfln("TAIL length: %d", trie.tail_length()); tfm::printfln("Memory usage in bytes: %d", memory_in_bytes); tfm::printfln("Memory usage in MiB: %g", memory_in_bytes / (1024.0 * 1024.0)); tfm::printfln("Construction time in seconds: %g", time_in_sec); @@ -77,16 +89,16 @@ void benchmark_lookup(const Trie& trie, const std::vector& que template void benchmark_decode(const Trie& trie, const std::vector& queries) { // Warmup - std::string tmp; + volatile std::uint64_t tmp = 0; for (const std::uint64_t query : queries) { - trie.decode(query, tmp); + tmp += trie.decode(query).size(); } // Measure const auto start_tp = std::chrono::high_resolution_clock::now(); for (int r = 0; r < num_trials; r++) { for (const std::uint64_t query : queries) { - trie.decode(query, tmp); + tmp += trie.decode(query).size(); } } const auto stop_tp = std::chrono::high_resolution_clock::now(); @@ -98,12 +110,12 @@ void benchmark_decode(const Trie& trie, const std::vector& querie } template -void benchmark(std::vector keys, const std::vector& q_keys, - const std::vector& q_ids) { - const auto trie = benchmark_build(keys); +void benchmark(std::vector keys, const std::vector& query_keys, bool binary_mode) { + const auto trie = benchmark_build(keys, binary_mode); + const auto query_ids = extract_ids(trie, query_keys); - benchmark_lookup(trie, q_keys); - benchmark_decode(trie, q_ids); + benchmark_lookup(trie, query_keys); + benchmark_decode(trie, query_ids); } int main(int argc, char** argv) { @@ -120,6 +132,7 @@ int main(int argc, char** argv) { const auto input_keys = p.get("input_keys"); const auto num_samples = p.get("num_samples", 1000); const auto random_seed = p.get("random_seed", 13); + const auto binary_mode = p.get("binary_mode", false); auto keys = xcdat::load_strings(input_keys); if (keys.empty()) { @@ -127,18 +140,16 @@ int main(int argc, char** argv) { return 1; } - // To unique std::sort(keys.begin(), keys.end()); keys.erase(std::unique(keys.begin(), keys.end()), keys.end()); - tfm::printfln("Number of keys: %d", keys.size()); - auto [q_keys, q_ids] = sample_keys(keys, num_samples, random_seed); + const auto query_keys = sample_keys(keys, num_samples, random_seed); tfm::printfln("** xcdat::trie_7_type **"); - benchmark(keys, q_keys, q_ids); + benchmark(keys, query_keys, binary_mode); tfm::printfln("** xcdat::trie_8_type **"); - benchmark(keys, q_keys, q_ids); + benchmark(keys, query_keys, binary_mode); return 0; } \ No newline at end of file diff --git a/tools/xcdat_build.cpp b/tools/xcdat_build.cpp index 1449fb4..a75d8c7 100644 --- a/tools/xcdat_build.cpp +++ b/tools/xcdat_build.cpp @@ -10,7 +10,7 @@ cmd_line_parser::parser make_parser(int argc, char** argv) { p.add("input_keys", "Input filepath of data keys"); p.add("output_idx", "Output filepath of trie index"); p.add("trie_type", "Trie type: [7|8] (default=7)", "-t", false); - p.add("to_unique", "Make unique the input keys? (default=1)", "-u", false); + p.add("binary_mode", "Is binary mode? (default=0)", "-b", false); return p; } @@ -18,32 +18,23 @@ template int build(const cmd_line_parser::parser& p) { const auto input_keys = p.get("input_keys"); const auto output_idx = p.get("output_idx"); - const auto to_unique = p.get("to_unique", true); + const auto binary_mode = p.get("binary_mode", false); auto keys = xcdat::load_strings(input_keys); if (keys.empty()) { tfm::errorfln("Error: The input dataset is empty."); } - if (to_unique) { - std::sort(keys.begin(), keys.end()); - keys.erase(std::unique(keys.begin(), keys.end()), keys.end()); - } + std::sort(keys.begin(), keys.end()); + keys.erase(std::unique(keys.begin(), keys.end()), keys.end()); - const auto start_tp = std::chrono::high_resolution_clock::now(); - const Trie trie(keys); - const auto stop_tp = std::chrono::high_resolution_clock::now(); - - const double time_in_sec = - std::chrono::duration_cast(stop_tp - start_tp).count() / 1000.0; + const Trie trie(keys, binary_mode); const double memory_in_bytes = xcdat::memory_in_bytes(trie); - tfm::printfln("time_in_sec: %g", time_in_sec); - tfm::printfln("memory_in_bytes: %d", memory_in_bytes); - tfm::printfln("memory_in_MiB: %g", memory_in_bytes / (1024.0 * 1024.0)); - tfm::printfln("number_of_keys: %d", trie.num_keys()); - tfm::printfln("alphabet_size: %d", trie.alphabet_size()); - tfm::printfln("max_length: %d", trie.max_length()); + tfm::printfln("Number of keys: %d", trie.num_keys()); + tfm::printfln("Number of trie nodes: %d", trie.num_nodes()); + tfm::printfln("Memory usage in bytes: %d", memory_in_bytes); + tfm::printfln("Memory usage in MiB: %g", memory_in_bytes / (1024.0 * 1024.0)); xcdat::save(trie, output_idx);