From b79860393029d6130f3523ce99e43da0eb6dd0c1 Mon Sep 17 00:00:00 2001 From: Shunsuke Kanda Date: Fri, 2 Jul 2021 06:14:55 +0900 Subject: [PATCH] add bench --- include/xcdat.hpp | 5 +- include/xcdat/bc_vector_7.hpp | 8 ++ include/xcdat/bc_vector_8.hpp | 8 ++ include/xcdat/trie.hpp | 30 +++++-- tools/CMakeLists.txt | 1 + tools/xcdat_benchmark.cpp | 144 ++++++++++++++++++++++++++++++++++ 6 files changed, 188 insertions(+), 8 deletions(-) create mode 100644 tools/xcdat_benchmark.cpp diff --git a/include/xcdat.hpp b/include/xcdat.hpp index 8ca1313..f5d201c 100644 --- a/include/xcdat.hpp +++ b/include/xcdat.hpp @@ -70,9 +70,8 @@ template [[maybe_unused]] std::vector load_strings(std::string_view filepath) { std::ifstream ifs(filepath); - if (!ifs) { - return {}; - } + XCDAT_THROW_IF(!ifs.good(), "Cannot open the input file"); + std::vector strs; for (std::string str; std::getline(ifs, str);) { strs.push_back(str); diff --git a/include/xcdat/bc_vector_7.hpp b/include/xcdat/bc_vector_7.hpp index 219a155..dcb45b8 100644 --- a/include/xcdat/bc_vector_7.hpp +++ b/include/xcdat/bc_vector_7.hpp @@ -141,6 +141,14 @@ class bc_vector_7 { return m_ints_l1.size() / 2; } + inline std::uint64_t num_free_units() const { + return m_num_frees; + } + + inline std::uint64_t num_nodes() const { + return num_units() - num_free_units(); + } + inline std::uint64_t num_leaves() const { return m_leaves.num_ones(); } diff --git a/include/xcdat/bc_vector_8.hpp b/include/xcdat/bc_vector_8.hpp index ed48357..17372d9 100644 --- a/include/xcdat/bc_vector_8.hpp +++ b/include/xcdat/bc_vector_8.hpp @@ -109,6 +109,14 @@ class bc_vector_8 { return m_bytes[0].size() / 2; } + inline std::uint64_t num_free_units() const { + return m_num_frees; + } + + inline std::uint64_t num_nodes() const { + return num_units() - num_free_units(); + } + inline std::uint64_t num_leaves() const { return m_leaves.num_ones(); } diff --git a/include/xcdat/trie.hpp b/include/xcdat/trie.hpp index 6a680fb..1bc30a9 100644 --- a/include/xcdat/trie.hpp +++ b/include/xcdat/trie.hpp @@ -72,6 +72,21 @@ class trie { return m_table.max_length(); } + //! Get the number of trie nodes. + inline std::uint64_t num_nodes() const { + return m_bcvec.num_nodes(); + } + + //! Get the number of DA units. + inline std::uint64_t num_units() const { + return m_bcvec.num_units(); + } + + //! Get the number of unused DA units. + inline std::uint64_t num_free_units() const { + return m_bcvec.num_free_units(); + } + //! Lookup the ID of the keyword. inline std::optional lookup(std::string_view key) const { std::uint64_t kpos = 0, npos = 0; @@ -98,12 +113,18 @@ class trie { //! Decode the keyword associated with the ID. inline std::string decode(std::uint64_t id) const { - if (num_keys() <= id) { - return {}; - } - std::string decoded; decoded.reserve(max_length()); + decode(id, decoded); + return decoded; + } + + //! Decode the keyword associated with the ID. + inline void decode(std::uint64_t id, std::string& decoded) const { + decoded.clear(); + if (num_keys() <= id) { + return; + } std::uint64_t npos = id_to_npos(id); std::uint64_t tpos = m_bcvec.is_leaf(npos) ? m_bcvec.link(npos) : UINT64_MAX; @@ -118,7 +139,6 @@ class trie { if (tpos != 0 && tpos != UINT64_MAX) { m_tvec.decode(tpos, [&](char c) { decoded.push_back(c); }); } - return decoded; } //! An iterator class for common prefix search. diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index 38b379a..bba7802 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -5,6 +5,7 @@ set(XCDAT_FILES "xcdat_prefix_search" "xcdat_predictive_search" "xcdat_enumerate" + "xcdat_benchmark" ) foreach(XCDAT_FILE ${XCDAT_FILES}) diff --git a/tools/xcdat_benchmark.cpp b/tools/xcdat_benchmark.cpp new file mode 100644 index 0000000..6cb0e2f --- /dev/null +++ b/tools/xcdat_benchmark.cpp @@ -0,0 +1,144 @@ +#include +#include + +#include + +#include "cmd_line_parser/parser.hpp" +#include "tinyformat/tinyformat.h" + +static constexpr int num_trials = 10; + +cmd_line_parser::parser make_parser(int argc, char** argv) { + cmd_line_parser::parser p(argc, argv); + p.add("input_keys", "Input filepath of data keys"); + p.add("num_samples", "Number of sample keys for benchmark (default=1000)", "-n", false); + p.add("random_seed", "Random seed for sampling (default=13)", "-s", false); + return p; +} + +auto sample_keys(const std::vector& keys, std::uint64_t num_samples, std::uint64_t random_seed) { + std::vector sampled_keys(num_samples); + std::vector sampled_ids(num_samples); + + std::mt19937_64 engine(random_seed); + std::uniform_int_distribution dist(0, keys.size() - 1); + + for (std::uint64_t i = 0; i < num_samples; i++) { + sampled_ids[i] = dist(engine); + sampled_keys[i] = std::string_view(keys[sampled_ids[i]]); + } + + return std::make_tuple(std::move(sampled_keys), std::move(sampled_ids)); +} + +template +Trie benchmark_build(const std::vector& keys) { + const auto start_tp = std::chrono::high_resolution_clock::now(); + Trie trie(keys); + const auto stop_tp = std::chrono::high_resolution_clock::now(); + + const auto dur_ms = std::chrono::duration_cast(stop_tp - start_tp); + const double time_in_sec = dur_ms.count() / 1000.0; + const double memory_in_bytes = xcdat::memory_in_bytes(trie); + + tfm::printfln("Number of trie nodes: %d", trie.num_nodes()); + tfm::printfln("Number of DA units: %d", trie.num_units()); + tfm::printfln("Number of free DA units: %d", trie.num_free_units()); + tfm::printfln("Memory usage in bytes: %d", memory_in_bytes); + tfm::printfln("Memory usage in MiB: %g", memory_in_bytes / (1024.0 * 1024.0)); + tfm::printfln("Construction time in seconds: %g", time_in_sec); + + return trie; +} + +template +void benchmark_lookup(const Trie& trie, const std::vector& queries) { + // Warmup + volatile std::uint64_t tmp = 0; + for (const auto& query : queries) { + tmp += trie.lookup(query).value(); + } + + // Measure + const auto start_tp = std::chrono::high_resolution_clock::now(); + for (int r = 0; r < num_trials; r++) { + for (const auto& query : queries) { + tmp += trie.lookup(query).value(); + } + } + const auto stop_tp = std::chrono::high_resolution_clock::now(); + + const auto dur_us = std::chrono::duration_cast(stop_tp - start_tp); + const auto elapsed_us = static_cast(dur_us.count()); + + tfm::printfln("Lookup time in microsec/query: %g", elapsed_us / (num_trials * queries.size())); +} + +template +void benchmark_decode(const Trie& trie, const std::vector& queries) { + // Warmup + std::string tmp; + for (const std::uint64_t query : queries) { + trie.decode(query, tmp); + } + + // Measure + const auto start_tp = std::chrono::high_resolution_clock::now(); + for (int r = 0; r < num_trials; r++) { + for (const std::uint64_t query : queries) { + trie.decode(query, tmp); + } + } + const auto stop_tp = std::chrono::high_resolution_clock::now(); + + const auto dur_us = std::chrono::duration_cast(stop_tp - start_tp); + const auto elapsed_us = static_cast(dur_us.count()); + + tfm::printfln("Decode time in microsec/query: %g", elapsed_us / (num_trials * queries.size())); +} + +template +void benchmark(std::vector keys, const std::vector& q_keys, + const std::vector& q_ids) { + const auto trie = benchmark_build(keys); + + benchmark_lookup(trie, q_keys); + benchmark_decode(trie, q_ids); +} + +int main(int argc, char** argv) { +#ifndef NDEBUG + tfm::warnfln("The code is running in debug mode."); +#endif + std::ios::sync_with_stdio(false); + + auto p = make_parser(argc, argv); + if (!p.parse()) { + return 1; + } + + const auto input_keys = p.get("input_keys"); + const auto num_samples = p.get("num_samples", 1000); + const auto random_seed = p.get("random_seed", 13); + + auto keys = xcdat::load_strings(input_keys); + if (keys.empty()) { + tfm::errorfln("Error: The input dataset is empty."); + return 1; + } + + // To unique + std::sort(keys.begin(), keys.end()); + keys.erase(std::unique(keys.begin(), keys.end()), keys.end()); + + tfm::printfln("Number of keys: %d", keys.size()); + auto [q_keys, q_ids] = sample_keys(keys, num_samples, random_seed); + + tfm::printfln("** xcdat::trie_7_type **"); + benchmark(keys, q_keys, q_ids); + + tfm::printfln("** xcdat::trie_8_type **"); + benchmark(keys, q_keys, q_ids); + + return 0; +} \ No newline at end of file