add bench
This commit is contained in:
parent
6e8efb6abb
commit
b798603930
|
@ -70,9 +70,8 @@ template <class Trie>
|
||||||
|
|
||||||
[[maybe_unused]] std::vector<std::string> load_strings(std::string_view filepath) {
|
[[maybe_unused]] std::vector<std::string> load_strings(std::string_view filepath) {
|
||||||
std::ifstream ifs(filepath);
|
std::ifstream ifs(filepath);
|
||||||
if (!ifs) {
|
XCDAT_THROW_IF(!ifs.good(), "Cannot open the input file");
|
||||||
return {};
|
|
||||||
}
|
|
||||||
std::vector<std::string> strs;
|
std::vector<std::string> strs;
|
||||||
for (std::string str; std::getline(ifs, str);) {
|
for (std::string str; std::getline(ifs, str);) {
|
||||||
strs.push_back(str);
|
strs.push_back(str);
|
||||||
|
|
|
@ -141,6 +141,14 @@ class bc_vector_7 {
|
||||||
return m_ints_l1.size() / 2;
|
return m_ints_l1.size() / 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline std::uint64_t num_free_units() const {
|
||||||
|
return m_num_frees;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline std::uint64_t num_nodes() const {
|
||||||
|
return num_units() - num_free_units();
|
||||||
|
}
|
||||||
|
|
||||||
inline std::uint64_t num_leaves() const {
|
inline std::uint64_t num_leaves() const {
|
||||||
return m_leaves.num_ones();
|
return m_leaves.num_ones();
|
||||||
}
|
}
|
||||||
|
|
|
@ -109,6 +109,14 @@ class bc_vector_8 {
|
||||||
return m_bytes[0].size() / 2;
|
return m_bytes[0].size() / 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline std::uint64_t num_free_units() const {
|
||||||
|
return m_num_frees;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline std::uint64_t num_nodes() const {
|
||||||
|
return num_units() - num_free_units();
|
||||||
|
}
|
||||||
|
|
||||||
inline std::uint64_t num_leaves() const {
|
inline std::uint64_t num_leaves() const {
|
||||||
return m_leaves.num_ones();
|
return m_leaves.num_ones();
|
||||||
}
|
}
|
||||||
|
|
|
@ -72,6 +72,21 @@ class trie {
|
||||||
return m_table.max_length();
|
return m_table.max_length();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//! Get the number of trie nodes.
|
||||||
|
inline std::uint64_t num_nodes() const {
|
||||||
|
return m_bcvec.num_nodes();
|
||||||
|
}
|
||||||
|
|
||||||
|
//! Get the number of DA units.
|
||||||
|
inline std::uint64_t num_units() const {
|
||||||
|
return m_bcvec.num_units();
|
||||||
|
}
|
||||||
|
|
||||||
|
//! Get the number of unused DA units.
|
||||||
|
inline std::uint64_t num_free_units() const {
|
||||||
|
return m_bcvec.num_free_units();
|
||||||
|
}
|
||||||
|
|
||||||
//! Lookup the ID of the keyword.
|
//! Lookup the ID of the keyword.
|
||||||
inline std::optional<std::uint64_t> lookup(std::string_view key) const {
|
inline std::optional<std::uint64_t> lookup(std::string_view key) const {
|
||||||
std::uint64_t kpos = 0, npos = 0;
|
std::uint64_t kpos = 0, npos = 0;
|
||||||
|
@ -98,12 +113,18 @@ class trie {
|
||||||
|
|
||||||
//! Decode the keyword associated with the ID.
|
//! Decode the keyword associated with the ID.
|
||||||
inline std::string decode(std::uint64_t id) const {
|
inline std::string decode(std::uint64_t id) const {
|
||||||
if (num_keys() <= id) {
|
|
||||||
return {};
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string decoded;
|
std::string decoded;
|
||||||
decoded.reserve(max_length());
|
decoded.reserve(max_length());
|
||||||
|
decode(id, decoded);
|
||||||
|
return decoded;
|
||||||
|
}
|
||||||
|
|
||||||
|
//! Decode the keyword associated with the ID.
|
||||||
|
inline void decode(std::uint64_t id, std::string& decoded) const {
|
||||||
|
decoded.clear();
|
||||||
|
if (num_keys() <= id) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
std::uint64_t npos = id_to_npos(id);
|
std::uint64_t npos = id_to_npos(id);
|
||||||
std::uint64_t tpos = m_bcvec.is_leaf(npos) ? m_bcvec.link(npos) : UINT64_MAX;
|
std::uint64_t tpos = m_bcvec.is_leaf(npos) ? m_bcvec.link(npos) : UINT64_MAX;
|
||||||
|
@ -118,7 +139,6 @@ class trie {
|
||||||
if (tpos != 0 && tpos != UINT64_MAX) {
|
if (tpos != 0 && tpos != UINT64_MAX) {
|
||||||
m_tvec.decode(tpos, [&](char c) { decoded.push_back(c); });
|
m_tvec.decode(tpos, [&](char c) { decoded.push_back(c); });
|
||||||
}
|
}
|
||||||
return decoded;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//! An iterator class for common prefix search.
|
//! An iterator class for common prefix search.
|
||||||
|
|
|
@ -5,6 +5,7 @@ set(XCDAT_FILES
|
||||||
"xcdat_prefix_search"
|
"xcdat_prefix_search"
|
||||||
"xcdat_predictive_search"
|
"xcdat_predictive_search"
|
||||||
"xcdat_enumerate"
|
"xcdat_enumerate"
|
||||||
|
"xcdat_benchmark"
|
||||||
)
|
)
|
||||||
|
|
||||||
foreach(XCDAT_FILE ${XCDAT_FILES})
|
foreach(XCDAT_FILE ${XCDAT_FILES})
|
||||||
|
|
144
tools/xcdat_benchmark.cpp
Normal file
144
tools/xcdat_benchmark.cpp
Normal file
|
@ -0,0 +1,144 @@
|
||||||
|
#include <chrono>
|
||||||
|
#include <random>
|
||||||
|
|
||||||
|
#include <xcdat.hpp>
|
||||||
|
|
||||||
|
#include "cmd_line_parser/parser.hpp"
|
||||||
|
#include "tinyformat/tinyformat.h"
|
||||||
|
|
||||||
|
static constexpr int num_trials = 10;
|
||||||
|
|
||||||
|
cmd_line_parser::parser make_parser(int argc, char** argv) {
|
||||||
|
cmd_line_parser::parser p(argc, argv);
|
||||||
|
p.add("input_keys", "Input filepath of data keys");
|
||||||
|
p.add("num_samples", "Number of sample keys for benchmark (default=1000)", "-n", false);
|
||||||
|
p.add("random_seed", "Random seed for sampling (default=13)", "-s", false);
|
||||||
|
return p;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto sample_keys(const std::vector<std::string>& keys, std::uint64_t num_samples, std::uint64_t random_seed) {
|
||||||
|
std::vector<std::string_view> sampled_keys(num_samples);
|
||||||
|
std::vector<std::uint64_t> sampled_ids(num_samples);
|
||||||
|
|
||||||
|
std::mt19937_64 engine(random_seed);
|
||||||
|
std::uniform_int_distribution<std::uint64_t> dist(0, keys.size() - 1);
|
||||||
|
|
||||||
|
for (std::uint64_t i = 0; i < num_samples; i++) {
|
||||||
|
sampled_ids[i] = dist(engine);
|
||||||
|
sampled_keys[i] = std::string_view(keys[sampled_ids[i]]);
|
||||||
|
}
|
||||||
|
|
||||||
|
return std::make_tuple(std::move(sampled_keys), std::move(sampled_ids));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class Trie>
|
||||||
|
Trie benchmark_build(const std::vector<std::string>& keys) {
|
||||||
|
const auto start_tp = std::chrono::high_resolution_clock::now();
|
||||||
|
Trie trie(keys);
|
||||||
|
const auto stop_tp = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
|
const auto dur_ms = std::chrono::duration_cast<std::chrono::milliseconds>(stop_tp - start_tp);
|
||||||
|
const double time_in_sec = dur_ms.count() / 1000.0;
|
||||||
|
const double memory_in_bytes = xcdat::memory_in_bytes(trie);
|
||||||
|
|
||||||
|
tfm::printfln("Number of trie nodes: %d", trie.num_nodes());
|
||||||
|
tfm::printfln("Number of DA units: %d", trie.num_units());
|
||||||
|
tfm::printfln("Number of free DA units: %d", trie.num_free_units());
|
||||||
|
tfm::printfln("Memory usage in bytes: %d", memory_in_bytes);
|
||||||
|
tfm::printfln("Memory usage in MiB: %g", memory_in_bytes / (1024.0 * 1024.0));
|
||||||
|
tfm::printfln("Construction time in seconds: %g", time_in_sec);
|
||||||
|
|
||||||
|
return trie;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class Trie>
|
||||||
|
void benchmark_lookup(const Trie& trie, const std::vector<std::string_view>& queries) {
|
||||||
|
// Warmup
|
||||||
|
volatile std::uint64_t tmp = 0;
|
||||||
|
for (const auto& query : queries) {
|
||||||
|
tmp += trie.lookup(query).value();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Measure
|
||||||
|
const auto start_tp = std::chrono::high_resolution_clock::now();
|
||||||
|
for (int r = 0; r < num_trials; r++) {
|
||||||
|
for (const auto& query : queries) {
|
||||||
|
tmp += trie.lookup(query).value();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const auto stop_tp = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
|
const auto dur_us = std::chrono::duration_cast<std::chrono::microseconds>(stop_tp - start_tp);
|
||||||
|
const auto elapsed_us = static_cast<double>(dur_us.count());
|
||||||
|
|
||||||
|
tfm::printfln("Lookup time in microsec/query: %g", elapsed_us / (num_trials * queries.size()));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class Trie>
|
||||||
|
void benchmark_decode(const Trie& trie, const std::vector<std::uint64_t>& queries) {
|
||||||
|
// Warmup
|
||||||
|
std::string tmp;
|
||||||
|
for (const std::uint64_t query : queries) {
|
||||||
|
trie.decode(query, tmp);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Measure
|
||||||
|
const auto start_tp = std::chrono::high_resolution_clock::now();
|
||||||
|
for (int r = 0; r < num_trials; r++) {
|
||||||
|
for (const std::uint64_t query : queries) {
|
||||||
|
trie.decode(query, tmp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const auto stop_tp = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
|
const auto dur_us = std::chrono::duration_cast<std::chrono::microseconds>(stop_tp - start_tp);
|
||||||
|
const auto elapsed_us = static_cast<double>(dur_us.count());
|
||||||
|
|
||||||
|
tfm::printfln("Decode time in microsec/query: %g", elapsed_us / (num_trials * queries.size()));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class Trie>
|
||||||
|
void benchmark(std::vector<std::string> keys, const std::vector<std::string_view>& q_keys,
|
||||||
|
const std::vector<std::uint64_t>& q_ids) {
|
||||||
|
const auto trie = benchmark_build<Trie>(keys);
|
||||||
|
|
||||||
|
benchmark_lookup(trie, q_keys);
|
||||||
|
benchmark_decode(trie, q_ids);
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char** argv) {
|
||||||
|
#ifndef NDEBUG
|
||||||
|
tfm::warnfln("The code is running in debug mode.");
|
||||||
|
#endif
|
||||||
|
std::ios::sync_with_stdio(false);
|
||||||
|
|
||||||
|
auto p = make_parser(argc, argv);
|
||||||
|
if (!p.parse()) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto input_keys = p.get<std::string>("input_keys");
|
||||||
|
const auto num_samples = p.get<std::uint64_t>("num_samples", 1000);
|
||||||
|
const auto random_seed = p.get<std::uint64_t>("random_seed", 13);
|
||||||
|
|
||||||
|
auto keys = xcdat::load_strings(input_keys);
|
||||||
|
if (keys.empty()) {
|
||||||
|
tfm::errorfln("Error: The input dataset is empty.");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// To unique
|
||||||
|
std::sort(keys.begin(), keys.end());
|
||||||
|
keys.erase(std::unique(keys.begin(), keys.end()), keys.end());
|
||||||
|
|
||||||
|
tfm::printfln("Number of keys: %d", keys.size());
|
||||||
|
auto [q_keys, q_ids] = sample_keys(keys, num_samples, random_seed);
|
||||||
|
|
||||||
|
tfm::printfln("** xcdat::trie_7_type **");
|
||||||
|
benchmark<xcdat::trie_7_type>(keys, q_keys, q_ids);
|
||||||
|
|
||||||
|
tfm::printfln("** xcdat::trie_8_type **");
|
||||||
|
benchmark<xcdat::trie_8_type>(keys, q_keys, q_ids);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
Loading…
Reference in a new issue