This commit is contained in:
Shunsuke Kanda 2021-07-02 07:05:06 +09:00
parent b798603930
commit af9731e6c2
4 changed files with 127 additions and 79 deletions

111
README.md
View file

@ -58,12 +58,10 @@ It builds the trie index from a given dataset consisting of keywords separated b
``` ```
$ xcdat_build enwiki-titles.txt idx.bin $ xcdat_build enwiki-titles.txt idx.bin
time_in_sec: 13.449 Number of keys: 15955763
memory_in_bytes: 1.70618e+08 Number of trie nodes: 36441058
memory_in_MiB: 162.714 Memory usage in bytes: 1.70618e+08
number_of_keys: 15955763 Memory usage in MiB: 162.714
alphabet_size: 198
max_length: 253
``` ```
### `xcdat_lookup` ### `xcdat_lookup`
@ -128,6 +126,42 @@ $ xcdat_enumerate idx.bin | head -3
138 !!! 138 !!!
``` ```
### `xcdat_benchmark`
It measures the performance of Xcdat for a given dataset.
```
$ xcdat_benchmark enwiki-titles.txt
** xcdat::trie_7_type **
Binary mode: 0
Alphabet size: 198
Max key length: 253
Number of keys: 15955763
Number of trie nodes: 36441058
Number of DA units: 36520704
Number of free DA units: 79646
TAIL length: 30776290
Memory usage in bytes: 1.70618e+08
Memory usage in MiB: 162.714
Construction time in seconds: 11.828
Lookup time in microsec/query: 0.8259
Decode time in microsec/query: 1.4545
** xcdat::trie_8_type **
Binary mode: 0
Alphabet size: 198
Max key length: 253
Number of keys: 15955763
Number of trie nodes: 36441035
Number of DA units: 36515840
Number of free DA units: 74805
TAIL length: 30776290
Memory usage in bytes: 1.64104e+08
Memory usage in MiB: 156.502
Construction time in seconds: 11.966
Lookup time in microsec/query: 0.844
Decode time in microsec/query: 1.0029
```
## Sample usage ## Sample usage
`sample/sample.cpp` provides a sample usage. `sample/sample.cpp` provides a sample usage.
@ -301,25 +335,40 @@ class trie {
//! If bin_mode = true, bit flags are used istead, and the keywords can contain NULL characters. //! If bin_mode = true, bit flags are used istead, and the keywords can contain NULL characters.
//! If the input keywords contain NULL characters, bin_mode will be forced to be set to true. //! If the input keywords contain NULL characters, bin_mode will be forced to be set to true.
template <class Strings> template <class Strings>
explicit trie(const Strings& keys, bool bin_mode = false); trie(const Strings& keys, bool bin_mode = false);
//! Check if the binary mode. //! Check if the binary mode.
inline bool bin_mode() const; bool bin_mode() const;
//! Get the number of stored keywords. //! Get the number of stored keywords.
inline std::uint64_t num_keys() const; std::uint64_t num_keys() const;
//! Get the alphabet size. //! Get the alphabet size.
inline std::uint64_t alphabet_size() const; std::uint64_t alphabet_size() const;
//! Get the maximum length of keywords. //! Get the maximum length of keywords.
inline std::uint64_t max_length() const; std::uint64_t max_length() const;
//! Get the number of trie nodes.
std::uint64_t num_nodes() const;
//! Get the number of DA units.
std::uint64_t num_units() const;
//! Get the number of unused DA units.
std::uint64_t num_free_units() const;
//! Get the number of unused DA units.
std::uint64_t tail_length() const;
//! Lookup the ID of the keyword. //! Lookup the ID of the keyword.
inline std::optional<std::uint64_t> lookup(std::string_view key) const; std::optional<std::uint64_t> lookup(std::string_view key) const;
//! Decode the keyword associated with the ID. //! Decode the keyword associated with the ID.
inline std::string decode(std::uint64_t id) const; std::string decode(std::uint64_t id) const;
//! Decode the keyword associated with the ID.
void decode(std::uint64_t id, std::string& decoded) const;
//! An iterator class for common prefix search. //! An iterator class for common prefix search.
class prefix_iterator { class prefix_iterator {
@ -328,24 +377,24 @@ class trie {
//! Increment the iterator. //! Increment the iterator.
//! Return false if the iteration is terminated. //! Return false if the iteration is terminated.
inline bool next(); bool next();
//! Get the result ID. //! Get the result ID.
inline std::uint64_t id() const; std::uint64_t id() const;
//! Get the result keyword. //! Get the result keyword.
inline std::string decoded() const; std::string decoded() const;
//! Get the reference to the result keyword. //! Get the reference to the result keyword.
//! Note that the referenced data will be changed in the next iteration. //! Note that the referenced data will be changed in the next iteration.
inline std::string_view decoded_view() const; std::string_view decoded_view() const;
}; };
//! Make the common prefix searcher for the given keyword. //! Make the common prefix searcher for the given keyword.
inline prefix_iterator make_prefix_iterator(std::string_view key) const; prefix_iterator make_prefix_iterator(std::string_view key) const;
//! Preform common prefix search for the keyword. //! Preform common prefix search for the keyword.
inline void prefix_search(std::string_view key, const std::function<void(std::uint64_t, std::string_view)>& fn) const; void prefix_search(std::string_view key, const std::function<void(std::uint64_t, std::string_view)>& fn) const;
//! An iterator class for predictive search. //! An iterator class for predictive search.
class predictive_iterator { class predictive_iterator {
@ -354,41 +403,33 @@ class trie {
//! Increment the iterator. //! Increment the iterator.
//! Return false if the iteration is terminated. //! Return false if the iteration is terminated.
inline bool next(); bool next();
//! Get the result ID. //! Get the result ID.
inline std::uint64_t id() const; std::uint64_t id() const;
//! Get the result keyword. //! Get the result keyword.
inline std::string decoded() const; std::string decoded() const;
//! Get the reference to the result keyword. //! Get the reference to the result keyword.
//! Note that the referenced data will be changed in the next iteration. //! Note that the referenced data will be changed in the next iteration.
inline std::string_view decoded_view() const; std::string_view decoded_view() const;
}; };
//! Make the predictive searcher for the keyword. //! Make the predictive searcher for the keyword.
inline predictive_iterator make_predictive_iterator(std::string_view key) const { predictive_iterator make_predictive_iterator(std::string_view key) const;
return predictive_iterator(this, key);
}
//! Preform predictive search for the keyword. //! Preform predictive search for the keyword.
inline void predictive_search(std::string_view key, void predictive_search(std::string_view key, const std::function<void(std::uint64_t, std::string_view)>& fn) const;
const std::function<void(std::uint64_t, std::string_view)>& fn) const {
auto itr = make_predictive_iterator(key);
while (itr.next()) {
fn(itr.id(), itr.decoded_view());
}
}
//! An iterator class for enumeration. //! An iterator class for enumeration.
using enumerative_iterator = predictive_iterator; using enumerative_iterator = predictive_iterator;
//! An iterator class for enumeration. //! An iterator class for enumeration.
inline enumerative_iterator make_enumerative_iterator() const; enumerative_iterator make_enumerative_iterator() const;
//! Enumerate all the keywords and their IDs stored in the trie. //! Enumerate all the keywords and their IDs stored in the trie.
inline void enumerate(const std::function<void(std::uint64_t, std::string_view)>& fn) const; void enumerate(const std::function<void(std::uint64_t, std::string_view)>& fn) const;
//! Visit the members. //! Visit the members.
template <class Visitor> template <class Visitor>

View file

@ -48,7 +48,7 @@ class trie {
//! If bin_mode = true, bit flags are used istead, and the keywords can contain NULL characters. //! If bin_mode = true, bit flags are used istead, and the keywords can contain NULL characters.
//! If the input keywords contain NULL characters, bin_mode will be forced to be set to true. //! If the input keywords contain NULL characters, bin_mode will be forced to be set to true.
template <class Strings> template <class Strings>
explicit trie(const Strings& keys, bool bin_mode = false) : trie(trie_builder(keys, l1_bits, bin_mode)) { trie(const Strings& keys, bool bin_mode = false) : trie(trie_builder(keys, l1_bits, bin_mode)) {
static_assert(sizeof(char) == sizeof(typename Strings::value_type::value_type)); static_assert(sizeof(char) == sizeof(typename Strings::value_type::value_type));
} }
@ -87,6 +87,11 @@ class trie {
return m_bcvec.num_free_units(); return m_bcvec.num_free_units();
} }
//! Get the number of unused DA units.
inline std::uint64_t tail_length() const {
return m_tvec.size();
}
//! Lookup the ID of the keyword. //! Lookup the ID of the keyword.
inline std::optional<std::uint64_t> lookup(std::string_view key) const { inline std::optional<std::uint64_t> lookup(std::string_view key) const {
std::uint64_t kpos = 0, npos = 0; std::uint64_t kpos = 0, npos = 0;

View file

@ -10,40 +10,52 @@ static constexpr int num_trials = 10;
cmd_line_parser::parser make_parser(int argc, char** argv) { cmd_line_parser::parser make_parser(int argc, char** argv) {
cmd_line_parser::parser p(argc, argv); cmd_line_parser::parser p(argc, argv);
p.add("input_keys", "Input filepath of data keys"); p.add("input_keys", "Input filepath of keywords");
p.add("num_samples", "Number of sample keys for benchmark (default=1000)", "-n", false); p.add("num_samples", "Number of sample keys for searches (default=1000)", "-n", false);
p.add("random_seed", "Random seed for sampling (default=13)", "-s", false); p.add("random_seed", "Random seed for sampling (default=13)", "-s", false);
p.add("binary_mode", "Is binary mode? (default=0)", "-b", false);
return p; return p;
} }
auto sample_keys(const std::vector<std::string>& keys, std::uint64_t num_samples, std::uint64_t random_seed) { std::vector<std::string_view> sample_keys(const std::vector<std::string>& keys, std::uint64_t num_samples,
std::vector<std::string_view> sampled_keys(num_samples); std::uint64_t random_seed) {
std::vector<std::uint64_t> sampled_ids(num_samples);
std::mt19937_64 engine(random_seed); std::mt19937_64 engine(random_seed);
std::uniform_int_distribution<std::uint64_t> dist(0, keys.size() - 1); std::uniform_int_distribution<std::uint64_t> dist(0, keys.size() - 1);
std::vector<std::string_view> sampled_keys(num_samples);
for (std::uint64_t i = 0; i < num_samples; i++) { for (std::uint64_t i = 0; i < num_samples; i++) {
sampled_ids[i] = dist(engine); sampled_keys[i] = std::string_view(keys[dist(engine)]);
sampled_keys[i] = std::string_view(keys[sampled_ids[i]]);
} }
return sampled_keys;
return std::make_tuple(std::move(sampled_keys), std::move(sampled_ids));
} }
template <class Trie> template <class Trie>
Trie benchmark_build(const std::vector<std::string>& keys) { std::vector<std::uint64_t> extract_ids(const Trie& trie, const std::vector<std::string_view>& keys) {
std::vector<std::uint64_t> sampled_ids(keys.size());
for (std::uint64_t i = 0; i < keys.size(); i++) {
sampled_ids[i] = trie.lookup(keys[i]).value();
}
return sampled_ids;
}
template <class Trie>
Trie benchmark_build(const std::vector<std::string>& keys, bool binary_mode) {
const auto start_tp = std::chrono::high_resolution_clock::now(); const auto start_tp = std::chrono::high_resolution_clock::now();
Trie trie(keys); Trie trie(keys, binary_mode);
const auto stop_tp = std::chrono::high_resolution_clock::now(); const auto stop_tp = std::chrono::high_resolution_clock::now();
const auto dur_ms = std::chrono::duration_cast<std::chrono::milliseconds>(stop_tp - start_tp); const auto dur_ms = std::chrono::duration_cast<std::chrono::milliseconds>(stop_tp - start_tp);
const double time_in_sec = dur_ms.count() / 1000.0; const double time_in_sec = dur_ms.count() / 1000.0;
const double memory_in_bytes = xcdat::memory_in_bytes(trie); const double memory_in_bytes = xcdat::memory_in_bytes(trie);
tfm::printfln("Binary mode: %d", trie.bin_mode());
tfm::printfln("Alphabet size: %d", trie.alphabet_size());
tfm::printfln("Max key length: %d", trie.max_length());
tfm::printfln("Number of keys: %d", trie.num_keys());
tfm::printfln("Number of trie nodes: %d", trie.num_nodes()); tfm::printfln("Number of trie nodes: %d", trie.num_nodes());
tfm::printfln("Number of DA units: %d", trie.num_units()); tfm::printfln("Number of DA units: %d", trie.num_units());
tfm::printfln("Number of free DA units: %d", trie.num_free_units()); tfm::printfln("Number of free DA units: %d", trie.num_free_units());
tfm::printfln("TAIL length: %d", trie.tail_length());
tfm::printfln("Memory usage in bytes: %d", memory_in_bytes); tfm::printfln("Memory usage in bytes: %d", memory_in_bytes);
tfm::printfln("Memory usage in MiB: %g", memory_in_bytes / (1024.0 * 1024.0)); tfm::printfln("Memory usage in MiB: %g", memory_in_bytes / (1024.0 * 1024.0));
tfm::printfln("Construction time in seconds: %g", time_in_sec); tfm::printfln("Construction time in seconds: %g", time_in_sec);
@ -77,16 +89,16 @@ void benchmark_lookup(const Trie& trie, const std::vector<std::string_view>& que
template <class Trie> template <class Trie>
void benchmark_decode(const Trie& trie, const std::vector<std::uint64_t>& queries) { void benchmark_decode(const Trie& trie, const std::vector<std::uint64_t>& queries) {
// Warmup // Warmup
std::string tmp; volatile std::uint64_t tmp = 0;
for (const std::uint64_t query : queries) { for (const std::uint64_t query : queries) {
trie.decode(query, tmp); tmp += trie.decode(query).size();
} }
// Measure // Measure
const auto start_tp = std::chrono::high_resolution_clock::now(); const auto start_tp = std::chrono::high_resolution_clock::now();
for (int r = 0; r < num_trials; r++) { for (int r = 0; r < num_trials; r++) {
for (const std::uint64_t query : queries) { for (const std::uint64_t query : queries) {
trie.decode(query, tmp); tmp += trie.decode(query).size();
} }
} }
const auto stop_tp = std::chrono::high_resolution_clock::now(); const auto stop_tp = std::chrono::high_resolution_clock::now();
@ -98,12 +110,12 @@ void benchmark_decode(const Trie& trie, const std::vector<std::uint64_t>& querie
} }
template <class Trie> template <class Trie>
void benchmark(std::vector<std::string> keys, const std::vector<std::string_view>& q_keys, void benchmark(std::vector<std::string> keys, const std::vector<std::string_view>& query_keys, bool binary_mode) {
const std::vector<std::uint64_t>& q_ids) { const auto trie = benchmark_build<Trie>(keys, binary_mode);
const auto trie = benchmark_build<Trie>(keys); const auto query_ids = extract_ids(trie, query_keys);
benchmark_lookup(trie, q_keys); benchmark_lookup(trie, query_keys);
benchmark_decode(trie, q_ids); benchmark_decode(trie, query_ids);
} }
int main(int argc, char** argv) { int main(int argc, char** argv) {
@ -120,6 +132,7 @@ int main(int argc, char** argv) {
const auto input_keys = p.get<std::string>("input_keys"); const auto input_keys = p.get<std::string>("input_keys");
const auto num_samples = p.get<std::uint64_t>("num_samples", 1000); const auto num_samples = p.get<std::uint64_t>("num_samples", 1000);
const auto random_seed = p.get<std::uint64_t>("random_seed", 13); const auto random_seed = p.get<std::uint64_t>("random_seed", 13);
const auto binary_mode = p.get<bool>("binary_mode", false);
auto keys = xcdat::load_strings(input_keys); auto keys = xcdat::load_strings(input_keys);
if (keys.empty()) { if (keys.empty()) {
@ -127,18 +140,16 @@ int main(int argc, char** argv) {
return 1; return 1;
} }
// To unique
std::sort(keys.begin(), keys.end()); std::sort(keys.begin(), keys.end());
keys.erase(std::unique(keys.begin(), keys.end()), keys.end()); keys.erase(std::unique(keys.begin(), keys.end()), keys.end());
tfm::printfln("Number of keys: %d", keys.size()); const auto query_keys = sample_keys(keys, num_samples, random_seed);
auto [q_keys, q_ids] = sample_keys(keys, num_samples, random_seed);
tfm::printfln("** xcdat::trie_7_type **"); tfm::printfln("** xcdat::trie_7_type **");
benchmark<xcdat::trie_7_type>(keys, q_keys, q_ids); benchmark<xcdat::trie_7_type>(keys, query_keys, binary_mode);
tfm::printfln("** xcdat::trie_8_type **"); tfm::printfln("** xcdat::trie_8_type **");
benchmark<xcdat::trie_8_type>(keys, q_keys, q_ids); benchmark<xcdat::trie_8_type>(keys, query_keys, binary_mode);
return 0; return 0;
} }

View file

@ -10,7 +10,7 @@ cmd_line_parser::parser make_parser(int argc, char** argv) {
p.add("input_keys", "Input filepath of data keys"); p.add("input_keys", "Input filepath of data keys");
p.add("output_idx", "Output filepath of trie index"); p.add("output_idx", "Output filepath of trie index");
p.add("trie_type", "Trie type: [7|8] (default=7)", "-t", false); p.add("trie_type", "Trie type: [7|8] (default=7)", "-t", false);
p.add("to_unique", "Make unique the input keys? (default=1)", "-u", false); p.add("binary_mode", "Is binary mode? (default=0)", "-b", false);
return p; return p;
} }
@ -18,32 +18,23 @@ template <class Trie>
int build(const cmd_line_parser::parser& p) { int build(const cmd_line_parser::parser& p) {
const auto input_keys = p.get<std::string>("input_keys"); const auto input_keys = p.get<std::string>("input_keys");
const auto output_idx = p.get<std::string>("output_idx"); const auto output_idx = p.get<std::string>("output_idx");
const auto to_unique = p.get<bool>("to_unique", true); const auto binary_mode = p.get<bool>("binary_mode", false);
auto keys = xcdat::load_strings(input_keys); auto keys = xcdat::load_strings(input_keys);
if (keys.empty()) { if (keys.empty()) {
tfm::errorfln("Error: The input dataset is empty."); tfm::errorfln("Error: The input dataset is empty.");
} }
if (to_unique) {
std::sort(keys.begin(), keys.end()); std::sort(keys.begin(), keys.end());
keys.erase(std::unique(keys.begin(), keys.end()), keys.end()); keys.erase(std::unique(keys.begin(), keys.end()), keys.end());
}
const auto start_tp = std::chrono::high_resolution_clock::now(); const Trie trie(keys, binary_mode);
const Trie trie(keys);
const auto stop_tp = std::chrono::high_resolution_clock::now();
const double time_in_sec =
std::chrono::duration_cast<std::chrono::milliseconds>(stop_tp - start_tp).count() / 1000.0;
const double memory_in_bytes = xcdat::memory_in_bytes(trie); const double memory_in_bytes = xcdat::memory_in_bytes(trie);
tfm::printfln("time_in_sec: %g", time_in_sec); tfm::printfln("Number of keys: %d", trie.num_keys());
tfm::printfln("memory_in_bytes: %d", memory_in_bytes); tfm::printfln("Number of trie nodes: %d", trie.num_nodes());
tfm::printfln("memory_in_MiB: %g", memory_in_bytes / (1024.0 * 1024.0)); tfm::printfln("Memory usage in bytes: %d", memory_in_bytes);
tfm::printfln("number_of_keys: %d", trie.num_keys()); tfm::printfln("Memory usage in MiB: %g", memory_in_bytes / (1024.0 * 1024.0));
tfm::printfln("alphabet_size: %d", trie.alphabet_size());
tfm::printfln("max_length: %d", trie.max_length());
xcdat::save(trie, output_idx); xcdat::save(trie, output_idx);