This commit is contained in:
Shunsuke Kanda 2021-07-02 07:05:06 +09:00
parent b798603930
commit af9731e6c2
4 changed files with 127 additions and 79 deletions

111
README.md
View file

@ -58,12 +58,10 @@ It builds the trie index from a given dataset consisting of keywords separated b
```
$ xcdat_build enwiki-titles.txt idx.bin
time_in_sec: 13.449
memory_in_bytes: 1.70618e+08
memory_in_MiB: 162.714
number_of_keys: 15955763
alphabet_size: 198
max_length: 253
Number of keys: 15955763
Number of trie nodes: 36441058
Memory usage in bytes: 1.70618e+08
Memory usage in MiB: 162.714
```
### `xcdat_lookup`
@ -128,6 +126,42 @@ $ xcdat_enumerate idx.bin | head -3
138 !!!
```
### `xcdat_benchmark`
It measures the performance of Xcdat for a given dataset.
```
$ xcdat_benchmark enwiki-titles.txt
** xcdat::trie_7_type **
Binary mode: 0
Alphabet size: 198
Max key length: 253
Number of keys: 15955763
Number of trie nodes: 36441058
Number of DA units: 36520704
Number of free DA units: 79646
TAIL length: 30776290
Memory usage in bytes: 1.70618e+08
Memory usage in MiB: 162.714
Construction time in seconds: 11.828
Lookup time in microsec/query: 0.8259
Decode time in microsec/query: 1.4545
** xcdat::trie_8_type **
Binary mode: 0
Alphabet size: 198
Max key length: 253
Number of keys: 15955763
Number of trie nodes: 36441035
Number of DA units: 36515840
Number of free DA units: 74805
TAIL length: 30776290
Memory usage in bytes: 1.64104e+08
Memory usage in MiB: 156.502
Construction time in seconds: 11.966
Lookup time in microsec/query: 0.844
Decode time in microsec/query: 1.0029
```
## Sample usage
`sample/sample.cpp` provides a sample usage.
@ -301,25 +335,40 @@ class trie {
//! If bin_mode = true, bit flags are used istead, and the keywords can contain NULL characters.
//! If the input keywords contain NULL characters, bin_mode will be forced to be set to true.
template <class Strings>
explicit trie(const Strings& keys, bool bin_mode = false);
trie(const Strings& keys, bool bin_mode = false);
//! Check if the binary mode.
inline bool bin_mode() const;
bool bin_mode() const;
//! Get the number of stored keywords.
inline std::uint64_t num_keys() const;
std::uint64_t num_keys() const;
//! Get the alphabet size.
inline std::uint64_t alphabet_size() const;
std::uint64_t alphabet_size() const;
//! Get the maximum length of keywords.
inline std::uint64_t max_length() const;
std::uint64_t max_length() const;
//! Get the number of trie nodes.
std::uint64_t num_nodes() const;
//! Get the number of DA units.
std::uint64_t num_units() const;
//! Get the number of unused DA units.
std::uint64_t num_free_units() const;
//! Get the number of unused DA units.
std::uint64_t tail_length() const;
//! Lookup the ID of the keyword.
inline std::optional<std::uint64_t> lookup(std::string_view key) const;
std::optional<std::uint64_t> lookup(std::string_view key) const;
//! Decode the keyword associated with the ID.
inline std::string decode(std::uint64_t id) const;
std::string decode(std::uint64_t id) const;
//! Decode the keyword associated with the ID.
void decode(std::uint64_t id, std::string& decoded) const;
//! An iterator class for common prefix search.
class prefix_iterator {
@ -328,24 +377,24 @@ class trie {
//! Increment the iterator.
//! Return false if the iteration is terminated.
inline bool next();
bool next();
//! Get the result ID.
inline std::uint64_t id() const;
std::uint64_t id() const;
//! Get the result keyword.
inline std::string decoded() const;
std::string decoded() const;
//! Get the reference to the result keyword.
//! Note that the referenced data will be changed in the next iteration.
inline std::string_view decoded_view() const;
std::string_view decoded_view() const;
};
//! Make the common prefix searcher for the given keyword.
inline prefix_iterator make_prefix_iterator(std::string_view key) const;
prefix_iterator make_prefix_iterator(std::string_view key) const;
//! Preform common prefix search for the keyword.
inline void prefix_search(std::string_view key, const std::function<void(std::uint64_t, std::string_view)>& fn) const;
void prefix_search(std::string_view key, const std::function<void(std::uint64_t, std::string_view)>& fn) const;
//! An iterator class for predictive search.
class predictive_iterator {
@ -354,41 +403,33 @@ class trie {
//! Increment the iterator.
//! Return false if the iteration is terminated.
inline bool next();
bool next();
//! Get the result ID.
inline std::uint64_t id() const;
std::uint64_t id() const;
//! Get the result keyword.
inline std::string decoded() const;
std::string decoded() const;
//! Get the reference to the result keyword.
//! Note that the referenced data will be changed in the next iteration.
inline std::string_view decoded_view() const;
std::string_view decoded_view() const;
};
//! Make the predictive searcher for the keyword.
inline predictive_iterator make_predictive_iterator(std::string_view key) const {
return predictive_iterator(this, key);
}
predictive_iterator make_predictive_iterator(std::string_view key) const;
//! Preform predictive search for the keyword.
inline void predictive_search(std::string_view key,
const std::function<void(std::uint64_t, std::string_view)>& fn) const {
auto itr = make_predictive_iterator(key);
while (itr.next()) {
fn(itr.id(), itr.decoded_view());
}
}
void predictive_search(std::string_view key, const std::function<void(std::uint64_t, std::string_view)>& fn) const;
//! An iterator class for enumeration.
using enumerative_iterator = predictive_iterator;
//! An iterator class for enumeration.
inline enumerative_iterator make_enumerative_iterator() const;
enumerative_iterator make_enumerative_iterator() const;
//! Enumerate all the keywords and their IDs stored in the trie.
inline void enumerate(const std::function<void(std::uint64_t, std::string_view)>& fn) const;
void enumerate(const std::function<void(std::uint64_t, std::string_view)>& fn) const;
//! Visit the members.
template <class Visitor>

View file

@ -48,7 +48,7 @@ class trie {
//! If bin_mode = true, bit flags are used istead, and the keywords can contain NULL characters.
//! If the input keywords contain NULL characters, bin_mode will be forced to be set to true.
template <class Strings>
explicit trie(const Strings& keys, bool bin_mode = false) : trie(trie_builder(keys, l1_bits, bin_mode)) {
trie(const Strings& keys, bool bin_mode = false) : trie(trie_builder(keys, l1_bits, bin_mode)) {
static_assert(sizeof(char) == sizeof(typename Strings::value_type::value_type));
}
@ -87,6 +87,11 @@ class trie {
return m_bcvec.num_free_units();
}
//! Get the number of unused DA units.
inline std::uint64_t tail_length() const {
return m_tvec.size();
}
//! Lookup the ID of the keyword.
inline std::optional<std::uint64_t> lookup(std::string_view key) const {
std::uint64_t kpos = 0, npos = 0;

View file

@ -10,40 +10,52 @@ static constexpr int num_trials = 10;
cmd_line_parser::parser make_parser(int argc, char** argv) {
cmd_line_parser::parser p(argc, argv);
p.add("input_keys", "Input filepath of data keys");
p.add("num_samples", "Number of sample keys for benchmark (default=1000)", "-n", false);
p.add("input_keys", "Input filepath of keywords");
p.add("num_samples", "Number of sample keys for searches (default=1000)", "-n", false);
p.add("random_seed", "Random seed for sampling (default=13)", "-s", false);
p.add("binary_mode", "Is binary mode? (default=0)", "-b", false);
return p;
}
auto sample_keys(const std::vector<std::string>& keys, std::uint64_t num_samples, std::uint64_t random_seed) {
std::vector<std::string_view> sampled_keys(num_samples);
std::vector<std::uint64_t> sampled_ids(num_samples);
std::vector<std::string_view> sample_keys(const std::vector<std::string>& keys, std::uint64_t num_samples,
std::uint64_t random_seed) {
std::mt19937_64 engine(random_seed);
std::uniform_int_distribution<std::uint64_t> dist(0, keys.size() - 1);
std::vector<std::string_view> sampled_keys(num_samples);
for (std::uint64_t i = 0; i < num_samples; i++) {
sampled_ids[i] = dist(engine);
sampled_keys[i] = std::string_view(keys[sampled_ids[i]]);
sampled_keys[i] = std::string_view(keys[dist(engine)]);
}
return std::make_tuple(std::move(sampled_keys), std::move(sampled_ids));
return sampled_keys;
}
template <class Trie>
Trie benchmark_build(const std::vector<std::string>& keys) {
std::vector<std::uint64_t> extract_ids(const Trie& trie, const std::vector<std::string_view>& keys) {
std::vector<std::uint64_t> sampled_ids(keys.size());
for (std::uint64_t i = 0; i < keys.size(); i++) {
sampled_ids[i] = trie.lookup(keys[i]).value();
}
return sampled_ids;
}
template <class Trie>
Trie benchmark_build(const std::vector<std::string>& keys, bool binary_mode) {
const auto start_tp = std::chrono::high_resolution_clock::now();
Trie trie(keys);
Trie trie(keys, binary_mode);
const auto stop_tp = std::chrono::high_resolution_clock::now();
const auto dur_ms = std::chrono::duration_cast<std::chrono::milliseconds>(stop_tp - start_tp);
const double time_in_sec = dur_ms.count() / 1000.0;
const double memory_in_bytes = xcdat::memory_in_bytes(trie);
tfm::printfln("Binary mode: %d", trie.bin_mode());
tfm::printfln("Alphabet size: %d", trie.alphabet_size());
tfm::printfln("Max key length: %d", trie.max_length());
tfm::printfln("Number of keys: %d", trie.num_keys());
tfm::printfln("Number of trie nodes: %d", trie.num_nodes());
tfm::printfln("Number of DA units: %d", trie.num_units());
tfm::printfln("Number of free DA units: %d", trie.num_free_units());
tfm::printfln("TAIL length: %d", trie.tail_length());
tfm::printfln("Memory usage in bytes: %d", memory_in_bytes);
tfm::printfln("Memory usage in MiB: %g", memory_in_bytes / (1024.0 * 1024.0));
tfm::printfln("Construction time in seconds: %g", time_in_sec);
@ -77,16 +89,16 @@ void benchmark_lookup(const Trie& trie, const std::vector<std::string_view>& que
template <class Trie>
void benchmark_decode(const Trie& trie, const std::vector<std::uint64_t>& queries) {
// Warmup
std::string tmp;
volatile std::uint64_t tmp = 0;
for (const std::uint64_t query : queries) {
trie.decode(query, tmp);
tmp += trie.decode(query).size();
}
// Measure
const auto start_tp = std::chrono::high_resolution_clock::now();
for (int r = 0; r < num_trials; r++) {
for (const std::uint64_t query : queries) {
trie.decode(query, tmp);
tmp += trie.decode(query).size();
}
}
const auto stop_tp = std::chrono::high_resolution_clock::now();
@ -98,12 +110,12 @@ void benchmark_decode(const Trie& trie, const std::vector<std::uint64_t>& querie
}
template <class Trie>
void benchmark(std::vector<std::string> keys, const std::vector<std::string_view>& q_keys,
const std::vector<std::uint64_t>& q_ids) {
const auto trie = benchmark_build<Trie>(keys);
void benchmark(std::vector<std::string> keys, const std::vector<std::string_view>& query_keys, bool binary_mode) {
const auto trie = benchmark_build<Trie>(keys, binary_mode);
const auto query_ids = extract_ids(trie, query_keys);
benchmark_lookup(trie, q_keys);
benchmark_decode(trie, q_ids);
benchmark_lookup(trie, query_keys);
benchmark_decode(trie, query_ids);
}
int main(int argc, char** argv) {
@ -120,6 +132,7 @@ int main(int argc, char** argv) {
const auto input_keys = p.get<std::string>("input_keys");
const auto num_samples = p.get<std::uint64_t>("num_samples", 1000);
const auto random_seed = p.get<std::uint64_t>("random_seed", 13);
const auto binary_mode = p.get<bool>("binary_mode", false);
auto keys = xcdat::load_strings(input_keys);
if (keys.empty()) {
@ -127,18 +140,16 @@ int main(int argc, char** argv) {
return 1;
}
// To unique
std::sort(keys.begin(), keys.end());
keys.erase(std::unique(keys.begin(), keys.end()), keys.end());
tfm::printfln("Number of keys: %d", keys.size());
auto [q_keys, q_ids] = sample_keys(keys, num_samples, random_seed);
const auto query_keys = sample_keys(keys, num_samples, random_seed);
tfm::printfln("** xcdat::trie_7_type **");
benchmark<xcdat::trie_7_type>(keys, q_keys, q_ids);
benchmark<xcdat::trie_7_type>(keys, query_keys, binary_mode);
tfm::printfln("** xcdat::trie_8_type **");
benchmark<xcdat::trie_8_type>(keys, q_keys, q_ids);
benchmark<xcdat::trie_8_type>(keys, query_keys, binary_mode);
return 0;
}

View file

@ -10,7 +10,7 @@ cmd_line_parser::parser make_parser(int argc, char** argv) {
p.add("input_keys", "Input filepath of data keys");
p.add("output_idx", "Output filepath of trie index");
p.add("trie_type", "Trie type: [7|8] (default=7)", "-t", false);
p.add("to_unique", "Make unique the input keys? (default=1)", "-u", false);
p.add("binary_mode", "Is binary mode? (default=0)", "-b", false);
return p;
}
@ -18,32 +18,23 @@ template <class Trie>
int build(const cmd_line_parser::parser& p) {
const auto input_keys = p.get<std::string>("input_keys");
const auto output_idx = p.get<std::string>("output_idx");
const auto to_unique = p.get<bool>("to_unique", true);
const auto binary_mode = p.get<bool>("binary_mode", false);
auto keys = xcdat::load_strings(input_keys);
if (keys.empty()) {
tfm::errorfln("Error: The input dataset is empty.");
}
if (to_unique) {
std::sort(keys.begin(), keys.end());
keys.erase(std::unique(keys.begin(), keys.end()), keys.end());
}
std::sort(keys.begin(), keys.end());
keys.erase(std::unique(keys.begin(), keys.end()), keys.end());
const auto start_tp = std::chrono::high_resolution_clock::now();
const Trie trie(keys);
const auto stop_tp = std::chrono::high_resolution_clock::now();
const double time_in_sec =
std::chrono::duration_cast<std::chrono::milliseconds>(stop_tp - start_tp).count() / 1000.0;
const Trie trie(keys, binary_mode);
const double memory_in_bytes = xcdat::memory_in_bytes(trie);
tfm::printfln("time_in_sec: %g", time_in_sec);
tfm::printfln("memory_in_bytes: %d", memory_in_bytes);
tfm::printfln("memory_in_MiB: %g", memory_in_bytes / (1024.0 * 1024.0));
tfm::printfln("number_of_keys: %d", trie.num_keys());
tfm::printfln("alphabet_size: %d", trie.alphabet_size());
tfm::printfln("max_length: %d", trie.max_length());
tfm::printfln("Number of keys: %d", trie.num_keys());
tfm::printfln("Number of trie nodes: %d", trie.num_nodes());
tfm::printfln("Memory usage in bytes: %d", memory_in_bytes);
tfm::printfln("Memory usage in MiB: %g", memory_in_bytes / (1024.0 * 1024.0));
xcdat::save(trie, output_idx);