add
This commit is contained in:
parent
b798603930
commit
af9731e6c2
111
README.md
111
README.md
|
@ -58,12 +58,10 @@ It builds the trie index from a given dataset consisting of keywords separated b
|
|||
|
||||
```
|
||||
$ xcdat_build enwiki-titles.txt idx.bin
|
||||
time_in_sec: 13.449
|
||||
memory_in_bytes: 1.70618e+08
|
||||
memory_in_MiB: 162.714
|
||||
number_of_keys: 15955763
|
||||
alphabet_size: 198
|
||||
max_length: 253
|
||||
Number of keys: 15955763
|
||||
Number of trie nodes: 36441058
|
||||
Memory usage in bytes: 1.70618e+08
|
||||
Memory usage in MiB: 162.714
|
||||
```
|
||||
|
||||
### `xcdat_lookup`
|
||||
|
@ -128,6 +126,42 @@ $ xcdat_enumerate idx.bin | head -3
|
|||
138 !!!
|
||||
```
|
||||
|
||||
### `xcdat_benchmark`
|
||||
|
||||
It measures the performance of Xcdat for a given dataset.
|
||||
|
||||
```
|
||||
$ xcdat_benchmark enwiki-titles.txt
|
||||
** xcdat::trie_7_type **
|
||||
Binary mode: 0
|
||||
Alphabet size: 198
|
||||
Max key length: 253
|
||||
Number of keys: 15955763
|
||||
Number of trie nodes: 36441058
|
||||
Number of DA units: 36520704
|
||||
Number of free DA units: 79646
|
||||
TAIL length: 30776290
|
||||
Memory usage in bytes: 1.70618e+08
|
||||
Memory usage in MiB: 162.714
|
||||
Construction time in seconds: 11.828
|
||||
Lookup time in microsec/query: 0.8259
|
||||
Decode time in microsec/query: 1.4545
|
||||
** xcdat::trie_8_type **
|
||||
Binary mode: 0
|
||||
Alphabet size: 198
|
||||
Max key length: 253
|
||||
Number of keys: 15955763
|
||||
Number of trie nodes: 36441035
|
||||
Number of DA units: 36515840
|
||||
Number of free DA units: 74805
|
||||
TAIL length: 30776290
|
||||
Memory usage in bytes: 1.64104e+08
|
||||
Memory usage in MiB: 156.502
|
||||
Construction time in seconds: 11.966
|
||||
Lookup time in microsec/query: 0.844
|
||||
Decode time in microsec/query: 1.0029
|
||||
```
|
||||
|
||||
## Sample usage
|
||||
|
||||
`sample/sample.cpp` provides a sample usage.
|
||||
|
@ -301,25 +335,40 @@ class trie {
|
|||
//! If bin_mode = true, bit flags are used istead, and the keywords can contain NULL characters.
|
||||
//! If the input keywords contain NULL characters, bin_mode will be forced to be set to true.
|
||||
template <class Strings>
|
||||
explicit trie(const Strings& keys, bool bin_mode = false);
|
||||
trie(const Strings& keys, bool bin_mode = false);
|
||||
|
||||
//! Check if the binary mode.
|
||||
inline bool bin_mode() const;
|
||||
bool bin_mode() const;
|
||||
|
||||
//! Get the number of stored keywords.
|
||||
inline std::uint64_t num_keys() const;
|
||||
std::uint64_t num_keys() const;
|
||||
|
||||
//! Get the alphabet size.
|
||||
inline std::uint64_t alphabet_size() const;
|
||||
std::uint64_t alphabet_size() const;
|
||||
|
||||
//! Get the maximum length of keywords.
|
||||
inline std::uint64_t max_length() const;
|
||||
std::uint64_t max_length() const;
|
||||
|
||||
//! Get the number of trie nodes.
|
||||
std::uint64_t num_nodes() const;
|
||||
|
||||
//! Get the number of DA units.
|
||||
std::uint64_t num_units() const;
|
||||
|
||||
//! Get the number of unused DA units.
|
||||
std::uint64_t num_free_units() const;
|
||||
|
||||
//! Get the number of unused DA units.
|
||||
std::uint64_t tail_length() const;
|
||||
|
||||
//! Lookup the ID of the keyword.
|
||||
inline std::optional<std::uint64_t> lookup(std::string_view key) const;
|
||||
std::optional<std::uint64_t> lookup(std::string_view key) const;
|
||||
|
||||
//! Decode the keyword associated with the ID.
|
||||
inline std::string decode(std::uint64_t id) const;
|
||||
std::string decode(std::uint64_t id) const;
|
||||
|
||||
//! Decode the keyword associated with the ID.
|
||||
void decode(std::uint64_t id, std::string& decoded) const;
|
||||
|
||||
//! An iterator class for common prefix search.
|
||||
class prefix_iterator {
|
||||
|
@ -328,24 +377,24 @@ class trie {
|
|||
|
||||
//! Increment the iterator.
|
||||
//! Return false if the iteration is terminated.
|
||||
inline bool next();
|
||||
bool next();
|
||||
|
||||
//! Get the result ID.
|
||||
inline std::uint64_t id() const;
|
||||
std::uint64_t id() const;
|
||||
|
||||
//! Get the result keyword.
|
||||
inline std::string decoded() const;
|
||||
std::string decoded() const;
|
||||
|
||||
//! Get the reference to the result keyword.
|
||||
//! Note that the referenced data will be changed in the next iteration.
|
||||
inline std::string_view decoded_view() const;
|
||||
std::string_view decoded_view() const;
|
||||
};
|
||||
|
||||
//! Make the common prefix searcher for the given keyword.
|
||||
inline prefix_iterator make_prefix_iterator(std::string_view key) const;
|
||||
prefix_iterator make_prefix_iterator(std::string_view key) const;
|
||||
|
||||
//! Preform common prefix search for the keyword.
|
||||
inline void prefix_search(std::string_view key, const std::function<void(std::uint64_t, std::string_view)>& fn) const;
|
||||
void prefix_search(std::string_view key, const std::function<void(std::uint64_t, std::string_view)>& fn) const;
|
||||
|
||||
//! An iterator class for predictive search.
|
||||
class predictive_iterator {
|
||||
|
@ -354,41 +403,33 @@ class trie {
|
|||
|
||||
//! Increment the iterator.
|
||||
//! Return false if the iteration is terminated.
|
||||
inline bool next();
|
||||
bool next();
|
||||
|
||||
//! Get the result ID.
|
||||
inline std::uint64_t id() const;
|
||||
std::uint64_t id() const;
|
||||
|
||||
//! Get the result keyword.
|
||||
inline std::string decoded() const;
|
||||
std::string decoded() const;
|
||||
|
||||
//! Get the reference to the result keyword.
|
||||
//! Note that the referenced data will be changed in the next iteration.
|
||||
inline std::string_view decoded_view() const;
|
||||
std::string_view decoded_view() const;
|
||||
};
|
||||
|
||||
//! Make the predictive searcher for the keyword.
|
||||
inline predictive_iterator make_predictive_iterator(std::string_view key) const {
|
||||
return predictive_iterator(this, key);
|
||||
}
|
||||
predictive_iterator make_predictive_iterator(std::string_view key) const;
|
||||
|
||||
//! Preform predictive search for the keyword.
|
||||
inline void predictive_search(std::string_view key,
|
||||
const std::function<void(std::uint64_t, std::string_view)>& fn) const {
|
||||
auto itr = make_predictive_iterator(key);
|
||||
while (itr.next()) {
|
||||
fn(itr.id(), itr.decoded_view());
|
||||
}
|
||||
}
|
||||
void predictive_search(std::string_view key, const std::function<void(std::uint64_t, std::string_view)>& fn) const;
|
||||
|
||||
//! An iterator class for enumeration.
|
||||
using enumerative_iterator = predictive_iterator;
|
||||
|
||||
//! An iterator class for enumeration.
|
||||
inline enumerative_iterator make_enumerative_iterator() const;
|
||||
enumerative_iterator make_enumerative_iterator() const;
|
||||
|
||||
//! Enumerate all the keywords and their IDs stored in the trie.
|
||||
inline void enumerate(const std::function<void(std::uint64_t, std::string_view)>& fn) const;
|
||||
void enumerate(const std::function<void(std::uint64_t, std::string_view)>& fn) const;
|
||||
|
||||
//! Visit the members.
|
||||
template <class Visitor>
|
||||
|
|
|
@ -48,7 +48,7 @@ class trie {
|
|||
//! If bin_mode = true, bit flags are used istead, and the keywords can contain NULL characters.
|
||||
//! If the input keywords contain NULL characters, bin_mode will be forced to be set to true.
|
||||
template <class Strings>
|
||||
explicit trie(const Strings& keys, bool bin_mode = false) : trie(trie_builder(keys, l1_bits, bin_mode)) {
|
||||
trie(const Strings& keys, bool bin_mode = false) : trie(trie_builder(keys, l1_bits, bin_mode)) {
|
||||
static_assert(sizeof(char) == sizeof(typename Strings::value_type::value_type));
|
||||
}
|
||||
|
||||
|
@ -87,6 +87,11 @@ class trie {
|
|||
return m_bcvec.num_free_units();
|
||||
}
|
||||
|
||||
//! Get the number of unused DA units.
|
||||
inline std::uint64_t tail_length() const {
|
||||
return m_tvec.size();
|
||||
}
|
||||
|
||||
//! Lookup the ID of the keyword.
|
||||
inline std::optional<std::uint64_t> lookup(std::string_view key) const {
|
||||
std::uint64_t kpos = 0, npos = 0;
|
||||
|
|
|
@ -10,40 +10,52 @@ static constexpr int num_trials = 10;
|
|||
|
||||
cmd_line_parser::parser make_parser(int argc, char** argv) {
|
||||
cmd_line_parser::parser p(argc, argv);
|
||||
p.add("input_keys", "Input filepath of data keys");
|
||||
p.add("num_samples", "Number of sample keys for benchmark (default=1000)", "-n", false);
|
||||
p.add("input_keys", "Input filepath of keywords");
|
||||
p.add("num_samples", "Number of sample keys for searches (default=1000)", "-n", false);
|
||||
p.add("random_seed", "Random seed for sampling (default=13)", "-s", false);
|
||||
p.add("binary_mode", "Is binary mode? (default=0)", "-b", false);
|
||||
return p;
|
||||
}
|
||||
|
||||
auto sample_keys(const std::vector<std::string>& keys, std::uint64_t num_samples, std::uint64_t random_seed) {
|
||||
std::vector<std::string_view> sampled_keys(num_samples);
|
||||
std::vector<std::uint64_t> sampled_ids(num_samples);
|
||||
|
||||
std::vector<std::string_view> sample_keys(const std::vector<std::string>& keys, std::uint64_t num_samples,
|
||||
std::uint64_t random_seed) {
|
||||
std::mt19937_64 engine(random_seed);
|
||||
std::uniform_int_distribution<std::uint64_t> dist(0, keys.size() - 1);
|
||||
|
||||
std::vector<std::string_view> sampled_keys(num_samples);
|
||||
for (std::uint64_t i = 0; i < num_samples; i++) {
|
||||
sampled_ids[i] = dist(engine);
|
||||
sampled_keys[i] = std::string_view(keys[sampled_ids[i]]);
|
||||
sampled_keys[i] = std::string_view(keys[dist(engine)]);
|
||||
}
|
||||
|
||||
return std::make_tuple(std::move(sampled_keys), std::move(sampled_ids));
|
||||
return sampled_keys;
|
||||
}
|
||||
|
||||
template <class Trie>
|
||||
Trie benchmark_build(const std::vector<std::string>& keys) {
|
||||
std::vector<std::uint64_t> extract_ids(const Trie& trie, const std::vector<std::string_view>& keys) {
|
||||
std::vector<std::uint64_t> sampled_ids(keys.size());
|
||||
for (std::uint64_t i = 0; i < keys.size(); i++) {
|
||||
sampled_ids[i] = trie.lookup(keys[i]).value();
|
||||
}
|
||||
return sampled_ids;
|
||||
}
|
||||
|
||||
template <class Trie>
|
||||
Trie benchmark_build(const std::vector<std::string>& keys, bool binary_mode) {
|
||||
const auto start_tp = std::chrono::high_resolution_clock::now();
|
||||
Trie trie(keys);
|
||||
Trie trie(keys, binary_mode);
|
||||
const auto stop_tp = std::chrono::high_resolution_clock::now();
|
||||
|
||||
const auto dur_ms = std::chrono::duration_cast<std::chrono::milliseconds>(stop_tp - start_tp);
|
||||
const double time_in_sec = dur_ms.count() / 1000.0;
|
||||
const double memory_in_bytes = xcdat::memory_in_bytes(trie);
|
||||
|
||||
tfm::printfln("Binary mode: %d", trie.bin_mode());
|
||||
tfm::printfln("Alphabet size: %d", trie.alphabet_size());
|
||||
tfm::printfln("Max key length: %d", trie.max_length());
|
||||
tfm::printfln("Number of keys: %d", trie.num_keys());
|
||||
tfm::printfln("Number of trie nodes: %d", trie.num_nodes());
|
||||
tfm::printfln("Number of DA units: %d", trie.num_units());
|
||||
tfm::printfln("Number of free DA units: %d", trie.num_free_units());
|
||||
tfm::printfln("TAIL length: %d", trie.tail_length());
|
||||
tfm::printfln("Memory usage in bytes: %d", memory_in_bytes);
|
||||
tfm::printfln("Memory usage in MiB: %g", memory_in_bytes / (1024.0 * 1024.0));
|
||||
tfm::printfln("Construction time in seconds: %g", time_in_sec);
|
||||
|
@ -77,16 +89,16 @@ void benchmark_lookup(const Trie& trie, const std::vector<std::string_view>& que
|
|||
template <class Trie>
|
||||
void benchmark_decode(const Trie& trie, const std::vector<std::uint64_t>& queries) {
|
||||
// Warmup
|
||||
std::string tmp;
|
||||
volatile std::uint64_t tmp = 0;
|
||||
for (const std::uint64_t query : queries) {
|
||||
trie.decode(query, tmp);
|
||||
tmp += trie.decode(query).size();
|
||||
}
|
||||
|
||||
// Measure
|
||||
const auto start_tp = std::chrono::high_resolution_clock::now();
|
||||
for (int r = 0; r < num_trials; r++) {
|
||||
for (const std::uint64_t query : queries) {
|
||||
trie.decode(query, tmp);
|
||||
tmp += trie.decode(query).size();
|
||||
}
|
||||
}
|
||||
const auto stop_tp = std::chrono::high_resolution_clock::now();
|
||||
|
@ -98,12 +110,12 @@ void benchmark_decode(const Trie& trie, const std::vector<std::uint64_t>& querie
|
|||
}
|
||||
|
||||
template <class Trie>
|
||||
void benchmark(std::vector<std::string> keys, const std::vector<std::string_view>& q_keys,
|
||||
const std::vector<std::uint64_t>& q_ids) {
|
||||
const auto trie = benchmark_build<Trie>(keys);
|
||||
void benchmark(std::vector<std::string> keys, const std::vector<std::string_view>& query_keys, bool binary_mode) {
|
||||
const auto trie = benchmark_build<Trie>(keys, binary_mode);
|
||||
const auto query_ids = extract_ids(trie, query_keys);
|
||||
|
||||
benchmark_lookup(trie, q_keys);
|
||||
benchmark_decode(trie, q_ids);
|
||||
benchmark_lookup(trie, query_keys);
|
||||
benchmark_decode(trie, query_ids);
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
|
@ -120,6 +132,7 @@ int main(int argc, char** argv) {
|
|||
const auto input_keys = p.get<std::string>("input_keys");
|
||||
const auto num_samples = p.get<std::uint64_t>("num_samples", 1000);
|
||||
const auto random_seed = p.get<std::uint64_t>("random_seed", 13);
|
||||
const auto binary_mode = p.get<bool>("binary_mode", false);
|
||||
|
||||
auto keys = xcdat::load_strings(input_keys);
|
||||
if (keys.empty()) {
|
||||
|
@ -127,18 +140,16 @@ int main(int argc, char** argv) {
|
|||
return 1;
|
||||
}
|
||||
|
||||
// To unique
|
||||
std::sort(keys.begin(), keys.end());
|
||||
keys.erase(std::unique(keys.begin(), keys.end()), keys.end());
|
||||
|
||||
tfm::printfln("Number of keys: %d", keys.size());
|
||||
auto [q_keys, q_ids] = sample_keys(keys, num_samples, random_seed);
|
||||
const auto query_keys = sample_keys(keys, num_samples, random_seed);
|
||||
|
||||
tfm::printfln("** xcdat::trie_7_type **");
|
||||
benchmark<xcdat::trie_7_type>(keys, q_keys, q_ids);
|
||||
benchmark<xcdat::trie_7_type>(keys, query_keys, binary_mode);
|
||||
|
||||
tfm::printfln("** xcdat::trie_8_type **");
|
||||
benchmark<xcdat::trie_8_type>(keys, q_keys, q_ids);
|
||||
benchmark<xcdat::trie_8_type>(keys, query_keys, binary_mode);
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -10,7 +10,7 @@ cmd_line_parser::parser make_parser(int argc, char** argv) {
|
|||
p.add("input_keys", "Input filepath of data keys");
|
||||
p.add("output_idx", "Output filepath of trie index");
|
||||
p.add("trie_type", "Trie type: [7|8] (default=7)", "-t", false);
|
||||
p.add("to_unique", "Make unique the input keys? (default=1)", "-u", false);
|
||||
p.add("binary_mode", "Is binary mode? (default=0)", "-b", false);
|
||||
return p;
|
||||
}
|
||||
|
||||
|
@ -18,32 +18,23 @@ template <class Trie>
|
|||
int build(const cmd_line_parser::parser& p) {
|
||||
const auto input_keys = p.get<std::string>("input_keys");
|
||||
const auto output_idx = p.get<std::string>("output_idx");
|
||||
const auto to_unique = p.get<bool>("to_unique", true);
|
||||
const auto binary_mode = p.get<bool>("binary_mode", false);
|
||||
|
||||
auto keys = xcdat::load_strings(input_keys);
|
||||
if (keys.empty()) {
|
||||
tfm::errorfln("Error: The input dataset is empty.");
|
||||
}
|
||||
|
||||
if (to_unique) {
|
||||
std::sort(keys.begin(), keys.end());
|
||||
keys.erase(std::unique(keys.begin(), keys.end()), keys.end());
|
||||
}
|
||||
std::sort(keys.begin(), keys.end());
|
||||
keys.erase(std::unique(keys.begin(), keys.end()), keys.end());
|
||||
|
||||
const auto start_tp = std::chrono::high_resolution_clock::now();
|
||||
const Trie trie(keys);
|
||||
const auto stop_tp = std::chrono::high_resolution_clock::now();
|
||||
|
||||
const double time_in_sec =
|
||||
std::chrono::duration_cast<std::chrono::milliseconds>(stop_tp - start_tp).count() / 1000.0;
|
||||
const Trie trie(keys, binary_mode);
|
||||
const double memory_in_bytes = xcdat::memory_in_bytes(trie);
|
||||
|
||||
tfm::printfln("time_in_sec: %g", time_in_sec);
|
||||
tfm::printfln("memory_in_bytes: %d", memory_in_bytes);
|
||||
tfm::printfln("memory_in_MiB: %g", memory_in_bytes / (1024.0 * 1024.0));
|
||||
tfm::printfln("number_of_keys: %d", trie.num_keys());
|
||||
tfm::printfln("alphabet_size: %d", trie.alphabet_size());
|
||||
tfm::printfln("max_length: %d", trie.max_length());
|
||||
tfm::printfln("Number of keys: %d", trie.num_keys());
|
||||
tfm::printfln("Number of trie nodes: %d", trie.num_nodes());
|
||||
tfm::printfln("Memory usage in bytes: %d", memory_in_bytes);
|
||||
tfm::printfln("Memory usage in MiB: %g", memory_in_bytes / (1024.0 * 1024.0));
|
||||
|
||||
xcdat::save(trie, output_idx);
|
||||
|
||||
|
|
Loading…
Reference in a new issue