add comments etc
This commit is contained in:
parent
693b22eb0b
commit
45682b7a10
176
README.md
176
README.md
|
@ -10,6 +10,14 @@ The double array is known as the fastest trie representation and has been used i
|
||||||
|
|
||||||
Xcdat can implement trie dictionaries in smaller space compared to the other double-array libraries. In addition, the lookup speed is relatively fast in compressed data structures from the double-array advantage.
|
Xcdat can implement trie dictionaries in smaller space compared to the other double-array libraries. In addition, the lookup speed is relatively fast in compressed data structures from the double-array advantage.
|
||||||
|
|
||||||
|
### Table of contents
|
||||||
|
|
||||||
|
- [Features](#features)
|
||||||
|
- [Build instructions](#build-instructions)
|
||||||
|
- [Command line tools](#command-line-tools)
|
||||||
|
- [Sample usage](#sample-usage)
|
||||||
|
- [Interface](#interface)
|
||||||
|
|
||||||
## Features
|
## Features
|
||||||
|
|
||||||
- **Fast and memory-efficient:** Xcdat employs the double-array structure, known as the fastest trie data structure, and.
|
- **Fast and memory-efficient:** Xcdat employs the double-array structure, known as the fastest trie data structure, and.
|
||||||
|
@ -21,7 +29,7 @@ Xcdat can implement trie dictionaries in smaller space compared to the other dou
|
||||||
- **Fast search**: Xcdat can provide lookup operations faster than other compressed trie libraries because it is based on the double-array trie. On the other hand, compared to the existing double-array libraries, the speed will be slower due to the compression.
|
- **Fast search**: Xcdat can provide lookup operations faster than other compressed trie libraries because it is based on the double-array trie. On the other hand, compared to the existing double-array libraries, the speed will be slower due to the compression.
|
||||||
- **Prefix-based Lookup Operations**: As with other trie libraries, Xcdat also provides prefix-based lookup operations required for natural language processing and so on.
|
- **Prefix-based Lookup Operations**: As with other trie libraries, Xcdat also provides prefix-based lookup operations required for natural language processing and so on.
|
||||||
|
|
||||||
## Build Instructions
|
## Build instructions
|
||||||
|
|
||||||
You can download and compile Xcdat as the following commands.
|
You can download and compile Xcdat as the following commands.
|
||||||
|
|
||||||
|
@ -37,9 +45,69 @@ $ make install
|
||||||
|
|
||||||
## Command line tools
|
## Command line tools
|
||||||
|
|
||||||
|
### Build
|
||||||
|
|
||||||
|
```sh
|
||||||
|
$ xcdat_build enwiki-latest-all-titles-in-ns0 idx.bin -u 1
|
||||||
|
time_in_sec: 13.449
|
||||||
|
memory_in_bytes: 1.70618e+08
|
||||||
|
memory_in_MiB: 162.714
|
||||||
|
number_of_keys: 15955763
|
||||||
|
alphabet_size: 198
|
||||||
|
max_length: 253
|
||||||
|
```
|
||||||
|
|
||||||
## Sample
|
### Lookup
|
||||||
|
|
||||||
|
```sh
|
||||||
|
$ xcdat_lookup idx.bin
|
||||||
|
Algorithm
|
||||||
|
1255938 Algorithm
|
||||||
|
```
|
||||||
|
|
||||||
|
### Decode
|
||||||
|
|
||||||
|
```sh
|
||||||
|
$ xcdat_decode idx.bin
|
||||||
|
1255938
|
||||||
|
1255938 Algorithm
|
||||||
|
```
|
||||||
|
|
||||||
|
### Common prefix search
|
||||||
|
|
||||||
|
```sh
|
||||||
|
$ xcdat_prefix_search idx.bin
|
||||||
|
Algorithmic
|
||||||
|
6 found
|
||||||
|
57 A
|
||||||
|
798460 Al
|
||||||
|
1138004 Alg
|
||||||
|
1253024 Algo
|
||||||
|
1255938 Algorithm
|
||||||
|
1255931 Algorithmic
|
||||||
|
```
|
||||||
|
|
||||||
|
### Predictive search
|
||||||
|
|
||||||
|
```sh
|
||||||
|
$ xcdat_predictive_search idx.bin -n 3
|
||||||
|
Algorithm
|
||||||
|
263 found
|
||||||
|
1255938 Algorithm
|
||||||
|
1255944 Algorithm's_optimality
|
||||||
|
1255972 Algorithm_(C++)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Enumerate
|
||||||
|
|
||||||
|
```sh
|
||||||
|
$ xcdat_enumerate idx.bin | head -3
|
||||||
|
0 !
|
||||||
|
107 !!
|
||||||
|
138 !!!
|
||||||
|
```
|
||||||
|
|
||||||
|
## Sample usage
|
||||||
|
|
||||||
```c++
|
```c++
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
@ -168,7 +236,111 @@ Enumerate() = {
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Interface
|
||||||
|
|
||||||
|
### Dictionary class
|
||||||
|
|
||||||
|
```c++
|
||||||
|
temp late <class BcVector>
|
||||||
|
class trie {
|
||||||
|
public:
|
||||||
|
using trie_type = trie<BcVector>;
|
||||||
|
using bc_vector_type = BcVector;
|
||||||
|
|
||||||
|
static constexpr auto l1_bits = bc_vector_type::l1_bits;
|
||||||
|
|
||||||
|
public:
|
||||||
|
trie() = default;
|
||||||
|
virtual ~trie() = default;
|
||||||
|
trie(const trie&) = delete;
|
||||||
|
trie& operator=(const trie&) = delete;
|
||||||
|
trie(trie&&) noexcept = default;
|
||||||
|
trie& operator=(trie&&) noexcept = default;
|
||||||
|
|
||||||
|
template <class Strings>
|
||||||
|
explicit trie(const Strings& keys, bool bin_mode = false);
|
||||||
|
|
||||||
|
inline bool bin_mode() const;
|
||||||
|
|
||||||
|
//! Get the number of stored keywords.
|
||||||
|
inline std::uint64_t num_keys() const;
|
||||||
|
|
||||||
|
//! Get the alphabet size.
|
||||||
|
inline std::uint64_t alphabet_size() const;
|
||||||
|
|
||||||
|
//! Get the maximum length of keywords.
|
||||||
|
inline std::uint64_t max_length() const;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Search the given keyword in the trie.
|
||||||
|
* @param[in] key The query keyword.
|
||||||
|
* @return The associated ID if found.
|
||||||
|
*/
|
||||||
|
inline std::optional<std::uint64_t> lookup(std::string_view key) const;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Decode the keyword associated with the given ID.
|
||||||
|
* @param[in] id The keyword ID.
|
||||||
|
* @return The keyword associated with the ID.
|
||||||
|
*/
|
||||||
|
inline std::string decode(std::uint64_t id) const;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An iterator class for common prefix search.
|
||||||
|
*/
|
||||||
|
class prefix_iterator {
|
||||||
|
public:
|
||||||
|
prefix_iterator() = default;
|
||||||
|
|
||||||
|
inline bool next();
|
||||||
|
inline std::uint64_t id() const;
|
||||||
|
inline std::string decoded() const;
|
||||||
|
inline std::string_view decoded_view() const;
|
||||||
|
};
|
||||||
|
|
||||||
|
//! Make the iterator for the prefix search
|
||||||
|
inline prefix_iterator make_prefix_iterator(std::string_view key) const;
|
||||||
|
|
||||||
|
inline void prefix_search(std::string_view key, const std::function<void(std::uint64_t, std::string_view)>& fn) const;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An iterator class for predictive search.
|
||||||
|
*/
|
||||||
|
class predictive_iterator {
|
||||||
|
public:
|
||||||
|
predictive_iterator() = default;
|
||||||
|
|
||||||
|
inline bool next();
|
||||||
|
inline std::uint64_t id() const;
|
||||||
|
inline std::string decoded() const;
|
||||||
|
inline std::string_view decoded_view() const;
|
||||||
|
};
|
||||||
|
|
||||||
|
inline predictive_iterator make_predictive_iterator(std::string_view key) const {
|
||||||
|
return predictive_iterator(this, key);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void predictive_search(std::string_view key,
|
||||||
|
const std::function<void(std::uint64_t, std::string_view)>& fn) const {
|
||||||
|
auto itr = make_predictive_iterator(key);
|
||||||
|
while (itr.next()) {
|
||||||
|
fn(itr.id(), itr.decoded_view());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
using enumerative_iterator = predictive_iterator;
|
||||||
|
|
||||||
|
inline enumerative_iterator make_enumerative_iterator() const;
|
||||||
|
inline void enumerate(const std::function<void(std::uint64_t, std::string_view)>& fn) const;
|
||||||
|
|
||||||
|
template <class Visitor>
|
||||||
|
void visit(Visitor& visitor);
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
### I/O handlers
|
||||||
|
|
||||||
|
## Benchmark
|
||||||
|
|
||||||
## Licensing
|
## Licensing
|
||||||
|
|
||||||
|
|
|
@ -13,6 +13,7 @@ namespace xcdat {
|
||||||
using trie_7_type = trie<bc_vector_7>;
|
using trie_7_type = trie<bc_vector_7>;
|
||||||
using trie_8_type = trie<bc_vector_8>;
|
using trie_8_type = trie<bc_vector_8>;
|
||||||
|
|
||||||
|
//! Map the memory to the trie index.
|
||||||
template <class Trie>
|
template <class Trie>
|
||||||
[[maybe_unused]] Trie mmap(const char* address) {
|
[[maybe_unused]] Trie mmap(const char* address) {
|
||||||
mmap_visitor visitor(address);
|
mmap_visitor visitor(address);
|
||||||
|
@ -26,6 +27,7 @@ template <class Trie>
|
||||||
return idx;
|
return idx;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//! Load the trie index from the file.
|
||||||
template <class Trie>
|
template <class Trie>
|
||||||
[[maybe_unused]] Trie load(std::string_view filepath) {
|
[[maybe_unused]] Trie load(std::string_view filepath) {
|
||||||
load_visitor visitor(filepath);
|
load_visitor visitor(filepath);
|
||||||
|
@ -39,6 +41,7 @@ template <class Trie>
|
||||||
return idx;
|
return idx;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//! Save the trie index into the file, and returns the file size in bytes.
|
||||||
template <class Trie>
|
template <class Trie>
|
||||||
[[maybe_unused]] std::uint64_t save(const Trie& idx, std::string_view filepath) {
|
[[maybe_unused]] std::uint64_t save(const Trie& idx, std::string_view filepath) {
|
||||||
save_visitor visitor(filepath);
|
save_visitor visitor(filepath);
|
||||||
|
@ -47,6 +50,7 @@ template <class Trie>
|
||||||
return visitor.bytes();
|
return visitor.bytes();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//! Get the index size in bytes.
|
||||||
template <class Trie>
|
template <class Trie>
|
||||||
[[maybe_unused]] std::uint64_t memory_in_bytes(const Trie& idx) {
|
[[maybe_unused]] std::uint64_t memory_in_bytes(const Trie& idx) {
|
||||||
size_visitor visitor;
|
size_visitor visitor;
|
||||||
|
|
|
@ -8,17 +8,7 @@
|
||||||
|
|
||||||
namespace xcdat {
|
namespace xcdat {
|
||||||
|
|
||||||
/**
|
//! A compressed string dictionary based on the XOR-compressed double-array trie.
|
||||||
* A compressed string dictionary based on the XOR-compressed double-array trie.
|
|
||||||
*
|
|
||||||
* @par References
|
|
||||||
* - Shunsuke Kanda, Kazuhiro Morita and Masao Fuketa. Compressed Double-array Tries for String Dictionaries
|
|
||||||
* Supporting Fast Lookup. Knowledge and Information Systems (KAIS), 51(3): 1023–1042, 2017.
|
|
||||||
*
|
|
||||||
* @par Links
|
|
||||||
* - https://kampersanda.github.io/pdf/KAIS2017.pdf
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
template <class BcVector>
|
template <class BcVector>
|
||||||
class trie {
|
class trie {
|
||||||
public:
|
public:
|
||||||
|
@ -53,8 +43,14 @@ class trie {
|
||||||
//! Move constructor
|
//! Move constructor
|
||||||
trie& operator=(trie&&) noexcept = default;
|
trie& operator=(trie&&) noexcept = default;
|
||||||
|
|
||||||
|
//! Build the trie from the input keywords, which are lexicographically sorted and unique.
|
||||||
|
//! If bin_mode = false, the NULL character is used for the termination of a keyword.
|
||||||
|
//! If bin_mode = true, bit flags are used istead, and the keywords can contain NULL characters.
|
||||||
|
//! If the input keywords contain NULL characters, bin_mode will be forced to be set to true.
|
||||||
template <class Strings>
|
template <class Strings>
|
||||||
explicit trie(const Strings& keys, bool bin_mode = false) : trie(trie_builder(keys, l1_bits, bin_mode)) {}
|
explicit trie(const Strings& keys, bool bin_mode = false) : trie(trie_builder(keys, l1_bits, bin_mode)) {
|
||||||
|
static_assert(sizeof(char) == sizeof(typename Strings::value_type::value_type));
|
||||||
|
}
|
||||||
|
|
||||||
//! Check the binary mode.
|
//! Check the binary mode.
|
||||||
inline bool bin_mode() const {
|
inline bool bin_mode() const {
|
||||||
|
@ -76,11 +72,7 @@ class trie {
|
||||||
return m_table.max_length();
|
return m_table.max_length();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
//! Lookup the ID of the keyword.
|
||||||
* Search the given keyword in the trie.
|
|
||||||
* @param[in] key The query keyword.
|
|
||||||
* @return The associated ID if found.
|
|
||||||
*/
|
|
||||||
inline std::optional<std::uint64_t> lookup(std::string_view key) const {
|
inline std::optional<std::uint64_t> lookup(std::string_view key) const {
|
||||||
std::uint64_t kpos = 0, npos = 0;
|
std::uint64_t kpos = 0, npos = 0;
|
||||||
while (!m_bcvec.is_leaf(npos)) {
|
while (!m_bcvec.is_leaf(npos)) {
|
||||||
|
@ -104,11 +96,7 @@ class trie {
|
||||||
return npos_to_id(npos);
|
return npos_to_id(npos);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
//! Decode the keyword associated with the ID.
|
||||||
* Decode the keyword associated with the given ID.
|
|
||||||
* @param[in] id The keyword ID.
|
|
||||||
* @return The keyword associated with the ID.
|
|
||||||
*/
|
|
||||||
inline std::string decode(std::uint64_t id) const {
|
inline std::string decode(std::uint64_t id) const {
|
||||||
if (num_keys() <= id) {
|
if (num_keys() <= id) {
|
||||||
return {};
|
return {};
|
||||||
|
@ -133,9 +121,7 @@ class trie {
|
||||||
return decoded;
|
return decoded;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
//! An iterator class for common prefix search.
|
||||||
* An iterator class for common prefix search.
|
|
||||||
*/
|
|
||||||
class prefix_iterator {
|
class prefix_iterator {
|
||||||
private:
|
private:
|
||||||
const trie_type* m_obj = nullptr;
|
const trie_type* m_obj = nullptr;
|
||||||
|
@ -149,16 +135,24 @@ class trie {
|
||||||
public:
|
public:
|
||||||
prefix_iterator() = default;
|
prefix_iterator() = default;
|
||||||
|
|
||||||
|
//! Get the next result.
|
||||||
|
//! If not found, false will be returned.
|
||||||
inline bool next() {
|
inline bool next() {
|
||||||
return m_obj != nullptr && m_obj->next_prefix(this);
|
return m_obj != nullptr && m_obj->next_prefix(this);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//! Get the ID.
|
||||||
inline std::uint64_t id() const {
|
inline std::uint64_t id() const {
|
||||||
return m_id;
|
return m_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//! Get the keyword.
|
||||||
inline std::string decoded() const {
|
inline std::string decoded() const {
|
||||||
return std::string(m_key.data(), m_kpos);
|
return std::string(m_key.data(), m_kpos);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//! Get the reference to the keyword.
|
||||||
|
//! Note that the referenced data will be changed in the next step.
|
||||||
inline std::string_view decoded_view() const {
|
inline std::string_view decoded_view() const {
|
||||||
return std::string_view(m_key.data(), m_kpos);
|
return std::string_view(m_key.data(), m_kpos);
|
||||||
}
|
}
|
||||||
|
@ -169,11 +163,12 @@ class trie {
|
||||||
friend class trie;
|
friend class trie;
|
||||||
};
|
};
|
||||||
|
|
||||||
//! Make the iterator for the prefix search
|
//! Make the common prefix searcher for the given keyword.
|
||||||
inline prefix_iterator make_prefix_iterator(std::string_view key) const {
|
inline prefix_iterator make_prefix_iterator(std::string_view key) const {
|
||||||
return prefix_iterator(this, key);
|
return prefix_iterator(this, key);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//! Preform common prefix search for the keyword.
|
||||||
inline void prefix_search(std::string_view key,
|
inline void prefix_search(std::string_view key,
|
||||||
const std::function<void(std::uint64_t, std::string_view)>& fn) const {
|
const std::function<void(std::uint64_t, std::string_view)>& fn) const {
|
||||||
auto itr = make_prefix_iterator(key);
|
auto itr = make_prefix_iterator(key);
|
||||||
|
@ -182,9 +177,7 @@ class trie {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
//! An iterator class for predictive search.
|
||||||
* An iterator class for predictive search.
|
|
||||||
*/
|
|
||||||
class predictive_iterator {
|
class predictive_iterator {
|
||||||
public:
|
public:
|
||||||
struct cursor_type {
|
struct cursor_type {
|
||||||
|
@ -205,16 +198,24 @@ class trie {
|
||||||
public:
|
public:
|
||||||
predictive_iterator() = default;
|
predictive_iterator() = default;
|
||||||
|
|
||||||
|
//! Get the next result.
|
||||||
|
//! If not found, false will be returned.
|
||||||
inline bool next() {
|
inline bool next() {
|
||||||
return m_obj != nullptr && m_obj->next_predictive(this);
|
return m_obj != nullptr && m_obj->next_predictive(this);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//! Get the ID.
|
||||||
inline std::uint64_t id() const {
|
inline std::uint64_t id() const {
|
||||||
return m_id;
|
return m_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//! Get the keyword.
|
||||||
inline std::string decoded() const {
|
inline std::string decoded() const {
|
||||||
return m_decoded;
|
return m_decoded;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//! Get the reference to the keyword.
|
||||||
|
//! Note that the referenced data will be changed in the next step.
|
||||||
inline std::string_view decoded_view() const {
|
inline std::string_view decoded_view() const {
|
||||||
return m_decoded;
|
return m_decoded;
|
||||||
}
|
}
|
||||||
|
@ -225,10 +226,12 @@ class trie {
|
||||||
friend class trie;
|
friend class trie;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
//! Make the predictive searcher for the keyword.
|
||||||
inline predictive_iterator make_predictive_iterator(std::string_view key) const {
|
inline predictive_iterator make_predictive_iterator(std::string_view key) const {
|
||||||
return predictive_iterator(this, key);
|
return predictive_iterator(this, key);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//! Preform predictive search for the keyword.
|
||||||
inline void predictive_search(std::string_view key,
|
inline void predictive_search(std::string_view key,
|
||||||
const std::function<void(std::uint64_t, std::string_view)>& fn) const {
|
const std::function<void(std::uint64_t, std::string_view)>& fn) const {
|
||||||
auto itr = make_predictive_iterator(key);
|
auto itr = make_predictive_iterator(key);
|
||||||
|
@ -237,12 +240,15 @@ class trie {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//! An iterator class for enumeration.
|
||||||
using enumerative_iterator = predictive_iterator;
|
using enumerative_iterator = predictive_iterator;
|
||||||
|
|
||||||
|
//! Make the enumerator.
|
||||||
inline enumerative_iterator make_enumerative_iterator() const {
|
inline enumerative_iterator make_enumerative_iterator() const {
|
||||||
return enumerative_iterator(this, "");
|
return enumerative_iterator(this, "");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//! Enumerate all the keywords and their IDs stored in the trie.
|
||||||
inline void enumerate(const std::function<void(std::uint64_t, std::string_view)>& fn) const {
|
inline void enumerate(const std::function<void(std::uint64_t, std::string_view)>& fn) const {
|
||||||
auto itr = make_enumerative_iterator();
|
auto itr = make_enumerative_iterator();
|
||||||
while (itr.next()) {
|
while (itr.next()) {
|
||||||
|
|
|
@ -26,13 +26,11 @@ int predictive_search(const cmd_line_parser::parser& p) {
|
||||||
std::vector<result_type> results;
|
std::vector<result_type> results;
|
||||||
results.reserve(1ULL << 10);
|
results.reserve(1ULL << 10);
|
||||||
|
|
||||||
for (std::string str; std::getline(std::cin, str);) {
|
for (std::string key; std::getline(std::cin, key);) {
|
||||||
results.clear();
|
results.clear();
|
||||||
|
trie.predictive_search(key, [&](std::uint64_t id, std::string_view str) {
|
||||||
auto itr = trie.make_predictive_iterator(str);
|
results.push_back({id, std::string(str)});
|
||||||
while (itr.next()) {
|
});
|
||||||
results.push_back({itr.id(), itr.decoded()});
|
|
||||||
}
|
|
||||||
|
|
||||||
tfm::printfln("%d found", results.size());
|
tfm::printfln("%d found", results.size());
|
||||||
for (std::uint64_t i = 0; i < std::min<std::uint64_t>(results.size(), max_num_results); i++) {
|
for (std::uint64_t i = 0; i < std::min<std::uint64_t>(results.size(), max_num_results); i++) {
|
||||||
|
|
|
@ -25,13 +25,9 @@ int prefix_search(const cmd_line_parser::parser& p) {
|
||||||
std::vector<result_type> results;
|
std::vector<result_type> results;
|
||||||
results.reserve(trie.max_length());
|
results.reserve(trie.max_length());
|
||||||
|
|
||||||
for (std::string str; std::getline(std::cin, str);) {
|
for (std::string key; std::getline(std::cin, key);) {
|
||||||
results.clear();
|
results.clear();
|
||||||
|
trie.prefix_search(key, [&](std::uint64_t id, std::string_view str) { results.push_back({id, str}); });
|
||||||
auto itr = trie.make_prefix_iterator(str);
|
|
||||||
while (itr.next()) {
|
|
||||||
results.push_back({itr.id(), itr.decoded_view()});
|
|
||||||
}
|
|
||||||
|
|
||||||
tfm::printfln("%d found", results.size());
|
tfm::printfln("%d found", results.size());
|
||||||
for (const auto& r : results) {
|
for (const auto& r : results) {
|
||||||
|
|
Loading…
Reference in a new issue