add comments etc

This commit is contained in:
Shunsuke Kanda 2021-06-29 15:39:26 +09:00
parent 693b22eb0b
commit 45682b7a10
5 changed files with 219 additions and 43 deletions

176
README.md
View file

@ -10,6 +10,14 @@ The double array is known as the fastest trie representation and has been used i
Xcdat can implement trie dictionaries in smaller space compared to the other double-array libraries. In addition, the lookup speed is relatively fast in compressed data structures from the double-array advantage. Xcdat can implement trie dictionaries in smaller space compared to the other double-array libraries. In addition, the lookup speed is relatively fast in compressed data structures from the double-array advantage.
### Table of contents
- [Features](#features)
- [Build instructions](#build-instructions)
- [Command line tools](#command-line-tools)
- [Sample usage](#sample-usage)
- [Interface](#interface)
## Features ## Features
- **Fast and memory-efficient:** Xcdat employs the double-array structure, known as the fastest trie data structure, and. - **Fast and memory-efficient:** Xcdat employs the double-array structure, known as the fastest trie data structure, and.
@ -21,7 +29,7 @@ Xcdat can implement trie dictionaries in smaller space compared to the other dou
- **Fast search**: Xcdat can provide lookup operations faster than other compressed trie libraries because it is based on the double-array trie. On the other hand, compared to the existing double-array libraries, the speed will be slower due to the compression. - **Fast search**: Xcdat can provide lookup operations faster than other compressed trie libraries because it is based on the double-array trie. On the other hand, compared to the existing double-array libraries, the speed will be slower due to the compression.
- **Prefix-based Lookup Operations**: As with other trie libraries, Xcdat also provides prefix-based lookup operations required for natural language processing and so on. - **Prefix-based Lookup Operations**: As with other trie libraries, Xcdat also provides prefix-based lookup operations required for natural language processing and so on.
## Build Instructions ## Build instructions
You can download and compile Xcdat as the following commands. You can download and compile Xcdat as the following commands.
@ -37,9 +45,69 @@ $ make install
## Command line tools ## Command line tools
### Build
```sh
$ xcdat_build enwiki-latest-all-titles-in-ns0 idx.bin -u 1
time_in_sec: 13.449
memory_in_bytes: 1.70618e+08
memory_in_MiB: 162.714
number_of_keys: 15955763
alphabet_size: 198
max_length: 253
```
## Sample ### Lookup
```sh
$ xcdat_lookup idx.bin
Algorithm
1255938 Algorithm
```
### Decode
```sh
$ xcdat_decode idx.bin
1255938
1255938 Algorithm
```
### Common prefix search
```sh
$ xcdat_prefix_search idx.bin
Algorithmic
6 found
57 A
798460 Al
1138004 Alg
1253024 Algo
1255938 Algorithm
1255931 Algorithmic
```
### Predictive search
```sh
$ xcdat_predictive_search idx.bin -n 3
Algorithm
263 found
1255938 Algorithm
1255944 Algorithm's_optimality
1255972 Algorithm_(C++)
```
### Enumerate
```sh
$ xcdat_enumerate idx.bin | head -3
0 !
107 !!
138 !!!
```
## Sample usage
```c++ ```c++
#include <iostream> #include <iostream>
@ -168,7 +236,111 @@ Enumerate() = {
} }
``` ```
## Interface
### Dictionary class
```c++
temp late <class BcVector>
class trie {
public:
using trie_type = trie<BcVector>;
using bc_vector_type = BcVector;
static constexpr auto l1_bits = bc_vector_type::l1_bits;
public:
trie() = default;
virtual ~trie() = default;
trie(const trie&) = delete;
trie& operator=(const trie&) = delete;
trie(trie&&) noexcept = default;
trie& operator=(trie&&) noexcept = default;
template <class Strings>
explicit trie(const Strings& keys, bool bin_mode = false);
inline bool bin_mode() const;
//! Get the number of stored keywords.
inline std::uint64_t num_keys() const;
//! Get the alphabet size.
inline std::uint64_t alphabet_size() const;
//! Get the maximum length of keywords.
inline std::uint64_t max_length() const;
/**
* Search the given keyword in the trie.
* @param[in] key The query keyword.
* @return The associated ID if found.
*/
inline std::optional<std::uint64_t> lookup(std::string_view key) const;
/**
* Decode the keyword associated with the given ID.
* @param[in] id The keyword ID.
* @return The keyword associated with the ID.
*/
inline std::string decode(std::uint64_t id) const;
/**
* An iterator class for common prefix search.
*/
class prefix_iterator {
public:
prefix_iterator() = default;
inline bool next();
inline std::uint64_t id() const;
inline std::string decoded() const;
inline std::string_view decoded_view() const;
};
//! Make the iterator for the prefix search
inline prefix_iterator make_prefix_iterator(std::string_view key) const;
inline void prefix_search(std::string_view key, const std::function<void(std::uint64_t, std::string_view)>& fn) const;
/**
* An iterator class for predictive search.
*/
class predictive_iterator {
public:
predictive_iterator() = default;
inline bool next();
inline std::uint64_t id() const;
inline std::string decoded() const;
inline std::string_view decoded_view() const;
};
inline predictive_iterator make_predictive_iterator(std::string_view key) const {
return predictive_iterator(this, key);
}
inline void predictive_search(std::string_view key,
const std::function<void(std::uint64_t, std::string_view)>& fn) const {
auto itr = make_predictive_iterator(key);
while (itr.next()) {
fn(itr.id(), itr.decoded_view());
}
}
using enumerative_iterator = predictive_iterator;
inline enumerative_iterator make_enumerative_iterator() const;
inline void enumerate(const std::function<void(std::uint64_t, std::string_view)>& fn) const;
template <class Visitor>
void visit(Visitor& visitor);
};
```
### I/O handlers
## Benchmark
## Licensing ## Licensing

View file

@ -13,6 +13,7 @@ namespace xcdat {
using trie_7_type = trie<bc_vector_7>; using trie_7_type = trie<bc_vector_7>;
using trie_8_type = trie<bc_vector_8>; using trie_8_type = trie<bc_vector_8>;
//! Map the memory to the trie index.
template <class Trie> template <class Trie>
[[maybe_unused]] Trie mmap(const char* address) { [[maybe_unused]] Trie mmap(const char* address) {
mmap_visitor visitor(address); mmap_visitor visitor(address);
@ -26,6 +27,7 @@ template <class Trie>
return idx; return idx;
} }
//! Load the trie index from the file.
template <class Trie> template <class Trie>
[[maybe_unused]] Trie load(std::string_view filepath) { [[maybe_unused]] Trie load(std::string_view filepath) {
load_visitor visitor(filepath); load_visitor visitor(filepath);
@ -39,6 +41,7 @@ template <class Trie>
return idx; return idx;
} }
//! Save the trie index into the file, and returns the file size in bytes.
template <class Trie> template <class Trie>
[[maybe_unused]] std::uint64_t save(const Trie& idx, std::string_view filepath) { [[maybe_unused]] std::uint64_t save(const Trie& idx, std::string_view filepath) {
save_visitor visitor(filepath); save_visitor visitor(filepath);
@ -47,6 +50,7 @@ template <class Trie>
return visitor.bytes(); return visitor.bytes();
} }
//! Get the index size in bytes.
template <class Trie> template <class Trie>
[[maybe_unused]] std::uint64_t memory_in_bytes(const Trie& idx) { [[maybe_unused]] std::uint64_t memory_in_bytes(const Trie& idx) {
size_visitor visitor; size_visitor visitor;

View file

@ -8,17 +8,7 @@
namespace xcdat { namespace xcdat {
/** //! A compressed string dictionary based on the XOR-compressed double-array trie.
* A compressed string dictionary based on the XOR-compressed double-array trie.
*
* @par References
* - Shunsuke Kanda, Kazuhiro Morita and Masao Fuketa. Compressed Double-array Tries for String Dictionaries
* Supporting Fast Lookup. Knowledge and Information Systems (KAIS), 51(3): 10231042, 2017.
*
* @par Links
* - https://kampersanda.github.io/pdf/KAIS2017.pdf
*
*/
template <class BcVector> template <class BcVector>
class trie { class trie {
public: public:
@ -53,8 +43,14 @@ class trie {
//! Move constructor //! Move constructor
trie& operator=(trie&&) noexcept = default; trie& operator=(trie&&) noexcept = default;
//! Build the trie from the input keywords, which are lexicographically sorted and unique.
//! If bin_mode = false, the NULL character is used for the termination of a keyword.
//! If bin_mode = true, bit flags are used istead, and the keywords can contain NULL characters.
//! If the input keywords contain NULL characters, bin_mode will be forced to be set to true.
template <class Strings> template <class Strings>
explicit trie(const Strings& keys, bool bin_mode = false) : trie(trie_builder(keys, l1_bits, bin_mode)) {} explicit trie(const Strings& keys, bool bin_mode = false) : trie(trie_builder(keys, l1_bits, bin_mode)) {
static_assert(sizeof(char) == sizeof(typename Strings::value_type::value_type));
}
//! Check the binary mode. //! Check the binary mode.
inline bool bin_mode() const { inline bool bin_mode() const {
@ -76,11 +72,7 @@ class trie {
return m_table.max_length(); return m_table.max_length();
} }
/** //! Lookup the ID of the keyword.
* Search the given keyword in the trie.
* @param[in] key The query keyword.
* @return The associated ID if found.
*/
inline std::optional<std::uint64_t> lookup(std::string_view key) const { inline std::optional<std::uint64_t> lookup(std::string_view key) const {
std::uint64_t kpos = 0, npos = 0; std::uint64_t kpos = 0, npos = 0;
while (!m_bcvec.is_leaf(npos)) { while (!m_bcvec.is_leaf(npos)) {
@ -104,11 +96,7 @@ class trie {
return npos_to_id(npos); return npos_to_id(npos);
} }
/** //! Decode the keyword associated with the ID.
* Decode the keyword associated with the given ID.
* @param[in] id The keyword ID.
* @return The keyword associated with the ID.
*/
inline std::string decode(std::uint64_t id) const { inline std::string decode(std::uint64_t id) const {
if (num_keys() <= id) { if (num_keys() <= id) {
return {}; return {};
@ -133,9 +121,7 @@ class trie {
return decoded; return decoded;
} }
/** //! An iterator class for common prefix search.
* An iterator class for common prefix search.
*/
class prefix_iterator { class prefix_iterator {
private: private:
const trie_type* m_obj = nullptr; const trie_type* m_obj = nullptr;
@ -149,16 +135,24 @@ class trie {
public: public:
prefix_iterator() = default; prefix_iterator() = default;
//! Get the next result.
//! If not found, false will be returned.
inline bool next() { inline bool next() {
return m_obj != nullptr && m_obj->next_prefix(this); return m_obj != nullptr && m_obj->next_prefix(this);
} }
//! Get the ID.
inline std::uint64_t id() const { inline std::uint64_t id() const {
return m_id; return m_id;
} }
//! Get the keyword.
inline std::string decoded() const { inline std::string decoded() const {
return std::string(m_key.data(), m_kpos); return std::string(m_key.data(), m_kpos);
} }
//! Get the reference to the keyword.
//! Note that the referenced data will be changed in the next step.
inline std::string_view decoded_view() const { inline std::string_view decoded_view() const {
return std::string_view(m_key.data(), m_kpos); return std::string_view(m_key.data(), m_kpos);
} }
@ -169,11 +163,12 @@ class trie {
friend class trie; friend class trie;
}; };
//! Make the iterator for the prefix search //! Make the common prefix searcher for the given keyword.
inline prefix_iterator make_prefix_iterator(std::string_view key) const { inline prefix_iterator make_prefix_iterator(std::string_view key) const {
return prefix_iterator(this, key); return prefix_iterator(this, key);
} }
//! Preform common prefix search for the keyword.
inline void prefix_search(std::string_view key, inline void prefix_search(std::string_view key,
const std::function<void(std::uint64_t, std::string_view)>& fn) const { const std::function<void(std::uint64_t, std::string_view)>& fn) const {
auto itr = make_prefix_iterator(key); auto itr = make_prefix_iterator(key);
@ -182,9 +177,7 @@ class trie {
} }
} }
/** //! An iterator class for predictive search.
* An iterator class for predictive search.
*/
class predictive_iterator { class predictive_iterator {
public: public:
struct cursor_type { struct cursor_type {
@ -205,16 +198,24 @@ class trie {
public: public:
predictive_iterator() = default; predictive_iterator() = default;
//! Get the next result.
//! If not found, false will be returned.
inline bool next() { inline bool next() {
return m_obj != nullptr && m_obj->next_predictive(this); return m_obj != nullptr && m_obj->next_predictive(this);
} }
//! Get the ID.
inline std::uint64_t id() const { inline std::uint64_t id() const {
return m_id; return m_id;
} }
//! Get the keyword.
inline std::string decoded() const { inline std::string decoded() const {
return m_decoded; return m_decoded;
} }
//! Get the reference to the keyword.
//! Note that the referenced data will be changed in the next step.
inline std::string_view decoded_view() const { inline std::string_view decoded_view() const {
return m_decoded; return m_decoded;
} }
@ -225,10 +226,12 @@ class trie {
friend class trie; friend class trie;
}; };
//! Make the predictive searcher for the keyword.
inline predictive_iterator make_predictive_iterator(std::string_view key) const { inline predictive_iterator make_predictive_iterator(std::string_view key) const {
return predictive_iterator(this, key); return predictive_iterator(this, key);
} }
//! Preform predictive search for the keyword.
inline void predictive_search(std::string_view key, inline void predictive_search(std::string_view key,
const std::function<void(std::uint64_t, std::string_view)>& fn) const { const std::function<void(std::uint64_t, std::string_view)>& fn) const {
auto itr = make_predictive_iterator(key); auto itr = make_predictive_iterator(key);
@ -237,12 +240,15 @@ class trie {
} }
} }
//! An iterator class for enumeration.
using enumerative_iterator = predictive_iterator; using enumerative_iterator = predictive_iterator;
//! Make the enumerator.
inline enumerative_iterator make_enumerative_iterator() const { inline enumerative_iterator make_enumerative_iterator() const {
return enumerative_iterator(this, ""); return enumerative_iterator(this, "");
} }
//! Enumerate all the keywords and their IDs stored in the trie.
inline void enumerate(const std::function<void(std::uint64_t, std::string_view)>& fn) const { inline void enumerate(const std::function<void(std::uint64_t, std::string_view)>& fn) const {
auto itr = make_enumerative_iterator(); auto itr = make_enumerative_iterator();
while (itr.next()) { while (itr.next()) {

View file

@ -26,13 +26,11 @@ int predictive_search(const cmd_line_parser::parser& p) {
std::vector<result_type> results; std::vector<result_type> results;
results.reserve(1ULL << 10); results.reserve(1ULL << 10);
for (std::string str; std::getline(std::cin, str);) { for (std::string key; std::getline(std::cin, key);) {
results.clear(); results.clear();
trie.predictive_search(key, [&](std::uint64_t id, std::string_view str) {
auto itr = trie.make_predictive_iterator(str); results.push_back({id, std::string(str)});
while (itr.next()) { });
results.push_back({itr.id(), itr.decoded()});
}
tfm::printfln("%d found", results.size()); tfm::printfln("%d found", results.size());
for (std::uint64_t i = 0; i < std::min<std::uint64_t>(results.size(), max_num_results); i++) { for (std::uint64_t i = 0; i < std::min<std::uint64_t>(results.size(), max_num_results); i++) {

View file

@ -25,13 +25,9 @@ int prefix_search(const cmd_line_parser::parser& p) {
std::vector<result_type> results; std::vector<result_type> results;
results.reserve(trie.max_length()); results.reserve(trie.max_length());
for (std::string str; std::getline(std::cin, str);) { for (std::string key; std::getline(std::cin, key);) {
results.clear(); results.clear();
trie.prefix_search(key, [&](std::uint64_t id, std::string_view str) { results.push_back({id, str}); });
auto itr = trie.make_prefix_iterator(str);
while (itr.next()) {
results.push_back({itr.id(), itr.decoded_view()});
}
tfm::printfln("%d found", results.size()); tfm::printfln("%d found", results.size());
for (const auto& r : results) { for (const auto& r : results) {