add comments etc
This commit is contained in:
parent
693b22eb0b
commit
45682b7a10
176
README.md
176
README.md
|
@ -10,6 +10,14 @@ The double array is known as the fastest trie representation and has been used i
|
|||
|
||||
Xcdat can implement trie dictionaries in smaller space compared to the other double-array libraries. In addition, the lookup speed is relatively fast in compressed data structures from the double-array advantage.
|
||||
|
||||
### Table of contents
|
||||
|
||||
- [Features](#features)
|
||||
- [Build instructions](#build-instructions)
|
||||
- [Command line tools](#command-line-tools)
|
||||
- [Sample usage](#sample-usage)
|
||||
- [Interface](#interface)
|
||||
|
||||
## Features
|
||||
|
||||
- **Fast and memory-efficient:** Xcdat employs the double-array structure, known as the fastest trie data structure, and.
|
||||
|
@ -21,7 +29,7 @@ Xcdat can implement trie dictionaries in smaller space compared to the other dou
|
|||
- **Fast search**: Xcdat can provide lookup operations faster than other compressed trie libraries because it is based on the double-array trie. On the other hand, compared to the existing double-array libraries, the speed will be slower due to the compression.
|
||||
- **Prefix-based Lookup Operations**: As with other trie libraries, Xcdat also provides prefix-based lookup operations required for natural language processing and so on.
|
||||
|
||||
## Build Instructions
|
||||
## Build instructions
|
||||
|
||||
You can download and compile Xcdat as the following commands.
|
||||
|
||||
|
@ -37,9 +45,69 @@ $ make install
|
|||
|
||||
## Command line tools
|
||||
|
||||
### Build
|
||||
|
||||
```sh
|
||||
$ xcdat_build enwiki-latest-all-titles-in-ns0 idx.bin -u 1
|
||||
time_in_sec: 13.449
|
||||
memory_in_bytes: 1.70618e+08
|
||||
memory_in_MiB: 162.714
|
||||
number_of_keys: 15955763
|
||||
alphabet_size: 198
|
||||
max_length: 253
|
||||
```
|
||||
|
||||
## Sample
|
||||
### Lookup
|
||||
|
||||
```sh
|
||||
$ xcdat_lookup idx.bin
|
||||
Algorithm
|
||||
1255938 Algorithm
|
||||
```
|
||||
|
||||
### Decode
|
||||
|
||||
```sh
|
||||
$ xcdat_decode idx.bin
|
||||
1255938
|
||||
1255938 Algorithm
|
||||
```
|
||||
|
||||
### Common prefix search
|
||||
|
||||
```sh
|
||||
$ xcdat_prefix_search idx.bin
|
||||
Algorithmic
|
||||
6 found
|
||||
57 A
|
||||
798460 Al
|
||||
1138004 Alg
|
||||
1253024 Algo
|
||||
1255938 Algorithm
|
||||
1255931 Algorithmic
|
||||
```
|
||||
|
||||
### Predictive search
|
||||
|
||||
```sh
|
||||
$ xcdat_predictive_search idx.bin -n 3
|
||||
Algorithm
|
||||
263 found
|
||||
1255938 Algorithm
|
||||
1255944 Algorithm's_optimality
|
||||
1255972 Algorithm_(C++)
|
||||
```
|
||||
|
||||
### Enumerate
|
||||
|
||||
```sh
|
||||
$ xcdat_enumerate idx.bin | head -3
|
||||
0 !
|
||||
107 !!
|
||||
138 !!!
|
||||
```
|
||||
|
||||
## Sample usage
|
||||
|
||||
```c++
|
||||
#include <iostream>
|
||||
|
@ -168,7 +236,111 @@ Enumerate() = {
|
|||
}
|
||||
```
|
||||
|
||||
## Interface
|
||||
|
||||
### Dictionary class
|
||||
|
||||
```c++
|
||||
temp late <class BcVector>
|
||||
class trie {
|
||||
public:
|
||||
using trie_type = trie<BcVector>;
|
||||
using bc_vector_type = BcVector;
|
||||
|
||||
static constexpr auto l1_bits = bc_vector_type::l1_bits;
|
||||
|
||||
public:
|
||||
trie() = default;
|
||||
virtual ~trie() = default;
|
||||
trie(const trie&) = delete;
|
||||
trie& operator=(const trie&) = delete;
|
||||
trie(trie&&) noexcept = default;
|
||||
trie& operator=(trie&&) noexcept = default;
|
||||
|
||||
template <class Strings>
|
||||
explicit trie(const Strings& keys, bool bin_mode = false);
|
||||
|
||||
inline bool bin_mode() const;
|
||||
|
||||
//! Get the number of stored keywords.
|
||||
inline std::uint64_t num_keys() const;
|
||||
|
||||
//! Get the alphabet size.
|
||||
inline std::uint64_t alphabet_size() const;
|
||||
|
||||
//! Get the maximum length of keywords.
|
||||
inline std::uint64_t max_length() const;
|
||||
|
||||
/**
|
||||
* Search the given keyword in the trie.
|
||||
* @param[in] key The query keyword.
|
||||
* @return The associated ID if found.
|
||||
*/
|
||||
inline std::optional<std::uint64_t> lookup(std::string_view key) const;
|
||||
|
||||
/**
|
||||
* Decode the keyword associated with the given ID.
|
||||
* @param[in] id The keyword ID.
|
||||
* @return The keyword associated with the ID.
|
||||
*/
|
||||
inline std::string decode(std::uint64_t id) const;
|
||||
|
||||
/**
|
||||
* An iterator class for common prefix search.
|
||||
*/
|
||||
class prefix_iterator {
|
||||
public:
|
||||
prefix_iterator() = default;
|
||||
|
||||
inline bool next();
|
||||
inline std::uint64_t id() const;
|
||||
inline std::string decoded() const;
|
||||
inline std::string_view decoded_view() const;
|
||||
};
|
||||
|
||||
//! Make the iterator for the prefix search
|
||||
inline prefix_iterator make_prefix_iterator(std::string_view key) const;
|
||||
|
||||
inline void prefix_search(std::string_view key, const std::function<void(std::uint64_t, std::string_view)>& fn) const;
|
||||
|
||||
/**
|
||||
* An iterator class for predictive search.
|
||||
*/
|
||||
class predictive_iterator {
|
||||
public:
|
||||
predictive_iterator() = default;
|
||||
|
||||
inline bool next();
|
||||
inline std::uint64_t id() const;
|
||||
inline std::string decoded() const;
|
||||
inline std::string_view decoded_view() const;
|
||||
};
|
||||
|
||||
inline predictive_iterator make_predictive_iterator(std::string_view key) const {
|
||||
return predictive_iterator(this, key);
|
||||
}
|
||||
|
||||
inline void predictive_search(std::string_view key,
|
||||
const std::function<void(std::uint64_t, std::string_view)>& fn) const {
|
||||
auto itr = make_predictive_iterator(key);
|
||||
while (itr.next()) {
|
||||
fn(itr.id(), itr.decoded_view());
|
||||
}
|
||||
}
|
||||
|
||||
using enumerative_iterator = predictive_iterator;
|
||||
|
||||
inline enumerative_iterator make_enumerative_iterator() const;
|
||||
inline void enumerate(const std::function<void(std::uint64_t, std::string_view)>& fn) const;
|
||||
|
||||
template <class Visitor>
|
||||
void visit(Visitor& visitor);
|
||||
};
|
||||
```
|
||||
|
||||
### I/O handlers
|
||||
|
||||
## Benchmark
|
||||
|
||||
## Licensing
|
||||
|
||||
|
|
|
@ -13,6 +13,7 @@ namespace xcdat {
|
|||
using trie_7_type = trie<bc_vector_7>;
|
||||
using trie_8_type = trie<bc_vector_8>;
|
||||
|
||||
//! Map the memory to the trie index.
|
||||
template <class Trie>
|
||||
[[maybe_unused]] Trie mmap(const char* address) {
|
||||
mmap_visitor visitor(address);
|
||||
|
@ -26,6 +27,7 @@ template <class Trie>
|
|||
return idx;
|
||||
}
|
||||
|
||||
//! Load the trie index from the file.
|
||||
template <class Trie>
|
||||
[[maybe_unused]] Trie load(std::string_view filepath) {
|
||||
load_visitor visitor(filepath);
|
||||
|
@ -39,6 +41,7 @@ template <class Trie>
|
|||
return idx;
|
||||
}
|
||||
|
||||
//! Save the trie index into the file, and returns the file size in bytes.
|
||||
template <class Trie>
|
||||
[[maybe_unused]] std::uint64_t save(const Trie& idx, std::string_view filepath) {
|
||||
save_visitor visitor(filepath);
|
||||
|
@ -47,6 +50,7 @@ template <class Trie>
|
|||
return visitor.bytes();
|
||||
}
|
||||
|
||||
//! Get the index size in bytes.
|
||||
template <class Trie>
|
||||
[[maybe_unused]] std::uint64_t memory_in_bytes(const Trie& idx) {
|
||||
size_visitor visitor;
|
||||
|
|
|
@ -8,17 +8,7 @@
|
|||
|
||||
namespace xcdat {
|
||||
|
||||
/**
|
||||
* A compressed string dictionary based on the XOR-compressed double-array trie.
|
||||
*
|
||||
* @par References
|
||||
* - Shunsuke Kanda, Kazuhiro Morita and Masao Fuketa. Compressed Double-array Tries for String Dictionaries
|
||||
* Supporting Fast Lookup. Knowledge and Information Systems (KAIS), 51(3): 1023–1042, 2017.
|
||||
*
|
||||
* @par Links
|
||||
* - https://kampersanda.github.io/pdf/KAIS2017.pdf
|
||||
*
|
||||
*/
|
||||
//! A compressed string dictionary based on the XOR-compressed double-array trie.
|
||||
template <class BcVector>
|
||||
class trie {
|
||||
public:
|
||||
|
@ -53,8 +43,14 @@ class trie {
|
|||
//! Move constructor
|
||||
trie& operator=(trie&&) noexcept = default;
|
||||
|
||||
//! Build the trie from the input keywords, which are lexicographically sorted and unique.
|
||||
//! If bin_mode = false, the NULL character is used for the termination of a keyword.
|
||||
//! If bin_mode = true, bit flags are used istead, and the keywords can contain NULL characters.
|
||||
//! If the input keywords contain NULL characters, bin_mode will be forced to be set to true.
|
||||
template <class Strings>
|
||||
explicit trie(const Strings& keys, bool bin_mode = false) : trie(trie_builder(keys, l1_bits, bin_mode)) {}
|
||||
explicit trie(const Strings& keys, bool bin_mode = false) : trie(trie_builder(keys, l1_bits, bin_mode)) {
|
||||
static_assert(sizeof(char) == sizeof(typename Strings::value_type::value_type));
|
||||
}
|
||||
|
||||
//! Check the binary mode.
|
||||
inline bool bin_mode() const {
|
||||
|
@ -76,11 +72,7 @@ class trie {
|
|||
return m_table.max_length();
|
||||
}
|
||||
|
||||
/**
|
||||
* Search the given keyword in the trie.
|
||||
* @param[in] key The query keyword.
|
||||
* @return The associated ID if found.
|
||||
*/
|
||||
//! Lookup the ID of the keyword.
|
||||
inline std::optional<std::uint64_t> lookup(std::string_view key) const {
|
||||
std::uint64_t kpos = 0, npos = 0;
|
||||
while (!m_bcvec.is_leaf(npos)) {
|
||||
|
@ -104,11 +96,7 @@ class trie {
|
|||
return npos_to_id(npos);
|
||||
}
|
||||
|
||||
/**
|
||||
* Decode the keyword associated with the given ID.
|
||||
* @param[in] id The keyword ID.
|
||||
* @return The keyword associated with the ID.
|
||||
*/
|
||||
//! Decode the keyword associated with the ID.
|
||||
inline std::string decode(std::uint64_t id) const {
|
||||
if (num_keys() <= id) {
|
||||
return {};
|
||||
|
@ -133,9 +121,7 @@ class trie {
|
|||
return decoded;
|
||||
}
|
||||
|
||||
/**
|
||||
* An iterator class for common prefix search.
|
||||
*/
|
||||
//! An iterator class for common prefix search.
|
||||
class prefix_iterator {
|
||||
private:
|
||||
const trie_type* m_obj = nullptr;
|
||||
|
@ -149,16 +135,24 @@ class trie {
|
|||
public:
|
||||
prefix_iterator() = default;
|
||||
|
||||
//! Get the next result.
|
||||
//! If not found, false will be returned.
|
||||
inline bool next() {
|
||||
return m_obj != nullptr && m_obj->next_prefix(this);
|
||||
}
|
||||
|
||||
//! Get the ID.
|
||||
inline std::uint64_t id() const {
|
||||
return m_id;
|
||||
}
|
||||
|
||||
//! Get the keyword.
|
||||
inline std::string decoded() const {
|
||||
return std::string(m_key.data(), m_kpos);
|
||||
}
|
||||
|
||||
//! Get the reference to the keyword.
|
||||
//! Note that the referenced data will be changed in the next step.
|
||||
inline std::string_view decoded_view() const {
|
||||
return std::string_view(m_key.data(), m_kpos);
|
||||
}
|
||||
|
@ -169,11 +163,12 @@ class trie {
|
|||
friend class trie;
|
||||
};
|
||||
|
||||
//! Make the iterator for the prefix search
|
||||
//! Make the common prefix searcher for the given keyword.
|
||||
inline prefix_iterator make_prefix_iterator(std::string_view key) const {
|
||||
return prefix_iterator(this, key);
|
||||
}
|
||||
|
||||
//! Preform common prefix search for the keyword.
|
||||
inline void prefix_search(std::string_view key,
|
||||
const std::function<void(std::uint64_t, std::string_view)>& fn) const {
|
||||
auto itr = make_prefix_iterator(key);
|
||||
|
@ -182,9 +177,7 @@ class trie {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* An iterator class for predictive search.
|
||||
*/
|
||||
//! An iterator class for predictive search.
|
||||
class predictive_iterator {
|
||||
public:
|
||||
struct cursor_type {
|
||||
|
@ -205,16 +198,24 @@ class trie {
|
|||
public:
|
||||
predictive_iterator() = default;
|
||||
|
||||
//! Get the next result.
|
||||
//! If not found, false will be returned.
|
||||
inline bool next() {
|
||||
return m_obj != nullptr && m_obj->next_predictive(this);
|
||||
}
|
||||
|
||||
//! Get the ID.
|
||||
inline std::uint64_t id() const {
|
||||
return m_id;
|
||||
}
|
||||
|
||||
//! Get the keyword.
|
||||
inline std::string decoded() const {
|
||||
return m_decoded;
|
||||
}
|
||||
|
||||
//! Get the reference to the keyword.
|
||||
//! Note that the referenced data will be changed in the next step.
|
||||
inline std::string_view decoded_view() const {
|
||||
return m_decoded;
|
||||
}
|
||||
|
@ -225,10 +226,12 @@ class trie {
|
|||
friend class trie;
|
||||
};
|
||||
|
||||
//! Make the predictive searcher for the keyword.
|
||||
inline predictive_iterator make_predictive_iterator(std::string_view key) const {
|
||||
return predictive_iterator(this, key);
|
||||
}
|
||||
|
||||
//! Preform predictive search for the keyword.
|
||||
inline void predictive_search(std::string_view key,
|
||||
const std::function<void(std::uint64_t, std::string_view)>& fn) const {
|
||||
auto itr = make_predictive_iterator(key);
|
||||
|
@ -237,12 +240,15 @@ class trie {
|
|||
}
|
||||
}
|
||||
|
||||
//! An iterator class for enumeration.
|
||||
using enumerative_iterator = predictive_iterator;
|
||||
|
||||
//! Make the enumerator.
|
||||
inline enumerative_iterator make_enumerative_iterator() const {
|
||||
return enumerative_iterator(this, "");
|
||||
}
|
||||
|
||||
//! Enumerate all the keywords and their IDs stored in the trie.
|
||||
inline void enumerate(const std::function<void(std::uint64_t, std::string_view)>& fn) const {
|
||||
auto itr = make_enumerative_iterator();
|
||||
while (itr.next()) {
|
||||
|
|
|
@ -26,13 +26,11 @@ int predictive_search(const cmd_line_parser::parser& p) {
|
|||
std::vector<result_type> results;
|
||||
results.reserve(1ULL << 10);
|
||||
|
||||
for (std::string str; std::getline(std::cin, str);) {
|
||||
for (std::string key; std::getline(std::cin, key);) {
|
||||
results.clear();
|
||||
|
||||
auto itr = trie.make_predictive_iterator(str);
|
||||
while (itr.next()) {
|
||||
results.push_back({itr.id(), itr.decoded()});
|
||||
}
|
||||
trie.predictive_search(key, [&](std::uint64_t id, std::string_view str) {
|
||||
results.push_back({id, std::string(str)});
|
||||
});
|
||||
|
||||
tfm::printfln("%d found", results.size());
|
||||
for (std::uint64_t i = 0; i < std::min<std::uint64_t>(results.size(), max_num_results); i++) {
|
||||
|
|
|
@ -25,13 +25,9 @@ int prefix_search(const cmd_line_parser::parser& p) {
|
|||
std::vector<result_type> results;
|
||||
results.reserve(trie.max_length());
|
||||
|
||||
for (std::string str; std::getline(std::cin, str);) {
|
||||
for (std::string key; std::getline(std::cin, key);) {
|
||||
results.clear();
|
||||
|
||||
auto itr = trie.make_prefix_iterator(str);
|
||||
while (itr.next()) {
|
||||
results.push_back({itr.id(), itr.decoded_view()});
|
||||
}
|
||||
trie.prefix_search(key, [&](std::uint64_t id, std::string_view str) { results.push_back({id, str}); });
|
||||
|
||||
tfm::printfln("%d found", results.size());
|
||||
for (const auto& r : results) {
|
||||
|
|
Loading…
Reference in a new issue