add
This commit is contained in:
parent
0362cc0453
commit
8f59dfbd41
48
README.md
48
README.md
|
@ -290,14 +290,10 @@ Enumerate() = {
|
||||||
### Dictionary class
|
### Dictionary class
|
||||||
|
|
||||||
```c++
|
```c++
|
||||||
|
//! A compressed string dictionary based on an improved double-array trie.
|
||||||
|
//! 'BcVector' is the data type of Base and Check vectors.
|
||||||
template <class BcVector>
|
template <class BcVector>
|
||||||
class trie {
|
class trie {
|
||||||
public:
|
|
||||||
using trie_type = trie<BcVector>;
|
|
||||||
using bc_vector_type = BcVector;
|
|
||||||
|
|
||||||
static constexpr auto l1_bits = bc_vector_type::l1_bits;
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
//! Default constructor
|
//! Default constructor
|
||||||
trie() = default;
|
trie() = default;
|
||||||
|
@ -318,9 +314,18 @@ class trie {
|
||||||
trie& operator=(trie&&) noexcept = default;
|
trie& operator=(trie&&) noexcept = default;
|
||||||
|
|
||||||
//! Build the trie from the input keywords, which are lexicographically sorted and unique.
|
//! Build the trie from the input keywords, which are lexicographically sorted and unique.
|
||||||
|
//!
|
||||||
//! If bin_mode = false, the NULL character is used for the termination of a keyword.
|
//! If bin_mode = false, the NULL character is used for the termination of a keyword.
|
||||||
//! If bin_mode = true, bit flags are used istead, and the keywords can contain NULL characters.
|
//! If bin_mode = true, bit flags are used istead, and the keywords can contain NULL characters.
|
||||||
//! If the input keywords contain NULL characters, bin_mode will be forced to be set to true.
|
//! If the input keywords contain NULL characters, bin_mode will be forced to be set to true.
|
||||||
|
//!
|
||||||
|
//! The type 'Strings' and 'Strings::value_type' should be a random iterable container such as std::vector.
|
||||||
|
//! Precisely, they should support the following operations:
|
||||||
|
//! - size() returns the container size.
|
||||||
|
//! - operator[](i) accesses the i-th element.
|
||||||
|
//! - begin() returns the iterator to the beginning.
|
||||||
|
//! - end() returns the iterator to the end.
|
||||||
|
//! The type 'Strings::value_type::value_type' should be one-byte integer type such as 'char'.
|
||||||
template <class Strings>
|
template <class Strings>
|
||||||
trie(const Strings& keys, bool bin_mode = false);
|
trie(const Strings& keys, bool bin_mode = false);
|
||||||
|
|
||||||
|
@ -354,10 +359,13 @@ class trie {
|
||||||
//! Decode the keyword associated with the ID.
|
//! Decode the keyword associated with the ID.
|
||||||
std::string decode(std::uint64_t id) const;
|
std::string decode(std::uint64_t id) const;
|
||||||
|
|
||||||
//! Decode the keyword associated with the ID.
|
//! Decode the keyword associated with the ID and store it in 'decoded'.
|
||||||
|
//! It can avoid reallocation of memory to store the result.
|
||||||
void decode(std::uint64_t id, std::string& decoded) const;
|
void decode(std::uint64_t id, std::string& decoded) const;
|
||||||
|
|
||||||
//! An iterator class for common prefix search.
|
//! An iterator class for common prefix search.
|
||||||
|
//! It enumerates all the keywords contained as prefixes of a given string.
|
||||||
|
//! It should be instantiated via the function 'make_prefix_iterator'.
|
||||||
class prefix_iterator {
|
class prefix_iterator {
|
||||||
public:
|
public:
|
||||||
prefix_iterator() = default;
|
prefix_iterator() = default;
|
||||||
|
@ -384,6 +392,8 @@ class trie {
|
||||||
void prefix_search(std::string_view key, const std::function<void(std::uint64_t, std::string_view)>& fn) const;
|
void prefix_search(std::string_view key, const std::function<void(std::uint64_t, std::string_view)>& fn) const;
|
||||||
|
|
||||||
//! An iterator class for predictive search.
|
//! An iterator class for predictive search.
|
||||||
|
//! It enumerates all the keywords starting with a given string.
|
||||||
|
//! It should be instantiated via the function 'make_predictive_iterator'.
|
||||||
class predictive_iterator {
|
class predictive_iterator {
|
||||||
public:
|
public:
|
||||||
predictive_iterator() = default;
|
predictive_iterator() = default;
|
||||||
|
@ -410,6 +420,8 @@ class trie {
|
||||||
void predictive_search(std::string_view key, const std::function<void(std::uint64_t, std::string_view)>& fn) const;
|
void predictive_search(std::string_view key, const std::function<void(std::uint64_t, std::string_view)>& fn) const;
|
||||||
|
|
||||||
//! An iterator class for enumeration.
|
//! An iterator class for enumeration.
|
||||||
|
//! It enumerates all the keywords stored in the trie.
|
||||||
|
//! It should be instantiated via the function 'make_enumerative_iterator'.
|
||||||
using enumerative_iterator = predictive_iterator;
|
using enumerative_iterator = predictive_iterator;
|
||||||
|
|
||||||
//! An iterator class for enumeration.
|
//! An iterator class for enumeration.
|
||||||
|
@ -418,7 +430,7 @@ class trie {
|
||||||
//! Enumerate all the keywords and their IDs stored in the trie.
|
//! Enumerate all the keywords and their IDs stored in the trie.
|
||||||
void enumerate(const std::function<void(std::uint64_t, std::string_view)>& fn) const;
|
void enumerate(const std::function<void(std::uint64_t, std::string_view)>& fn) const;
|
||||||
|
|
||||||
//! Visit the members.
|
//! Visit the members (commonly used for I/O).
|
||||||
template <class Visitor>
|
template <class Visitor>
|
||||||
void visit(Visitor& visitor);
|
void visit(Visitor& visitor);
|
||||||
};
|
};
|
||||||
|
@ -429,8 +441,28 @@ class trie {
|
||||||
`xcdat.hpp` provides some functions for handling I/O operations.
|
`xcdat.hpp` provides some functions for handling I/O operations.
|
||||||
|
|
||||||
```c++
|
```c++
|
||||||
|
//! Set the continuous memory block to a new trie instance.
|
||||||
template <class Trie>
|
template <class Trie>
|
||||||
Trie mmap(const char* address);
|
Trie mmap(const char* address);
|
||||||
|
|
||||||
|
//! Load the trie index from the file.
|
||||||
|
template <class Trie>
|
||||||
|
Trie load(std::string_view filepath);
|
||||||
|
|
||||||
|
//! Save the trie index to the file and returns the file size in bytes.
|
||||||
|
template <class Trie>
|
||||||
|
std::uint64_t save(const Trie& idx, std::string_view filepath);
|
||||||
|
|
||||||
|
//! Get the index size in bytes.
|
||||||
|
template <class Trie>
|
||||||
|
std::uint64_t memory_in_bytes(const Trie& idx);
|
||||||
|
|
||||||
|
//! Get the flag indicating the trie type, embedded by the function 'save'.
|
||||||
|
//! The flag corresponds to trie::l1_bits and will be used to detect the trie type from the file.
|
||||||
|
std::uint32_t get_flag(std::string_view filepath);
|
||||||
|
|
||||||
|
//! Load the keywords from the file.
|
||||||
|
std::vector<std::string> load_strings(std::string_view filepath, char delim = '\n');
|
||||||
```
|
```
|
||||||
|
|
||||||
## Performance
|
## Performance
|
||||||
|
|
|
@ -13,7 +13,7 @@ namespace xcdat {
|
||||||
using trie_7_type = trie<bc_vector_7>;
|
using trie_7_type = trie<bc_vector_7>;
|
||||||
using trie_8_type = trie<bc_vector_8>;
|
using trie_8_type = trie<bc_vector_8>;
|
||||||
|
|
||||||
//! Map the memory to the trie index.
|
//! Set the continuous memory block to a new trie instance.
|
||||||
template <class Trie>
|
template <class Trie>
|
||||||
[[maybe_unused]] Trie mmap(const char* address) {
|
[[maybe_unused]] Trie mmap(const char* address) {
|
||||||
mmap_visitor visitor(address);
|
mmap_visitor visitor(address);
|
||||||
|
@ -41,7 +41,7 @@ template <class Trie>
|
||||||
return idx;
|
return idx;
|
||||||
}
|
}
|
||||||
|
|
||||||
//! Save the trie index into the file, and returns the file size in bytes.
|
//! Save the trie index to the file and returns the file size in bytes.
|
||||||
template <class Trie>
|
template <class Trie>
|
||||||
[[maybe_unused]] std::uint64_t save(const Trie& idx, std::string_view filepath) {
|
[[maybe_unused]] std::uint64_t save(const Trie& idx, std::string_view filepath) {
|
||||||
save_visitor visitor(filepath);
|
save_visitor visitor(filepath);
|
||||||
|
@ -59,6 +59,8 @@ template <class Trie>
|
||||||
return visitor.bytes();
|
return visitor.bytes();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//! Get the flag indicating the trie type, embedded by the function 'save'.
|
||||||
|
//! The flag corresponds to trie::l1_bits and will be used to detect the trie type from the file.
|
||||||
[[maybe_unused]] std::uint32_t get_flag(std::string_view filepath) {
|
[[maybe_unused]] std::uint32_t get_flag(std::string_view filepath) {
|
||||||
std::ifstream ifs(filepath);
|
std::ifstream ifs(filepath);
|
||||||
XCDAT_THROW_IF(!ifs.good(), "Cannot open the input file");
|
XCDAT_THROW_IF(!ifs.good(), "Cannot open the input file");
|
||||||
|
@ -68,12 +70,13 @@ template <class Trie>
|
||||||
return flag;
|
return flag;
|
||||||
}
|
}
|
||||||
|
|
||||||
[[maybe_unused]] std::vector<std::string> load_strings(std::string_view filepath) {
|
//! Load the keywords from the file.
|
||||||
|
[[maybe_unused]] std::vector<std::string> load_strings(std::string_view filepath, char delim = '\n') {
|
||||||
std::ifstream ifs(filepath);
|
std::ifstream ifs(filepath);
|
||||||
XCDAT_THROW_IF(!ifs.good(), "Cannot open the input file");
|
XCDAT_THROW_IF(!ifs.good(), "Cannot open the input file");
|
||||||
|
|
||||||
std::vector<std::string> strs;
|
std::vector<std::string> strs;
|
||||||
for (std::string str; std::getline(ifs, str);) {
|
for (std::string str; std::getline(ifs, str, delim);) {
|
||||||
strs.push_back(str);
|
strs.push_back(str);
|
||||||
}
|
}
|
||||||
return strs;
|
return strs;
|
||||||
|
|
|
@ -8,7 +8,8 @@
|
||||||
|
|
||||||
namespace xcdat {
|
namespace xcdat {
|
||||||
|
|
||||||
//! A compressed string dictionary based on the XOR-compressed double-array trie.
|
//! A compressed string dictionary based on an improved double-array trie.
|
||||||
|
//! 'BcVector' is the data type of Base and Check vectors.
|
||||||
template <class BcVector>
|
template <class BcVector>
|
||||||
class trie {
|
class trie {
|
||||||
public:
|
public:
|
||||||
|
@ -44,9 +45,18 @@ class trie {
|
||||||
trie& operator=(trie&&) noexcept = default;
|
trie& operator=(trie&&) noexcept = default;
|
||||||
|
|
||||||
//! Build the trie from the input keywords, which are lexicographically sorted and unique.
|
//! Build the trie from the input keywords, which are lexicographically sorted and unique.
|
||||||
|
//!
|
||||||
//! If bin_mode = false, the NULL character is used for the termination of a keyword.
|
//! If bin_mode = false, the NULL character is used for the termination of a keyword.
|
||||||
//! If bin_mode = true, bit flags are used istead, and the keywords can contain NULL characters.
|
//! If bin_mode = true, bit flags are used istead, and the keywords can contain NULL characters.
|
||||||
//! If the input keywords contain NULL characters, bin_mode will be forced to be set to true.
|
//! If the input keywords contain NULL characters, bin_mode will be forced to be set to true.
|
||||||
|
//!
|
||||||
|
//! The type 'Strings' and 'Strings::value_type' should be a random iterable container such as std::vector.
|
||||||
|
//! Precisely, they should support the following operations:
|
||||||
|
//! - size() returns the container size.
|
||||||
|
//! - operator[](i) accesses the i-th element.
|
||||||
|
//! - begin() returns the iterator to the beginning.
|
||||||
|
//! - end() returns the iterator to the end.
|
||||||
|
//! The type 'Strings::value_type::value_type' should be one-byte integer type such as 'char'.
|
||||||
template <class Strings>
|
template <class Strings>
|
||||||
trie(const Strings& keys, bool bin_mode = false) : trie(trie_builder(keys, l1_bits, bin_mode)) {
|
trie(const Strings& keys, bool bin_mode = false) : trie(trie_builder(keys, l1_bits, bin_mode)) {
|
||||||
static_assert(sizeof(char) == sizeof(typename Strings::value_type::value_type));
|
static_assert(sizeof(char) == sizeof(typename Strings::value_type::value_type));
|
||||||
|
@ -124,9 +134,11 @@ class trie {
|
||||||
return decoded;
|
return decoded;
|
||||||
}
|
}
|
||||||
|
|
||||||
//! Decode the keyword associated with the ID.
|
//! Decode the keyword associated with the ID and store it in 'decoded'.
|
||||||
|
//! It can avoid reallocation of memory to store the result.
|
||||||
inline void decode(std::uint64_t id, std::string& decoded) const {
|
inline void decode(std::uint64_t id, std::string& decoded) const {
|
||||||
decoded.clear();
|
decoded.clear();
|
||||||
|
|
||||||
if (num_keys() <= id) {
|
if (num_keys() <= id) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -147,6 +159,8 @@ class trie {
|
||||||
}
|
}
|
||||||
|
|
||||||
//! An iterator class for common prefix search.
|
//! An iterator class for common prefix search.
|
||||||
|
//! It enumerates all the keywords contained as prefixes of a given string.
|
||||||
|
//! It should be instantiated via the function 'make_prefix_iterator'.
|
||||||
class prefix_iterator {
|
class prefix_iterator {
|
||||||
private:
|
private:
|
||||||
const trie_type* m_obj = nullptr;
|
const trie_type* m_obj = nullptr;
|
||||||
|
@ -203,6 +217,8 @@ class trie {
|
||||||
}
|
}
|
||||||
|
|
||||||
//! An iterator class for predictive search.
|
//! An iterator class for predictive search.
|
||||||
|
//! It enumerates all the keywords starting with a given string.
|
||||||
|
//! It should be instantiated via the function 'make_predictive_iterator'.
|
||||||
class predictive_iterator {
|
class predictive_iterator {
|
||||||
public:
|
public:
|
||||||
struct cursor_type {
|
struct cursor_type {
|
||||||
|
@ -266,6 +282,8 @@ class trie {
|
||||||
}
|
}
|
||||||
|
|
||||||
//! An iterator class for enumeration.
|
//! An iterator class for enumeration.
|
||||||
|
//! It enumerates all the keywords stored in the trie.
|
||||||
|
//! It should be instantiated via the function 'make_enumerative_iterator'.
|
||||||
using enumerative_iterator = predictive_iterator;
|
using enumerative_iterator = predictive_iterator;
|
||||||
|
|
||||||
//! Make the enumerator.
|
//! Make the enumerator.
|
||||||
|
@ -281,7 +299,7 @@ class trie {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//! Visit the members.
|
//! Visit the members (commonly used for I/O).
|
||||||
template <class Visitor>
|
template <class Visitor>
|
||||||
void visit(Visitor& visitor) {
|
void visit(Visitor& visitor) {
|
||||||
visitor.visit(m_num_keys);
|
visitor.visit(m_num_keys);
|
||||||
|
|
Loading…
Reference in a new issue