This commit is contained in:
Shunsuke Kanda 2021-07-02 14:37:03 +09:00
parent 0362cc0453
commit 8f59dfbd41
3 changed files with 68 additions and 15 deletions

View file

@ -290,14 +290,10 @@ Enumerate() = {
### Dictionary class ### Dictionary class
```c++ ```c++
//! A compressed string dictionary based on an improved double-array trie.
//! 'BcVector' is the data type of Base and Check vectors.
template <class BcVector> template <class BcVector>
class trie { class trie {
public:
using trie_type = trie<BcVector>;
using bc_vector_type = BcVector;
static constexpr auto l1_bits = bc_vector_type::l1_bits;
public: public:
//! Default constructor //! Default constructor
trie() = default; trie() = default;
@ -318,9 +314,18 @@ class trie {
trie& operator=(trie&&) noexcept = default; trie& operator=(trie&&) noexcept = default;
//! Build the trie from the input keywords, which are lexicographically sorted and unique. //! Build the trie from the input keywords, which are lexicographically sorted and unique.
//!
//! If bin_mode = false, the NULL character is used for the termination of a keyword. //! If bin_mode = false, the NULL character is used for the termination of a keyword.
//! If bin_mode = true, bit flags are used istead, and the keywords can contain NULL characters. //! If bin_mode = true, bit flags are used istead, and the keywords can contain NULL characters.
//! If the input keywords contain NULL characters, bin_mode will be forced to be set to true. //! If the input keywords contain NULL characters, bin_mode will be forced to be set to true.
//!
//! The type 'Strings' and 'Strings::value_type' should be a random iterable container such as std::vector.
//! Precisely, they should support the following operations:
//! - size() returns the container size.
//! - operator[](i) accesses the i-th element.
//! - begin() returns the iterator to the beginning.
//! - end() returns the iterator to the end.
//! The type 'Strings::value_type::value_type' should be one-byte integer type such as 'char'.
template <class Strings> template <class Strings>
trie(const Strings& keys, bool bin_mode = false); trie(const Strings& keys, bool bin_mode = false);
@ -354,10 +359,13 @@ class trie {
//! Decode the keyword associated with the ID. //! Decode the keyword associated with the ID.
std::string decode(std::uint64_t id) const; std::string decode(std::uint64_t id) const;
//! Decode the keyword associated with the ID. //! Decode the keyword associated with the ID and store it in 'decoded'.
//! It can avoid reallocation of memory to store the result.
void decode(std::uint64_t id, std::string& decoded) const; void decode(std::uint64_t id, std::string& decoded) const;
//! An iterator class for common prefix search. //! An iterator class for common prefix search.
//! It enumerates all the keywords contained as prefixes of a given string.
//! It should be instantiated via the function 'make_prefix_iterator'.
class prefix_iterator { class prefix_iterator {
public: public:
prefix_iterator() = default; prefix_iterator() = default;
@ -384,6 +392,8 @@ class trie {
void prefix_search(std::string_view key, const std::function<void(std::uint64_t, std::string_view)>& fn) const; void prefix_search(std::string_view key, const std::function<void(std::uint64_t, std::string_view)>& fn) const;
//! An iterator class for predictive search. //! An iterator class for predictive search.
//! It enumerates all the keywords starting with a given string.
//! It should be instantiated via the function 'make_predictive_iterator'.
class predictive_iterator { class predictive_iterator {
public: public:
predictive_iterator() = default; predictive_iterator() = default;
@ -410,6 +420,8 @@ class trie {
void predictive_search(std::string_view key, const std::function<void(std::uint64_t, std::string_view)>& fn) const; void predictive_search(std::string_view key, const std::function<void(std::uint64_t, std::string_view)>& fn) const;
//! An iterator class for enumeration. //! An iterator class for enumeration.
//! It enumerates all the keywords stored in the trie.
//! It should be instantiated via the function 'make_enumerative_iterator'.
using enumerative_iterator = predictive_iterator; using enumerative_iterator = predictive_iterator;
//! An iterator class for enumeration. //! An iterator class for enumeration.
@ -418,7 +430,7 @@ class trie {
//! Enumerate all the keywords and their IDs stored in the trie. //! Enumerate all the keywords and their IDs stored in the trie.
void enumerate(const std::function<void(std::uint64_t, std::string_view)>& fn) const; void enumerate(const std::function<void(std::uint64_t, std::string_view)>& fn) const;
//! Visit the members. //! Visit the members (commonly used for I/O).
template <class Visitor> template <class Visitor>
void visit(Visitor& visitor); void visit(Visitor& visitor);
}; };
@ -429,8 +441,28 @@ class trie {
`xcdat.hpp` provides some functions for handling I/O operations. `xcdat.hpp` provides some functions for handling I/O operations.
```c++ ```c++
//! Set the continuous memory block to a new trie instance.
template <class Trie> template <class Trie>
Trie mmap(const char* address); Trie mmap(const char* address);
//! Load the trie index from the file.
template <class Trie>
Trie load(std::string_view filepath);
//! Save the trie index to the file and returns the file size in bytes.
template <class Trie>
std::uint64_t save(const Trie& idx, std::string_view filepath);
//! Get the index size in bytes.
template <class Trie>
std::uint64_t memory_in_bytes(const Trie& idx);
//! Get the flag indicating the trie type, embedded by the function 'save'.
//! The flag corresponds to trie::l1_bits and will be used to detect the trie type from the file.
std::uint32_t get_flag(std::string_view filepath);
//! Load the keywords from the file.
std::vector<std::string> load_strings(std::string_view filepath, char delim = '\n');
``` ```
## Performance ## Performance

View file

@ -13,7 +13,7 @@ namespace xcdat {
using trie_7_type = trie<bc_vector_7>; using trie_7_type = trie<bc_vector_7>;
using trie_8_type = trie<bc_vector_8>; using trie_8_type = trie<bc_vector_8>;
//! Map the memory to the trie index. //! Set the continuous memory block to a new trie instance.
template <class Trie> template <class Trie>
[[maybe_unused]] Trie mmap(const char* address) { [[maybe_unused]] Trie mmap(const char* address) {
mmap_visitor visitor(address); mmap_visitor visitor(address);
@ -41,7 +41,7 @@ template <class Trie>
return idx; return idx;
} }
//! Save the trie index into the file, and returns the file size in bytes. //! Save the trie index to the file and returns the file size in bytes.
template <class Trie> template <class Trie>
[[maybe_unused]] std::uint64_t save(const Trie& idx, std::string_view filepath) { [[maybe_unused]] std::uint64_t save(const Trie& idx, std::string_view filepath) {
save_visitor visitor(filepath); save_visitor visitor(filepath);
@ -59,6 +59,8 @@ template <class Trie>
return visitor.bytes(); return visitor.bytes();
} }
//! Get the flag indicating the trie type, embedded by the function 'save'.
//! The flag corresponds to trie::l1_bits and will be used to detect the trie type from the file.
[[maybe_unused]] std::uint32_t get_flag(std::string_view filepath) { [[maybe_unused]] std::uint32_t get_flag(std::string_view filepath) {
std::ifstream ifs(filepath); std::ifstream ifs(filepath);
XCDAT_THROW_IF(!ifs.good(), "Cannot open the input file"); XCDAT_THROW_IF(!ifs.good(), "Cannot open the input file");
@ -68,12 +70,13 @@ template <class Trie>
return flag; return flag;
} }
[[maybe_unused]] std::vector<std::string> load_strings(std::string_view filepath) { //! Load the keywords from the file.
[[maybe_unused]] std::vector<std::string> load_strings(std::string_view filepath, char delim = '\n') {
std::ifstream ifs(filepath); std::ifstream ifs(filepath);
XCDAT_THROW_IF(!ifs.good(), "Cannot open the input file"); XCDAT_THROW_IF(!ifs.good(), "Cannot open the input file");
std::vector<std::string> strs; std::vector<std::string> strs;
for (std::string str; std::getline(ifs, str);) { for (std::string str; std::getline(ifs, str, delim);) {
strs.push_back(str); strs.push_back(str);
} }
return strs; return strs;

View file

@ -8,7 +8,8 @@
namespace xcdat { namespace xcdat {
//! A compressed string dictionary based on the XOR-compressed double-array trie. //! A compressed string dictionary based on an improved double-array trie.
//! 'BcVector' is the data type of Base and Check vectors.
template <class BcVector> template <class BcVector>
class trie { class trie {
public: public:
@ -44,9 +45,18 @@ class trie {
trie& operator=(trie&&) noexcept = default; trie& operator=(trie&&) noexcept = default;
//! Build the trie from the input keywords, which are lexicographically sorted and unique. //! Build the trie from the input keywords, which are lexicographically sorted and unique.
//!
//! If bin_mode = false, the NULL character is used for the termination of a keyword. //! If bin_mode = false, the NULL character is used for the termination of a keyword.
//! If bin_mode = true, bit flags are used istead, and the keywords can contain NULL characters. //! If bin_mode = true, bit flags are used istead, and the keywords can contain NULL characters.
//! If the input keywords contain NULL characters, bin_mode will be forced to be set to true. //! If the input keywords contain NULL characters, bin_mode will be forced to be set to true.
//!
//! The type 'Strings' and 'Strings::value_type' should be a random iterable container such as std::vector.
//! Precisely, they should support the following operations:
//! - size() returns the container size.
//! - operator[](i) accesses the i-th element.
//! - begin() returns the iterator to the beginning.
//! - end() returns the iterator to the end.
//! The type 'Strings::value_type::value_type' should be one-byte integer type such as 'char'.
template <class Strings> template <class Strings>
trie(const Strings& keys, bool bin_mode = false) : trie(trie_builder(keys, l1_bits, bin_mode)) { trie(const Strings& keys, bool bin_mode = false) : trie(trie_builder(keys, l1_bits, bin_mode)) {
static_assert(sizeof(char) == sizeof(typename Strings::value_type::value_type)); static_assert(sizeof(char) == sizeof(typename Strings::value_type::value_type));
@ -124,9 +134,11 @@ class trie {
return decoded; return decoded;
} }
//! Decode the keyword associated with the ID. //! Decode the keyword associated with the ID and store it in 'decoded'.
//! It can avoid reallocation of memory to store the result.
inline void decode(std::uint64_t id, std::string& decoded) const { inline void decode(std::uint64_t id, std::string& decoded) const {
decoded.clear(); decoded.clear();
if (num_keys() <= id) { if (num_keys() <= id) {
return; return;
} }
@ -147,6 +159,8 @@ class trie {
} }
//! An iterator class for common prefix search. //! An iterator class for common prefix search.
//! It enumerates all the keywords contained as prefixes of a given string.
//! It should be instantiated via the function 'make_prefix_iterator'.
class prefix_iterator { class prefix_iterator {
private: private:
const trie_type* m_obj = nullptr; const trie_type* m_obj = nullptr;
@ -203,6 +217,8 @@ class trie {
} }
//! An iterator class for predictive search. //! An iterator class for predictive search.
//! It enumerates all the keywords starting with a given string.
//! It should be instantiated via the function 'make_predictive_iterator'.
class predictive_iterator { class predictive_iterator {
public: public:
struct cursor_type { struct cursor_type {
@ -266,6 +282,8 @@ class trie {
} }
//! An iterator class for enumeration. //! An iterator class for enumeration.
//! It enumerates all the keywords stored in the trie.
//! It should be instantiated via the function 'make_enumerative_iterator'.
using enumerative_iterator = predictive_iterator; using enumerative_iterator = predictive_iterator;
//! Make the enumerator. //! Make the enumerator.
@ -281,7 +299,7 @@ class trie {
} }
} }
//! Visit the members. //! Visit the members (commonly used for I/O).
template <class Visitor> template <class Visitor>
void visit(Visitor& visitor) { void visit(Visitor& visitor) {
visitor.visit(m_num_keys); visitor.visit(m_num_keys);