From 8f59dfbd41267dab6a8a38b9511d6ddafd379cc4 Mon Sep 17 00:00:00 2001 From: Shunsuke Kanda Date: Fri, 2 Jul 2021 14:37:03 +0900 Subject: [PATCH] add --- README.md | 48 +++++++++++++++++++++++++++++++++++------- include/xcdat.hpp | 11 ++++++---- include/xcdat/trie.hpp | 24 ++++++++++++++++++--- 3 files changed, 68 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 2cb1db9..80b24a5 100644 --- a/README.md +++ b/README.md @@ -290,14 +290,10 @@ Enumerate() = { ### Dictionary class ```c++ +//! A compressed string dictionary based on an improved double-array trie. +//! 'BcVector' is the data type of Base and Check vectors. template class trie { - public: - using trie_type = trie; - using bc_vector_type = BcVector; - - static constexpr auto l1_bits = bc_vector_type::l1_bits; - public: //! Default constructor trie() = default; @@ -318,9 +314,18 @@ class trie { trie& operator=(trie&&) noexcept = default; //! Build the trie from the input keywords, which are lexicographically sorted and unique. + //! //! If bin_mode = false, the NULL character is used for the termination of a keyword. //! If bin_mode = true, bit flags are used istead, and the keywords can contain NULL characters. //! If the input keywords contain NULL characters, bin_mode will be forced to be set to true. + //! + //! The type 'Strings' and 'Strings::value_type' should be a random iterable container such as std::vector. + //! Precisely, they should support the following operations: + //! - size() returns the container size. + //! - operator[](i) accesses the i-th element. + //! - begin() returns the iterator to the beginning. + //! - end() returns the iterator to the end. + //! The type 'Strings::value_type::value_type' should be one-byte integer type such as 'char'. template trie(const Strings& keys, bool bin_mode = false); @@ -354,10 +359,13 @@ class trie { //! Decode the keyword associated with the ID. std::string decode(std::uint64_t id) const; - //! Decode the keyword associated with the ID. + //! Decode the keyword associated with the ID and store it in 'decoded'. + //! It can avoid reallocation of memory to store the result. void decode(std::uint64_t id, std::string& decoded) const; //! An iterator class for common prefix search. + //! It enumerates all the keywords contained as prefixes of a given string. + //! It should be instantiated via the function 'make_prefix_iterator'. class prefix_iterator { public: prefix_iterator() = default; @@ -384,6 +392,8 @@ class trie { void prefix_search(std::string_view key, const std::function& fn) const; //! An iterator class for predictive search. + //! It enumerates all the keywords starting with a given string. + //! It should be instantiated via the function 'make_predictive_iterator'. class predictive_iterator { public: predictive_iterator() = default; @@ -410,6 +420,8 @@ class trie { void predictive_search(std::string_view key, const std::function& fn) const; //! An iterator class for enumeration. + //! It enumerates all the keywords stored in the trie. + //! It should be instantiated via the function 'make_enumerative_iterator'. using enumerative_iterator = predictive_iterator; //! An iterator class for enumeration. @@ -418,7 +430,7 @@ class trie { //! Enumerate all the keywords and their IDs stored in the trie. void enumerate(const std::function& fn) const; - //! Visit the members. + //! Visit the members (commonly used for I/O). template void visit(Visitor& visitor); }; @@ -429,8 +441,28 @@ class trie { `xcdat.hpp` provides some functions for handling I/O operations. ```c++ +//! Set the continuous memory block to a new trie instance. template Trie mmap(const char* address); + +//! Load the trie index from the file. +template +Trie load(std::string_view filepath); + +//! Save the trie index to the file and returns the file size in bytes. +template +std::uint64_t save(const Trie& idx, std::string_view filepath); + +//! Get the index size in bytes. +template +std::uint64_t memory_in_bytes(const Trie& idx); + +//! Get the flag indicating the trie type, embedded by the function 'save'. +//! The flag corresponds to trie::l1_bits and will be used to detect the trie type from the file. +std::uint32_t get_flag(std::string_view filepath); + +//! Load the keywords from the file. +std::vector load_strings(std::string_view filepath, char delim = '\n'); ``` ## Performance diff --git a/include/xcdat.hpp b/include/xcdat.hpp index f5d201c..7208997 100644 --- a/include/xcdat.hpp +++ b/include/xcdat.hpp @@ -13,7 +13,7 @@ namespace xcdat { using trie_7_type = trie; using trie_8_type = trie; -//! Map the memory to the trie index. +//! Set the continuous memory block to a new trie instance. template [[maybe_unused]] Trie mmap(const char* address) { mmap_visitor visitor(address); @@ -41,7 +41,7 @@ template return idx; } -//! Save the trie index into the file, and returns the file size in bytes. +//! Save the trie index to the file and returns the file size in bytes. template [[maybe_unused]] std::uint64_t save(const Trie& idx, std::string_view filepath) { save_visitor visitor(filepath); @@ -59,6 +59,8 @@ template return visitor.bytes(); } +//! Get the flag indicating the trie type, embedded by the function 'save'. +//! The flag corresponds to trie::l1_bits and will be used to detect the trie type from the file. [[maybe_unused]] std::uint32_t get_flag(std::string_view filepath) { std::ifstream ifs(filepath); XCDAT_THROW_IF(!ifs.good(), "Cannot open the input file"); @@ -68,12 +70,13 @@ template return flag; } -[[maybe_unused]] std::vector load_strings(std::string_view filepath) { +//! Load the keywords from the file. +[[maybe_unused]] std::vector load_strings(std::string_view filepath, char delim = '\n') { std::ifstream ifs(filepath); XCDAT_THROW_IF(!ifs.good(), "Cannot open the input file"); std::vector strs; - for (std::string str; std::getline(ifs, str);) { + for (std::string str; std::getline(ifs, str, delim);) { strs.push_back(str); } return strs; diff --git a/include/xcdat/trie.hpp b/include/xcdat/trie.hpp index b0c2788..dd769c0 100644 --- a/include/xcdat/trie.hpp +++ b/include/xcdat/trie.hpp @@ -8,7 +8,8 @@ namespace xcdat { -//! A compressed string dictionary based on the XOR-compressed double-array trie. +//! A compressed string dictionary based on an improved double-array trie. +//! 'BcVector' is the data type of Base and Check vectors. template class trie { public: @@ -44,9 +45,18 @@ class trie { trie& operator=(trie&&) noexcept = default; //! Build the trie from the input keywords, which are lexicographically sorted and unique. + //! //! If bin_mode = false, the NULL character is used for the termination of a keyword. //! If bin_mode = true, bit flags are used istead, and the keywords can contain NULL characters. //! If the input keywords contain NULL characters, bin_mode will be forced to be set to true. + //! + //! The type 'Strings' and 'Strings::value_type' should be a random iterable container such as std::vector. + //! Precisely, they should support the following operations: + //! - size() returns the container size. + //! - operator[](i) accesses the i-th element. + //! - begin() returns the iterator to the beginning. + //! - end() returns the iterator to the end. + //! The type 'Strings::value_type::value_type' should be one-byte integer type such as 'char'. template trie(const Strings& keys, bool bin_mode = false) : trie(trie_builder(keys, l1_bits, bin_mode)) { static_assert(sizeof(char) == sizeof(typename Strings::value_type::value_type)); @@ -124,9 +134,11 @@ class trie { return decoded; } - //! Decode the keyword associated with the ID. + //! Decode the keyword associated with the ID and store it in 'decoded'. + //! It can avoid reallocation of memory to store the result. inline void decode(std::uint64_t id, std::string& decoded) const { decoded.clear(); + if (num_keys() <= id) { return; } @@ -147,6 +159,8 @@ class trie { } //! An iterator class for common prefix search. + //! It enumerates all the keywords contained as prefixes of a given string. + //! It should be instantiated via the function 'make_prefix_iterator'. class prefix_iterator { private: const trie_type* m_obj = nullptr; @@ -203,6 +217,8 @@ class trie { } //! An iterator class for predictive search. + //! It enumerates all the keywords starting with a given string. + //! It should be instantiated via the function 'make_predictive_iterator'. class predictive_iterator { public: struct cursor_type { @@ -266,6 +282,8 @@ class trie { } //! An iterator class for enumeration. + //! It enumerates all the keywords stored in the trie. + //! It should be instantiated via the function 'make_enumerative_iterator'. using enumerative_iterator = predictive_iterator; //! Make the enumerator. @@ -281,7 +299,7 @@ class trie { } } - //! Visit the members. + //! Visit the members (commonly used for I/O). template void visit(Visitor& visitor) { visitor.visit(m_num_keys);