From 45682b7a10b1adfa37694b1074e9cf2ff067583a Mon Sep 17 00:00:00 2001 From: Shunsuke Kanda Date: Tue, 29 Jun 2021 15:39:26 +0900 Subject: [PATCH] add comments etc --- README.md | 176 +++++++++++++++++++++++++++++- include/xcdat.hpp | 4 + include/xcdat/trie.hpp | 64 ++++++----- tools/xcdat_predictive_search.cpp | 10 +- tools/xcdat_prefix_search.cpp | 8 +- 5 files changed, 219 insertions(+), 43 deletions(-) diff --git a/README.md b/README.md index e1cc69e..5f051ad 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,14 @@ The double array is known as the fastest trie representation and has been used i Xcdat can implement trie dictionaries in smaller space compared to the other double-array libraries. In addition, the lookup speed is relatively fast in compressed data structures from the double-array advantage. +### Table of contents + +- [Features](#features) +- [Build instructions](#build-instructions) +- [Command line tools](#command-line-tools) +- [Sample usage](#sample-usage) +- [Interface](#interface) + ## Features - **Fast and memory-efficient:** Xcdat employs the double-array structure, known as the fastest trie data structure, and. @@ -21,7 +29,7 @@ Xcdat can implement trie dictionaries in smaller space compared to the other dou - **Fast search**: Xcdat can provide lookup operations faster than other compressed trie libraries because it is based on the double-array trie. On the other hand, compared to the existing double-array libraries, the speed will be slower due to the compression. - **Prefix-based Lookup Operations**: As with other trie libraries, Xcdat also provides prefix-based lookup operations required for natural language processing and so on. -## Build Instructions +## Build instructions You can download and compile Xcdat as the following commands. @@ -37,9 +45,69 @@ $ make install ## Command line tools +### Build +```sh +$ xcdat_build enwiki-latest-all-titles-in-ns0 idx.bin -u 1 +time_in_sec: 13.449 +memory_in_bytes: 1.70618e+08 +memory_in_MiB: 162.714 +number_of_keys: 15955763 +alphabet_size: 198 +max_length: 253 +``` -## Sample +### Lookup + +```sh +$ xcdat_lookup idx.bin +Algorithm +1255938 Algorithm +``` + +### Decode + +```sh +$ xcdat_decode idx.bin +1255938 +1255938 Algorithm +``` + +### Common prefix search + +```sh +$ xcdat_prefix_search idx.bin +Algorithmic +6 found +57 A +798460 Al +1138004 Alg +1253024 Algo +1255938 Algorithm +1255931 Algorithmic +``` + +### Predictive search + +```sh +$ xcdat_predictive_search idx.bin -n 3 +Algorithm +263 found +1255938 Algorithm +1255944 Algorithm's_optimality +1255972 Algorithm_(C++) +``` + +### Enumerate + +```sh +$ xcdat_enumerate idx.bin | head -3 +0 ! +107 !! +138 !!! +``` + +## Sample usage ```c++ #include @@ -168,7 +236,111 @@ Enumerate() = { } ``` +## Interface +### Dictionary class + +```c++ +temp late +class trie { + public: + using trie_type = trie; + using bc_vector_type = BcVector; + + static constexpr auto l1_bits = bc_vector_type::l1_bits; + + public: + trie() = default; + virtual ~trie() = default; + trie(const trie&) = delete; + trie& operator=(const trie&) = delete; + trie(trie&&) noexcept = default; + trie& operator=(trie&&) noexcept = default; + + template + explicit trie(const Strings& keys, bool bin_mode = false); + + inline bool bin_mode() const; + + //! Get the number of stored keywords. + inline std::uint64_t num_keys() const; + + //! Get the alphabet size. + inline std::uint64_t alphabet_size() const; + + //! Get the maximum length of keywords. + inline std::uint64_t max_length() const; + + /** + * Search the given keyword in the trie. + * @param[in] key The query keyword. + * @return The associated ID if found. + */ + inline std::optional lookup(std::string_view key) const; + + /** + * Decode the keyword associated with the given ID. + * @param[in] id The keyword ID. + * @return The keyword associated with the ID. + */ + inline std::string decode(std::uint64_t id) const; + + /** + * An iterator class for common prefix search. + */ + class prefix_iterator { + public: + prefix_iterator() = default; + + inline bool next(); + inline std::uint64_t id() const; + inline std::string decoded() const; + inline std::string_view decoded_view() const; + }; + + //! Make the iterator for the prefix search + inline prefix_iterator make_prefix_iterator(std::string_view key) const; + + inline void prefix_search(std::string_view key, const std::function& fn) const; + + /** + * An iterator class for predictive search. + */ + class predictive_iterator { + public: + predictive_iterator() = default; + + inline bool next(); + inline std::uint64_t id() const; + inline std::string decoded() const; + inline std::string_view decoded_view() const; + }; + + inline predictive_iterator make_predictive_iterator(std::string_view key) const { + return predictive_iterator(this, key); + } + + inline void predictive_search(std::string_view key, + const std::function& fn) const { + auto itr = make_predictive_iterator(key); + while (itr.next()) { + fn(itr.id(), itr.decoded_view()); + } + } + + using enumerative_iterator = predictive_iterator; + + inline enumerative_iterator make_enumerative_iterator() const; + inline void enumerate(const std::function& fn) const; + + template + void visit(Visitor& visitor); +}; +``` + +### I/O handlers + +## Benchmark ## Licensing diff --git a/include/xcdat.hpp b/include/xcdat.hpp index d59b57f..8ca1313 100644 --- a/include/xcdat.hpp +++ b/include/xcdat.hpp @@ -13,6 +13,7 @@ namespace xcdat { using trie_7_type = trie; using trie_8_type = trie; +//! Map the memory to the trie index. template [[maybe_unused]] Trie mmap(const char* address) { mmap_visitor visitor(address); @@ -26,6 +27,7 @@ template return idx; } +//! Load the trie index from the file. template [[maybe_unused]] Trie load(std::string_view filepath) { load_visitor visitor(filepath); @@ -39,6 +41,7 @@ template return idx; } +//! Save the trie index into the file, and returns the file size in bytes. template [[maybe_unused]] std::uint64_t save(const Trie& idx, std::string_view filepath) { save_visitor visitor(filepath); @@ -47,6 +50,7 @@ template return visitor.bytes(); } +//! Get the index size in bytes. template [[maybe_unused]] std::uint64_t memory_in_bytes(const Trie& idx) { size_visitor visitor; diff --git a/include/xcdat/trie.hpp b/include/xcdat/trie.hpp index 1b4e6fe..7e27bfc 100644 --- a/include/xcdat/trie.hpp +++ b/include/xcdat/trie.hpp @@ -8,17 +8,7 @@ namespace xcdat { -/** - * A compressed string dictionary based on the XOR-compressed double-array trie. - * - * @par References - * - Shunsuke Kanda, Kazuhiro Morita and Masao Fuketa. Compressed Double-array Tries for String Dictionaries - * Supporting Fast Lookup. Knowledge and Information Systems (KAIS), 51(3): 1023–1042, 2017. - * - * @par Links - * - https://kampersanda.github.io/pdf/KAIS2017.pdf - * - */ +//! A compressed string dictionary based on the XOR-compressed double-array trie. template class trie { public: @@ -53,8 +43,14 @@ class trie { //! Move constructor trie& operator=(trie&&) noexcept = default; + //! Build the trie from the input keywords, which are lexicographically sorted and unique. + //! If bin_mode = false, the NULL character is used for the termination of a keyword. + //! If bin_mode = true, bit flags are used istead, and the keywords can contain NULL characters. + //! If the input keywords contain NULL characters, bin_mode will be forced to be set to true. template - explicit trie(const Strings& keys, bool bin_mode = false) : trie(trie_builder(keys, l1_bits, bin_mode)) {} + explicit trie(const Strings& keys, bool bin_mode = false) : trie(trie_builder(keys, l1_bits, bin_mode)) { + static_assert(sizeof(char) == sizeof(typename Strings::value_type::value_type)); + } //! Check the binary mode. inline bool bin_mode() const { @@ -76,11 +72,7 @@ class trie { return m_table.max_length(); } - /** - * Search the given keyword in the trie. - * @param[in] key The query keyword. - * @return The associated ID if found. - */ + //! Lookup the ID of the keyword. inline std::optional lookup(std::string_view key) const { std::uint64_t kpos = 0, npos = 0; while (!m_bcvec.is_leaf(npos)) { @@ -104,11 +96,7 @@ class trie { return npos_to_id(npos); } - /** - * Decode the keyword associated with the given ID. - * @param[in] id The keyword ID. - * @return The keyword associated with the ID. - */ + //! Decode the keyword associated with the ID. inline std::string decode(std::uint64_t id) const { if (num_keys() <= id) { return {}; @@ -133,9 +121,7 @@ class trie { return decoded; } - /** - * An iterator class for common prefix search. - */ + //! An iterator class for common prefix search. class prefix_iterator { private: const trie_type* m_obj = nullptr; @@ -149,16 +135,24 @@ class trie { public: prefix_iterator() = default; + //! Get the next result. + //! If not found, false will be returned. inline bool next() { return m_obj != nullptr && m_obj->next_prefix(this); } + //! Get the ID. inline std::uint64_t id() const { return m_id; } + + //! Get the keyword. inline std::string decoded() const { return std::string(m_key.data(), m_kpos); } + + //! Get the reference to the keyword. + //! Note that the referenced data will be changed in the next step. inline std::string_view decoded_view() const { return std::string_view(m_key.data(), m_kpos); } @@ -169,11 +163,12 @@ class trie { friend class trie; }; - //! Make the iterator for the prefix search + //! Make the common prefix searcher for the given keyword. inline prefix_iterator make_prefix_iterator(std::string_view key) const { return prefix_iterator(this, key); } + //! Preform common prefix search for the keyword. inline void prefix_search(std::string_view key, const std::function& fn) const { auto itr = make_prefix_iterator(key); @@ -182,9 +177,7 @@ class trie { } } - /** - * An iterator class for predictive search. - */ + //! An iterator class for predictive search. class predictive_iterator { public: struct cursor_type { @@ -205,16 +198,24 @@ class trie { public: predictive_iterator() = default; + //! Get the next result. + //! If not found, false will be returned. inline bool next() { return m_obj != nullptr && m_obj->next_predictive(this); } + //! Get the ID. inline std::uint64_t id() const { return m_id; } + + //! Get the keyword. inline std::string decoded() const { return m_decoded; } + + //! Get the reference to the keyword. + //! Note that the referenced data will be changed in the next step. inline std::string_view decoded_view() const { return m_decoded; } @@ -225,10 +226,12 @@ class trie { friend class trie; }; + //! Make the predictive searcher for the keyword. inline predictive_iterator make_predictive_iterator(std::string_view key) const { return predictive_iterator(this, key); } + //! Preform predictive search for the keyword. inline void predictive_search(std::string_view key, const std::function& fn) const { auto itr = make_predictive_iterator(key); @@ -237,12 +240,15 @@ class trie { } } + //! An iterator class for enumeration. using enumerative_iterator = predictive_iterator; + //! Make the enumerator. inline enumerative_iterator make_enumerative_iterator() const { return enumerative_iterator(this, ""); } + //! Enumerate all the keywords and their IDs stored in the trie. inline void enumerate(const std::function& fn) const { auto itr = make_enumerative_iterator(); while (itr.next()) { diff --git a/tools/xcdat_predictive_search.cpp b/tools/xcdat_predictive_search.cpp index 39195fa..efaeda8 100644 --- a/tools/xcdat_predictive_search.cpp +++ b/tools/xcdat_predictive_search.cpp @@ -26,13 +26,11 @@ int predictive_search(const cmd_line_parser::parser& p) { std::vector results; results.reserve(1ULL << 10); - for (std::string str; std::getline(std::cin, str);) { + for (std::string key; std::getline(std::cin, key);) { results.clear(); - - auto itr = trie.make_predictive_iterator(str); - while (itr.next()) { - results.push_back({itr.id(), itr.decoded()}); - } + trie.predictive_search(key, [&](std::uint64_t id, std::string_view str) { + results.push_back({id, std::string(str)}); + }); tfm::printfln("%d found", results.size()); for (std::uint64_t i = 0; i < std::min(results.size(), max_num_results); i++) { diff --git a/tools/xcdat_prefix_search.cpp b/tools/xcdat_prefix_search.cpp index 41141c6..32abcc7 100644 --- a/tools/xcdat_prefix_search.cpp +++ b/tools/xcdat_prefix_search.cpp @@ -25,13 +25,9 @@ int prefix_search(const cmd_line_parser::parser& p) { std::vector results; results.reserve(trie.max_length()); - for (std::string str; std::getline(std::cin, str);) { + for (std::string key; std::getline(std::cin, key);) { results.clear(); - - auto itr = trie.make_prefix_iterator(str); - while (itr.next()) { - results.push_back({itr.id(), itr.decoded_view()}); - } + trie.prefix_search(key, [&](std::uint64_t id, std::string_view str) { results.push_back({id, str}); }); tfm::printfln("%d found", results.size()); for (const auto& r : results) {