diff --git a/CMakeLists.txt b/CMakeLists.txt index 42e5fcd..ab1514f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -44,7 +44,7 @@ set(HEADER_FILES src/Vector.hpp src/xcdat_basics.hpp src/xcdat_config.hpp - ) + src/xcdat.hpp) set(SOURCE_FILES src/BitVector.cpp @@ -61,7 +61,7 @@ set_target_properties(xcdat-exe PROPERTIES OUTPUT_NAME xcdat) target_link_libraries(xcdat-exe xcdat) enable_testing() -file(GLOB TEST_SOURCES src/test*.cpp) +file(GLOB TEST_SOURCES src/*_test.cpp) foreach(TEST_SOURCE ${TEST_SOURCES}) get_filename_component(TEST_SOURCE_NAME ${TEST_SOURCE} NAME_WE) add_executable(${TEST_SOURCE_NAME} ${TEST_SOURCE}) diff --git a/src/BitVector.cpp b/src/BitVector.cpp index 86f640b..8c86eca 100644 --- a/src/BitVector.cpp +++ b/src/BitVector.cpp @@ -234,13 +234,13 @@ BitVector::BitVector(BitVectorBuilder& builder, bool rank_flag, } } -id_type BitVector::rank(size_t i) const { +id_type BitVector::rank(id_type i) const { auto& hint = rank_tips_[i / kBitsInR1]; return hint.L1 + hint.L2[i / kBitsInR2 % kR1PerR2] + pop_count(bits_[i / 32] & ((1U << (i % 32)) - 1)); } -id_type BitVector::select(size_t i) const { +id_type BitVector::select(id_type i) const { id_type left = 0, right = static_cast(rank_tips_.size()); if (!select_tips_.is_empty()) { diff --git a/src/BitVector.hpp b/src/BitVector.hpp index 8636bed..e8c9bb0 100644 --- a/src/BitVector.hpp +++ b/src/BitVector.hpp @@ -10,20 +10,19 @@ namespace xcdat { class BitVector { public: BitVector() = default; - ~BitVector() = default; - explicit BitVector(std::istream &is); - explicit BitVector(BitVectorBuilder& builder, - bool rank_flag, bool select_flag); + BitVector(BitVectorBuilder& builder, bool rank_flag, bool select_flag); + + ~BitVector() = default; bool operator[](size_t i) const { return (bits_[i / 32] & (1U << (i % 32))) != 0; } // the number of 1s in B[0,i). - id_type rank(size_t i) const; + id_type rank(id_type i) const; // the position of the i+1 th occurrence. - id_type select(size_t i) const; + id_type select(id_type i) const; size_t num_1s() const { return num_1s_; diff --git a/src/FastDacBc.cpp b/src/FastDacBc.cpp index c77e24a..f11c88f 100644 --- a/src/FastDacBc.cpp +++ b/src/FastDacBc.cpp @@ -17,7 +17,8 @@ FastDacBc::FastDacBc(std::istream& is) { num_free_nodes_ = read_value(is); } -FastDacBc::FastDacBc(const std::vector& bc, BitVectorBuilder& leaf_flags) { +FastDacBc::FastDacBc(const std::vector& bc, + BitVectorBuilder& leaf_flags) { if (bc.empty()) { return; } diff --git a/src/FastDacBc.hpp b/src/FastDacBc.hpp index d7b5691..5bde871 100644 --- a/src/FastDacBc.hpp +++ b/src/FastDacBc.hpp @@ -70,16 +70,16 @@ public: FastDacBc& operator=(FastDacBc&&) noexcept = default; private: - Vector values_L1_ {}; - Vector values_L2_ {}; - Vector values_L3_ {}; + Vector values_L1_{}; + Vector values_L2_{}; + Vector values_L3_{}; #ifdef XCDAT_X64 Vector values_L4_ {}; #endif - Vector ranks_[kLayers - 1] {}; - BitVector leaf_flags_ {}; - FitVector links_ {}; - size_t num_free_nodes_ {}; + Vector ranks_[kLayers - 1]{}; + BitVector leaf_flags_{}; + FitVector links_{}; + size_t num_free_nodes_{}; id_type access_(id_type i) const; }; diff --git a/src/TrieBuilder.cpp b/src/TrieBuilder.cpp index 22a4097..2248a52 100644 --- a/src/TrieBuilder.cpp +++ b/src/TrieBuilder.cpp @@ -3,8 +3,10 @@ namespace xcdat { -TrieBuilder::TrieBuilder(const std::vector& keys, id_type width_L1, bool binary_mode) - : keys_(keys), block_size_(1U << width_L1), width_L1_(width_L1), binary_mode_(binary_mode) { +TrieBuilder::TrieBuilder(const std::vector& keys, + id_type width_L1, bool binary_mode) + : keys_(keys), block_size_(1U << width_L1), width_L1_(width_L1), + binary_mode_(binary_mode) { if (keys_.empty()) { throw TrieBuilder::Exception("The input data is empty."); } @@ -13,16 +15,16 @@ TrieBuilder::TrieBuilder(const std::vector& keys, id_type width_L1, bool bi } { - size_t init_capacity = 1; - while (init_capacity < keys_.size()) { - init_capacity <<= 1; + size_t init_capa = 1; + while (init_capa < keys_.size()) { + init_capa <<= 1; } - bc_.reserve(init_capacity); - leaf_flags_.reserve(init_capacity); - term_flags_.reserve(init_capacity); - used_flags_.reserve(init_capacity); - heads_.reserve(init_capacity >> width_L1_); + bc_.reserve(init_capa); + leaf_flags_.reserve(init_capa); + term_flags_.reserve(init_capa); + used_flags_.reserve(init_capa); + heads_.reserve(init_capa >> width_L1_); } alphabet_.reserve(256); @@ -94,7 +96,8 @@ void TrieBuilder::build_table_() { } } -void TrieBuilder::build_bc_(size_t begin, size_t end, size_t depth, id_type node_id) { +void TrieBuilder::build_bc_(size_t begin, size_t end, size_t depth, + id_type node_id) { if (keys_[begin].length == depth) { term_flags_.set_bit(node_id, true); if (++begin == end) { // without link? @@ -117,7 +120,9 @@ void TrieBuilder::build_bc_(size_t begin, size_t end, size_t depth, id_type node const auto _label = keys_[str_id].ptr[depth]; if (label != _label) { if (_label < label) { - throw TrieBuilder::Exception("The input data is not in lexicographical order."); + throw TrieBuilder::Exception( + "The input data is not in lexicographical order." + ); } edges_.push_back(label); label = _label; @@ -177,13 +182,16 @@ void TrieBuilder::build_tail_() { } size_t match = 0; - while ((match < cur.length()) && (match < prev->length()) && ((*prev)[match] == cur[match])) { + while ((match < cur.length()) && (match < prev->length()) + && ((*prev)[match] == cur[match])) { ++match; } if ((match == cur.length()) && (prev->length() != 0)) { // sharing bc_[cur.node_id].base = - static_cast(bc_[prev->node_id].base + (prev->length() - match)); + static_cast( + bc_[prev->node_id].base + (prev->length() - match) + ); } else { // append bc_[cur.node_id].base = static_cast(tail_.size()); for (size_t j = 0; j < cur.length(); ++j) { @@ -276,7 +284,8 @@ id_type TrieBuilder::find_base_(id_type block_id) const { } // search in the same block - for (auto i = heads_[block_id]; i != kTabooId && i >> width_L1_ == block_id; i = bc_[i].base) { + for (auto i = heads_[block_id]; i != kTabooId && i >> width_L1_ == block_id; + i = bc_[i].base) { const auto base = i ^ table_[edges_[0]]; if (is_target_(base)) { return base; // base / block_size_ == block_id diff --git a/src/TrieBuilder.hpp b/src/TrieBuilder.hpp index 291a2a1..d0dab68 100644 --- a/src/TrieBuilder.hpp +++ b/src/TrieBuilder.hpp @@ -23,13 +23,13 @@ public: // reported by TrieBuilder::Exception. If the keys include the ASCII zero // code, pass binary_mode = true. template - static Trie build(const std::vector& keys, - bool binary_mode = false) { - TrieBuilder builder(keys, Trie::BcType::kWidthL1, binary_mode); + static Trie + build(const std::vector& keys, bool binary_mode = false) { + TrieBuilder builder(keys, Trie::bc_type::kWidthL1, binary_mode); Trie trie; - trie.bc_ = typename Trie::BcType(builder.bc_, builder.leaf_flags_); + trie.bc_ = typename Trie::bc_type(builder.bc_, builder.leaf_flags_); trie.terminal_flags_ = BitVector(builder.term_flags_, true, true); trie.tail_ = Vector(builder.tail_); trie.boundary_flags_ = BitVector(builder.boundary_flags_, false, false); @@ -85,22 +85,22 @@ private: const id_type block_size_; const id_type width_L1_; - bool binary_mode_ {}; + bool binary_mode_{}; - std::vector bc_ {}; - BitVectorBuilder leaf_flags_ {}; - BitVectorBuilder term_flags_ {}; - std::vector tail_ {}; - BitVectorBuilder boundary_flags_ {}; - std::vector alphabet_ {}; - uint8_t table_[512] {}; + std::vector bc_{}; + BitVectorBuilder leaf_flags_{}; + BitVectorBuilder term_flags_{}; + std::vector tail_{}; + BitVectorBuilder boundary_flags_{}; + std::vector alphabet_{}; + uint8_t table_[512]{}; - std::vector used_flags_ {}; - std::vector edges_ {}; - std::vector heads_ {}; - std::vector suffixes_ {}; + std::vector used_flags_{}; + std::vector edges_{}; + std::vector heads_{}; + std::vector suffixes_{}; - size_t max_length_ {}; + size_t max_length_{}; TrieBuilder(const std::vector& keys, id_type width_L1, bool binary_mode); ~TrieBuilder() = default; diff --git a/src/tries_test.cpp b/src/tries_test.cpp index 3bb79bc..939eb73 100644 --- a/src/tries_test.cpp +++ b/src/tries_test.cpp @@ -6,7 +6,7 @@ #include #include -#include "TrieBuilder.hpp" +#include "xcdat.hpp" using namespace xcdat; @@ -75,7 +75,7 @@ void test_basic_operations(const Trie& trie, const std::vector& keys, for (auto& key : keys) { const auto id = trie.lookup(key.ptr, key.length); - assert(id != NOT_FOUND); + assert(id != kNotFound); std::vector ret; trie.access(id, ret); @@ -86,7 +86,7 @@ void test_basic_operations(const Trie& trie, const std::vector& keys, for (auto& other : others) { const auto id = trie.lookup(other.ptr, other.length); - assert(id == NOT_FOUND); + assert(id == kNotFound); } } @@ -96,44 +96,48 @@ void test_prefix_operations(const Trie& trie, const std::vector& keys std::cerr << "Prefix operations -> common_prefix_lookup()" << std::endl; for (auto& key : keys) { - std::vector ids; - auto num_ids = trie.common_prefix_lookup(key.ptr, key.length, ids); + size_t num_results = 0; - assert(1 <= num_ids); - assert(num_ids <= kMaxLength); - assert(num_ids == ids.size()); + auto it = trie.make_prefix_iterator(key.ptr, key.length); + while (it.next()) { + auto id = it.id(); + auto dec = it.key(); - for (auto id : ids) { - std::vector ret; - trie.access(id, ret); - assert(ret.size() <= key.length); + assert(dec.second <= key.length); + + std::vector dec2; + trie.access(id, dec2); + + assert(dec.second == dec2.size()); + assert(std::memcmp(dec.first, dec2.data(), dec.second) == 0); + + ++num_results; } - auto limit = num_ids / 2; - auto new_num_ids = trie.common_prefix_lookup(key.ptr, key.length, ids, limit); - - assert(new_num_ids == limit); - assert(num_ids + new_num_ids == ids.size()); + assert(1 <= num_results); + assert(num_results <= key.length); } for (auto& other : others) { - std::vector ids; - auto num_ids = trie.common_prefix_lookup(other.ptr, other.length, ids); + size_t num_results = 0; - assert(num_ids <= kMaxLength); - assert(num_ids == ids.size()); + auto it = trie.make_prefix_iterator(other.ptr, other.length); + while (it.next()) { + auto id = it.id(); + auto dec = it.key(); - for (auto id : ids) { - std::vector ret; - trie.access(id, ret); - assert(ret.size() < other.length); + assert(dec.second < other.length); + + std::vector dec2; + trie.access(id, dec2); + + assert(dec.second == dec2.size()); + assert(std::memcmp(dec.first, dec2.data(), dec.second) == 0); + + ++num_results; } - auto limit = num_ids / 2; - auto new_num_ids = trie.common_prefix_lookup(other.ptr, other.length, ids, limit); - - assert(new_num_ids == limit); - assert(num_ids + new_num_ids == ids.size()); + assert(num_results < other.length); } } @@ -143,42 +147,63 @@ void test_predictive_operations(const Trie& trie, const std::vector& std::cerr << "Predictive operations -> predictive_lookup()" << std::endl; for (auto& key : keys) { - std::vector ids; - auto num_ids = trie.predictive_lookup(key.ptr, key.length, ids); + size_t num_results = 0; - assert(1 <= num_ids); - assert(num_ids == ids.size()); + auto it = trie.make_predictive_iterator(key.ptr, key.length); + while (it.next()) { + auto id = it.id(); + auto dec = it.key(); - for (auto id : ids) { - std::vector ret; - trie.access(id, ret); - assert(key.length <= ret.size()); + assert(key.length <= dec.second); + + std::vector dec2; + trie.access(id, dec2); + + assert(dec.second == dec2.size()); + assert(std::memcmp(dec.first, dec2.data(), dec.second) == 0); + + ++num_results; } - auto limit = num_ids / 2; - auto new_num_ids = trie.predictive_lookup(key.ptr, key.length, ids, limit); - - assert(new_num_ids == limit); - assert(num_ids + new_num_ids == ids.size()); + assert(1 <= num_results); } for (auto& other : others) { - std::vector ids; - auto num_ids = trie.predictive_lookup(other.ptr, other.length, ids); + auto it = trie.make_predictive_iterator(other.ptr, other.length); + while (it.next()) { + auto id = it.id(); + auto dec = it.key(); - assert(num_ids == ids.size()); + assert(other.length < dec.second); - for (auto id : ids) { - std::vector ret; - trie.access(id, ret); - assert(other.length < ret.size()); + std::vector dec2; + trie.access(id, dec2); + + assert(dec.second == dec2.size()); + assert(std::memcmp(dec.first, dec2.data(), dec.second) == 0); + } + } + + { // all enumeration + size_t num_results = 0; + + auto it = trie.make_predictive_iterator(nullptr, 0); + while (it.next()) { + auto id = it.id(); + auto dec = it.key(); + + assert(0 <= dec.second); + + std::vector dec2; + trie.access(id, dec2); + + assert(dec.second == dec2.size()); + assert(std::memcmp(dec.first, dec2.data(), dec.second) == 0); + + ++num_results; } - auto limit = num_ids / 2; - auto new_num_ids = trie.predictive_lookup(other.ptr, other.length, ids, limit); - - assert(new_num_ids == limit); - assert(num_ids + new_num_ids == ids.size()); + assert(num_results == trie.num_keys()); } } diff --git a/src/xcdat.cpp b/src/xcdat.cpp index 410eca6..553c513 100644 --- a/src/xcdat.cpp +++ b/src/xcdat.cpp @@ -2,7 +2,7 @@ #include #include -#include "TrieBuilder.hpp" +#include "xcdat.hpp" using namespace xcdat; @@ -143,8 +143,8 @@ int query(std::vector& args) { } std::string query; - std::vector ids; - std::vector buf; +// std::vector ids; +// std::vector buf; while (true){ putchar('>'); @@ -165,29 +165,47 @@ int query(std::vector& args) { } std::cout << "common_prefix_lookup()" << std::endl; - ids.clear(); - trie.common_prefix_lookup(key, length, ids); - std::cout << ids.size() << " found" << std::endl; + { + size_t N = 0; + auto it = trie.make_prefix_iterator(key, length); + while (N < limit && it.next()) { + std::cout << it.id() << '\t'; + std::cout.write(reinterpret_cast(it.key().first), it.key().second); + std::cout << std::endl; + ++N; + } - for (size_t i = 0; i < std::min(ids.size(), limit); ++i) { - buf.clear(); - trie.access(ids[i], buf); - std::cout << ids[i] << '\t'; - std::cout.write(reinterpret_cast(buf.data()), buf.size()); - std::cout << std::endl; + size_t M = 0; + while (it.next()) { + ++M; + } + + if (M != 0) { + std::cout << "and more..." << std::endl; + } + std::cout << N + M << " found" << std::endl; } std::cout << "predictive_lookup()" << std::endl; - ids.clear(); - trie.predictive_lookup(key, length, ids); - std::cout << ids.size() << " found" << std::endl; + { + size_t N = 0; + auto it = trie.make_predictive_iterator(key, length); + while (N < limit && it.next()) { + std::cout << it.id() << '\t'; + std::cout.write(reinterpret_cast(it.key().first), it.key().second); + std::cout << std::endl; + ++N; + } - for (size_t i = 0; i < std::min(ids.size(), limit); ++i) { - buf.clear(); - trie.access(ids[i], buf); - std::cout << ids[i] << '\t'; - std::cout.write(reinterpret_cast(buf.data()), buf.size()); - std::cout << std::endl; + size_t M = 0; + while (it.next()) { + ++M; + } + + if (M != 0) { + std::cout << "and more..." << std::endl; + } + std::cout << N + M << " found" << std::endl; } } diff --git a/src/xcdat.hpp b/src/xcdat.hpp index 7a46575..c173ce4 100644 --- a/src/xcdat.hpp +++ b/src/xcdat.hpp @@ -5,4 +5,13 @@ #ifndef XCDAT_XCDAT_HPP #define XCDAT_XCDAT_HPP +#include "TrieBuilder.hpp" + +namespace xcdat { + + + +} + + #endif //XCDAT_XCDAT_HPP