Little modify
This commit is contained in:
parent
f720e3b039
commit
3d103af359
|
@ -44,7 +44,7 @@ set(HEADER_FILES
|
|||
src/Vector.hpp
|
||||
src/xcdat_basics.hpp
|
||||
src/xcdat_config.hpp
|
||||
)
|
||||
src/xcdat.hpp)
|
||||
|
||||
set(SOURCE_FILES
|
||||
src/BitVector.cpp
|
||||
|
@ -61,7 +61,7 @@ set_target_properties(xcdat-exe PROPERTIES OUTPUT_NAME xcdat)
|
|||
target_link_libraries(xcdat-exe xcdat)
|
||||
|
||||
enable_testing()
|
||||
file(GLOB TEST_SOURCES src/test*.cpp)
|
||||
file(GLOB TEST_SOURCES src/*_test.cpp)
|
||||
foreach(TEST_SOURCE ${TEST_SOURCES})
|
||||
get_filename_component(TEST_SOURCE_NAME ${TEST_SOURCE} NAME_WE)
|
||||
add_executable(${TEST_SOURCE_NAME} ${TEST_SOURCE})
|
||||
|
|
|
@ -234,13 +234,13 @@ BitVector::BitVector(BitVectorBuilder& builder, bool rank_flag,
|
|||
}
|
||||
}
|
||||
|
||||
id_type BitVector::rank(size_t i) const {
|
||||
id_type BitVector::rank(id_type i) const {
|
||||
auto& hint = rank_tips_[i / kBitsInR1];
|
||||
return hint.L1 + hint.L2[i / kBitsInR2 % kR1PerR2]
|
||||
+ pop_count(bits_[i / 32] & ((1U << (i % 32)) - 1));
|
||||
}
|
||||
|
||||
id_type BitVector::select(size_t i) const {
|
||||
id_type BitVector::select(id_type i) const {
|
||||
id_type left = 0, right = static_cast<id_type>(rank_tips_.size());
|
||||
|
||||
if (!select_tips_.is_empty()) {
|
||||
|
|
|
@ -10,20 +10,19 @@ namespace xcdat {
|
|||
class BitVector {
|
||||
public:
|
||||
BitVector() = default;
|
||||
~BitVector() = default;
|
||||
|
||||
explicit BitVector(std::istream &is);
|
||||
explicit BitVector(BitVectorBuilder& builder,
|
||||
bool rank_flag, bool select_flag);
|
||||
BitVector(BitVectorBuilder& builder, bool rank_flag, bool select_flag);
|
||||
|
||||
~BitVector() = default;
|
||||
|
||||
bool operator[](size_t i) const {
|
||||
return (bits_[i / 32] & (1U << (i % 32))) != 0;
|
||||
}
|
||||
|
||||
// the number of 1s in B[0,i).
|
||||
id_type rank(size_t i) const;
|
||||
id_type rank(id_type i) const;
|
||||
// the position of the i+1 th occurrence.
|
||||
id_type select(size_t i) const;
|
||||
id_type select(id_type i) const;
|
||||
|
||||
size_t num_1s() const {
|
||||
return num_1s_;
|
||||
|
|
|
@ -17,7 +17,8 @@ FastDacBc::FastDacBc(std::istream& is) {
|
|||
num_free_nodes_ = read_value<size_t>(is);
|
||||
}
|
||||
|
||||
FastDacBc::FastDacBc(const std::vector<BcPair>& bc, BitVectorBuilder& leaf_flags) {
|
||||
FastDacBc::FastDacBc(const std::vector<BcPair>& bc,
|
||||
BitVectorBuilder& leaf_flags) {
|
||||
if (bc.empty()) {
|
||||
return;
|
||||
}
|
||||
|
|
|
@ -70,16 +70,16 @@ public:
|
|||
FastDacBc& operator=(FastDacBc&&) noexcept = default;
|
||||
|
||||
private:
|
||||
Vector<uint8_t> values_L1_ {};
|
||||
Vector<uint16_t> values_L2_ {};
|
||||
Vector<uint32_t> values_L3_ {};
|
||||
Vector <uint8_t> values_L1_{};
|
||||
Vector <uint16_t> values_L2_{};
|
||||
Vector <uint32_t> values_L3_{};
|
||||
#ifdef XCDAT_X64
|
||||
Vector<uint64_t> values_L4_ {};
|
||||
#endif
|
||||
Vector<id_type> ranks_[kLayers - 1] {};
|
||||
BitVector leaf_flags_ {};
|
||||
FitVector links_ {};
|
||||
size_t num_free_nodes_ {};
|
||||
Vector <id_type> ranks_[kLayers - 1]{};
|
||||
BitVector leaf_flags_{};
|
||||
FitVector links_{};
|
||||
size_t num_free_nodes_{};
|
||||
|
||||
id_type access_(id_type i) const;
|
||||
};
|
||||
|
|
|
@ -3,8 +3,10 @@
|
|||
|
||||
namespace xcdat {
|
||||
|
||||
TrieBuilder::TrieBuilder(const std::vector<Key>& keys, id_type width_L1, bool binary_mode)
|
||||
: keys_(keys), block_size_(1U << width_L1), width_L1_(width_L1), binary_mode_(binary_mode) {
|
||||
TrieBuilder::TrieBuilder(const std::vector<Key>& keys,
|
||||
id_type width_L1, bool binary_mode)
|
||||
: keys_(keys), block_size_(1U << width_L1), width_L1_(width_L1),
|
||||
binary_mode_(binary_mode) {
|
||||
if (keys_.empty()) {
|
||||
throw TrieBuilder::Exception("The input data is empty.");
|
||||
}
|
||||
|
@ -13,16 +15,16 @@ TrieBuilder::TrieBuilder(const std::vector<Key>& keys, id_type width_L1, bool bi
|
|||
}
|
||||
|
||||
{
|
||||
size_t init_capacity = 1;
|
||||
while (init_capacity < keys_.size()) {
|
||||
init_capacity <<= 1;
|
||||
size_t init_capa = 1;
|
||||
while (init_capa < keys_.size()) {
|
||||
init_capa <<= 1;
|
||||
}
|
||||
|
||||
bc_.reserve(init_capacity);
|
||||
leaf_flags_.reserve(init_capacity);
|
||||
term_flags_.reserve(init_capacity);
|
||||
used_flags_.reserve(init_capacity);
|
||||
heads_.reserve(init_capacity >> width_L1_);
|
||||
bc_.reserve(init_capa);
|
||||
leaf_flags_.reserve(init_capa);
|
||||
term_flags_.reserve(init_capa);
|
||||
used_flags_.reserve(init_capa);
|
||||
heads_.reserve(init_capa >> width_L1_);
|
||||
}
|
||||
|
||||
alphabet_.reserve(256);
|
||||
|
@ -94,7 +96,8 @@ void TrieBuilder::build_table_() {
|
|||
}
|
||||
}
|
||||
|
||||
void TrieBuilder::build_bc_(size_t begin, size_t end, size_t depth, id_type node_id) {
|
||||
void TrieBuilder::build_bc_(size_t begin, size_t end, size_t depth,
|
||||
id_type node_id) {
|
||||
if (keys_[begin].length == depth) {
|
||||
term_flags_.set_bit(node_id, true);
|
||||
if (++begin == end) { // without link?
|
||||
|
@ -117,7 +120,9 @@ void TrieBuilder::build_bc_(size_t begin, size_t end, size_t depth, id_type node
|
|||
const auto _label = keys_[str_id].ptr[depth];
|
||||
if (label != _label) {
|
||||
if (_label < label) {
|
||||
throw TrieBuilder::Exception("The input data is not in lexicographical order.");
|
||||
throw TrieBuilder::Exception(
|
||||
"The input data is not in lexicographical order."
|
||||
);
|
||||
}
|
||||
edges_.push_back(label);
|
||||
label = _label;
|
||||
|
@ -177,13 +182,16 @@ void TrieBuilder::build_tail_() {
|
|||
}
|
||||
|
||||
size_t match = 0;
|
||||
while ((match < cur.length()) && (match < prev->length()) && ((*prev)[match] == cur[match])) {
|
||||
while ((match < cur.length()) && (match < prev->length())
|
||||
&& ((*prev)[match] == cur[match])) {
|
||||
++match;
|
||||
}
|
||||
|
||||
if ((match == cur.length()) && (prev->length() != 0)) { // sharing
|
||||
bc_[cur.node_id].base =
|
||||
static_cast<id_type>(bc_[prev->node_id].base + (prev->length() - match));
|
||||
static_cast<id_type>(
|
||||
bc_[prev->node_id].base + (prev->length() - match)
|
||||
);
|
||||
} else { // append
|
||||
bc_[cur.node_id].base = static_cast<id_type>(tail_.size());
|
||||
for (size_t j = 0; j < cur.length(); ++j) {
|
||||
|
@ -276,7 +284,8 @@ id_type TrieBuilder::find_base_(id_type block_id) const {
|
|||
}
|
||||
|
||||
// search in the same block
|
||||
for (auto i = heads_[block_id]; i != kTabooId && i >> width_L1_ == block_id; i = bc_[i].base) {
|
||||
for (auto i = heads_[block_id]; i != kTabooId && i >> width_L1_ == block_id;
|
||||
i = bc_[i].base) {
|
||||
const auto base = i ^ table_[edges_[0]];
|
||||
if (is_target_(base)) {
|
||||
return base; // base / block_size_ == block_id
|
||||
|
|
|
@ -23,13 +23,13 @@ public:
|
|||
// reported by TrieBuilder::Exception. If the keys include the ASCII zero
|
||||
// code, pass binary_mode = true.
|
||||
template<bool Fast>
|
||||
static Trie<Fast> build(const std::vector<Key>& keys,
|
||||
bool binary_mode = false) {
|
||||
TrieBuilder builder(keys, Trie<Fast>::BcType::kWidthL1, binary_mode);
|
||||
static Trie<Fast>
|
||||
build(const std::vector<Key>& keys, bool binary_mode = false) {
|
||||
TrieBuilder builder(keys, Trie<Fast>::bc_type::kWidthL1, binary_mode);
|
||||
|
||||
Trie<Fast> trie;
|
||||
|
||||
trie.bc_ = typename Trie<Fast>::BcType(builder.bc_, builder.leaf_flags_);
|
||||
trie.bc_ = typename Trie<Fast>::bc_type(builder.bc_, builder.leaf_flags_);
|
||||
trie.terminal_flags_ = BitVector(builder.term_flags_, true, true);
|
||||
trie.tail_ = Vector<uint8_t>(builder.tail_);
|
||||
trie.boundary_flags_ = BitVector(builder.boundary_flags_, false, false);
|
||||
|
@ -85,22 +85,22 @@ private:
|
|||
const id_type block_size_;
|
||||
const id_type width_L1_;
|
||||
|
||||
bool binary_mode_ {};
|
||||
bool binary_mode_{};
|
||||
|
||||
std::vector<BcPair> bc_ {};
|
||||
BitVectorBuilder leaf_flags_ {};
|
||||
BitVectorBuilder term_flags_ {};
|
||||
std::vector<uint8_t> tail_ {};
|
||||
BitVectorBuilder boundary_flags_ {};
|
||||
std::vector<uint8_t> alphabet_ {};
|
||||
uint8_t table_[512] {};
|
||||
std::vector<BcPair> bc_{};
|
||||
BitVectorBuilder leaf_flags_{};
|
||||
BitVectorBuilder term_flags_{};
|
||||
std::vector<uint8_t> tail_{};
|
||||
BitVectorBuilder boundary_flags_{};
|
||||
std::vector<uint8_t> alphabet_{};
|
||||
uint8_t table_[512]{};
|
||||
|
||||
std::vector<bool> used_flags_ {};
|
||||
std::vector<uint8_t> edges_ {};
|
||||
std::vector<id_type> heads_ {};
|
||||
std::vector<Suffix> suffixes_ {};
|
||||
std::vector<bool> used_flags_{};
|
||||
std::vector<uint8_t> edges_{};
|
||||
std::vector<id_type> heads_{};
|
||||
std::vector<Suffix> suffixes_{};
|
||||
|
||||
size_t max_length_ {};
|
||||
size_t max_length_{};
|
||||
|
||||
TrieBuilder(const std::vector<Key>& keys, id_type width_L1, bool binary_mode);
|
||||
~TrieBuilder() = default;
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
#include <random>
|
||||
#include <cstring>
|
||||
|
||||
#include "TrieBuilder.hpp"
|
||||
#include "xcdat.hpp"
|
||||
|
||||
using namespace xcdat;
|
||||
|
||||
|
@ -75,7 +75,7 @@ void test_basic_operations(const Trie<Fast>& trie, const std::vector<Key>& keys,
|
|||
|
||||
for (auto& key : keys) {
|
||||
const auto id = trie.lookup(key.ptr, key.length);
|
||||
assert(id != NOT_FOUND);
|
||||
assert(id != kNotFound);
|
||||
|
||||
std::vector<uint8_t> ret;
|
||||
trie.access(id, ret);
|
||||
|
@ -86,7 +86,7 @@ void test_basic_operations(const Trie<Fast>& trie, const std::vector<Key>& keys,
|
|||
|
||||
for (auto& other : others) {
|
||||
const auto id = trie.lookup(other.ptr, other.length);
|
||||
assert(id == NOT_FOUND);
|
||||
assert(id == kNotFound);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -96,44 +96,48 @@ void test_prefix_operations(const Trie<Fast>& trie, const std::vector<Key>& keys
|
|||
std::cerr << "Prefix operations -> common_prefix_lookup()" << std::endl;
|
||||
|
||||
for (auto& key : keys) {
|
||||
std::vector<id_type> ids;
|
||||
auto num_ids = trie.common_prefix_lookup(key.ptr, key.length, ids);
|
||||
size_t num_results = 0;
|
||||
|
||||
assert(1 <= num_ids);
|
||||
assert(num_ids <= kMaxLength);
|
||||
assert(num_ids == ids.size());
|
||||
auto it = trie.make_prefix_iterator(key.ptr, key.length);
|
||||
while (it.next()) {
|
||||
auto id = it.id();
|
||||
auto dec = it.key();
|
||||
|
||||
for (auto id : ids) {
|
||||
std::vector<uint8_t> ret;
|
||||
trie.access(id, ret);
|
||||
assert(ret.size() <= key.length);
|
||||
assert(dec.second <= key.length);
|
||||
|
||||
std::vector<uint8_t> dec2;
|
||||
trie.access(id, dec2);
|
||||
|
||||
assert(dec.second == dec2.size());
|
||||
assert(std::memcmp(dec.first, dec2.data(), dec.second) == 0);
|
||||
|
||||
++num_results;
|
||||
}
|
||||
|
||||
auto limit = num_ids / 2;
|
||||
auto new_num_ids = trie.common_prefix_lookup(key.ptr, key.length, ids, limit);
|
||||
|
||||
assert(new_num_ids == limit);
|
||||
assert(num_ids + new_num_ids == ids.size());
|
||||
assert(1 <= num_results);
|
||||
assert(num_results <= key.length);
|
||||
}
|
||||
|
||||
for (auto& other : others) {
|
||||
std::vector<id_type> ids;
|
||||
auto num_ids = trie.common_prefix_lookup(other.ptr, other.length, ids);
|
||||
size_t num_results = 0;
|
||||
|
||||
assert(num_ids <= kMaxLength);
|
||||
assert(num_ids == ids.size());
|
||||
auto it = trie.make_prefix_iterator(other.ptr, other.length);
|
||||
while (it.next()) {
|
||||
auto id = it.id();
|
||||
auto dec = it.key();
|
||||
|
||||
for (auto id : ids) {
|
||||
std::vector<uint8_t> ret;
|
||||
trie.access(id, ret);
|
||||
assert(ret.size() < other.length);
|
||||
assert(dec.second < other.length);
|
||||
|
||||
std::vector<uint8_t> dec2;
|
||||
trie.access(id, dec2);
|
||||
|
||||
assert(dec.second == dec2.size());
|
||||
assert(std::memcmp(dec.first, dec2.data(), dec.second) == 0);
|
||||
|
||||
++num_results;
|
||||
}
|
||||
|
||||
auto limit = num_ids / 2;
|
||||
auto new_num_ids = trie.common_prefix_lookup(other.ptr, other.length, ids, limit);
|
||||
|
||||
assert(new_num_ids == limit);
|
||||
assert(num_ids + new_num_ids == ids.size());
|
||||
assert(num_results < other.length);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -143,42 +147,63 @@ void test_predictive_operations(const Trie<Fast>& trie, const std::vector<Key>&
|
|||
std::cerr << "Predictive operations -> predictive_lookup()" << std::endl;
|
||||
|
||||
for (auto& key : keys) {
|
||||
std::vector<id_type> ids;
|
||||
auto num_ids = trie.predictive_lookup(key.ptr, key.length, ids);
|
||||
size_t num_results = 0;
|
||||
|
||||
assert(1 <= num_ids);
|
||||
assert(num_ids == ids.size());
|
||||
auto it = trie.make_predictive_iterator(key.ptr, key.length);
|
||||
while (it.next()) {
|
||||
auto id = it.id();
|
||||
auto dec = it.key();
|
||||
|
||||
for (auto id : ids) {
|
||||
std::vector<uint8_t> ret;
|
||||
trie.access(id, ret);
|
||||
assert(key.length <= ret.size());
|
||||
assert(key.length <= dec.second);
|
||||
|
||||
std::vector<uint8_t> dec2;
|
||||
trie.access(id, dec2);
|
||||
|
||||
assert(dec.second == dec2.size());
|
||||
assert(std::memcmp(dec.first, dec2.data(), dec.second) == 0);
|
||||
|
||||
++num_results;
|
||||
}
|
||||
|
||||
auto limit = num_ids / 2;
|
||||
auto new_num_ids = trie.predictive_lookup(key.ptr, key.length, ids, limit);
|
||||
|
||||
assert(new_num_ids == limit);
|
||||
assert(num_ids + new_num_ids == ids.size());
|
||||
assert(1 <= num_results);
|
||||
}
|
||||
|
||||
for (auto& other : others) {
|
||||
std::vector<id_type> ids;
|
||||
auto num_ids = trie.predictive_lookup(other.ptr, other.length, ids);
|
||||
auto it = trie.make_predictive_iterator(other.ptr, other.length);
|
||||
while (it.next()) {
|
||||
auto id = it.id();
|
||||
auto dec = it.key();
|
||||
|
||||
assert(num_ids == ids.size());
|
||||
assert(other.length < dec.second);
|
||||
|
||||
for (auto id : ids) {
|
||||
std::vector<uint8_t> ret;
|
||||
trie.access(id, ret);
|
||||
assert(other.length < ret.size());
|
||||
std::vector<uint8_t> dec2;
|
||||
trie.access(id, dec2);
|
||||
|
||||
assert(dec.second == dec2.size());
|
||||
assert(std::memcmp(dec.first, dec2.data(), dec.second) == 0);
|
||||
}
|
||||
}
|
||||
|
||||
auto limit = num_ids / 2;
|
||||
auto new_num_ids = trie.predictive_lookup(other.ptr, other.length, ids, limit);
|
||||
{ // all enumeration
|
||||
size_t num_results = 0;
|
||||
|
||||
assert(new_num_ids == limit);
|
||||
assert(num_ids + new_num_ids == ids.size());
|
||||
auto it = trie.make_predictive_iterator(nullptr, 0);
|
||||
while (it.next()) {
|
||||
auto id = it.id();
|
||||
auto dec = it.key();
|
||||
|
||||
assert(0 <= dec.second);
|
||||
|
||||
std::vector<uint8_t> dec2;
|
||||
trie.access(id, dec2);
|
||||
|
||||
assert(dec.second == dec2.size());
|
||||
assert(std::memcmp(dec.first, dec2.data(), dec.second) == 0);
|
||||
|
||||
++num_results;
|
||||
}
|
||||
|
||||
assert(num_results == trie.num_keys());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
#include <iostream>
|
||||
#include <random>
|
||||
|
||||
#include "TrieBuilder.hpp"
|
||||
#include "xcdat.hpp"
|
||||
|
||||
using namespace xcdat;
|
||||
|
||||
|
@ -143,8 +143,8 @@ int query(std::vector<std::string>& args) {
|
|||
}
|
||||
|
||||
std::string query;
|
||||
std::vector<id_type> ids;
|
||||
std::vector<uint8_t> buf;
|
||||
// std::vector<id_type> ids;
|
||||
// std::vector<uint8_t> buf;
|
||||
|
||||
while (true){
|
||||
putchar('>');
|
||||
|
@ -165,29 +165,47 @@ int query(std::vector<std::string>& args) {
|
|||
}
|
||||
|
||||
std::cout << "common_prefix_lookup()" << std::endl;
|
||||
ids.clear();
|
||||
trie.common_prefix_lookup(key, length, ids);
|
||||
std::cout << ids.size() << " found" << std::endl;
|
||||
|
||||
for (size_t i = 0; i < std::min(ids.size(), limit); ++i) {
|
||||
buf.clear();
|
||||
trie.access(ids[i], buf);
|
||||
std::cout << ids[i] << '\t';
|
||||
std::cout.write(reinterpret_cast<const char*>(buf.data()), buf.size());
|
||||
{
|
||||
size_t N = 0;
|
||||
auto it = trie.make_prefix_iterator(key, length);
|
||||
while (N < limit && it.next()) {
|
||||
std::cout << it.id() << '\t';
|
||||
std::cout.write(reinterpret_cast<const char*>(it.key().first), it.key().second);
|
||||
std::cout << std::endl;
|
||||
++N;
|
||||
}
|
||||
|
||||
size_t M = 0;
|
||||
while (it.next()) {
|
||||
++M;
|
||||
}
|
||||
|
||||
if (M != 0) {
|
||||
std::cout << "and more..." << std::endl;
|
||||
}
|
||||
std::cout << N + M << " found" << std::endl;
|
||||
}
|
||||
|
||||
std::cout << "predictive_lookup()" << std::endl;
|
||||
ids.clear();
|
||||
trie.predictive_lookup(key, length, ids);
|
||||
std::cout << ids.size() << " found" << std::endl;
|
||||
|
||||
for (size_t i = 0; i < std::min(ids.size(), limit); ++i) {
|
||||
buf.clear();
|
||||
trie.access(ids[i], buf);
|
||||
std::cout << ids[i] << '\t';
|
||||
std::cout.write(reinterpret_cast<const char*>(buf.data()), buf.size());
|
||||
{
|
||||
size_t N = 0;
|
||||
auto it = trie.make_predictive_iterator(key, length);
|
||||
while (N < limit && it.next()) {
|
||||
std::cout << it.id() << '\t';
|
||||
std::cout.write(reinterpret_cast<const char*>(it.key().first), it.key().second);
|
||||
std::cout << std::endl;
|
||||
++N;
|
||||
}
|
||||
|
||||
size_t M = 0;
|
||||
while (it.next()) {
|
||||
++M;
|
||||
}
|
||||
|
||||
if (M != 0) {
|
||||
std::cout << "and more..." << std::endl;
|
||||
}
|
||||
std::cout << N + M << " found" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -5,4 +5,13 @@
|
|||
#ifndef XCDAT_XCDAT_HPP
|
||||
#define XCDAT_XCDAT_HPP
|
||||
|
||||
#include "TrieBuilder.hpp"
|
||||
|
||||
namespace xcdat {
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
#endif //XCDAT_XCDAT_HPP
|
||||
|
|
Loading…
Reference in a new issue