Little modify

This commit is contained in:
kampersanda 2017-11-11 21:01:10 +09:00
parent f720e3b039
commit 3d103af359
10 changed files with 187 additions and 126 deletions

View file

@ -44,7 +44,7 @@ set(HEADER_FILES
src/Vector.hpp
src/xcdat_basics.hpp
src/xcdat_config.hpp
)
src/xcdat.hpp)
set(SOURCE_FILES
src/BitVector.cpp
@ -61,7 +61,7 @@ set_target_properties(xcdat-exe PROPERTIES OUTPUT_NAME xcdat)
target_link_libraries(xcdat-exe xcdat)
enable_testing()
file(GLOB TEST_SOURCES src/test*.cpp)
file(GLOB TEST_SOURCES src/*_test.cpp)
foreach(TEST_SOURCE ${TEST_SOURCES})
get_filename_component(TEST_SOURCE_NAME ${TEST_SOURCE} NAME_WE)
add_executable(${TEST_SOURCE_NAME} ${TEST_SOURCE})

View file

@ -234,13 +234,13 @@ BitVector::BitVector(BitVectorBuilder& builder, bool rank_flag,
}
}
id_type BitVector::rank(size_t i) const {
id_type BitVector::rank(id_type i) const {
auto& hint = rank_tips_[i / kBitsInR1];
return hint.L1 + hint.L2[i / kBitsInR2 % kR1PerR2]
+ pop_count(bits_[i / 32] & ((1U << (i % 32)) - 1));
}
id_type BitVector::select(size_t i) const {
id_type BitVector::select(id_type i) const {
id_type left = 0, right = static_cast<id_type>(rank_tips_.size());
if (!select_tips_.is_empty()) {

View file

@ -10,20 +10,19 @@ namespace xcdat {
class BitVector {
public:
BitVector() = default;
~BitVector() = default;
explicit BitVector(std::istream &is);
explicit BitVector(BitVectorBuilder& builder,
bool rank_flag, bool select_flag);
BitVector(BitVectorBuilder& builder, bool rank_flag, bool select_flag);
~BitVector() = default;
bool operator[](size_t i) const {
return (bits_[i / 32] & (1U << (i % 32))) != 0;
}
// the number of 1s in B[0,i).
id_type rank(size_t i) const;
id_type rank(id_type i) const;
// the position of the i+1 th occurrence.
id_type select(size_t i) const;
id_type select(id_type i) const;
size_t num_1s() const {
return num_1s_;

View file

@ -17,7 +17,8 @@ FastDacBc::FastDacBc(std::istream& is) {
num_free_nodes_ = read_value<size_t>(is);
}
FastDacBc::FastDacBc(const std::vector<BcPair>& bc, BitVectorBuilder& leaf_flags) {
FastDacBc::FastDacBc(const std::vector<BcPair>& bc,
BitVectorBuilder& leaf_flags) {
if (bc.empty()) {
return;
}

View file

@ -70,16 +70,16 @@ public:
FastDacBc& operator=(FastDacBc&&) noexcept = default;
private:
Vector<uint8_t> values_L1_ {};
Vector<uint16_t> values_L2_ {};
Vector<uint32_t> values_L3_ {};
Vector <uint8_t> values_L1_{};
Vector <uint16_t> values_L2_{};
Vector <uint32_t> values_L3_{};
#ifdef XCDAT_X64
Vector<uint64_t> values_L4_ {};
#endif
Vector<id_type> ranks_[kLayers - 1] {};
BitVector leaf_flags_ {};
FitVector links_ {};
size_t num_free_nodes_ {};
Vector <id_type> ranks_[kLayers - 1]{};
BitVector leaf_flags_{};
FitVector links_{};
size_t num_free_nodes_{};
id_type access_(id_type i) const;
};

View file

@ -3,8 +3,10 @@
namespace xcdat {
TrieBuilder::TrieBuilder(const std::vector<Key>& keys, id_type width_L1, bool binary_mode)
: keys_(keys), block_size_(1U << width_L1), width_L1_(width_L1), binary_mode_(binary_mode) {
TrieBuilder::TrieBuilder(const std::vector<Key>& keys,
id_type width_L1, bool binary_mode)
: keys_(keys), block_size_(1U << width_L1), width_L1_(width_L1),
binary_mode_(binary_mode) {
if (keys_.empty()) {
throw TrieBuilder::Exception("The input data is empty.");
}
@ -13,16 +15,16 @@ TrieBuilder::TrieBuilder(const std::vector<Key>& keys, id_type width_L1, bool bi
}
{
size_t init_capacity = 1;
while (init_capacity < keys_.size()) {
init_capacity <<= 1;
size_t init_capa = 1;
while (init_capa < keys_.size()) {
init_capa <<= 1;
}
bc_.reserve(init_capacity);
leaf_flags_.reserve(init_capacity);
term_flags_.reserve(init_capacity);
used_flags_.reserve(init_capacity);
heads_.reserve(init_capacity >> width_L1_);
bc_.reserve(init_capa);
leaf_flags_.reserve(init_capa);
term_flags_.reserve(init_capa);
used_flags_.reserve(init_capa);
heads_.reserve(init_capa >> width_L1_);
}
alphabet_.reserve(256);
@ -94,7 +96,8 @@ void TrieBuilder::build_table_() {
}
}
void TrieBuilder::build_bc_(size_t begin, size_t end, size_t depth, id_type node_id) {
void TrieBuilder::build_bc_(size_t begin, size_t end, size_t depth,
id_type node_id) {
if (keys_[begin].length == depth) {
term_flags_.set_bit(node_id, true);
if (++begin == end) { // without link?
@ -117,7 +120,9 @@ void TrieBuilder::build_bc_(size_t begin, size_t end, size_t depth, id_type node
const auto _label = keys_[str_id].ptr[depth];
if (label != _label) {
if (_label < label) {
throw TrieBuilder::Exception("The input data is not in lexicographical order.");
throw TrieBuilder::Exception(
"The input data is not in lexicographical order."
);
}
edges_.push_back(label);
label = _label;
@ -177,13 +182,16 @@ void TrieBuilder::build_tail_() {
}
size_t match = 0;
while ((match < cur.length()) && (match < prev->length()) && ((*prev)[match] == cur[match])) {
while ((match < cur.length()) && (match < prev->length())
&& ((*prev)[match] == cur[match])) {
++match;
}
if ((match == cur.length()) && (prev->length() != 0)) { // sharing
bc_[cur.node_id].base =
static_cast<id_type>(bc_[prev->node_id].base + (prev->length() - match));
static_cast<id_type>(
bc_[prev->node_id].base + (prev->length() - match)
);
} else { // append
bc_[cur.node_id].base = static_cast<id_type>(tail_.size());
for (size_t j = 0; j < cur.length(); ++j) {
@ -276,7 +284,8 @@ id_type TrieBuilder::find_base_(id_type block_id) const {
}
// search in the same block
for (auto i = heads_[block_id]; i != kTabooId && i >> width_L1_ == block_id; i = bc_[i].base) {
for (auto i = heads_[block_id]; i != kTabooId && i >> width_L1_ == block_id;
i = bc_[i].base) {
const auto base = i ^ table_[edges_[0]];
if (is_target_(base)) {
return base; // base / block_size_ == block_id

View file

@ -23,13 +23,13 @@ public:
// reported by TrieBuilder::Exception. If the keys include the ASCII zero
// code, pass binary_mode = true.
template<bool Fast>
static Trie<Fast> build(const std::vector<Key>& keys,
bool binary_mode = false) {
TrieBuilder builder(keys, Trie<Fast>::BcType::kWidthL1, binary_mode);
static Trie<Fast>
build(const std::vector<Key>& keys, bool binary_mode = false) {
TrieBuilder builder(keys, Trie<Fast>::bc_type::kWidthL1, binary_mode);
Trie<Fast> trie;
trie.bc_ = typename Trie<Fast>::BcType(builder.bc_, builder.leaf_flags_);
trie.bc_ = typename Trie<Fast>::bc_type(builder.bc_, builder.leaf_flags_);
trie.terminal_flags_ = BitVector(builder.term_flags_, true, true);
trie.tail_ = Vector<uint8_t>(builder.tail_);
trie.boundary_flags_ = BitVector(builder.boundary_flags_, false, false);
@ -85,22 +85,22 @@ private:
const id_type block_size_;
const id_type width_L1_;
bool binary_mode_ {};
bool binary_mode_{};
std::vector<BcPair> bc_ {};
BitVectorBuilder leaf_flags_ {};
BitVectorBuilder term_flags_ {};
std::vector<uint8_t> tail_ {};
BitVectorBuilder boundary_flags_ {};
std::vector<uint8_t> alphabet_ {};
uint8_t table_[512] {};
std::vector<BcPair> bc_{};
BitVectorBuilder leaf_flags_{};
BitVectorBuilder term_flags_{};
std::vector<uint8_t> tail_{};
BitVectorBuilder boundary_flags_{};
std::vector<uint8_t> alphabet_{};
uint8_t table_[512]{};
std::vector<bool> used_flags_ {};
std::vector<uint8_t> edges_ {};
std::vector<id_type> heads_ {};
std::vector<Suffix> suffixes_ {};
std::vector<bool> used_flags_{};
std::vector<uint8_t> edges_{};
std::vector<id_type> heads_{};
std::vector<Suffix> suffixes_{};
size_t max_length_ {};
size_t max_length_{};
TrieBuilder(const std::vector<Key>& keys, id_type width_L1, bool binary_mode);
~TrieBuilder() = default;

View file

@ -6,7 +6,7 @@
#include <random>
#include <cstring>
#include "TrieBuilder.hpp"
#include "xcdat.hpp"
using namespace xcdat;
@ -75,7 +75,7 @@ void test_basic_operations(const Trie<Fast>& trie, const std::vector<Key>& keys,
for (auto& key : keys) {
const auto id = trie.lookup(key.ptr, key.length);
assert(id != NOT_FOUND);
assert(id != kNotFound);
std::vector<uint8_t> ret;
trie.access(id, ret);
@ -86,7 +86,7 @@ void test_basic_operations(const Trie<Fast>& trie, const std::vector<Key>& keys,
for (auto& other : others) {
const auto id = trie.lookup(other.ptr, other.length);
assert(id == NOT_FOUND);
assert(id == kNotFound);
}
}
@ -96,44 +96,48 @@ void test_prefix_operations(const Trie<Fast>& trie, const std::vector<Key>& keys
std::cerr << "Prefix operations -> common_prefix_lookup()" << std::endl;
for (auto& key : keys) {
std::vector<id_type> ids;
auto num_ids = trie.common_prefix_lookup(key.ptr, key.length, ids);
size_t num_results = 0;
assert(1 <= num_ids);
assert(num_ids <= kMaxLength);
assert(num_ids == ids.size());
auto it = trie.make_prefix_iterator(key.ptr, key.length);
while (it.next()) {
auto id = it.id();
auto dec = it.key();
for (auto id : ids) {
std::vector<uint8_t> ret;
trie.access(id, ret);
assert(ret.size() <= key.length);
assert(dec.second <= key.length);
std::vector<uint8_t> dec2;
trie.access(id, dec2);
assert(dec.second == dec2.size());
assert(std::memcmp(dec.first, dec2.data(), dec.second) == 0);
++num_results;
}
auto limit = num_ids / 2;
auto new_num_ids = trie.common_prefix_lookup(key.ptr, key.length, ids, limit);
assert(new_num_ids == limit);
assert(num_ids + new_num_ids == ids.size());
assert(1 <= num_results);
assert(num_results <= key.length);
}
for (auto& other : others) {
std::vector<id_type> ids;
auto num_ids = trie.common_prefix_lookup(other.ptr, other.length, ids);
size_t num_results = 0;
assert(num_ids <= kMaxLength);
assert(num_ids == ids.size());
auto it = trie.make_prefix_iterator(other.ptr, other.length);
while (it.next()) {
auto id = it.id();
auto dec = it.key();
for (auto id : ids) {
std::vector<uint8_t> ret;
trie.access(id, ret);
assert(ret.size() < other.length);
assert(dec.second < other.length);
std::vector<uint8_t> dec2;
trie.access(id, dec2);
assert(dec.second == dec2.size());
assert(std::memcmp(dec.first, dec2.data(), dec.second) == 0);
++num_results;
}
auto limit = num_ids / 2;
auto new_num_ids = trie.common_prefix_lookup(other.ptr, other.length, ids, limit);
assert(new_num_ids == limit);
assert(num_ids + new_num_ids == ids.size());
assert(num_results < other.length);
}
}
@ -143,42 +147,63 @@ void test_predictive_operations(const Trie<Fast>& trie, const std::vector<Key>&
std::cerr << "Predictive operations -> predictive_lookup()" << std::endl;
for (auto& key : keys) {
std::vector<id_type> ids;
auto num_ids = trie.predictive_lookup(key.ptr, key.length, ids);
size_t num_results = 0;
assert(1 <= num_ids);
assert(num_ids == ids.size());
auto it = trie.make_predictive_iterator(key.ptr, key.length);
while (it.next()) {
auto id = it.id();
auto dec = it.key();
for (auto id : ids) {
std::vector<uint8_t> ret;
trie.access(id, ret);
assert(key.length <= ret.size());
assert(key.length <= dec.second);
std::vector<uint8_t> dec2;
trie.access(id, dec2);
assert(dec.second == dec2.size());
assert(std::memcmp(dec.first, dec2.data(), dec.second) == 0);
++num_results;
}
auto limit = num_ids / 2;
auto new_num_ids = trie.predictive_lookup(key.ptr, key.length, ids, limit);
assert(new_num_ids == limit);
assert(num_ids + new_num_ids == ids.size());
assert(1 <= num_results);
}
for (auto& other : others) {
std::vector<id_type> ids;
auto num_ids = trie.predictive_lookup(other.ptr, other.length, ids);
auto it = trie.make_predictive_iterator(other.ptr, other.length);
while (it.next()) {
auto id = it.id();
auto dec = it.key();
assert(num_ids == ids.size());
assert(other.length < dec.second);
for (auto id : ids) {
std::vector<uint8_t> ret;
trie.access(id, ret);
assert(other.length < ret.size());
std::vector<uint8_t> dec2;
trie.access(id, dec2);
assert(dec.second == dec2.size());
assert(std::memcmp(dec.first, dec2.data(), dec.second) == 0);
}
}
auto limit = num_ids / 2;
auto new_num_ids = trie.predictive_lookup(other.ptr, other.length, ids, limit);
{ // all enumeration
size_t num_results = 0;
assert(new_num_ids == limit);
assert(num_ids + new_num_ids == ids.size());
auto it = trie.make_predictive_iterator(nullptr, 0);
while (it.next()) {
auto id = it.id();
auto dec = it.key();
assert(0 <= dec.second);
std::vector<uint8_t> dec2;
trie.access(id, dec2);
assert(dec.second == dec2.size());
assert(std::memcmp(dec.first, dec2.data(), dec.second) == 0);
++num_results;
}
assert(num_results == trie.num_keys());
}
}

View file

@ -2,7 +2,7 @@
#include <iostream>
#include <random>
#include "TrieBuilder.hpp"
#include "xcdat.hpp"
using namespace xcdat;
@ -143,8 +143,8 @@ int query(std::vector<std::string>& args) {
}
std::string query;
std::vector<id_type> ids;
std::vector<uint8_t> buf;
// std::vector<id_type> ids;
// std::vector<uint8_t> buf;
while (true){
putchar('>');
@ -165,29 +165,47 @@ int query(std::vector<std::string>& args) {
}
std::cout << "common_prefix_lookup()" << std::endl;
ids.clear();
trie.common_prefix_lookup(key, length, ids);
std::cout << ids.size() << " found" << std::endl;
for (size_t i = 0; i < std::min(ids.size(), limit); ++i) {
buf.clear();
trie.access(ids[i], buf);
std::cout << ids[i] << '\t';
std::cout.write(reinterpret_cast<const char*>(buf.data()), buf.size());
{
size_t N = 0;
auto it = trie.make_prefix_iterator(key, length);
while (N < limit && it.next()) {
std::cout << it.id() << '\t';
std::cout.write(reinterpret_cast<const char*>(it.key().first), it.key().second);
std::cout << std::endl;
++N;
}
size_t M = 0;
while (it.next()) {
++M;
}
if (M != 0) {
std::cout << "and more..." << std::endl;
}
std::cout << N + M << " found" << std::endl;
}
std::cout << "predictive_lookup()" << std::endl;
ids.clear();
trie.predictive_lookup(key, length, ids);
std::cout << ids.size() << " found" << std::endl;
for (size_t i = 0; i < std::min(ids.size(), limit); ++i) {
buf.clear();
trie.access(ids[i], buf);
std::cout << ids[i] << '\t';
std::cout.write(reinterpret_cast<const char*>(buf.data()), buf.size());
{
size_t N = 0;
auto it = trie.make_predictive_iterator(key, length);
while (N < limit && it.next()) {
std::cout << it.id() << '\t';
std::cout.write(reinterpret_cast<const char*>(it.key().first), it.key().second);
std::cout << std::endl;
++N;
}
size_t M = 0;
while (it.next()) {
++M;
}
if (M != 0) {
std::cout << "and more..." << std::endl;
}
std::cout << N + M << " found" << std::endl;
}
}

View file

@ -5,4 +5,13 @@
#ifndef XCDAT_XCDAT_HPP
#define XCDAT_XCDAT_HPP
#include "TrieBuilder.hpp"
namespace xcdat {
}
#endif //XCDAT_XCDAT_HPP