2017-03-29 06:01:06 +00:00
|
|
|
#include <iostream>
|
2016-12-03 07:51:00 +00:00
|
|
|
#include "TrieBuilder.hpp"
|
|
|
|
|
|
|
|
namespace xcdat {
|
|
|
|
|
2017-11-11 12:01:10 +00:00
|
|
|
TrieBuilder::TrieBuilder(const std::vector<Key>& keys,
|
|
|
|
id_type width_L1, bool binary_mode)
|
|
|
|
: keys_(keys), block_size_(1U << width_L1), width_L1_(width_L1),
|
|
|
|
binary_mode_(binary_mode) {
|
2017-03-29 06:01:06 +00:00
|
|
|
if (keys_.empty()) {
|
|
|
|
throw TrieBuilder::Exception("The input data is empty.");
|
|
|
|
}
|
2017-07-12 06:48:49 +00:00
|
|
|
if (kIdMax < keys_.size()) {
|
2017-03-29 06:01:06 +00:00
|
|
|
throw TrieBuilder::Exception("Key ID range error.");
|
2016-12-03 07:51:00 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
{
|
2017-11-11 12:01:10 +00:00
|
|
|
size_t init_capa = 1;
|
|
|
|
while (init_capa < keys_.size()) {
|
|
|
|
init_capa <<= 1;
|
2016-12-03 07:51:00 +00:00
|
|
|
}
|
2017-07-12 06:48:49 +00:00
|
|
|
|
2017-11-11 12:01:10 +00:00
|
|
|
bc_.reserve(init_capa);
|
|
|
|
leaf_flags_.reserve(init_capa);
|
|
|
|
term_flags_.reserve(init_capa);
|
|
|
|
used_flags_.reserve(init_capa);
|
|
|
|
heads_.reserve(init_capa >> width_L1_);
|
2016-12-03 07:51:00 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
alphabet_.reserve(256);
|
|
|
|
edges_.reserve(256);
|
2017-03-29 06:01:06 +00:00
|
|
|
suffixes_.reserve(keys_.size());
|
2016-12-03 07:51:00 +00:00
|
|
|
|
2016-12-04 12:19:51 +00:00
|
|
|
// initialize an empty list.
|
2017-03-29 06:01:06 +00:00
|
|
|
for (id_type i = 0; i < 256; ++i) {
|
|
|
|
bc_.push_back({i + 1, i - 1});
|
|
|
|
leaf_flags_.push_back(false);
|
|
|
|
term_flags_.push_back(false);
|
|
|
|
used_flags_.push_back(false);
|
2016-12-03 07:51:00 +00:00
|
|
|
}
|
|
|
|
bc_[255].base = 0;
|
|
|
|
bc_[0].check = 255;
|
|
|
|
|
2017-03-29 06:01:06 +00:00
|
|
|
for (id_type i = 0; i < 256; i += block_size_) {
|
2017-07-12 06:48:49 +00:00
|
|
|
heads_.push_back(i);
|
2016-12-03 07:51:00 +00:00
|
|
|
}
|
|
|
|
|
2016-12-04 06:53:02 +00:00
|
|
|
use_(0);
|
|
|
|
bc_[0].check = kTabooId;
|
2017-03-29 06:01:06 +00:00
|
|
|
used_flags_[kTabooId] = true;
|
|
|
|
heads_[kTabooId >> width_L1_] = bc_[kTabooId].base;
|
2016-12-03 07:51:00 +00:00
|
|
|
|
|
|
|
build_table_();
|
2017-03-29 06:01:06 +00:00
|
|
|
build_bc_(0, keys_.size(), 0, 0);
|
2016-12-03 07:51:00 +00:00
|
|
|
build_tail_();
|
|
|
|
}
|
|
|
|
|
|
|
|
void TrieBuilder::build_table_() {
|
2017-03-29 06:01:06 +00:00
|
|
|
using tb_type = std::pair<uint8_t, size_t>;
|
2017-07-12 06:48:49 +00:00
|
|
|
tb_type table_builder[256];
|
2016-12-03 07:51:00 +00:00
|
|
|
|
|
|
|
for (uint32_t i = 0; i < 256; ++i) {
|
|
|
|
table_builder[i] = {static_cast<uint8_t>(i), 0};
|
|
|
|
}
|
|
|
|
|
2017-07-12 06:48:49 +00:00
|
|
|
max_length_ = 0;
|
|
|
|
for (size_t i = 0; i < keys_.size(); ++i) {
|
|
|
|
for (size_t j = 0; j < keys_[i].length; ++j) {
|
|
|
|
++table_builder[keys_[i].ptr[j]].second;
|
2016-12-03 07:51:00 +00:00
|
|
|
}
|
2017-07-12 06:48:49 +00:00
|
|
|
max_length_ = std::max(max_length_, keys_[i].length);
|
2016-12-03 07:51:00 +00:00
|
|
|
}
|
|
|
|
|
2017-03-29 06:01:06 +00:00
|
|
|
if (table_builder[0].second) { // including '\0'
|
|
|
|
binary_mode_ = true;
|
2016-12-03 07:51:00 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
for (const auto& item : table_builder) {
|
|
|
|
if (item.second != 0) {
|
|
|
|
alphabet_.push_back(item.first);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
alphabet_.shrink_to_fit();
|
|
|
|
|
|
|
|
std::sort(std::begin(table_builder), std::end(table_builder),
|
2017-07-12 06:48:49 +00:00
|
|
|
[](const tb_type& lhs, const tb_type& rhs) {
|
|
|
|
return lhs.second > rhs.second;
|
|
|
|
});
|
2016-12-03 07:51:00 +00:00
|
|
|
|
|
|
|
for (uint32_t i = 0; i < 256; ++i) {
|
|
|
|
table_[table_builder[i].first] = static_cast<uint8_t>(i);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (uint32_t i = 0; i < 256; ++i) {
|
|
|
|
table_[table_[i] + 256] = static_cast<uint8_t>(i);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-11-11 12:01:10 +00:00
|
|
|
void TrieBuilder::build_bc_(size_t begin, size_t end, size_t depth,
|
|
|
|
id_type node_id) {
|
2017-07-12 06:48:49 +00:00
|
|
|
if (keys_[begin].length == depth) {
|
2017-03-29 06:01:06 +00:00
|
|
|
term_flags_.set_bit(node_id, true);
|
2016-12-04 12:19:51 +00:00
|
|
|
if (++begin == end) { // without link?
|
2017-03-29 06:01:06 +00:00
|
|
|
bc_[node_id].base = 0; // with an empty suffix
|
|
|
|
leaf_flags_.set_bit(node_id, true);
|
2016-12-03 07:51:00 +00:00
|
|
|
return;
|
|
|
|
}
|
2016-12-04 12:19:51 +00:00
|
|
|
} else if (begin + 1 == end) { // leaf?
|
2017-03-29 06:01:06 +00:00
|
|
|
term_flags_.set_bit(node_id, true);
|
|
|
|
leaf_flags_.set_bit(node_id, true);
|
|
|
|
auto& key = keys_[begin];
|
2017-07-12 06:48:49 +00:00
|
|
|
suffixes_.push_back({{key.ptr + depth, key.length - depth}, node_id});
|
2016-12-03 07:51:00 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2016-12-04 12:19:51 +00:00
|
|
|
{ // fetching edges
|
2016-12-03 07:51:00 +00:00
|
|
|
edges_.clear();
|
2017-07-12 06:48:49 +00:00
|
|
|
auto label = keys_[begin].ptr[depth];
|
2016-12-03 07:51:00 +00:00
|
|
|
for (auto str_id = begin + 1; str_id < end; ++str_id) {
|
2017-07-12 06:48:49 +00:00
|
|
|
const auto _label = keys_[str_id].ptr[depth];
|
2016-12-03 07:51:00 +00:00
|
|
|
if (label != _label) {
|
2017-03-29 06:01:06 +00:00
|
|
|
if (_label < label) {
|
2017-11-11 12:01:10 +00:00
|
|
|
throw TrieBuilder::Exception(
|
|
|
|
"The input data is not in lexicographical order."
|
|
|
|
);
|
2017-03-29 06:01:06 +00:00
|
|
|
}
|
2016-12-03 07:51:00 +00:00
|
|
|
edges_.push_back(label);
|
|
|
|
label = _label;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
edges_.push_back(label);
|
|
|
|
}
|
|
|
|
|
2017-03-29 06:01:06 +00:00
|
|
|
const auto base = find_base_(node_id >> width_L1_);
|
2016-12-03 07:51:00 +00:00
|
|
|
if (bc_.size() <= base) {
|
|
|
|
expand_();
|
|
|
|
}
|
|
|
|
|
2016-12-04 12:19:51 +00:00
|
|
|
// defining new edges
|
2016-12-03 07:51:00 +00:00
|
|
|
bc_[node_id].base = base;
|
|
|
|
for (const auto label : edges_) {
|
|
|
|
const auto child_id = base ^ table_[label];
|
|
|
|
use_(child_id);
|
|
|
|
bc_[child_id].check = node_id;
|
|
|
|
}
|
|
|
|
|
2016-12-04 12:19:51 +00:00
|
|
|
// following the children
|
2016-12-03 07:51:00 +00:00
|
|
|
auto _begin = begin;
|
2017-07-12 06:48:49 +00:00
|
|
|
auto label = keys_[begin].ptr[depth];
|
2016-12-03 07:51:00 +00:00
|
|
|
for (auto _end = begin + 1; _end < end; ++_end) {
|
2017-07-12 06:48:49 +00:00
|
|
|
const auto _label = keys_[_end].ptr[depth];
|
2016-12-03 07:51:00 +00:00
|
|
|
if (label != _label) {
|
|
|
|
build_bc_(_begin, _end, depth + 1, base ^ table_[label]);
|
|
|
|
label = _label;
|
|
|
|
_begin = _end;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
build_bc_(_begin, end, depth + 1, base ^ table_[label]);
|
|
|
|
}
|
|
|
|
|
2017-03-29 06:01:06 +00:00
|
|
|
// The algorithm is inspired by marisa-trie
|
2016-12-03 07:51:00 +00:00
|
|
|
void TrieBuilder::build_tail_() {
|
2017-07-12 06:48:49 +00:00
|
|
|
std::sort(std::begin(suffixes_), std::end(suffixes_),
|
|
|
|
[](const Suffix& lhs, const Suffix& rhs) {
|
|
|
|
return std::lexicographical_compare(lhs.rbegin(), lhs.rend(),
|
|
|
|
rhs.rbegin(), rhs.rend());
|
|
|
|
});
|
2016-12-03 07:51:00 +00:00
|
|
|
|
2017-03-29 06:01:06 +00:00
|
|
|
// For empty suffixes
|
|
|
|
tail_.emplace_back('\0');
|
|
|
|
if (binary_mode_) {
|
|
|
|
boundary_flags_.push_back(false);
|
|
|
|
}
|
2016-12-03 07:51:00 +00:00
|
|
|
|
2017-03-29 06:01:06 +00:00
|
|
|
const Suffix dummy = {{nullptr, 0}, 0};
|
|
|
|
const Suffix* prev = &dummy;
|
2016-12-03 07:51:00 +00:00
|
|
|
|
2017-03-29 06:01:06 +00:00
|
|
|
for (size_t i = suffixes_.size(); i > 0; --i) {
|
|
|
|
const Suffix& cur = suffixes_[i - 1];
|
|
|
|
if (cur.length() == 0) {
|
|
|
|
throw TrieBuilder::Exception("A suffix is empty.");
|
|
|
|
}
|
2016-12-03 07:51:00 +00:00
|
|
|
|
2017-03-29 06:01:06 +00:00
|
|
|
size_t match = 0;
|
2017-11-11 12:01:10 +00:00
|
|
|
while ((match < cur.length()) && (match < prev->length())
|
|
|
|
&& ((*prev)[match] == cur[match])) {
|
2017-03-29 06:01:06 +00:00
|
|
|
++match;
|
2016-12-03 07:51:00 +00:00
|
|
|
}
|
|
|
|
|
2017-03-29 06:01:06 +00:00
|
|
|
if ((match == cur.length()) && (prev->length() != 0)) { // sharing
|
|
|
|
bc_[cur.node_id].base =
|
2017-11-11 12:01:10 +00:00
|
|
|
static_cast<id_type>(
|
|
|
|
bc_[prev->node_id].base + (prev->length() - match)
|
|
|
|
);
|
2017-03-29 06:01:06 +00:00
|
|
|
} else { // append
|
|
|
|
bc_[cur.node_id].base = static_cast<id_type>(tail_.size());
|
|
|
|
for (size_t j = 0; j < cur.length(); ++j) {
|
2017-07-12 06:48:49 +00:00
|
|
|
tail_.push_back(cur.str.ptr[j]);
|
2017-03-29 06:01:06 +00:00
|
|
|
}
|
|
|
|
if (binary_mode_) {
|
|
|
|
for (size_t j = 1; j < cur.length(); ++j) {
|
|
|
|
boundary_flags_.push_back(false);
|
|
|
|
}
|
|
|
|
boundary_flags_.push_back(true);
|
|
|
|
} else {
|
|
|
|
tail_.emplace_back('\0');
|
|
|
|
}
|
2017-07-12 06:48:49 +00:00
|
|
|
if (kIdMax < tail_.size()) {
|
2017-03-29 06:01:06 +00:00
|
|
|
throw TrieBuilder::Exception("TAIL address range error.");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
prev = &cur;
|
2016-12-03 07:51:00 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void TrieBuilder::expand_() {
|
2017-07-12 06:48:49 +00:00
|
|
|
if (kIdMax < bc_.size() + 256) {
|
2017-03-29 06:01:06 +00:00
|
|
|
throw TrieBuilder::Exception("Node ID range error.");
|
2016-12-03 07:51:00 +00:00
|
|
|
}
|
|
|
|
|
2017-03-29 06:01:06 +00:00
|
|
|
const auto old_size = static_cast<id_type>(bc_.size());
|
2016-12-03 07:51:00 +00:00
|
|
|
const auto new_size = old_size + 256;
|
|
|
|
|
|
|
|
for (auto i = old_size; i < new_size; ++i) {
|
2017-03-29 06:01:06 +00:00
|
|
|
bc_.push_back({i + 1, i - 1});
|
|
|
|
leaf_flags_.push_back(false);
|
|
|
|
term_flags_.push_back(false);
|
|
|
|
used_flags_.push_back(false);
|
2016-12-03 07:51:00 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
const auto last = bc_[kTabooId].check;
|
|
|
|
bc_[old_size].check = last;
|
|
|
|
bc_[last].base = old_size;
|
|
|
|
bc_[new_size - 1].base = kTabooId;
|
|
|
|
bc_[kTabooId].check = new_size - 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (auto i = old_size; i < new_size; i += block_size_) {
|
2017-07-12 06:48:49 +00:00
|
|
|
heads_.push_back(i);
|
2016-12-03 07:51:00 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
const auto block_id = old_size / 256;
|
|
|
|
if (kFreeBlocks <= block_id) {
|
|
|
|
close_block_(block_id - kFreeBlocks);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-03-29 06:01:06 +00:00
|
|
|
void TrieBuilder::use_(id_type node_id) {
|
|
|
|
used_flags_[node_id] = true;
|
2016-12-03 07:51:00 +00:00
|
|
|
|
|
|
|
const auto next = bc_[node_id].base;
|
|
|
|
const auto prev = bc_[node_id].check;
|
|
|
|
bc_[prev].base = next;
|
|
|
|
bc_[next].check = prev;
|
|
|
|
|
2017-03-29 06:01:06 +00:00
|
|
|
const auto block_id = node_id >> width_L1_;
|
2016-12-03 07:51:00 +00:00
|
|
|
if (heads_[block_id] == node_id) {
|
2017-03-29 06:01:06 +00:00
|
|
|
heads_[block_id] = (block_id != next >> width_L1_) ? kTabooId : next;
|
2016-12-03 07:51:00 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-03-29 06:01:06 +00:00
|
|
|
void TrieBuilder::close_block_(id_type block_id) {
|
2016-12-03 07:51:00 +00:00
|
|
|
const auto begin = block_id * 256;
|
|
|
|
const auto end = begin + 256;
|
|
|
|
|
|
|
|
for (auto i = begin; i < end; ++i) {
|
2017-03-29 06:01:06 +00:00
|
|
|
if (!used_flags_[i]) {
|
2016-12-03 07:51:00 +00:00
|
|
|
use_(i);
|
2017-03-29 06:01:06 +00:00
|
|
|
bc_[i].base = i;
|
|
|
|
bc_[i].check = i;
|
|
|
|
used_flags_[i] = false;
|
2016-12-03 07:51:00 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for (auto i = begin; i < end; i += block_size_) {
|
2017-03-29 06:01:06 +00:00
|
|
|
heads_[i >> width_L1_] = kTabooId;
|
2016-12-03 07:51:00 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-03-29 06:01:06 +00:00
|
|
|
id_type TrieBuilder::find_base_(id_type block_id) const {
|
2016-12-03 07:51:00 +00:00
|
|
|
if (bc_[kTabooId].base == kTabooId) { // Full?
|
2017-03-29 06:01:06 +00:00
|
|
|
return static_cast<id_type>(bc_.size()) ^ table_[edges_[0]];
|
2016-12-03 07:51:00 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// search in the same block
|
2017-11-11 12:01:10 +00:00
|
|
|
for (auto i = heads_[block_id]; i != kTabooId && i >> width_L1_ == block_id;
|
|
|
|
i = bc_[i].base) {
|
2016-12-03 07:51:00 +00:00
|
|
|
const auto base = i ^ table_[edges_[0]];
|
|
|
|
if (is_target_(base)) {
|
|
|
|
return base; // base / block_size_ == block_id
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for (auto i = bc_[kTabooId].base; i != kTabooId; i = bc_[i].base) {
|
|
|
|
const auto base = i ^ table_[edges_[0]];
|
|
|
|
if (is_target_(base)) {
|
|
|
|
return base; // base / block_size_ != block_id
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-03-29 06:01:06 +00:00
|
|
|
return static_cast<id_type>(bc_.size()) ^ table_[edges_[0]];
|
2016-12-03 07:51:00 +00:00
|
|
|
}
|
|
|
|
|
2017-03-29 06:01:06 +00:00
|
|
|
bool TrieBuilder::is_target_(id_type base) const {
|
2016-12-03 07:51:00 +00:00
|
|
|
for (const auto label : edges_) {
|
2017-03-29 06:01:06 +00:00
|
|
|
if (used_flags_[base ^ table_[label]]) {
|
2016-12-03 07:51:00 +00:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
} //namespace - xcdat
|