xcdat/src/TrieBuilder.cpp

315 lines
8.2 KiB
C++
Raw Normal View History

2017-03-29 06:01:06 +00:00
#include <iostream>
2016-12-03 07:51:00 +00:00
#include "TrieBuilder.hpp"
namespace xcdat {
2017-11-11 12:01:10 +00:00
TrieBuilder::TrieBuilder(const std::vector<Key>& keys,
id_type width_L1, bool binary_mode)
: keys_(keys), block_size_(1U << width_L1), width_L1_(width_L1),
binary_mode_(binary_mode) {
2017-03-29 06:01:06 +00:00
if (keys_.empty()) {
throw TrieBuilder::Exception("The input data is empty.");
}
2017-07-12 06:48:49 +00:00
if (kIdMax < keys_.size()) {
2017-03-29 06:01:06 +00:00
throw TrieBuilder::Exception("Key ID range error.");
2016-12-03 07:51:00 +00:00
}
{
2017-11-11 12:01:10 +00:00
size_t init_capa = 1;
while (init_capa < keys_.size()) {
init_capa <<= 1;
2016-12-03 07:51:00 +00:00
}
2017-07-12 06:48:49 +00:00
2017-11-11 12:01:10 +00:00
bc_.reserve(init_capa);
leaf_flags_.reserve(init_capa);
term_flags_.reserve(init_capa);
used_flags_.reserve(init_capa);
heads_.reserve(init_capa >> width_L1_);
2016-12-03 07:51:00 +00:00
}
alphabet_.reserve(256);
edges_.reserve(256);
2017-03-29 06:01:06 +00:00
suffixes_.reserve(keys_.size());
2016-12-03 07:51:00 +00:00
2016-12-04 12:19:51 +00:00
// initialize an empty list.
2017-03-29 06:01:06 +00:00
for (id_type i = 0; i < 256; ++i) {
bc_.push_back({i + 1, i - 1});
leaf_flags_.push_back(false);
term_flags_.push_back(false);
used_flags_.push_back(false);
2016-12-03 07:51:00 +00:00
}
bc_[255].base = 0;
bc_[0].check = 255;
2017-03-29 06:01:06 +00:00
for (id_type i = 0; i < 256; i += block_size_) {
2017-07-12 06:48:49 +00:00
heads_.push_back(i);
2016-12-03 07:51:00 +00:00
}
2016-12-04 06:53:02 +00:00
use_(0);
bc_[0].check = kTabooId;
2017-03-29 06:01:06 +00:00
used_flags_[kTabooId] = true;
heads_[kTabooId >> width_L1_] = bc_[kTabooId].base;
2016-12-03 07:51:00 +00:00
build_table_();
2017-03-29 06:01:06 +00:00
build_bc_(0, keys_.size(), 0, 0);
2016-12-03 07:51:00 +00:00
build_tail_();
}
void TrieBuilder::build_table_() {
2017-03-29 06:01:06 +00:00
using tb_type = std::pair<uint8_t, size_t>;
2017-07-12 06:48:49 +00:00
tb_type table_builder[256];
2016-12-03 07:51:00 +00:00
for (uint32_t i = 0; i < 256; ++i) {
table_builder[i] = {static_cast<uint8_t>(i), 0};
}
2017-07-12 06:48:49 +00:00
max_length_ = 0;
for (size_t i = 0; i < keys_.size(); ++i) {
for (size_t j = 0; j < keys_[i].length; ++j) {
++table_builder[keys_[i].ptr[j]].second;
2016-12-03 07:51:00 +00:00
}
2017-07-12 06:48:49 +00:00
max_length_ = std::max(max_length_, keys_[i].length);
2016-12-03 07:51:00 +00:00
}
2017-03-29 06:01:06 +00:00
if (table_builder[0].second) { // including '\0'
binary_mode_ = true;
2016-12-03 07:51:00 +00:00
}
for (const auto& item : table_builder) {
if (item.second != 0) {
alphabet_.push_back(item.first);
}
}
alphabet_.shrink_to_fit();
std::sort(std::begin(table_builder), std::end(table_builder),
2017-07-12 06:48:49 +00:00
[](const tb_type& lhs, const tb_type& rhs) {
return lhs.second > rhs.second;
});
2016-12-03 07:51:00 +00:00
for (uint32_t i = 0; i < 256; ++i) {
table_[table_builder[i].first] = static_cast<uint8_t>(i);
}
for (uint32_t i = 0; i < 256; ++i) {
table_[table_[i] + 256] = static_cast<uint8_t>(i);
}
}
2017-11-11 12:01:10 +00:00
void TrieBuilder::build_bc_(size_t begin, size_t end, size_t depth,
id_type node_id) {
2017-07-12 06:48:49 +00:00
if (keys_[begin].length == depth) {
2017-03-29 06:01:06 +00:00
term_flags_.set_bit(node_id, true);
2016-12-04 12:19:51 +00:00
if (++begin == end) { // without link?
2017-03-29 06:01:06 +00:00
bc_[node_id].base = 0; // with an empty suffix
leaf_flags_.set_bit(node_id, true);
2016-12-03 07:51:00 +00:00
return;
}
2016-12-04 12:19:51 +00:00
} else if (begin + 1 == end) { // leaf?
2017-03-29 06:01:06 +00:00
term_flags_.set_bit(node_id, true);
leaf_flags_.set_bit(node_id, true);
auto& key = keys_[begin];
2017-07-12 06:48:49 +00:00
suffixes_.push_back({{key.ptr + depth, key.length - depth}, node_id});
2016-12-03 07:51:00 +00:00
return;
}
2016-12-04 12:19:51 +00:00
{ // fetching edges
2016-12-03 07:51:00 +00:00
edges_.clear();
2017-07-12 06:48:49 +00:00
auto label = keys_[begin].ptr[depth];
2016-12-03 07:51:00 +00:00
for (auto str_id = begin + 1; str_id < end; ++str_id) {
2017-07-12 06:48:49 +00:00
const auto _label = keys_[str_id].ptr[depth];
2016-12-03 07:51:00 +00:00
if (label != _label) {
2017-03-29 06:01:06 +00:00
if (_label < label) {
2017-11-11 12:01:10 +00:00
throw TrieBuilder::Exception(
"The input data is not in lexicographical order."
);
2017-03-29 06:01:06 +00:00
}
2016-12-03 07:51:00 +00:00
edges_.push_back(label);
label = _label;
}
}
edges_.push_back(label);
}
2017-03-29 06:01:06 +00:00
const auto base = find_base_(node_id >> width_L1_);
2016-12-03 07:51:00 +00:00
if (bc_.size() <= base) {
expand_();
}
2016-12-04 12:19:51 +00:00
// defining new edges
2016-12-03 07:51:00 +00:00
bc_[node_id].base = base;
for (const auto label : edges_) {
const auto child_id = base ^ table_[label];
use_(child_id);
bc_[child_id].check = node_id;
}
2016-12-04 12:19:51 +00:00
// following the children
2016-12-03 07:51:00 +00:00
auto _begin = begin;
2017-07-12 06:48:49 +00:00
auto label = keys_[begin].ptr[depth];
2016-12-03 07:51:00 +00:00
for (auto _end = begin + 1; _end < end; ++_end) {
2017-07-12 06:48:49 +00:00
const auto _label = keys_[_end].ptr[depth];
2016-12-03 07:51:00 +00:00
if (label != _label) {
build_bc_(_begin, _end, depth + 1, base ^ table_[label]);
label = _label;
_begin = _end;
}
}
build_bc_(_begin, end, depth + 1, base ^ table_[label]);
}
2017-03-29 06:01:06 +00:00
// The algorithm is inspired by marisa-trie
2016-12-03 07:51:00 +00:00
void TrieBuilder::build_tail_() {
2017-07-12 06:48:49 +00:00
std::sort(std::begin(suffixes_), std::end(suffixes_),
[](const Suffix& lhs, const Suffix& rhs) {
return std::lexicographical_compare(lhs.rbegin(), lhs.rend(),
rhs.rbegin(), rhs.rend());
});
2016-12-03 07:51:00 +00:00
2017-03-29 06:01:06 +00:00
// For empty suffixes
tail_.emplace_back('\0');
if (binary_mode_) {
boundary_flags_.push_back(false);
}
2016-12-03 07:51:00 +00:00
2017-03-29 06:01:06 +00:00
const Suffix dummy = {{nullptr, 0}, 0};
const Suffix* prev = &dummy;
2016-12-03 07:51:00 +00:00
2017-03-29 06:01:06 +00:00
for (size_t i = suffixes_.size(); i > 0; --i) {
const Suffix& cur = suffixes_[i - 1];
if (cur.length() == 0) {
throw TrieBuilder::Exception("A suffix is empty.");
}
2016-12-03 07:51:00 +00:00
2017-03-29 06:01:06 +00:00
size_t match = 0;
2017-11-11 12:01:10 +00:00
while ((match < cur.length()) && (match < prev->length())
&& ((*prev)[match] == cur[match])) {
2017-03-29 06:01:06 +00:00
++match;
2016-12-03 07:51:00 +00:00
}
2017-03-29 06:01:06 +00:00
if ((match == cur.length()) && (prev->length() != 0)) { // sharing
bc_[cur.node_id].base =
2017-11-11 12:01:10 +00:00
static_cast<id_type>(
bc_[prev->node_id].base + (prev->length() - match)
);
2017-03-29 06:01:06 +00:00
} else { // append
bc_[cur.node_id].base = static_cast<id_type>(tail_.size());
for (size_t j = 0; j < cur.length(); ++j) {
2017-07-12 06:48:49 +00:00
tail_.push_back(cur.str.ptr[j]);
2017-03-29 06:01:06 +00:00
}
if (binary_mode_) {
for (size_t j = 1; j < cur.length(); ++j) {
boundary_flags_.push_back(false);
}
boundary_flags_.push_back(true);
} else {
tail_.emplace_back('\0');
}
2017-07-12 06:48:49 +00:00
if (kIdMax < tail_.size()) {
2017-03-29 06:01:06 +00:00
throw TrieBuilder::Exception("TAIL address range error.");
}
}
prev = &cur;
2016-12-03 07:51:00 +00:00
}
}
void TrieBuilder::expand_() {
2017-07-12 06:48:49 +00:00
if (kIdMax < bc_.size() + 256) {
2017-03-29 06:01:06 +00:00
throw TrieBuilder::Exception("Node ID range error.");
2016-12-03 07:51:00 +00:00
}
2017-03-29 06:01:06 +00:00
const auto old_size = static_cast<id_type>(bc_.size());
2016-12-03 07:51:00 +00:00
const auto new_size = old_size + 256;
for (auto i = old_size; i < new_size; ++i) {
2017-03-29 06:01:06 +00:00
bc_.push_back({i + 1, i - 1});
leaf_flags_.push_back(false);
term_flags_.push_back(false);
used_flags_.push_back(false);
2016-12-03 07:51:00 +00:00
}
{
const auto last = bc_[kTabooId].check;
bc_[old_size].check = last;
bc_[last].base = old_size;
bc_[new_size - 1].base = kTabooId;
bc_[kTabooId].check = new_size - 1;
}
for (auto i = old_size; i < new_size; i += block_size_) {
2017-07-12 06:48:49 +00:00
heads_.push_back(i);
2016-12-03 07:51:00 +00:00
}
const auto block_id = old_size / 256;
if (kFreeBlocks <= block_id) {
close_block_(block_id - kFreeBlocks);
}
}
2017-03-29 06:01:06 +00:00
void TrieBuilder::use_(id_type node_id) {
used_flags_[node_id] = true;
2016-12-03 07:51:00 +00:00
const auto next = bc_[node_id].base;
const auto prev = bc_[node_id].check;
bc_[prev].base = next;
bc_[next].check = prev;
2017-03-29 06:01:06 +00:00
const auto block_id = node_id >> width_L1_;
2016-12-03 07:51:00 +00:00
if (heads_[block_id] == node_id) {
2017-03-29 06:01:06 +00:00
heads_[block_id] = (block_id != next >> width_L1_) ? kTabooId : next;
2016-12-03 07:51:00 +00:00
}
}
2017-03-29 06:01:06 +00:00
void TrieBuilder::close_block_(id_type block_id) {
2016-12-03 07:51:00 +00:00
const auto begin = block_id * 256;
const auto end = begin + 256;
for (auto i = begin; i < end; ++i) {
2017-03-29 06:01:06 +00:00
if (!used_flags_[i]) {
2016-12-03 07:51:00 +00:00
use_(i);
2017-03-29 06:01:06 +00:00
bc_[i].base = i;
bc_[i].check = i;
used_flags_[i] = false;
2016-12-03 07:51:00 +00:00
}
}
for (auto i = begin; i < end; i += block_size_) {
2017-03-29 06:01:06 +00:00
heads_[i >> width_L1_] = kTabooId;
2016-12-03 07:51:00 +00:00
}
}
2017-03-29 06:01:06 +00:00
id_type TrieBuilder::find_base_(id_type block_id) const {
2016-12-03 07:51:00 +00:00
if (bc_[kTabooId].base == kTabooId) { // Full?
2017-03-29 06:01:06 +00:00
return static_cast<id_type>(bc_.size()) ^ table_[edges_[0]];
2016-12-03 07:51:00 +00:00
}
// search in the same block
2017-11-11 12:01:10 +00:00
for (auto i = heads_[block_id]; i != kTabooId && i >> width_L1_ == block_id;
i = bc_[i].base) {
2016-12-03 07:51:00 +00:00
const auto base = i ^ table_[edges_[0]];
if (is_target_(base)) {
return base; // base / block_size_ == block_id
}
}
for (auto i = bc_[kTabooId].base; i != kTabooId; i = bc_[i].base) {
const auto base = i ^ table_[edges_[0]];
if (is_target_(base)) {
return base; // base / block_size_ != block_id
}
}
2017-03-29 06:01:06 +00:00
return static_cast<id_type>(bc_.size()) ^ table_[edges_[0]];
2016-12-03 07:51:00 +00:00
}
2017-03-29 06:01:06 +00:00
bool TrieBuilder::is_target_(id_type base) const {
2016-12-03 07:51:00 +00:00
for (const auto label : edges_) {
2017-03-29 06:01:06 +00:00
if (used_flags_[base ^ table_[label]]) {
2016-12-03 07:51:00 +00:00
return false;
}
}
return true;
}
} //namespace - xcdat