xcdat/lib/TrieBuilder.cpp

318 lines
8.3 KiB
C++
Raw Normal View History

2017-03-29 06:01:06 +00:00
#include <iostream>
2017-11-12 11:49:13 +00:00
#include "xcdat/TrieBuilder.hpp"
2016-12-03 07:51:00 +00:00
namespace xcdat {
2017-11-12 11:49:13 +00:00
TrieBuilder::TrieBuilder(const std::vector<std::string_view>& keys,
id_type width_L1, bool bin_mode)
2017-11-11 12:01:10 +00:00
: keys_(keys), block_size_(1U << width_L1), width_L1_(width_L1),
2017-11-12 11:49:13 +00:00
bin_mode_(bin_mode) {
2017-03-29 06:01:06 +00:00
if (keys_.empty()) {
throw TrieBuilder::Exception("The input data is empty.");
}
2017-11-12 11:49:13 +00:00
if (ID_MAX < keys_.size()) {
2017-03-29 06:01:06 +00:00
throw TrieBuilder::Exception("Key ID range error.");
2016-12-03 07:51:00 +00:00
}
{
2017-11-11 12:01:10 +00:00
size_t init_capa = 1;
while (init_capa < keys_.size()) {
init_capa <<= 1;
2016-12-03 07:51:00 +00:00
}
2017-07-12 06:48:49 +00:00
2017-11-11 12:01:10 +00:00
bc_.reserve(init_capa);
leaf_flags_.reserve(init_capa);
term_flags_.reserve(init_capa);
used_flags_.reserve(init_capa);
heads_.reserve(init_capa >> width_L1_);
2016-12-03 07:51:00 +00:00
}
alphabet_.reserve(256);
edges_.reserve(256);
2017-03-29 06:01:06 +00:00
suffixes_.reserve(keys_.size());
2016-12-03 07:51:00 +00:00
2016-12-04 12:19:51 +00:00
// initialize an empty list.
2017-03-29 06:01:06 +00:00
for (id_type i = 0; i < 256; ++i) {
bc_.push_back({i + 1, i - 1});
leaf_flags_.push_back(false);
term_flags_.push_back(false);
used_flags_.push_back(false);
2016-12-03 07:51:00 +00:00
}
bc_[255].base = 0;
bc_[0].check = 255;
2017-03-29 06:01:06 +00:00
for (id_type i = 0; i < 256; i += block_size_) {
2017-07-12 06:48:49 +00:00
heads_.push_back(i);
2016-12-03 07:51:00 +00:00
}
2016-12-04 06:53:02 +00:00
use_(0);
2017-11-12 11:49:13 +00:00
bc_[0].check = TABOO_ID;
used_flags_[TABOO_ID] = true;
heads_[TABOO_ID >> width_L1_] = bc_[TABOO_ID].base;
2016-12-03 07:51:00 +00:00
build_table_();
2017-03-29 06:01:06 +00:00
build_bc_(0, keys_.size(), 0, 0);
2016-12-03 07:51:00 +00:00
build_tail_();
}
void TrieBuilder::build_table_() {
2017-03-29 06:01:06 +00:00
using tb_type = std::pair<uint8_t, size_t>;
2017-07-12 06:48:49 +00:00
tb_type table_builder[256];
2016-12-03 07:51:00 +00:00
for (uint32_t i = 0; i < 256; ++i) {
table_builder[i] = {static_cast<uint8_t>(i), 0};
}
2017-07-12 06:48:49 +00:00
max_length_ = 0;
2017-11-12 11:49:13 +00:00
for (auto& key : keys_) {
for (char c : key) {
++table_builder[static_cast<uint8_t>(c)].second;
2016-12-03 07:51:00 +00:00
}
2017-11-12 11:49:13 +00:00
max_length_ = std::max(max_length_, key.length());
2016-12-03 07:51:00 +00:00
}
2017-11-12 11:49:13 +00:00
if (table_builder[0].second != 0) { // including '\0'
bin_mode_ = true;
2016-12-03 07:51:00 +00:00
}
for (const auto& item : table_builder) {
if (item.second != 0) {
alphabet_.push_back(item.first);
}
}
alphabet_.shrink_to_fit();
std::sort(std::begin(table_builder), std::end(table_builder),
2017-07-12 06:48:49 +00:00
[](const tb_type& lhs, const tb_type& rhs) {
return lhs.second > rhs.second;
});
2016-12-03 07:51:00 +00:00
for (uint32_t i = 0; i < 256; ++i) {
table_[table_builder[i].first] = static_cast<uint8_t>(i);
}
for (uint32_t i = 0; i < 256; ++i) {
table_[table_[i] + 256] = static_cast<uint8_t>(i);
}
}
2017-11-11 12:01:10 +00:00
void TrieBuilder::build_bc_(size_t begin, size_t end, size_t depth,
id_type node_id) {
2017-11-12 11:49:13 +00:00
if (keys_[begin].length() == depth) {
2017-03-29 06:01:06 +00:00
term_flags_.set_bit(node_id, true);
2016-12-04 12:19:51 +00:00
if (++begin == end) { // without link?
2017-03-29 06:01:06 +00:00
bc_[node_id].base = 0; // with an empty suffix
leaf_flags_.set_bit(node_id, true);
2016-12-03 07:51:00 +00:00
return;
}
2016-12-04 12:19:51 +00:00
} else if (begin + 1 == end) { // leaf?
2017-03-29 06:01:06 +00:00
term_flags_.set_bit(node_id, true);
leaf_flags_.set_bit(node_id, true);
auto& key = keys_[begin];
2017-11-12 11:49:13 +00:00
suffixes_.push_back(
{{key.data() + depth, key.length() - depth}, node_id}
);
2016-12-03 07:51:00 +00:00
return;
}
2016-12-04 12:19:51 +00:00
{ // fetching edges
2016-12-03 07:51:00 +00:00
edges_.clear();
2017-11-12 11:49:13 +00:00
auto label = static_cast<uint8_t>(keys_[begin][depth]);
2016-12-03 07:51:00 +00:00
for (auto str_id = begin + 1; str_id < end; ++str_id) {
2017-11-12 11:49:13 +00:00
const auto _label = static_cast<uint8_t>(keys_[str_id][depth]);
2016-12-03 07:51:00 +00:00
if (label != _label) {
2017-03-29 06:01:06 +00:00
if (_label < label) {
2017-11-11 12:01:10 +00:00
throw TrieBuilder::Exception(
"The input data is not in lexicographical order."
);
2017-03-29 06:01:06 +00:00
}
2016-12-03 07:51:00 +00:00
edges_.push_back(label);
label = _label;
}
}
edges_.push_back(label);
}
2017-03-29 06:01:06 +00:00
const auto base = find_base_(node_id >> width_L1_);
2016-12-03 07:51:00 +00:00
if (bc_.size() <= base) {
expand_();
}
2016-12-04 12:19:51 +00:00
// defining new edges
2016-12-03 07:51:00 +00:00
bc_[node_id].base = base;
for (const auto label : edges_) {
const auto child_id = base ^ table_[label];
use_(child_id);
bc_[child_id].check = node_id;
}
2016-12-04 12:19:51 +00:00
// following the children
2016-12-03 07:51:00 +00:00
auto _begin = begin;
2017-11-12 11:49:13 +00:00
auto label = static_cast<uint8_t>(keys_[begin][depth]);
2016-12-03 07:51:00 +00:00
for (auto _end = begin + 1; _end < end; ++_end) {
2017-11-12 11:49:13 +00:00
const auto _label = static_cast<uint8_t>(keys_[_end][depth]);
2016-12-03 07:51:00 +00:00
if (label != _label) {
build_bc_(_begin, _end, depth + 1, base ^ table_[label]);
label = _label;
_begin = _end;
}
}
build_bc_(_begin, end, depth + 1, base ^ table_[label]);
}
2017-03-29 06:01:06 +00:00
// The algorithm is inspired by marisa-trie
2016-12-03 07:51:00 +00:00
void TrieBuilder::build_tail_() {
2017-07-12 06:48:49 +00:00
std::sort(std::begin(suffixes_), std::end(suffixes_),
[](const Suffix& lhs, const Suffix& rhs) {
2017-11-12 11:49:13 +00:00
return std::lexicographical_compare(
std::rbegin(lhs), std::rend(lhs),
std::rbegin(rhs), std::rend(rhs));
2017-07-12 06:48:49 +00:00
});
2016-12-03 07:51:00 +00:00
2017-03-29 06:01:06 +00:00
// For empty suffixes
tail_.emplace_back('\0');
2017-11-12 11:49:13 +00:00
if (bin_mode_) {
2017-03-29 06:01:06 +00:00
boundary_flags_.push_back(false);
}
2016-12-03 07:51:00 +00:00
2017-03-29 06:01:06 +00:00
const Suffix dummy = {{nullptr, 0}, 0};
2017-11-12 11:49:13 +00:00
const Suffix* prev_suf = &dummy;
2016-12-03 07:51:00 +00:00
2017-03-29 06:01:06 +00:00
for (size_t i = suffixes_.size(); i > 0; --i) {
2017-11-12 11:49:13 +00:00
const auto& cur_suf = suffixes_[i - 1];
if (cur_suf.length() == 0) {
2017-03-29 06:01:06 +00:00
throw TrieBuilder::Exception("A suffix is empty.");
}
2016-12-03 07:51:00 +00:00
2017-03-29 06:01:06 +00:00
size_t match = 0;
2017-11-12 11:49:13 +00:00
while ((match < cur_suf.length()) && (match < prev_suf->length())
&& ((*prev_suf)[match] == cur_suf[match])) {
2017-03-29 06:01:06 +00:00
++match;
2016-12-03 07:51:00 +00:00
}
2017-11-12 11:49:13 +00:00
if ((match == cur_suf.length()) && (prev_suf->length() != 0)) { // sharing
bc_[cur_suf.node_id].base = static_cast<id_type>(
bc_[prev_suf->node_id].base + (prev_suf->length() - match)
);
2017-03-29 06:01:06 +00:00
} else { // append
2017-11-12 11:49:13 +00:00
bc_[cur_suf.node_id].base = static_cast<id_type>(tail_.size());
std::copy(std::begin(cur_suf.str), std::end(cur_suf.str),
std::back_inserter(tail_));
if (bin_mode_) {
for (size_t j = 1; j < cur_suf.length(); ++j) {
2017-03-29 06:01:06 +00:00
boundary_flags_.push_back(false);
}
boundary_flags_.push_back(true);
} else {
tail_.emplace_back('\0');
}
2017-11-12 11:49:13 +00:00
if (ID_MAX < tail_.size()) {
2017-03-29 06:01:06 +00:00
throw TrieBuilder::Exception("TAIL address range error.");
}
}
2017-11-12 11:49:13 +00:00
prev_suf = &cur_suf;
2016-12-03 07:51:00 +00:00
}
}
void TrieBuilder::expand_() {
2017-11-12 11:49:13 +00:00
if (ID_MAX < bc_.size() + 256) {
2017-03-29 06:01:06 +00:00
throw TrieBuilder::Exception("Node ID range error.");
2016-12-03 07:51:00 +00:00
}
2017-03-29 06:01:06 +00:00
const auto old_size = static_cast<id_type>(bc_.size());
2016-12-03 07:51:00 +00:00
const auto new_size = old_size + 256;
for (auto i = old_size; i < new_size; ++i) {
2017-03-29 06:01:06 +00:00
bc_.push_back({i + 1, i - 1});
leaf_flags_.push_back(false);
term_flags_.push_back(false);
used_flags_.push_back(false);
2016-12-03 07:51:00 +00:00
}
{
2017-11-12 11:49:13 +00:00
const auto last = bc_[TABOO_ID].check;
2016-12-03 07:51:00 +00:00
bc_[old_size].check = last;
bc_[last].base = old_size;
2017-11-12 11:49:13 +00:00
bc_[new_size - 1].base = TABOO_ID;
bc_[TABOO_ID].check = new_size - 1;
2016-12-03 07:51:00 +00:00
}
for (auto i = old_size; i < new_size; i += block_size_) {
2017-07-12 06:48:49 +00:00
heads_.push_back(i);
2016-12-03 07:51:00 +00:00
}
const auto block_id = old_size / 256;
2017-11-12 11:49:13 +00:00
if (FREE_BLOCKS <= block_id) {
close_block_(block_id - FREE_BLOCKS);
2016-12-03 07:51:00 +00:00
}
}
2017-03-29 06:01:06 +00:00
void TrieBuilder::use_(id_type node_id) {
used_flags_[node_id] = true;
2016-12-03 07:51:00 +00:00
const auto next = bc_[node_id].base;
const auto prev = bc_[node_id].check;
bc_[prev].base = next;
bc_[next].check = prev;
2017-03-29 06:01:06 +00:00
const auto block_id = node_id >> width_L1_;
2016-12-03 07:51:00 +00:00
if (heads_[block_id] == node_id) {
2017-11-12 11:49:13 +00:00
heads_[block_id] = (block_id != next >> width_L1_) ? TABOO_ID : next;
2016-12-03 07:51:00 +00:00
}
}
2017-03-29 06:01:06 +00:00
void TrieBuilder::close_block_(id_type block_id) {
2016-12-03 07:51:00 +00:00
const auto begin = block_id * 256;
const auto end = begin + 256;
for (auto i = begin; i < end; ++i) {
2017-03-29 06:01:06 +00:00
if (!used_flags_[i]) {
2016-12-03 07:51:00 +00:00
use_(i);
2017-03-29 06:01:06 +00:00
bc_[i].base = i;
bc_[i].check = i;
used_flags_[i] = false;
2016-12-03 07:51:00 +00:00
}
}
for (auto i = begin; i < end; i += block_size_) {
2017-11-12 11:49:13 +00:00
heads_[i >> width_L1_] = TABOO_ID;
2016-12-03 07:51:00 +00:00
}
}
2017-03-29 06:01:06 +00:00
id_type TrieBuilder::find_base_(id_type block_id) const {
2017-11-12 11:49:13 +00:00
if (bc_[TABOO_ID].base == TABOO_ID) { // Full?
2017-03-29 06:01:06 +00:00
return static_cast<id_type>(bc_.size()) ^ table_[edges_[0]];
2016-12-03 07:51:00 +00:00
}
// search in the same block
2017-11-12 11:49:13 +00:00
for (auto i = heads_[block_id];
i != TABOO_ID && i >> width_L1_ == block_id;
2017-11-11 12:01:10 +00:00
i = bc_[i].base) {
2016-12-03 07:51:00 +00:00
const auto base = i ^ table_[edges_[0]];
if (is_target_(base)) {
return base; // base / block_size_ == block_id
}
}
2017-11-12 11:49:13 +00:00
for (auto i = bc_[TABOO_ID].base; i != TABOO_ID; i = bc_[i].base) {
2016-12-03 07:51:00 +00:00
const auto base = i ^ table_[edges_[0]];
if (is_target_(base)) {
return base; // base / block_size_ != block_id
}
}
2017-03-29 06:01:06 +00:00
return static_cast<id_type>(bc_.size()) ^ table_[edges_[0]];
2016-12-03 07:51:00 +00:00
}
2017-03-29 06:01:06 +00:00
bool TrieBuilder::is_target_(id_type base) const {
2016-12-03 07:51:00 +00:00
for (const auto label : edges_) {
2017-03-29 06:01:06 +00:00
if (used_flags_[base ^ table_[label]]) {
2016-12-03 07:51:00 +00:00
return false;
}
}
return true;
}
} //namespace - xcdat