add 15/16-bit version
This commit is contained in:
parent
da5097baee
commit
2955cab72c
|
@ -1,5 +1,7 @@
|
|||
#pragma once
|
||||
|
||||
#include "xcdat/bc_vector_15.hpp"
|
||||
#include "xcdat/bc_vector_16.hpp"
|
||||
#include "xcdat/bc_vector_7.hpp"
|
||||
#include "xcdat/bc_vector_8.hpp"
|
||||
#include "xcdat/load_visitor.hpp"
|
||||
|
@ -11,7 +13,10 @@
|
|||
namespace xcdat {
|
||||
|
||||
using trie_8_type = trie<bc_vector_8>;
|
||||
using trie_16_type = trie<bc_vector_16>;
|
||||
|
||||
using trie_7_type = trie<bc_vector_7>;
|
||||
using trie_15_type = trie<bc_vector_15>;
|
||||
|
||||
//! Set the continuous memory block to a new trie instance (for a memory-mapped file).
|
||||
template <class Trie>
|
||||
|
|
173
include/xcdat/bc_vector_15.hpp
Normal file
173
include/xcdat/bc_vector_15.hpp
Normal file
|
@ -0,0 +1,173 @@
|
|||
#pragma once
|
||||
|
||||
#include <array>
|
||||
|
||||
#include "bit_vector.hpp"
|
||||
#include "compact_vector.hpp"
|
||||
|
||||
namespace xcdat {
|
||||
|
||||
class bc_vector_15 {
|
||||
public:
|
||||
static constexpr std::uint32_t l1_bits = 15;
|
||||
static constexpr std::uint32_t max_levels = 3;
|
||||
|
||||
static constexpr std::uint64_t block_size_l1 = 1ULL << 15;
|
||||
static constexpr std::uint64_t block_size_l2 = 1ULL << 31;
|
||||
|
||||
private:
|
||||
std::uint64_t m_num_frees = 0;
|
||||
immutable_vector<std::uint16_t> m_ints_l1;
|
||||
immutable_vector<std::uint32_t> m_ints_l2;
|
||||
immutable_vector<std::uint64_t> m_ints_l3;
|
||||
std::array<immutable_vector<std::uint64_t>, max_levels - 1> m_ranks;
|
||||
compact_vector m_links;
|
||||
bit_vector m_leaves;
|
||||
|
||||
public:
|
||||
bc_vector_15() = default;
|
||||
virtual ~bc_vector_15() = default;
|
||||
|
||||
bc_vector_15(const bc_vector_15&) = delete;
|
||||
bc_vector_15& operator=(const bc_vector_15&) = delete;
|
||||
|
||||
bc_vector_15(bc_vector_15&&) noexcept = default;
|
||||
bc_vector_15& operator=(bc_vector_15&&) noexcept = default;
|
||||
|
||||
template <class BcUnits>
|
||||
explicit bc_vector_15(const BcUnits& bc_units, bit_vector::builder&& leaves) {
|
||||
std::vector<std::uint16_t> ints_l1;
|
||||
std::vector<std::uint32_t> ints_l2;
|
||||
std::vector<std::uint64_t> ints_l3;
|
||||
std::array<std::vector<std::uint64_t>, max_levels - 1> ranks;
|
||||
std::vector<std::uint64_t> links;
|
||||
|
||||
ints_l1.reserve(bc_units.size() * 2);
|
||||
ranks[0].reserve((bc_units.size() * 2) >> l1_bits);
|
||||
links.reserve(bc_units.size());
|
||||
|
||||
auto append_unit = [&](std::uint64_t x) {
|
||||
if ((ints_l1.size() % block_size_l1) == 0) {
|
||||
ranks[0].push_back(static_cast<std::uint64_t>(ints_l2.size()));
|
||||
}
|
||||
if ((x / block_size_l1) == 0) {
|
||||
ints_l1.push_back(static_cast<std::uint16_t>(0 | (x << 1)));
|
||||
return;
|
||||
} else {
|
||||
const auto i = ints_l2.size() - ranks[0].back();
|
||||
ints_l1.push_back(static_cast<std::uint16_t>(1 | (i << 1)));
|
||||
}
|
||||
|
||||
if ((ints_l2.size() % block_size_l2) == 0) {
|
||||
ranks[1].push_back(static_cast<std::uint64_t>(ints_l3.size()));
|
||||
}
|
||||
if ((x / block_size_l2) == 0) {
|
||||
ints_l2.push_back(static_cast<std::uint32_t>(0 | (x << 1)));
|
||||
return;
|
||||
} else {
|
||||
const auto i = ints_l3.size() - ranks[1].back();
|
||||
ints_l2.push_back(static_cast<std::uint32_t>(1 | (i << 1)));
|
||||
}
|
||||
|
||||
ints_l3.push_back(x);
|
||||
};
|
||||
|
||||
auto append_leaf = [&](std::uint64_t x) {
|
||||
if ((ints_l1.size() % block_size_l1) == 0) {
|
||||
ranks[0].push_back(static_cast<std::uint64_t>(ints_l2.size()));
|
||||
}
|
||||
ints_l1.push_back(static_cast<std::uint16_t>(x & 0xFFFFU));
|
||||
links.push_back(x >> 16);
|
||||
};
|
||||
|
||||
for (std::uint64_t i = 0; i < bc_units.size(); ++i) {
|
||||
if (leaves[i]) {
|
||||
append_leaf(bc_units[i].base);
|
||||
} else {
|
||||
append_unit(bc_units[i].base ^ i);
|
||||
}
|
||||
append_unit(bc_units[i].check ^ i);
|
||||
if (bc_units[i].check == i) {
|
||||
m_num_frees += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// release
|
||||
m_ints_l1.build(ints_l1);
|
||||
m_ints_l2.build(ints_l2);
|
||||
m_ints_l3.build(ints_l3);
|
||||
for (std::uint32_t j = 0; j < m_ranks.size(); ++j) {
|
||||
m_ranks[j].build(ranks[j]);
|
||||
}
|
||||
m_links = compact_vector(links);
|
||||
m_leaves = bit_vector(leaves, true, false);
|
||||
}
|
||||
|
||||
inline std::uint64_t base(std::uint64_t i) const {
|
||||
return access(i * 2) ^ i;
|
||||
}
|
||||
|
||||
inline std::uint64_t check(std::uint64_t i) const {
|
||||
return access(i * 2 + 1) ^ i;
|
||||
}
|
||||
|
||||
inline std::uint64_t link(std::uint64_t i) const {
|
||||
return m_ints_l1[i * 2] | (m_links[m_leaves.rank(i)] << 16);
|
||||
}
|
||||
|
||||
inline bool is_leaf(std::uint64_t i) const {
|
||||
return m_leaves[i];
|
||||
}
|
||||
|
||||
inline bool is_used(std::uint64_t i) const {
|
||||
return check(i) != i;
|
||||
}
|
||||
|
||||
inline std::uint64_t num_units() const {
|
||||
return m_ints_l1.size() / 2;
|
||||
}
|
||||
|
||||
inline std::uint64_t num_free_units() const {
|
||||
return m_num_frees;
|
||||
}
|
||||
|
||||
inline std::uint64_t num_nodes() const {
|
||||
return num_units() - num_free_units();
|
||||
}
|
||||
|
||||
inline std::uint64_t num_leaves() const {
|
||||
return m_leaves.num_ones();
|
||||
}
|
||||
|
||||
template <class Visitor>
|
||||
void visit(Visitor& visitor) {
|
||||
visitor.visit(m_num_frees);
|
||||
visitor.visit(m_ints_l1);
|
||||
visitor.visit(m_ints_l2);
|
||||
visitor.visit(m_ints_l3);
|
||||
for (std::uint32_t j = 0; j < m_ranks.size(); j++) {
|
||||
visitor.visit(m_ranks[j]);
|
||||
}
|
||||
visitor.visit(m_links);
|
||||
visitor.visit(m_leaves);
|
||||
}
|
||||
|
||||
private:
|
||||
inline std::uint64_t access(std::uint64_t i) const {
|
||||
std::uint64_t x = m_ints_l1[i] >> 1;
|
||||
if ((m_ints_l1[i] & 1U) == 0) {
|
||||
return x;
|
||||
}
|
||||
i = m_ranks[0][i / block_size_l1] + x;
|
||||
|
||||
x = m_ints_l2[i] >> 1;
|
||||
if ((m_ints_l2[i] & 1U) == 0) {
|
||||
return x;
|
||||
}
|
||||
i = m_ranks[1][i / block_size_l2] + x;
|
||||
|
||||
return m_ints_l3[i];
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace xcdat
|
150
include/xcdat/bc_vector_16.hpp
Normal file
150
include/xcdat/bc_vector_16.hpp
Normal file
|
@ -0,0 +1,150 @@
|
|||
#pragma once
|
||||
|
||||
#include <array>
|
||||
|
||||
#include "bit_vector.hpp"
|
||||
#include "compact_vector.hpp"
|
||||
|
||||
namespace xcdat {
|
||||
|
||||
class bc_vector_16 {
|
||||
public:
|
||||
static constexpr std::uint32_t l1_bits = sizeof(std::uint16_t) * 8;
|
||||
static constexpr std::uint32_t max_levels = sizeof(std::uint64_t) / sizeof(std::uint16_t);
|
||||
|
||||
private:
|
||||
std::uint32_t m_num_levels = 0;
|
||||
std::uint64_t m_num_frees = 0;
|
||||
std::array<immutable_vector<std::uint16_t>, max_levels> m_shorts;
|
||||
std::array<bit_vector, max_levels - 1> m_nexts;
|
||||
compact_vector m_links;
|
||||
bit_vector m_leaves;
|
||||
|
||||
public:
|
||||
bc_vector_16() = default;
|
||||
virtual ~bc_vector_16() = default;
|
||||
|
||||
bc_vector_16(const bc_vector_16&) = delete;
|
||||
bc_vector_16& operator=(const bc_vector_16&) = delete;
|
||||
|
||||
bc_vector_16(bc_vector_16&&) noexcept = default;
|
||||
bc_vector_16& operator=(bc_vector_16&&) noexcept = default;
|
||||
|
||||
template <class BcUnits>
|
||||
explicit bc_vector_16(const BcUnits& bc_units, bit_vector::builder&& leaves) {
|
||||
std::array<std::vector<std::uint16_t>, max_levels> shorts;
|
||||
std::array<bit_vector::builder, max_levels> next_flags; // The last will not be released
|
||||
std::vector<std::uint64_t> links;
|
||||
|
||||
shorts[0].reserve(bc_units.size() * 2);
|
||||
next_flags[0].reserve(bc_units.size() * 2);
|
||||
links.reserve(bc_units.size());
|
||||
|
||||
m_num_levels = 0;
|
||||
|
||||
auto append_unit = [&](std::uint64_t x) {
|
||||
std::uint32_t j = 0;
|
||||
shorts[j].push_back(static_cast<std::uint16_t>(x & 0xFFFFU));
|
||||
next_flags[j].push_back(true);
|
||||
x >>= 16;
|
||||
while (x) {
|
||||
++j;
|
||||
shorts[j].push_back(static_cast<std::uint16_t>(x & 0xFFFFU));
|
||||
next_flags[j].push_back(true);
|
||||
x >>= 16;
|
||||
}
|
||||
next_flags[j].set_bit(next_flags[j].size() - 1, false);
|
||||
m_num_levels = std::max(m_num_levels, j);
|
||||
};
|
||||
|
||||
auto append_leaf = [&](std::uint64_t x) {
|
||||
shorts[0].push_back(static_cast<std::uint16_t>(x & 0xFFFFU));
|
||||
next_flags[0].push_back(false);
|
||||
links.push_back(x >> 16);
|
||||
};
|
||||
|
||||
for (std::uint64_t i = 0; i < bc_units.size(); ++i) {
|
||||
if (leaves[i]) {
|
||||
append_leaf(bc_units[i].base);
|
||||
} else {
|
||||
append_unit(bc_units[i].base ^ i);
|
||||
}
|
||||
append_unit(bc_units[i].check ^ i);
|
||||
if (bc_units[i].check == i) {
|
||||
m_num_frees += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// release
|
||||
for (std::uint32_t i = 0; i < m_num_levels; ++i) {
|
||||
m_shorts[i].build(shorts[i]);
|
||||
m_nexts[i] = bit_vector(next_flags[i], true, false);
|
||||
}
|
||||
m_shorts[m_num_levels].build(shorts[m_num_levels]);
|
||||
m_links = compact_vector(links);
|
||||
m_leaves = bit_vector(leaves, true, false);
|
||||
}
|
||||
|
||||
inline std::uint64_t base(std::uint64_t i) const {
|
||||
return access(i * 2) ^ i;
|
||||
}
|
||||
|
||||
inline std::uint64_t check(std::uint64_t i) const {
|
||||
return access(i * 2 + 1) ^ i;
|
||||
}
|
||||
|
||||
inline std::uint64_t link(std::uint64_t i) const {
|
||||
return m_shorts[0][i * 2] | (m_links[m_leaves.rank(i)] << 16);
|
||||
}
|
||||
|
||||
inline bool is_leaf(std::uint64_t i) const {
|
||||
return m_leaves[i];
|
||||
}
|
||||
|
||||
inline bool is_used(std::uint64_t i) const {
|
||||
return check(i) != i;
|
||||
}
|
||||
|
||||
inline std::uint64_t num_units() const {
|
||||
return m_shorts[0].size() / 2;
|
||||
}
|
||||
|
||||
inline std::uint64_t num_free_units() const {
|
||||
return m_num_frees;
|
||||
}
|
||||
|
||||
inline std::uint64_t num_nodes() const {
|
||||
return num_units() - num_free_units();
|
||||
}
|
||||
|
||||
inline std::uint64_t num_leaves() const {
|
||||
return m_leaves.num_ones();
|
||||
}
|
||||
|
||||
template <class Visitor>
|
||||
void visit(Visitor& visitor) {
|
||||
visitor.visit(m_num_levels);
|
||||
visitor.visit(m_num_frees);
|
||||
for (std::uint32_t j = 0; j < m_shorts.size(); j++) {
|
||||
visitor.visit(m_shorts[j]);
|
||||
}
|
||||
for (std::uint32_t j = 0; j < m_nexts.size(); j++) {
|
||||
visitor.visit(m_nexts[j]);
|
||||
}
|
||||
visitor.visit(m_links);
|
||||
visitor.visit(m_leaves);
|
||||
}
|
||||
|
||||
private:
|
||||
inline std::uint64_t access(std::uint64_t i) const {
|
||||
std::uint32_t j = 0;
|
||||
std::uint64_t x = m_shorts[j][i];
|
||||
while (j < m_num_levels and m_nexts[j][i]) {
|
||||
i = m_nexts[j++].rank(i);
|
||||
x |= static_cast<std::uint64_t>(m_shorts[j][i]) << (j * 16);
|
||||
}
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace xcdat
|
|
@ -82,6 +82,7 @@ class bc_vector_7 {
|
|||
const auto i = ints_l4.size() - ranks[2].back();
|
||||
ints_l3.push_back(static_cast<std::uint32_t>(1 | (i << 1)));
|
||||
}
|
||||
|
||||
ints_l4.push_back(x);
|
||||
};
|
||||
|
||||
|
@ -89,7 +90,7 @@ class bc_vector_7 {
|
|||
if ((ints_l1.size() % block_size_l1) == 0) {
|
||||
ranks[0].push_back(static_cast<std::uint64_t>(ints_l2.size()));
|
||||
}
|
||||
ints_l1.push_back(static_cast<std::uint8_t>(x & 0xFF));
|
||||
ints_l1.push_back(static_cast<std::uint8_t>(x & 0xFFU));
|
||||
links.push_back(x >> 8);
|
||||
};
|
||||
|
||||
|
|
|
@ -9,8 +9,8 @@ namespace xcdat {
|
|||
|
||||
class bc_vector_8 {
|
||||
public:
|
||||
static constexpr std::uint32_t l1_bits = 8;
|
||||
static constexpr std::uint32_t max_levels = sizeof(std::uint64_t);
|
||||
static constexpr std::uint32_t l1_bits = sizeof(std::uint8_t) * 8;
|
||||
static constexpr std::uint32_t max_levels = sizeof(std::uint64_t) / sizeof(std::uint8_t);
|
||||
|
||||
private:
|
||||
std::uint32_t m_num_levels = 0;
|
||||
|
@ -44,12 +44,12 @@ class bc_vector_8 {
|
|||
|
||||
auto append_unit = [&](std::uint64_t x) {
|
||||
std::uint32_t j = 0;
|
||||
bytes[j].push_back(static_cast<std::uint8_t>(x & 0xFF));
|
||||
bytes[j].push_back(static_cast<std::uint8_t>(x & 0xFFU));
|
||||
next_flags[j].push_back(true);
|
||||
x >>= 8;
|
||||
while (x) {
|
||||
++j;
|
||||
bytes[j].push_back(static_cast<std::uint8_t>(x & 0xFF));
|
||||
bytes[j].push_back(static_cast<std::uint8_t>(x & 0xFFU));
|
||||
next_flags[j].push_back(true);
|
||||
x >>= 8;
|
||||
}
|
||||
|
@ -58,7 +58,7 @@ class bc_vector_8 {
|
|||
};
|
||||
|
||||
auto append_leaf = [&](std::uint64_t x) {
|
||||
bytes[0].push_back(static_cast<std::uint8_t>(x & 0xFF));
|
||||
bytes[0].push_back(static_cast<std::uint8_t>(x & 0xFFU));
|
||||
next_flags[0].push_back(false);
|
||||
links.push_back(x >> 8);
|
||||
};
|
||||
|
|
|
@ -7,7 +7,7 @@ add_test(test_compact_vector test_compact_vector)
|
|||
add_executable(test_tail_vector test_tail_vector.cpp)
|
||||
add_test(test_tail_vector test_tail_vector)
|
||||
|
||||
set(BC_OPTIONS "7" "8")
|
||||
set(BC_OPTIONS "7" "8" "15" "16")
|
||||
|
||||
foreach(BC_OPTION ${BC_OPTIONS})
|
||||
set(TEST_SRC_NAME test_bc_vector_${BC_OPTION})
|
||||
|
|
|
@ -5,6 +5,8 @@
|
|||
|
||||
#include "doctest/doctest.h"
|
||||
#include "test_common.hpp"
|
||||
#include "xcdat/bc_vector_15.hpp"
|
||||
#include "xcdat/bc_vector_16.hpp"
|
||||
#include "xcdat/bc_vector_7.hpp"
|
||||
#include "xcdat/bc_vector_8.hpp"
|
||||
|
||||
|
@ -12,6 +14,10 @@
|
|||
using bc_vector_type = xcdat::bc_vector_7;
|
||||
#elif BC_VECTOR_8
|
||||
using bc_vector_type = xcdat::bc_vector_8;
|
||||
#elif BC_VECTOR_15
|
||||
using bc_vector_type = xcdat::bc_vector_15;
|
||||
#elif BC_VECTOR_16
|
||||
using bc_vector_type = xcdat::bc_vector_16;
|
||||
#endif
|
||||
|
||||
struct bc_unit {
|
||||
|
|
|
@ -14,6 +14,10 @@
|
|||
using trie_type = xcdat::trie_7_type;
|
||||
#elif TRIE_8
|
||||
using trie_type = xcdat::trie_8_type;
|
||||
#elif TRIE_15
|
||||
using trie_type = xcdat::trie_15_type;
|
||||
#elif TRIE_16
|
||||
using trie_type = xcdat::trie_16_type;
|
||||
#endif
|
||||
|
||||
void test_basic_operations(const trie_type& trie, const std::vector<std::string>& keys,
|
||||
|
|
|
@ -144,5 +144,11 @@ int main(int argc, char** argv) {
|
|||
tfm::printfln("** xcdat::trie_8_type **");
|
||||
benchmark<xcdat::trie_8_type>(keys, query_keys, binary_mode);
|
||||
|
||||
tfm::printfln("** xcdat::trie_15_type **");
|
||||
benchmark<xcdat::trie_15_type>(keys, query_keys, binary_mode);
|
||||
|
||||
tfm::printfln("** xcdat::trie_16_type **");
|
||||
benchmark<xcdat::trie_16_type>(keys, query_keys, binary_mode);
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -7,7 +7,7 @@ cmd_line_parser::parser make_parser(int argc, char** argv) {
|
|||
cmd_line_parser::parser p(argc, argv);
|
||||
p.add("input_keys", "Input filepath of keywords");
|
||||
p.add("output_dic", "Output filepath of trie dictionary");
|
||||
p.add("trie_type", "Trie type: [7|8] (default=7)", "-t", false);
|
||||
p.add("trie_type", "Trie type: [7|8|15|16] (default=8)", "-t", false);
|
||||
p.add("binary_mode", "Is binary mode? (default=0)", "-b", false);
|
||||
return p;
|
||||
}
|
||||
|
@ -51,13 +51,17 @@ int main(int argc, char** argv) {
|
|||
return 1;
|
||||
}
|
||||
|
||||
const auto trie_type = p.get<int>("trie_type", 7);
|
||||
const auto trie_type = p.get<int>("trie_type", 8);
|
||||
|
||||
switch (trie_type) {
|
||||
case 7:
|
||||
return build<xcdat::trie_7_type>(p);
|
||||
case 8:
|
||||
return build<xcdat::trie_8_type>(p);
|
||||
case 15:
|
||||
return build<xcdat::trie_15_type>(p);
|
||||
case 16:
|
||||
return build<xcdat::trie_16_type>(p);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue