add 15/16-bit version

This commit is contained in:
Shunsuke Kanda 2021-07-03 08:12:35 +09:00
parent da5097baee
commit 2955cab72c
10 changed files with 358 additions and 9 deletions

View file

@ -1,5 +1,7 @@
#pragma once
#include "xcdat/bc_vector_15.hpp"
#include "xcdat/bc_vector_16.hpp"
#include "xcdat/bc_vector_7.hpp"
#include "xcdat/bc_vector_8.hpp"
#include "xcdat/load_visitor.hpp"
@ -11,7 +13,10 @@
namespace xcdat {
using trie_8_type = trie<bc_vector_8>;
using trie_16_type = trie<bc_vector_16>;
using trie_7_type = trie<bc_vector_7>;
using trie_15_type = trie<bc_vector_15>;
//! Set the continuous memory block to a new trie instance (for a memory-mapped file).
template <class Trie>

View file

@ -0,0 +1,173 @@
#pragma once
#include <array>
#include "bit_vector.hpp"
#include "compact_vector.hpp"
namespace xcdat {
class bc_vector_15 {
public:
static constexpr std::uint32_t l1_bits = 15;
static constexpr std::uint32_t max_levels = 3;
static constexpr std::uint64_t block_size_l1 = 1ULL << 15;
static constexpr std::uint64_t block_size_l2 = 1ULL << 31;
private:
std::uint64_t m_num_frees = 0;
immutable_vector<std::uint16_t> m_ints_l1;
immutable_vector<std::uint32_t> m_ints_l2;
immutable_vector<std::uint64_t> m_ints_l3;
std::array<immutable_vector<std::uint64_t>, max_levels - 1> m_ranks;
compact_vector m_links;
bit_vector m_leaves;
public:
bc_vector_15() = default;
virtual ~bc_vector_15() = default;
bc_vector_15(const bc_vector_15&) = delete;
bc_vector_15& operator=(const bc_vector_15&) = delete;
bc_vector_15(bc_vector_15&&) noexcept = default;
bc_vector_15& operator=(bc_vector_15&&) noexcept = default;
template <class BcUnits>
explicit bc_vector_15(const BcUnits& bc_units, bit_vector::builder&& leaves) {
std::vector<std::uint16_t> ints_l1;
std::vector<std::uint32_t> ints_l2;
std::vector<std::uint64_t> ints_l3;
std::array<std::vector<std::uint64_t>, max_levels - 1> ranks;
std::vector<std::uint64_t> links;
ints_l1.reserve(bc_units.size() * 2);
ranks[0].reserve((bc_units.size() * 2) >> l1_bits);
links.reserve(bc_units.size());
auto append_unit = [&](std::uint64_t x) {
if ((ints_l1.size() % block_size_l1) == 0) {
ranks[0].push_back(static_cast<std::uint64_t>(ints_l2.size()));
}
if ((x / block_size_l1) == 0) {
ints_l1.push_back(static_cast<std::uint16_t>(0 | (x << 1)));
return;
} else {
const auto i = ints_l2.size() - ranks[0].back();
ints_l1.push_back(static_cast<std::uint16_t>(1 | (i << 1)));
}
if ((ints_l2.size() % block_size_l2) == 0) {
ranks[1].push_back(static_cast<std::uint64_t>(ints_l3.size()));
}
if ((x / block_size_l2) == 0) {
ints_l2.push_back(static_cast<std::uint32_t>(0 | (x << 1)));
return;
} else {
const auto i = ints_l3.size() - ranks[1].back();
ints_l2.push_back(static_cast<std::uint32_t>(1 | (i << 1)));
}
ints_l3.push_back(x);
};
auto append_leaf = [&](std::uint64_t x) {
if ((ints_l1.size() % block_size_l1) == 0) {
ranks[0].push_back(static_cast<std::uint64_t>(ints_l2.size()));
}
ints_l1.push_back(static_cast<std::uint16_t>(x & 0xFFFFU));
links.push_back(x >> 16);
};
for (std::uint64_t i = 0; i < bc_units.size(); ++i) {
if (leaves[i]) {
append_leaf(bc_units[i].base);
} else {
append_unit(bc_units[i].base ^ i);
}
append_unit(bc_units[i].check ^ i);
if (bc_units[i].check == i) {
m_num_frees += 1;
}
}
// release
m_ints_l1.build(ints_l1);
m_ints_l2.build(ints_l2);
m_ints_l3.build(ints_l3);
for (std::uint32_t j = 0; j < m_ranks.size(); ++j) {
m_ranks[j].build(ranks[j]);
}
m_links = compact_vector(links);
m_leaves = bit_vector(leaves, true, false);
}
inline std::uint64_t base(std::uint64_t i) const {
return access(i * 2) ^ i;
}
inline std::uint64_t check(std::uint64_t i) const {
return access(i * 2 + 1) ^ i;
}
inline std::uint64_t link(std::uint64_t i) const {
return m_ints_l1[i * 2] | (m_links[m_leaves.rank(i)] << 16);
}
inline bool is_leaf(std::uint64_t i) const {
return m_leaves[i];
}
inline bool is_used(std::uint64_t i) const {
return check(i) != i;
}
inline std::uint64_t num_units() const {
return m_ints_l1.size() / 2;
}
inline std::uint64_t num_free_units() const {
return m_num_frees;
}
inline std::uint64_t num_nodes() const {
return num_units() - num_free_units();
}
inline std::uint64_t num_leaves() const {
return m_leaves.num_ones();
}
template <class Visitor>
void visit(Visitor& visitor) {
visitor.visit(m_num_frees);
visitor.visit(m_ints_l1);
visitor.visit(m_ints_l2);
visitor.visit(m_ints_l3);
for (std::uint32_t j = 0; j < m_ranks.size(); j++) {
visitor.visit(m_ranks[j]);
}
visitor.visit(m_links);
visitor.visit(m_leaves);
}
private:
inline std::uint64_t access(std::uint64_t i) const {
std::uint64_t x = m_ints_l1[i] >> 1;
if ((m_ints_l1[i] & 1U) == 0) {
return x;
}
i = m_ranks[0][i / block_size_l1] + x;
x = m_ints_l2[i] >> 1;
if ((m_ints_l2[i] & 1U) == 0) {
return x;
}
i = m_ranks[1][i / block_size_l2] + x;
return m_ints_l3[i];
}
};
} // namespace xcdat

View file

@ -0,0 +1,150 @@
#pragma once
#include <array>
#include "bit_vector.hpp"
#include "compact_vector.hpp"
namespace xcdat {
class bc_vector_16 {
public:
static constexpr std::uint32_t l1_bits = sizeof(std::uint16_t) * 8;
static constexpr std::uint32_t max_levels = sizeof(std::uint64_t) / sizeof(std::uint16_t);
private:
std::uint32_t m_num_levels = 0;
std::uint64_t m_num_frees = 0;
std::array<immutable_vector<std::uint16_t>, max_levels> m_shorts;
std::array<bit_vector, max_levels - 1> m_nexts;
compact_vector m_links;
bit_vector m_leaves;
public:
bc_vector_16() = default;
virtual ~bc_vector_16() = default;
bc_vector_16(const bc_vector_16&) = delete;
bc_vector_16& operator=(const bc_vector_16&) = delete;
bc_vector_16(bc_vector_16&&) noexcept = default;
bc_vector_16& operator=(bc_vector_16&&) noexcept = default;
template <class BcUnits>
explicit bc_vector_16(const BcUnits& bc_units, bit_vector::builder&& leaves) {
std::array<std::vector<std::uint16_t>, max_levels> shorts;
std::array<bit_vector::builder, max_levels> next_flags; // The last will not be released
std::vector<std::uint64_t> links;
shorts[0].reserve(bc_units.size() * 2);
next_flags[0].reserve(bc_units.size() * 2);
links.reserve(bc_units.size());
m_num_levels = 0;
auto append_unit = [&](std::uint64_t x) {
std::uint32_t j = 0;
shorts[j].push_back(static_cast<std::uint16_t>(x & 0xFFFFU));
next_flags[j].push_back(true);
x >>= 16;
while (x) {
++j;
shorts[j].push_back(static_cast<std::uint16_t>(x & 0xFFFFU));
next_flags[j].push_back(true);
x >>= 16;
}
next_flags[j].set_bit(next_flags[j].size() - 1, false);
m_num_levels = std::max(m_num_levels, j);
};
auto append_leaf = [&](std::uint64_t x) {
shorts[0].push_back(static_cast<std::uint16_t>(x & 0xFFFFU));
next_flags[0].push_back(false);
links.push_back(x >> 16);
};
for (std::uint64_t i = 0; i < bc_units.size(); ++i) {
if (leaves[i]) {
append_leaf(bc_units[i].base);
} else {
append_unit(bc_units[i].base ^ i);
}
append_unit(bc_units[i].check ^ i);
if (bc_units[i].check == i) {
m_num_frees += 1;
}
}
// release
for (std::uint32_t i = 0; i < m_num_levels; ++i) {
m_shorts[i].build(shorts[i]);
m_nexts[i] = bit_vector(next_flags[i], true, false);
}
m_shorts[m_num_levels].build(shorts[m_num_levels]);
m_links = compact_vector(links);
m_leaves = bit_vector(leaves, true, false);
}
inline std::uint64_t base(std::uint64_t i) const {
return access(i * 2) ^ i;
}
inline std::uint64_t check(std::uint64_t i) const {
return access(i * 2 + 1) ^ i;
}
inline std::uint64_t link(std::uint64_t i) const {
return m_shorts[0][i * 2] | (m_links[m_leaves.rank(i)] << 16);
}
inline bool is_leaf(std::uint64_t i) const {
return m_leaves[i];
}
inline bool is_used(std::uint64_t i) const {
return check(i) != i;
}
inline std::uint64_t num_units() const {
return m_shorts[0].size() / 2;
}
inline std::uint64_t num_free_units() const {
return m_num_frees;
}
inline std::uint64_t num_nodes() const {
return num_units() - num_free_units();
}
inline std::uint64_t num_leaves() const {
return m_leaves.num_ones();
}
template <class Visitor>
void visit(Visitor& visitor) {
visitor.visit(m_num_levels);
visitor.visit(m_num_frees);
for (std::uint32_t j = 0; j < m_shorts.size(); j++) {
visitor.visit(m_shorts[j]);
}
for (std::uint32_t j = 0; j < m_nexts.size(); j++) {
visitor.visit(m_nexts[j]);
}
visitor.visit(m_links);
visitor.visit(m_leaves);
}
private:
inline std::uint64_t access(std::uint64_t i) const {
std::uint32_t j = 0;
std::uint64_t x = m_shorts[j][i];
while (j < m_num_levels and m_nexts[j][i]) {
i = m_nexts[j++].rank(i);
x |= static_cast<std::uint64_t>(m_shorts[j][i]) << (j * 16);
}
return x;
}
};
} // namespace xcdat

View file

@ -82,6 +82,7 @@ class bc_vector_7 {
const auto i = ints_l4.size() - ranks[2].back();
ints_l3.push_back(static_cast<std::uint32_t>(1 | (i << 1)));
}
ints_l4.push_back(x);
};
@ -89,7 +90,7 @@ class bc_vector_7 {
if ((ints_l1.size() % block_size_l1) == 0) {
ranks[0].push_back(static_cast<std::uint64_t>(ints_l2.size()));
}
ints_l1.push_back(static_cast<std::uint8_t>(x & 0xFF));
ints_l1.push_back(static_cast<std::uint8_t>(x & 0xFFU));
links.push_back(x >> 8);
};

View file

@ -9,8 +9,8 @@ namespace xcdat {
class bc_vector_8 {
public:
static constexpr std::uint32_t l1_bits = 8;
static constexpr std::uint32_t max_levels = sizeof(std::uint64_t);
static constexpr std::uint32_t l1_bits = sizeof(std::uint8_t) * 8;
static constexpr std::uint32_t max_levels = sizeof(std::uint64_t) / sizeof(std::uint8_t);
private:
std::uint32_t m_num_levels = 0;
@ -44,12 +44,12 @@ class bc_vector_8 {
auto append_unit = [&](std::uint64_t x) {
std::uint32_t j = 0;
bytes[j].push_back(static_cast<std::uint8_t>(x & 0xFF));
bytes[j].push_back(static_cast<std::uint8_t>(x & 0xFFU));
next_flags[j].push_back(true);
x >>= 8;
while (x) {
++j;
bytes[j].push_back(static_cast<std::uint8_t>(x & 0xFF));
bytes[j].push_back(static_cast<std::uint8_t>(x & 0xFFU));
next_flags[j].push_back(true);
x >>= 8;
}
@ -58,7 +58,7 @@ class bc_vector_8 {
};
auto append_leaf = [&](std::uint64_t x) {
bytes[0].push_back(static_cast<std::uint8_t>(x & 0xFF));
bytes[0].push_back(static_cast<std::uint8_t>(x & 0xFFU));
next_flags[0].push_back(false);
links.push_back(x >> 8);
};

View file

@ -7,7 +7,7 @@ add_test(test_compact_vector test_compact_vector)
add_executable(test_tail_vector test_tail_vector.cpp)
add_test(test_tail_vector test_tail_vector)
set(BC_OPTIONS "7" "8")
set(BC_OPTIONS "7" "8" "15" "16")
foreach(BC_OPTION ${BC_OPTIONS})
set(TEST_SRC_NAME test_bc_vector_${BC_OPTION})

View file

@ -5,6 +5,8 @@
#include "doctest/doctest.h"
#include "test_common.hpp"
#include "xcdat/bc_vector_15.hpp"
#include "xcdat/bc_vector_16.hpp"
#include "xcdat/bc_vector_7.hpp"
#include "xcdat/bc_vector_8.hpp"
@ -12,6 +14,10 @@
using bc_vector_type = xcdat::bc_vector_7;
#elif BC_VECTOR_8
using bc_vector_type = xcdat::bc_vector_8;
#elif BC_VECTOR_15
using bc_vector_type = xcdat::bc_vector_15;
#elif BC_VECTOR_16
using bc_vector_type = xcdat::bc_vector_16;
#endif
struct bc_unit {

View file

@ -14,6 +14,10 @@
using trie_type = xcdat::trie_7_type;
#elif TRIE_8
using trie_type = xcdat::trie_8_type;
#elif TRIE_15
using trie_type = xcdat::trie_15_type;
#elif TRIE_16
using trie_type = xcdat::trie_16_type;
#endif
void test_basic_operations(const trie_type& trie, const std::vector<std::string>& keys,

View file

@ -144,5 +144,11 @@ int main(int argc, char** argv) {
tfm::printfln("** xcdat::trie_8_type **");
benchmark<xcdat::trie_8_type>(keys, query_keys, binary_mode);
tfm::printfln("** xcdat::trie_15_type **");
benchmark<xcdat::trie_15_type>(keys, query_keys, binary_mode);
tfm::printfln("** xcdat::trie_16_type **");
benchmark<xcdat::trie_16_type>(keys, query_keys, binary_mode);
return 0;
}

View file

@ -7,7 +7,7 @@ cmd_line_parser::parser make_parser(int argc, char** argv) {
cmd_line_parser::parser p(argc, argv);
p.add("input_keys", "Input filepath of keywords");
p.add("output_dic", "Output filepath of trie dictionary");
p.add("trie_type", "Trie type: [7|8] (default=7)", "-t", false);
p.add("trie_type", "Trie type: [7|8|15|16] (default=8)", "-t", false);
p.add("binary_mode", "Is binary mode? (default=0)", "-b", false);
return p;
}
@ -51,13 +51,17 @@ int main(int argc, char** argv) {
return 1;
}
const auto trie_type = p.get<int>("trie_type", 7);
const auto trie_type = p.get<int>("trie_type", 8);
switch (trie_type) {
case 7:
return build<xcdat::trie_7_type>(p);
case 8:
return build<xcdat::trie_8_type>(p);
case 15:
return build<xcdat::trie_15_type>(p);
case 16:
return build<xcdat::trie_16_type>(p);
default:
break;
}