add 7-bit version
This commit is contained in:
parent
1f1080e605
commit
dd1860e792
|
@ -1,11 +1,14 @@
|
|||
#pragma once
|
||||
|
||||
#include "xcdat/bc_vector_7.hpp"
|
||||
#include "xcdat/bc_vector_8.hpp"
|
||||
|
||||
#include "xcdat/io.hpp"
|
||||
#include "xcdat/trie.hpp"
|
||||
|
||||
namespace xcdat {
|
||||
|
||||
using trie_7_type = trie<bc_vector_7>;
|
||||
using trie_8_type = trie<bc_vector_8>;
|
||||
|
||||
}
|
||||
} // namespace xcdat
|
||||
|
|
190
include/xcdat/bc_vector_7.hpp
Normal file
190
include/xcdat/bc_vector_7.hpp
Normal file
|
@ -0,0 +1,190 @@
|
|||
#pragma once
|
||||
|
||||
#include <array>
|
||||
|
||||
#include "bit_vector.hpp"
|
||||
#include "compact_vector.hpp"
|
||||
|
||||
namespace xcdat {
|
||||
|
||||
class bc_vector_7 {
|
||||
public:
|
||||
static constexpr std::uint32_t l1_bits = 7;
|
||||
static constexpr std::uint32_t max_levels = 4;
|
||||
static constexpr std::uint64_t block_size_l1 = 1ULL << 7;
|
||||
static constexpr std::uint64_t block_size_l2 = 1ULL << 15;
|
||||
static constexpr std::uint64_t block_size_l3 = 1ULL << 31;
|
||||
|
||||
private:
|
||||
std::uint64_t m_num_frees = 0;
|
||||
mm_vector<std::uint8_t> m_ints_l1;
|
||||
mm_vector<std::uint16_t> m_ints_l2;
|
||||
mm_vector<std::uint32_t> m_ints_l3;
|
||||
mm_vector<std::uint64_t> m_ints_l4;
|
||||
std::array<mm_vector<std::uint64_t>, max_levels - 1> m_ranks;
|
||||
compact_vector m_links;
|
||||
bit_vector m_leaves;
|
||||
|
||||
public:
|
||||
bc_vector_7() = default;
|
||||
virtual ~bc_vector_7() = default;
|
||||
|
||||
bc_vector_7(const bc_vector_7&) = delete;
|
||||
bc_vector_7& operator=(const bc_vector_7&) = delete;
|
||||
|
||||
bc_vector_7(bc_vector_7&&) noexcept = default;
|
||||
bc_vector_7& operator=(bc_vector_7&&) noexcept = default;
|
||||
|
||||
template <class BcUnits>
|
||||
explicit bc_vector_7(const BcUnits& bc_units, bit_vector::builder&& leaves) {
|
||||
build(bc_units, std::move(leaves));
|
||||
}
|
||||
|
||||
template <class BcUnits>
|
||||
void build(const BcUnits& bc_units, bit_vector::builder&& leaves) {
|
||||
std::vector<std::uint8_t> ints_l1;
|
||||
std::vector<std::uint16_t> ints_l2;
|
||||
std::vector<std::uint32_t> ints_l3;
|
||||
std::vector<std::uint64_t> ints_l4;
|
||||
std::array<std::vector<std::uint64_t>, max_levels - 1> ranks;
|
||||
std::vector<std::uint64_t> links;
|
||||
|
||||
ints_l1.reserve(bc_units.size() * 2);
|
||||
ranks[0].reserve((bc_units.size() * 2) >> l1_bits);
|
||||
links.reserve(bc_units.size());
|
||||
|
||||
auto append_unit = [&](std::uint64_t x) {
|
||||
if ((ints_l1.size() % block_size_l1) == 0) {
|
||||
ranks[0].push_back(static_cast<std::uint64_t>(ints_l2.size()));
|
||||
}
|
||||
if ((x / block_size_l1) == 0) {
|
||||
ints_l1.push_back(static_cast<std::uint8_t>(0 | (x << 1)));
|
||||
return;
|
||||
} else {
|
||||
const auto i = ints_l2.size() - ranks[0].back();
|
||||
ints_l1.push_back(static_cast<std::uint8_t>(1 | (i << 1)));
|
||||
}
|
||||
|
||||
if ((ints_l2.size() % block_size_l2) == 0) {
|
||||
ranks[1].push_back(static_cast<std::uint64_t>(ints_l3.size()));
|
||||
}
|
||||
if ((x / block_size_l2) == 0) {
|
||||
ints_l2.push_back(static_cast<std::uint16_t>(0 | (x << 1)));
|
||||
return;
|
||||
} else {
|
||||
const auto i = ints_l3.size() - ranks[1].back();
|
||||
ints_l2.push_back(static_cast<std::uint16_t>(1 | (i << 1)));
|
||||
}
|
||||
|
||||
if ((ints_l3.size() % block_size_l3) == 0) {
|
||||
ranks[2].push_back(static_cast<std::uint64_t>(ints_l4.size()));
|
||||
}
|
||||
if ((x / block_size_l3) == 0) {
|
||||
ints_l3.push_back(static_cast<std::uint32_t>(0 | (x << 1)));
|
||||
return;
|
||||
} else {
|
||||
const auto i = ints_l4.size() - ranks[2].back();
|
||||
ints_l3.push_back(static_cast<std::uint32_t>(1 | (i << 1)));
|
||||
}
|
||||
ints_l4.push_back(x);
|
||||
};
|
||||
|
||||
auto append_leaf = [&](std::uint64_t x) {
|
||||
if ((ints_l1.size() % block_size_l1) == 0) {
|
||||
ranks[0].push_back(static_cast<std::uint64_t>(ints_l2.size()));
|
||||
}
|
||||
ints_l1.push_back(static_cast<std::uint8_t>(x & 0xFF));
|
||||
links.push_back(x >> 8);
|
||||
};
|
||||
|
||||
for (std::uint64_t i = 0; i < bc_units.size(); ++i) {
|
||||
if (leaves[i]) {
|
||||
append_leaf(bc_units[i].base);
|
||||
} else {
|
||||
append_unit(bc_units[i].base ^ i);
|
||||
}
|
||||
append_unit(bc_units[i].check ^ i);
|
||||
if (bc_units[i].check == i) {
|
||||
m_num_frees += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// release
|
||||
m_ints_l1.steal(ints_l1);
|
||||
m_ints_l2.steal(ints_l2);
|
||||
m_ints_l3.steal(ints_l3);
|
||||
m_ints_l4.steal(ints_l4);
|
||||
for (std::uint32_t j = 0; j < m_ranks.size(); ++j) {
|
||||
m_ranks[j].steal(ranks[j]);
|
||||
}
|
||||
m_links.build(links);
|
||||
m_leaves.build(leaves, true, false);
|
||||
}
|
||||
|
||||
inline std::uint64_t base(std::uint64_t i) const {
|
||||
return access(i * 2) ^ i;
|
||||
}
|
||||
|
||||
inline std::uint64_t check(std::uint64_t i) const {
|
||||
return access(i * 2 + 1) ^ i;
|
||||
}
|
||||
|
||||
inline std::uint64_t link(std::uint64_t i) const {
|
||||
return m_ints_l1[i * 2] | (m_links[m_leaves.rank(i)] << 8);
|
||||
}
|
||||
|
||||
inline bool is_leaf(std::uint64_t i) const {
|
||||
return m_leaves[i];
|
||||
}
|
||||
|
||||
inline bool is_used(std::uint64_t i) const {
|
||||
return check(i) != i;
|
||||
}
|
||||
|
||||
inline std::uint64_t num_units() const {
|
||||
return m_ints_l1.size() / 2;
|
||||
}
|
||||
|
||||
inline std::uint64_t num_leaves() const {
|
||||
return m_leaves.num_ones();
|
||||
}
|
||||
|
||||
template <class Visitor>
|
||||
void visit(Visitor& visitor) {
|
||||
visitor.visit(m_num_frees);
|
||||
visitor.visit(m_ints_l1);
|
||||
visitor.visit(m_ints_l2);
|
||||
visitor.visit(m_ints_l3);
|
||||
visitor.visit(m_ints_l4);
|
||||
for (std::uint32_t j = 0; j < m_ranks.size(); j++) {
|
||||
visitor.visit(m_ranks[j]);
|
||||
}
|
||||
visitor.visit(m_links);
|
||||
visitor.visit(m_leaves);
|
||||
}
|
||||
|
||||
private:
|
||||
inline std::uint64_t access(std::uint64_t i) const {
|
||||
std::uint64_t x = m_ints_l1[i] >> 1;
|
||||
if ((m_ints_l1[i] & 1U) == 0) {
|
||||
return x;
|
||||
}
|
||||
i = m_ranks[0][i / block_size_l1] + x;
|
||||
|
||||
x = m_ints_l2[i] >> 1;
|
||||
if ((m_ints_l2[i] & 1U) == 0) {
|
||||
return x;
|
||||
}
|
||||
i = m_ranks[1][i / block_size_l2] + x;
|
||||
|
||||
x = m_ints_l3[i] >> 1;
|
||||
if ((m_ints_l3[i] & 1U) == 0) {
|
||||
return x;
|
||||
}
|
||||
i = m_ranks[2][i / block_size_l3] + x;
|
||||
|
||||
return m_ints_l4[i];
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace xcdat
|
|
@ -5,18 +5,20 @@
|
|||
|
||||
#include "doctest/doctest.h"
|
||||
#include "test_common.hpp"
|
||||
#include "xcdat/bc_vector_7.hpp"
|
||||
#include "xcdat/bc_vector_8.hpp"
|
||||
|
||||
using bc_vector_type = xcdat::bc_vector_7;
|
||||
// using bc_vector_type = xcdat::bc_vector_8;
|
||||
|
||||
struct bc_unit {
|
||||
std::uint64_t base;
|
||||
std::uint64_t check;
|
||||
};
|
||||
|
||||
std::vector<bc_unit> make_random_units(std::uint64_t n) {
|
||||
static constexpr std::uint64_t seed = 13;
|
||||
|
||||
std::vector<bc_unit> make_random_units(std::uint64_t n, std::uint64_t maxv, std::uint64_t seed = 13) {
|
||||
std::mt19937_64 engine(seed);
|
||||
std::uniform_int_distribution<std::uint64_t> dist(0, n - 1);
|
||||
std::uniform_int_distribution<std::uint64_t> dist(0, maxv);
|
||||
|
||||
std::vector<bc_unit> bc_units(n);
|
||||
for (std::uint64_t i = 0; i < n; i++) {
|
||||
|
@ -38,18 +40,15 @@ std::uint64_t get_num_ones(const std::vector<bool>& bits) {
|
|||
return std::accumulate(bits.begin(), bits.end(), 0ULL);
|
||||
}
|
||||
|
||||
TEST_CASE("Test xcdat::bc_vector_8") {
|
||||
auto bc_units = make_random_units(10000);
|
||||
auto leaf_flags = xcdat::test::make_random_bits(10000, 0.2);
|
||||
|
||||
xcdat::bc_vector_8 bc(bc_units, to_bit_vector_builder(leaf_flags));
|
||||
void test_bc_vector(const std::vector<bc_unit>& bc_units, const std::vector<bool>& leaves) {
|
||||
bc_vector_type bc(bc_units, to_bit_vector_builder(leaves));
|
||||
|
||||
REQUIRE_EQ(bc.num_units(), bc_units.size());
|
||||
REQUIRE_EQ(bc.num_leaves(), get_num_ones(leaf_flags));
|
||||
REQUIRE_EQ(bc.num_leaves(), get_num_ones(leaves));
|
||||
|
||||
for (std::uint64_t i = 0; i < bc.num_units(); i++) {
|
||||
REQUIRE_EQ(bc.is_leaf(i), leaf_flags[i]);
|
||||
if (leaf_flags[i]) {
|
||||
REQUIRE_EQ(bc.is_leaf(i), leaves[i]);
|
||||
if (leaves[i]) {
|
||||
REQUIRE_EQ(bc.link(i), bc_units[i].base);
|
||||
} else {
|
||||
REQUIRE_EQ(bc.base(i), bc_units[i].base);
|
||||
|
@ -57,3 +56,17 @@ TEST_CASE("Test xcdat::bc_vector_8") {
|
|||
REQUIRE_EQ(bc.check(i), bc_units[i].check);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("Test bc_vector 10K in [0,10K)") {
|
||||
const std::uint64_t size = 10000;
|
||||
auto bc_units = make_random_units(size, size - 1);
|
||||
auto leaves = xcdat::test::make_random_bits(size, 0.2);
|
||||
test_bc_vector(bc_units, leaves);
|
||||
}
|
||||
|
||||
TEST_CASE("Test bc_vector 10K in [0,UINT64_MAX)") {
|
||||
const std::uint64_t size = 10000;
|
||||
auto bc_units = make_random_units(size, UINT64_MAX);
|
||||
auto leaves = xcdat::test::make_random_bits(size, 0.2);
|
||||
test_bc_vector(bc_units, leaves);
|
||||
}
|
||||
|
|
|
@ -15,6 +15,14 @@ std::vector<T> to_unique_vec(std::vector<T>&& vec) {
|
|||
return std::move(vec);
|
||||
}
|
||||
|
||||
std::uint64_t max_length(const std::vector<std::string>& keys) {
|
||||
std::uint64_t n = 0;
|
||||
for (auto& key : keys) {
|
||||
n = std::max<std::uint64_t>(n, key.size());
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
std::vector<bool> make_random_bits(std::uint64_t n, double dens = 0.5, std::uint64_t seed = 13) {
|
||||
std::mt19937_64 engine(seed);
|
||||
std::uniform_real_distribution<double> dist(0.0, 1.0);
|
||||
|
@ -73,12 +81,4 @@ std::vector<std::string> extract_keys(std::vector<std::string>& keys, double rat
|
|||
return keys2;
|
||||
}
|
||||
|
||||
std::uint64_t max_length(const std::vector<std::string>& keys) {
|
||||
std::uint64_t n = 0;
|
||||
for (auto& key : keys) {
|
||||
n = std::max<std::uint64_t>(n, key.size());
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
} // namespace xcdat::test
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
#include "test_common.hpp"
|
||||
#include "xcdat/compact_vector.hpp"
|
||||
|
||||
TEST_CASE("Test xcdat::compact_vector (zero)") {
|
||||
TEST_CASE("Test compact_vector (zero)") {
|
||||
std::vector<std::uint64_t> ints = {0, 0, 0, 0, 0};
|
||||
xcdat::compact_vector cv(ints);
|
||||
|
||||
|
@ -18,7 +18,7 @@ TEST_CASE("Test xcdat::compact_vector (zero)") {
|
|||
}
|
||||
}
|
||||
|
||||
TEST_CASE("Test xcdat::compact_vector (tiny)") {
|
||||
TEST_CASE("Test compact_vector (tiny)") {
|
||||
std::vector<std::uint64_t> ints = {2, 0, 14, 456, 32, 5544, 23};
|
||||
xcdat::compact_vector cv(ints);
|
||||
|
||||
|
@ -29,7 +29,7 @@ TEST_CASE("Test xcdat::compact_vector (tiny)") {
|
|||
}
|
||||
}
|
||||
|
||||
TEST_CASE("Test xcdat::compact_vector (random)") {
|
||||
TEST_CASE("Test compact_vector (random)") {
|
||||
std::vector<std::uint64_t> ints = xcdat::test::make_random_ints(10000, 0, UINT16_MAX);
|
||||
xcdat::compact_vector cv(ints);
|
||||
|
||||
|
|
|
@ -9,7 +9,8 @@
|
|||
#include "test_common.hpp"
|
||||
#include "xcdat.hpp"
|
||||
|
||||
using trie_type = xcdat::trie_8_type;
|
||||
using trie_type = xcdat::trie_7_type;
|
||||
// using trie_type = xcdat::trie_8_type;
|
||||
|
||||
void test_basic_operations(const trie_type& trie, const std::vector<std::string>& keys,
|
||||
const std::vector<std::string>& others) {
|
||||
|
|
Loading…
Reference in a new issue