add 7-bit version

This commit is contained in:
Shunsuke Kanda 2021-06-27 14:06:08 +09:00
parent 1f1080e605
commit dd1860e792
6 changed files with 233 additions and 26 deletions

View file

@ -1,11 +1,14 @@
#pragma once
#include "xcdat/bc_vector_7.hpp"
#include "xcdat/bc_vector_8.hpp"
#include "xcdat/io.hpp"
#include "xcdat/trie.hpp"
namespace xcdat {
using trie_7_type = trie<bc_vector_7>;
using trie_8_type = trie<bc_vector_8>;
}
} // namespace xcdat

View file

@ -0,0 +1,190 @@
#pragma once
#include <array>
#include "bit_vector.hpp"
#include "compact_vector.hpp"
namespace xcdat {
class bc_vector_7 {
public:
static constexpr std::uint32_t l1_bits = 7;
static constexpr std::uint32_t max_levels = 4;
static constexpr std::uint64_t block_size_l1 = 1ULL << 7;
static constexpr std::uint64_t block_size_l2 = 1ULL << 15;
static constexpr std::uint64_t block_size_l3 = 1ULL << 31;
private:
std::uint64_t m_num_frees = 0;
mm_vector<std::uint8_t> m_ints_l1;
mm_vector<std::uint16_t> m_ints_l2;
mm_vector<std::uint32_t> m_ints_l3;
mm_vector<std::uint64_t> m_ints_l4;
std::array<mm_vector<std::uint64_t>, max_levels - 1> m_ranks;
compact_vector m_links;
bit_vector m_leaves;
public:
bc_vector_7() = default;
virtual ~bc_vector_7() = default;
bc_vector_7(const bc_vector_7&) = delete;
bc_vector_7& operator=(const bc_vector_7&) = delete;
bc_vector_7(bc_vector_7&&) noexcept = default;
bc_vector_7& operator=(bc_vector_7&&) noexcept = default;
template <class BcUnits>
explicit bc_vector_7(const BcUnits& bc_units, bit_vector::builder&& leaves) {
build(bc_units, std::move(leaves));
}
template <class BcUnits>
void build(const BcUnits& bc_units, bit_vector::builder&& leaves) {
std::vector<std::uint8_t> ints_l1;
std::vector<std::uint16_t> ints_l2;
std::vector<std::uint32_t> ints_l3;
std::vector<std::uint64_t> ints_l4;
std::array<std::vector<std::uint64_t>, max_levels - 1> ranks;
std::vector<std::uint64_t> links;
ints_l1.reserve(bc_units.size() * 2);
ranks[0].reserve((bc_units.size() * 2) >> l1_bits);
links.reserve(bc_units.size());
auto append_unit = [&](std::uint64_t x) {
if ((ints_l1.size() % block_size_l1) == 0) {
ranks[0].push_back(static_cast<std::uint64_t>(ints_l2.size()));
}
if ((x / block_size_l1) == 0) {
ints_l1.push_back(static_cast<std::uint8_t>(0 | (x << 1)));
return;
} else {
const auto i = ints_l2.size() - ranks[0].back();
ints_l1.push_back(static_cast<std::uint8_t>(1 | (i << 1)));
}
if ((ints_l2.size() % block_size_l2) == 0) {
ranks[1].push_back(static_cast<std::uint64_t>(ints_l3.size()));
}
if ((x / block_size_l2) == 0) {
ints_l2.push_back(static_cast<std::uint16_t>(0 | (x << 1)));
return;
} else {
const auto i = ints_l3.size() - ranks[1].back();
ints_l2.push_back(static_cast<std::uint16_t>(1 | (i << 1)));
}
if ((ints_l3.size() % block_size_l3) == 0) {
ranks[2].push_back(static_cast<std::uint64_t>(ints_l4.size()));
}
if ((x / block_size_l3) == 0) {
ints_l3.push_back(static_cast<std::uint32_t>(0 | (x << 1)));
return;
} else {
const auto i = ints_l4.size() - ranks[2].back();
ints_l3.push_back(static_cast<std::uint32_t>(1 | (i << 1)));
}
ints_l4.push_back(x);
};
auto append_leaf = [&](std::uint64_t x) {
if ((ints_l1.size() % block_size_l1) == 0) {
ranks[0].push_back(static_cast<std::uint64_t>(ints_l2.size()));
}
ints_l1.push_back(static_cast<std::uint8_t>(x & 0xFF));
links.push_back(x >> 8);
};
for (std::uint64_t i = 0; i < bc_units.size(); ++i) {
if (leaves[i]) {
append_leaf(bc_units[i].base);
} else {
append_unit(bc_units[i].base ^ i);
}
append_unit(bc_units[i].check ^ i);
if (bc_units[i].check == i) {
m_num_frees += 1;
}
}
// release
m_ints_l1.steal(ints_l1);
m_ints_l2.steal(ints_l2);
m_ints_l3.steal(ints_l3);
m_ints_l4.steal(ints_l4);
for (std::uint32_t j = 0; j < m_ranks.size(); ++j) {
m_ranks[j].steal(ranks[j]);
}
m_links.build(links);
m_leaves.build(leaves, true, false);
}
inline std::uint64_t base(std::uint64_t i) const {
return access(i * 2) ^ i;
}
inline std::uint64_t check(std::uint64_t i) const {
return access(i * 2 + 1) ^ i;
}
inline std::uint64_t link(std::uint64_t i) const {
return m_ints_l1[i * 2] | (m_links[m_leaves.rank(i)] << 8);
}
inline bool is_leaf(std::uint64_t i) const {
return m_leaves[i];
}
inline bool is_used(std::uint64_t i) const {
return check(i) != i;
}
inline std::uint64_t num_units() const {
return m_ints_l1.size() / 2;
}
inline std::uint64_t num_leaves() const {
return m_leaves.num_ones();
}
template <class Visitor>
void visit(Visitor& visitor) {
visitor.visit(m_num_frees);
visitor.visit(m_ints_l1);
visitor.visit(m_ints_l2);
visitor.visit(m_ints_l3);
visitor.visit(m_ints_l4);
for (std::uint32_t j = 0; j < m_ranks.size(); j++) {
visitor.visit(m_ranks[j]);
}
visitor.visit(m_links);
visitor.visit(m_leaves);
}
private:
inline std::uint64_t access(std::uint64_t i) const {
std::uint64_t x = m_ints_l1[i] >> 1;
if ((m_ints_l1[i] & 1U) == 0) {
return x;
}
i = m_ranks[0][i / block_size_l1] + x;
x = m_ints_l2[i] >> 1;
if ((m_ints_l2[i] & 1U) == 0) {
return x;
}
i = m_ranks[1][i / block_size_l2] + x;
x = m_ints_l3[i] >> 1;
if ((m_ints_l3[i] & 1U) == 0) {
return x;
}
i = m_ranks[2][i / block_size_l3] + x;
return m_ints_l4[i];
}
};
} // namespace xcdat

View file

@ -5,18 +5,20 @@
#include "doctest/doctest.h"
#include "test_common.hpp"
#include "xcdat/bc_vector_7.hpp"
#include "xcdat/bc_vector_8.hpp"
using bc_vector_type = xcdat::bc_vector_7;
// using bc_vector_type = xcdat::bc_vector_8;
struct bc_unit {
std::uint64_t base;
std::uint64_t check;
};
std::vector<bc_unit> make_random_units(std::uint64_t n) {
static constexpr std::uint64_t seed = 13;
std::vector<bc_unit> make_random_units(std::uint64_t n, std::uint64_t maxv, std::uint64_t seed = 13) {
std::mt19937_64 engine(seed);
std::uniform_int_distribution<std::uint64_t> dist(0, n - 1);
std::uniform_int_distribution<std::uint64_t> dist(0, maxv);
std::vector<bc_unit> bc_units(n);
for (std::uint64_t i = 0; i < n; i++) {
@ -38,18 +40,15 @@ std::uint64_t get_num_ones(const std::vector<bool>& bits) {
return std::accumulate(bits.begin(), bits.end(), 0ULL);
}
TEST_CASE("Test xcdat::bc_vector_8") {
auto bc_units = make_random_units(10000);
auto leaf_flags = xcdat::test::make_random_bits(10000, 0.2);
xcdat::bc_vector_8 bc(bc_units, to_bit_vector_builder(leaf_flags));
void test_bc_vector(const std::vector<bc_unit>& bc_units, const std::vector<bool>& leaves) {
bc_vector_type bc(bc_units, to_bit_vector_builder(leaves));
REQUIRE_EQ(bc.num_units(), bc_units.size());
REQUIRE_EQ(bc.num_leaves(), get_num_ones(leaf_flags));
REQUIRE_EQ(bc.num_leaves(), get_num_ones(leaves));
for (std::uint64_t i = 0; i < bc.num_units(); i++) {
REQUIRE_EQ(bc.is_leaf(i), leaf_flags[i]);
if (leaf_flags[i]) {
REQUIRE_EQ(bc.is_leaf(i), leaves[i]);
if (leaves[i]) {
REQUIRE_EQ(bc.link(i), bc_units[i].base);
} else {
REQUIRE_EQ(bc.base(i), bc_units[i].base);
@ -57,3 +56,17 @@ TEST_CASE("Test xcdat::bc_vector_8") {
REQUIRE_EQ(bc.check(i), bc_units[i].check);
}
}
TEST_CASE("Test bc_vector 10K in [0,10K)") {
const std::uint64_t size = 10000;
auto bc_units = make_random_units(size, size - 1);
auto leaves = xcdat::test::make_random_bits(size, 0.2);
test_bc_vector(bc_units, leaves);
}
TEST_CASE("Test bc_vector 10K in [0,UINT64_MAX)") {
const std::uint64_t size = 10000;
auto bc_units = make_random_units(size, UINT64_MAX);
auto leaves = xcdat::test::make_random_bits(size, 0.2);
test_bc_vector(bc_units, leaves);
}

View file

@ -15,6 +15,14 @@ std::vector<T> to_unique_vec(std::vector<T>&& vec) {
return std::move(vec);
}
std::uint64_t max_length(const std::vector<std::string>& keys) {
std::uint64_t n = 0;
for (auto& key : keys) {
n = std::max<std::uint64_t>(n, key.size());
}
return n;
}
std::vector<bool> make_random_bits(std::uint64_t n, double dens = 0.5, std::uint64_t seed = 13) {
std::mt19937_64 engine(seed);
std::uniform_real_distribution<double> dist(0.0, 1.0);
@ -73,12 +81,4 @@ std::vector<std::string> extract_keys(std::vector<std::string>& keys, double rat
return keys2;
}
std::uint64_t max_length(const std::vector<std::string>& keys) {
std::uint64_t n = 0;
for (auto& key : keys) {
n = std::max<std::uint64_t>(n, key.size());
}
return n;
}
} // namespace xcdat::test

View file

@ -7,7 +7,7 @@
#include "test_common.hpp"
#include "xcdat/compact_vector.hpp"
TEST_CASE("Test xcdat::compact_vector (zero)") {
TEST_CASE("Test compact_vector (zero)") {
std::vector<std::uint64_t> ints = {0, 0, 0, 0, 0};
xcdat::compact_vector cv(ints);
@ -18,7 +18,7 @@ TEST_CASE("Test xcdat::compact_vector (zero)") {
}
}
TEST_CASE("Test xcdat::compact_vector (tiny)") {
TEST_CASE("Test compact_vector (tiny)") {
std::vector<std::uint64_t> ints = {2, 0, 14, 456, 32, 5544, 23};
xcdat::compact_vector cv(ints);
@ -29,7 +29,7 @@ TEST_CASE("Test xcdat::compact_vector (tiny)") {
}
}
TEST_CASE("Test xcdat::compact_vector (random)") {
TEST_CASE("Test compact_vector (random)") {
std::vector<std::uint64_t> ints = xcdat::test::make_random_ints(10000, 0, UINT16_MAX);
xcdat::compact_vector cv(ints);

View file

@ -9,7 +9,8 @@
#include "test_common.hpp"
#include "xcdat.hpp"
using trie_type = xcdat::trie_8_type;
using trie_type = xcdat::trie_7_type;
// using trie_type = xcdat::trie_8_type;
void test_basic_operations(const trie_type& trie, const std::vector<std::string>& keys,
const std::vector<std::string>& others) {