From dd1860e792df52ee9609b9ce20304dea11ad71c6 Mon Sep 17 00:00:00 2001 From: Shunsuke Kanda Date: Sun, 27 Jun 2021 14:06:08 +0900 Subject: [PATCH] add 7-bit version --- include/xcdat.hpp | 5 +- include/xcdat/bc_vector_7.hpp | 190 ++++++++++++++++++++++++++++++++++ test/test_bc_vector.cpp | 39 ++++--- test/test_common.hpp | 16 +-- test/test_compact_vector.cpp | 6 +- test/test_trie.cpp | 3 +- 6 files changed, 233 insertions(+), 26 deletions(-) create mode 100644 include/xcdat/bc_vector_7.hpp diff --git a/include/xcdat.hpp b/include/xcdat.hpp index 0066339..2ee025c 100644 --- a/include/xcdat.hpp +++ b/include/xcdat.hpp @@ -1,11 +1,14 @@ #pragma once +#include "xcdat/bc_vector_7.hpp" #include "xcdat/bc_vector_8.hpp" + #include "xcdat/io.hpp" #include "xcdat/trie.hpp" namespace xcdat { +using trie_7_type = trie; using trie_8_type = trie; -} +} // namespace xcdat diff --git a/include/xcdat/bc_vector_7.hpp b/include/xcdat/bc_vector_7.hpp new file mode 100644 index 0000000..c3736e0 --- /dev/null +++ b/include/xcdat/bc_vector_7.hpp @@ -0,0 +1,190 @@ +#pragma once + +#include + +#include "bit_vector.hpp" +#include "compact_vector.hpp" + +namespace xcdat { + +class bc_vector_7 { + public: + static constexpr std::uint32_t l1_bits = 7; + static constexpr std::uint32_t max_levels = 4; + static constexpr std::uint64_t block_size_l1 = 1ULL << 7; + static constexpr std::uint64_t block_size_l2 = 1ULL << 15; + static constexpr std::uint64_t block_size_l3 = 1ULL << 31; + + private: + std::uint64_t m_num_frees = 0; + mm_vector m_ints_l1; + mm_vector m_ints_l2; + mm_vector m_ints_l3; + mm_vector m_ints_l4; + std::array, max_levels - 1> m_ranks; + compact_vector m_links; + bit_vector m_leaves; + + public: + bc_vector_7() = default; + virtual ~bc_vector_7() = default; + + bc_vector_7(const bc_vector_7&) = delete; + bc_vector_7& operator=(const bc_vector_7&) = delete; + + bc_vector_7(bc_vector_7&&) noexcept = default; + bc_vector_7& operator=(bc_vector_7&&) noexcept = default; + + template + explicit bc_vector_7(const BcUnits& bc_units, bit_vector::builder&& leaves) { + build(bc_units, std::move(leaves)); + } + + template + void build(const BcUnits& bc_units, bit_vector::builder&& leaves) { + std::vector ints_l1; + std::vector ints_l2; + std::vector ints_l3; + std::vector ints_l4; + std::array, max_levels - 1> ranks; + std::vector links; + + ints_l1.reserve(bc_units.size() * 2); + ranks[0].reserve((bc_units.size() * 2) >> l1_bits); + links.reserve(bc_units.size()); + + auto append_unit = [&](std::uint64_t x) { + if ((ints_l1.size() % block_size_l1) == 0) { + ranks[0].push_back(static_cast(ints_l2.size())); + } + if ((x / block_size_l1) == 0) { + ints_l1.push_back(static_cast(0 | (x << 1))); + return; + } else { + const auto i = ints_l2.size() - ranks[0].back(); + ints_l1.push_back(static_cast(1 | (i << 1))); + } + + if ((ints_l2.size() % block_size_l2) == 0) { + ranks[1].push_back(static_cast(ints_l3.size())); + } + if ((x / block_size_l2) == 0) { + ints_l2.push_back(static_cast(0 | (x << 1))); + return; + } else { + const auto i = ints_l3.size() - ranks[1].back(); + ints_l2.push_back(static_cast(1 | (i << 1))); + } + + if ((ints_l3.size() % block_size_l3) == 0) { + ranks[2].push_back(static_cast(ints_l4.size())); + } + if ((x / block_size_l3) == 0) { + ints_l3.push_back(static_cast(0 | (x << 1))); + return; + } else { + const auto i = ints_l4.size() - ranks[2].back(); + ints_l3.push_back(static_cast(1 | (i << 1))); + } + ints_l4.push_back(x); + }; + + auto append_leaf = [&](std::uint64_t x) { + if ((ints_l1.size() % block_size_l1) == 0) { + ranks[0].push_back(static_cast(ints_l2.size())); + } + ints_l1.push_back(static_cast(x & 0xFF)); + links.push_back(x >> 8); + }; + + for (std::uint64_t i = 0; i < bc_units.size(); ++i) { + if (leaves[i]) { + append_leaf(bc_units[i].base); + } else { + append_unit(bc_units[i].base ^ i); + } + append_unit(bc_units[i].check ^ i); + if (bc_units[i].check == i) { + m_num_frees += 1; + } + } + + // release + m_ints_l1.steal(ints_l1); + m_ints_l2.steal(ints_l2); + m_ints_l3.steal(ints_l3); + m_ints_l4.steal(ints_l4); + for (std::uint32_t j = 0; j < m_ranks.size(); ++j) { + m_ranks[j].steal(ranks[j]); + } + m_links.build(links); + m_leaves.build(leaves, true, false); + } + + inline std::uint64_t base(std::uint64_t i) const { + return access(i * 2) ^ i; + } + + inline std::uint64_t check(std::uint64_t i) const { + return access(i * 2 + 1) ^ i; + } + + inline std::uint64_t link(std::uint64_t i) const { + return m_ints_l1[i * 2] | (m_links[m_leaves.rank(i)] << 8); + } + + inline bool is_leaf(std::uint64_t i) const { + return m_leaves[i]; + } + + inline bool is_used(std::uint64_t i) const { + return check(i) != i; + } + + inline std::uint64_t num_units() const { + return m_ints_l1.size() / 2; + } + + inline std::uint64_t num_leaves() const { + return m_leaves.num_ones(); + } + + template + void visit(Visitor& visitor) { + visitor.visit(m_num_frees); + visitor.visit(m_ints_l1); + visitor.visit(m_ints_l2); + visitor.visit(m_ints_l3); + visitor.visit(m_ints_l4); + for (std::uint32_t j = 0; j < m_ranks.size(); j++) { + visitor.visit(m_ranks[j]); + } + visitor.visit(m_links); + visitor.visit(m_leaves); + } + + private: + inline std::uint64_t access(std::uint64_t i) const { + std::uint64_t x = m_ints_l1[i] >> 1; + if ((m_ints_l1[i] & 1U) == 0) { + return x; + } + i = m_ranks[0][i / block_size_l1] + x; + + x = m_ints_l2[i] >> 1; + if ((m_ints_l2[i] & 1U) == 0) { + return x; + } + i = m_ranks[1][i / block_size_l2] + x; + + x = m_ints_l3[i] >> 1; + if ((m_ints_l3[i] & 1U) == 0) { + return x; + } + i = m_ranks[2][i / block_size_l3] + x; + + return m_ints_l4[i]; + } +}; + +} // namespace xcdat \ No newline at end of file diff --git a/test/test_bc_vector.cpp b/test/test_bc_vector.cpp index d54969a..397c06b 100644 --- a/test/test_bc_vector.cpp +++ b/test/test_bc_vector.cpp @@ -5,18 +5,20 @@ #include "doctest/doctest.h" #include "test_common.hpp" +#include "xcdat/bc_vector_7.hpp" #include "xcdat/bc_vector_8.hpp" +using bc_vector_type = xcdat::bc_vector_7; +// using bc_vector_type = xcdat::bc_vector_8; + struct bc_unit { std::uint64_t base; std::uint64_t check; }; -std::vector make_random_units(std::uint64_t n) { - static constexpr std::uint64_t seed = 13; - +std::vector make_random_units(std::uint64_t n, std::uint64_t maxv, std::uint64_t seed = 13) { std::mt19937_64 engine(seed); - std::uniform_int_distribution dist(0, n - 1); + std::uniform_int_distribution dist(0, maxv); std::vector bc_units(n); for (std::uint64_t i = 0; i < n; i++) { @@ -38,22 +40,33 @@ std::uint64_t get_num_ones(const std::vector& bits) { return std::accumulate(bits.begin(), bits.end(), 0ULL); } -TEST_CASE("Test xcdat::bc_vector_8") { - auto bc_units = make_random_units(10000); - auto leaf_flags = xcdat::test::make_random_bits(10000, 0.2); - - xcdat::bc_vector_8 bc(bc_units, to_bit_vector_builder(leaf_flags)); +void test_bc_vector(const std::vector& bc_units, const std::vector& leaves) { + bc_vector_type bc(bc_units, to_bit_vector_builder(leaves)); REQUIRE_EQ(bc.num_units(), bc_units.size()); - REQUIRE_EQ(bc.num_leaves(), get_num_ones(leaf_flags)); + REQUIRE_EQ(bc.num_leaves(), get_num_ones(leaves)); for (std::uint64_t i = 0; i < bc.num_units(); i++) { - REQUIRE_EQ(bc.is_leaf(i), leaf_flags[i]); - if (leaf_flags[i]) { + REQUIRE_EQ(bc.is_leaf(i), leaves[i]); + if (leaves[i]) { REQUIRE_EQ(bc.link(i), bc_units[i].base); } else { REQUIRE_EQ(bc.base(i), bc_units[i].base); } REQUIRE_EQ(bc.check(i), bc_units[i].check); } -} \ No newline at end of file +} + +TEST_CASE("Test bc_vector 10K in [0,10K)") { + const std::uint64_t size = 10000; + auto bc_units = make_random_units(size, size - 1); + auto leaves = xcdat::test::make_random_bits(size, 0.2); + test_bc_vector(bc_units, leaves); +} + +TEST_CASE("Test bc_vector 10K in [0,UINT64_MAX)") { + const std::uint64_t size = 10000; + auto bc_units = make_random_units(size, UINT64_MAX); + auto leaves = xcdat::test::make_random_bits(size, 0.2); + test_bc_vector(bc_units, leaves); +} diff --git a/test/test_common.hpp b/test/test_common.hpp index f537f02..ff7d12c 100644 --- a/test/test_common.hpp +++ b/test/test_common.hpp @@ -15,6 +15,14 @@ std::vector to_unique_vec(std::vector&& vec) { return std::move(vec); } +std::uint64_t max_length(const std::vector& keys) { + std::uint64_t n = 0; + for (auto& key : keys) { + n = std::max(n, key.size()); + } + return n; +} + std::vector make_random_bits(std::uint64_t n, double dens = 0.5, std::uint64_t seed = 13) { std::mt19937_64 engine(seed); std::uniform_real_distribution dist(0.0, 1.0); @@ -73,12 +81,4 @@ std::vector extract_keys(std::vector& keys, double rat return keys2; } -std::uint64_t max_length(const std::vector& keys) { - std::uint64_t n = 0; - for (auto& key : keys) { - n = std::max(n, key.size()); - } - return n; -} - } // namespace xcdat::test diff --git a/test/test_compact_vector.cpp b/test/test_compact_vector.cpp index 436f31b..bf19a42 100644 --- a/test/test_compact_vector.cpp +++ b/test/test_compact_vector.cpp @@ -7,7 +7,7 @@ #include "test_common.hpp" #include "xcdat/compact_vector.hpp" -TEST_CASE("Test xcdat::compact_vector (zero)") { +TEST_CASE("Test compact_vector (zero)") { std::vector ints = {0, 0, 0, 0, 0}; xcdat::compact_vector cv(ints); @@ -18,7 +18,7 @@ TEST_CASE("Test xcdat::compact_vector (zero)") { } } -TEST_CASE("Test xcdat::compact_vector (tiny)") { +TEST_CASE("Test compact_vector (tiny)") { std::vector ints = {2, 0, 14, 456, 32, 5544, 23}; xcdat::compact_vector cv(ints); @@ -29,7 +29,7 @@ TEST_CASE("Test xcdat::compact_vector (tiny)") { } } -TEST_CASE("Test xcdat::compact_vector (random)") { +TEST_CASE("Test compact_vector (random)") { std::vector ints = xcdat::test::make_random_ints(10000, 0, UINT16_MAX); xcdat::compact_vector cv(ints); diff --git a/test/test_trie.cpp b/test/test_trie.cpp index 30dbc86..79cdc1e 100644 --- a/test/test_trie.cpp +++ b/test/test_trie.cpp @@ -9,7 +9,8 @@ #include "test_common.hpp" #include "xcdat.hpp" -using trie_type = xcdat::trie_8_type; +using trie_type = xcdat::trie_7_type; +// using trie_type = xcdat::trie_8_type; void test_basic_operations(const trie_type& trie, const std::vector& keys, const std::vector& others) {