From e216688f2fcc95fb674bf110c1b0cab162bd5257 Mon Sep 17 00:00:00 2001 From: Shunsuke Kanda Date: Tue, 29 Jun 2021 09:06:40 +0900 Subject: [PATCH] fix the interface --- CMakeLists.txt | 1 + include/xcdat.hpp | 35 ++ include/xcdat/bc_vector_7.hpp | 20 +- include/xcdat/bc_vector_8.hpp | 17 +- include/xcdat/bit_vector.hpp | 27 +- include/xcdat/code_table.hpp | 9 +- include/xcdat/compact_vector.hpp | 22 +- include/xcdat/essentials/essentials.hpp | 679 ------------------------ include/xcdat/immutable_vector.hpp | 85 ++- include/xcdat/io.hpp | 28 + include/xcdat/load_visitor.hpp | 43 ++ include/xcdat/mmap_visitor.hpp | 39 ++ include/xcdat/save_visitor.hpp | 43 ++ include/xcdat/size_visitor.hpp | 39 ++ include/xcdat/tail_vector.hpp | 10 +- include/xcdat/trie.hpp | 29 +- include/xcdat/trie_builder.hpp | 2 +- sample/sample.cpp | 9 +- tests/test_bit_vector.cpp | 2 +- tests/test_trie.cpp | 51 +- tools/xcdat_build.cpp | 19 +- tools/xcdat_decode.cpp | 2 +- tools/xcdat_enumerate.cpp | 2 +- tools/xcdat_lookup.cpp | 2 +- tools/xcdat_predictive_search.cpp | 3 +- tools/xcdat_prefix_search.cpp | 3 +- 26 files changed, 397 insertions(+), 824 deletions(-) delete mode 100644 include/xcdat/essentials/essentials.hpp create mode 100644 include/xcdat/load_visitor.hpp create mode 100644 include/xcdat/mmap_visitor.hpp create mode 100644 include/xcdat/save_visitor.hpp create mode 100644 include/xcdat/size_visitor.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index c643265..a864acd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -34,6 +34,7 @@ message(STATUS "CXX_FLAGS_RELEASE are ${CMAKE_CXX_FLAGS_RELEASE}") include_directories(include) add_subdirectory(sample) + add_subdirectory(tools) enable_testing() diff --git a/include/xcdat.hpp b/include/xcdat.hpp index 2ee025c..040604b 100644 --- a/include/xcdat.hpp +++ b/include/xcdat.hpp @@ -6,9 +6,44 @@ #include "xcdat/io.hpp" #include "xcdat/trie.hpp" +#include "xcdat/load_visitor.hpp" +#include "xcdat/mmap_visitor.hpp" +#include "xcdat/save_visitor.hpp" +#include "xcdat/size_visitor.hpp" + namespace xcdat { using trie_7_type = trie; using trie_8_type = trie; +template +static Trie mmap(const char* address) { + Trie idx; + mmap_visitor visitor(address); + visitor.visit(idx); + return idx; +} + +template +static Trie load(std::string_view filepath) { + Trie idx; + load_visitor visitor(filepath); + visitor.visit(idx); + return idx; +} + +template +static std::uint64_t save(const Trie& idx, std::string_view filepath) { + save_visitor visitor(filepath); + visitor.visit(const_cast(idx)); + return visitor.bytes(); +} + +template +static std::uint64_t memory_in_bytes(const Trie& idx) { + size_visitor visitor; + visitor.visit(const_cast(idx)); + return visitor.bytes(); +} + } // namespace xcdat diff --git a/include/xcdat/bc_vector_7.hpp b/include/xcdat/bc_vector_7.hpp index 8ec1b3c..219a155 100644 --- a/include/xcdat/bc_vector_7.hpp +++ b/include/xcdat/bc_vector_7.hpp @@ -11,6 +11,7 @@ class bc_vector_7 { public: static constexpr std::uint32_t l1_bits = 7; static constexpr std::uint32_t max_levels = 4; + static constexpr std::uint64_t block_size_l1 = 1ULL << 7; static constexpr std::uint64_t block_size_l2 = 1ULL << 15; static constexpr std::uint64_t block_size_l3 = 1ULL << 31; @@ -37,11 +38,6 @@ class bc_vector_7 { template explicit bc_vector_7(const BcUnits& bc_units, bit_vector::builder&& leaves) { - build(bc_units, std::move(leaves)); - } - - template - void build(const BcUnits& bc_units, bit_vector::builder&& leaves) { std::vector ints_l1; std::vector ints_l2; std::vector ints_l3; @@ -110,15 +106,15 @@ class bc_vector_7 { } // release - m_ints_l1.steal(ints_l1); - m_ints_l2.steal(ints_l2); - m_ints_l3.steal(ints_l3); - m_ints_l4.steal(ints_l4); + m_ints_l1.build(ints_l1); + m_ints_l2.build(ints_l2); + m_ints_l3.build(ints_l3); + m_ints_l4.build(ints_l4); for (std::uint32_t j = 0; j < m_ranks.size(); ++j) { - m_ranks[j].steal(ranks[j]); + m_ranks[j].build(ranks[j]); } - m_links.build(links); - m_leaves.build(leaves, true, false); + m_links = compact_vector(links); + m_leaves = bit_vector(leaves, true, false); } inline std::uint64_t base(std::uint64_t i) const { diff --git a/include/xcdat/bc_vector_8.hpp b/include/xcdat/bc_vector_8.hpp index a4ed365..d69819d 100644 --- a/include/xcdat/bc_vector_8.hpp +++ b/include/xcdat/bc_vector_8.hpp @@ -32,11 +32,6 @@ class bc_vector_8 { template explicit bc_vector_8(const BcUnits& bc_units, bit_vector::builder&& leaves) { - build(bc_units, std::move(leaves)); - } - - template - void build(const BcUnits& bc_units, bit_vector::builder&& leaves) { std::array, max_levels> bytes; std::array next_flags; std::vector links; @@ -81,13 +76,13 @@ class bc_vector_8 { } // release - for (uint8_t i = 0; i < m_num_levels; ++i) { - m_bytes[i].steal(bytes[i]); - m_nexts[i].build(next_flags[i], true, false); + for (std::uint32_t i = 0; i < m_num_levels; ++i) { + m_bytes[i].build(bytes[i]); + m_nexts[i] = bit_vector(next_flags[i], true, false); } - m_bytes[m_num_levels].steal(bytes[m_num_levels]); - m_links.build(links); - m_leaves.build(leaves, true, false); + m_bytes[m_num_levels].build(bytes[m_num_levels]); + m_links = compact_vector(links); + m_leaves = bit_vector(leaves, true, false); } inline std::uint64_t base(std::uint64_t i) const { diff --git a/include/xcdat/bit_vector.hpp b/include/xcdat/bit_vector.hpp index f70d94f..143819d 100644 --- a/include/xcdat/bit_vector.hpp +++ b/include/xcdat/bit_vector.hpp @@ -1,17 +1,13 @@ #pragma once -#include -#include #include - -#include "essentials/essentials.hpp" +#include #include "bit_tools.hpp" #include "immutable_vector.hpp" namespace xcdat { -//! Rank9 implementatoin class bit_vector { public: class builder { @@ -56,12 +52,12 @@ class bit_vector { } inline void resize(std::uint64_t size) { - m_bits.resize(essentials::words_for(size), 0ULL); + m_bits.resize(words_for(size), 0ULL); m_size = size; } inline void reserve(std::uint64_t capacity) { - m_bits.reserve(essentials::words_for(capacity)); + m_bits.reserve(words_for(capacity)); } inline std::uint64_t size() const { @@ -92,17 +88,10 @@ class bit_vector { bit_vector& operator=(bit_vector&&) noexcept = default; explicit bit_vector(builder& b, bool enable_rank = false, bool enable_select = false) { - build(b, enable_rank, enable_select); - } - - void build(builder& b, bool enable_rank = false, bool enable_select = false) { - m_bits.steal(b.m_bits); + m_bits.build(b.m_bits); m_size = b.m_size; m_num_ones = std::accumulate(m_bits.begin(), m_bits.end(), 0ULL, [](std::uint64_t acc, std::uint64_t x) { return acc + bit_tools::popcount(x); }); - m_rank_hints.clear(); - m_select_hints.clear(); - if (enable_rank) { build_rank_hints(); } @@ -172,6 +161,10 @@ class bit_vector { return {x / N, x % N}; } + static std::uint64_t words_for(std::uint64_t nbits) { + return (nbits + 63) / 64; + } + inline std::uint64_t num_blocks() const { return m_rank_hints.size() / 2 - 1; } @@ -258,7 +251,7 @@ class bit_vector { } // Release - m_rank_hints.steal(rank_hints); + m_rank_hints.build(rank_hints); } void build_select_hints() { @@ -271,7 +264,7 @@ class bit_vector { } } select_hints.push_back(num_blocks()); - m_select_hints.steal(select_hints); + m_select_hints.build(select_hints); } }; diff --git a/include/xcdat/code_table.hpp b/include/xcdat/code_table.hpp index c38a14f..7e31dca 100644 --- a/include/xcdat/code_table.hpp +++ b/include/xcdat/code_table.hpp @@ -29,12 +29,7 @@ class code_table { code_table& operator=(code_table&&) noexcept = default; template - explicit code_table(const Strings& keys) { - build(keys); - } - - template - void build(const Strings& keys) { + code_table(const Strings& keys) { std::array counter; for (std::uint32_t ch = 0; ch < 256; ++ch) { counter[ch] = {static_cast(ch), 0}; @@ -55,7 +50,7 @@ class code_table { alphabet.push_back(cf.ch); } } - m_alphabet.steal(alphabet); + m_alphabet.build(alphabet); } std::sort(counter.begin(), counter.end(), diff --git a/include/xcdat/compact_vector.hpp b/include/xcdat/compact_vector.hpp index de8f0a5..10e7087 100644 --- a/include/xcdat/compact_vector.hpp +++ b/include/xcdat/compact_vector.hpp @@ -1,14 +1,11 @@ #pragma once -#include "essentials/essentials.hpp" - #include "bit_tools.hpp" #include "exception.hpp" #include "immutable_vector.hpp" namespace xcdat { -//! A compressed integer vector. class compact_vector { private: std::uint64_t m_size = 0; @@ -27,21 +24,14 @@ class compact_vector { compact_vector& operator=(compact_vector&&) noexcept = default; template - explicit compact_vector(const Vec& vec) { - build(vec); - } - - template - void build(const Vec& vec) { + compact_vector(const Vec& vec) { XCDAT_THROW_IF(vec.size() == 0, "The input vector is empty."); - const std::uint64_t maxv = *std::max_element(vec.begin(), vec.end()); - m_size = vec.size(); - m_bits = needed_bits(maxv); + m_bits = needed_bits(*std::max_element(vec.begin(), vec.end())); m_mask = (1ULL << m_bits) - 1; - std::vector chunks(essentials::words_for(m_size * m_bits)); + std::vector chunks(words_for(m_size * m_bits)); for (std::uint64_t i = 0; i < m_size; i++) { const auto [quo, mod] = decompose(i * m_bits); @@ -53,7 +43,7 @@ class compact_vector { chunks[quo + 1] |= (vec[i] & m_mask) >> diff; } } - m_chunks.steal(chunks); + m_chunks.build(chunks); } inline std::uint64_t operator[](std::uint64_t i) const { @@ -90,6 +80,10 @@ class compact_vector { static std::tuple decompose(std::uint64_t x) { return {x / 64, x % 64}; } + + static std::uint64_t words_for(std::uint64_t nbits) { + return (nbits + 63) / 64; + } }; } // namespace xcdat \ No newline at end of file diff --git a/include/xcdat/essentials/essentials.hpp b/include/xcdat/essentials/essentials.hpp deleted file mode 100644 index a54eeef..0000000 --- a/include/xcdat/essentials/essentials.hpp +++ /dev/null @@ -1,679 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef __GNUG__ -#include // for name demangling -#endif - -namespace essentials { - -void logger(std::string const& msg) { - time_t t = std::time(nullptr); - std::locale loc; - const std::time_put& tp = std::use_facet>(loc); - const char* fmt = "%F %T"; - tp.put(std::cout, std::cout, ' ', std::localtime(&t), fmt, fmt + strlen(fmt)); - std::cout << ": " << msg << std::endl; -} - -static const uint64_t GB = 1000 * 1000 * 1000; -static const uint64_t GiB = uint64_t(1) << 30; -static const uint64_t MB = 1000 * 1000; -static const uint64_t MiB = uint64_t(1) << 20; -static const uint64_t KB = 1000; -static const uint64_t KiB = uint64_t(1) << 10; - -double convert(size_t bytes, uint64_t unit) { - return static_cast(bytes) / unit; -} - -template -size_t vec_bytes(T const& vec) { - return vec.size() * sizeof(vec.front()) + sizeof(typename T::size_type); -} - -template -size_t pod_bytes(T const& pod) { - static_assert(std::is_pod::value); - return sizeof(pod); -} - -size_t file_size(char const* filename) { - std::ifstream is(filename, std::ios::binary | std::ios::ate); - if (!is.good()) { - throw std::runtime_error("Error in opening binary " - "file."); - } - size_t bytes = (size_t)is.tellg(); - is.close(); - return bytes; -} - -template -uint64_t words_for(uint64_t bits) { - uint64_t word_bits = sizeof(WordType) * 8; - return (bits + word_bits - 1) / word_bits; -} - -template -inline void do_not_optimize_away(T&& value) { - asm volatile("" : "+r"(value)); -} - -uint64_t maxrss_in_bytes() { - struct rusage ru; - if (getrusage(RUSAGE_SELF, &ru) == 0) { - // NOTE: ru_maxrss is in kilobytes on Linux, but not on Apple... -#ifdef __APPLE__ - return ru.ru_maxrss; -#endif - return ru.ru_maxrss * 1000; - } - return 0; -} - -template -void load_pod(std::istream& is, T& val) { - static_assert(std::is_pod::value); - is.read(reinterpret_cast(&val), sizeof(T)); -} - -template -void load_vec(std::istream& is, std::vector& vec) { - size_t n; - load_pod(is, n); - vec.resize(n); - is.read(reinterpret_cast(vec.data()), static_cast(sizeof(T) * n)); -} - -template -void save_pod(std::ostream& os, T const& val) { - static_assert(std::is_pod::value); - os.write(reinterpret_cast(&val), sizeof(T)); -} - -template -void save_vec(std::ostream& os, std::vector const& vec) { - static_assert(std::is_pod::value); - size_t n = vec.size(); - save_pod(os, n); - os.write(reinterpret_cast(vec.data()), static_cast(sizeof(T) * n)); -} - -struct json_lines { - struct property { - property(std::string n, std::string v) : name(n), value(v) {} - - std::string name; - std::string value; - }; - - void new_line() { - m_properties.push_back(std::vector()); - } - - template - void add(std::string name, T value) { - if (!m_properties.size()) { - new_line(); - } - if constexpr (std::is_same::value) { - m_properties.back().emplace_back(name, value); - } else { - m_properties.back().emplace_back(name, std::to_string(value)); - } - } - - void save_to_file(char const* filename) const { - std::ofstream out(filename); - print_to(out); - out.close(); - } - - void print_line() const { - print_line_to(m_properties.back(), std::cerr); - } - - void print() const { - print_to(std::cerr); - } - - private: - std::vector> m_properties; - - template - void print_line_to(std::vector const& properties, T& device) const { - device << "{"; - for (uint64_t i = 0; i != properties.size(); ++i) { - auto const& p = properties[i]; - device << "\"" << p.name << "\": \"" << p.value << "\""; - if (i != properties.size() - 1) { - device << ", "; - } - } - device << "}\n"; - } - - template - void print_to(T& device) const { - for (auto const& properties : m_properties) { - print_line_to(properties, device); - } - } -}; - -template -struct timer { - void start() { - m_start = ClockType::now(); - } - - void stop() { - m_stop = ClockType::now(); - auto elapsed = std::chrono::duration_cast(m_stop - m_start); - m_timings.push_back(elapsed.count()); - } - - size_t runs() const { - return m_timings.size(); - } - - void reset() { - m_timings.clear(); - } - - double min() const { - return *std::min_element(m_timings.begin(), m_timings.end()); - } - - double max() const { - return *std::max_element(m_timings.begin(), m_timings.end()); - } - - void discard_first() { - if (runs()) { - m_timings.erase(m_timings.begin()); - } - } - - void discard_min() { - if (runs() > 1) { - m_timings.erase(std::min_element(m_timings.begin(), m_timings.end())); - } - } - - void discard_max() { - if (runs() > 1) { - m_timings.erase(std::max_element(m_timings.begin(), m_timings.end())); - } - } - - double elapsed() { - return std::accumulate(m_timings.begin(), m_timings.end(), 0.0); - } - - double average() { - return elapsed() / runs(); - } - - private: - typename ClockType::time_point m_start; - typename ClockType::time_point m_stop; - std::vector m_timings; -}; - -typedef std::chrono::high_resolution_clock clock_type; -typedef std::chrono::microseconds duration_type; -typedef timer timer_type; - -unsigned get_random_seed() { - return std::chrono::system_clock::now().time_since_epoch().count(); -} - -template -struct uniform_int_rng { - uniform_int_rng(IntType from, IntType to, unsigned seed = 13) : m_rng(seed), m_distr(from, to) {} - - IntType gen() { - return m_distr(m_rng); - } - - private: - std::mt19937_64 m_rng; - std::uniform_int_distribution m_distr; -}; - -struct loader { - loader(char const* filename) : m_num_bytes_pods(0), m_num_bytes_vecs_of_pods(0), m_is(filename, std::ios::binary) { - if (!m_is.good()) { - throw std::runtime_error("Error in opening binary " - "file."); - } - } - - ~loader() { - m_is.close(); - } - - template - void visit(T& val) { - if constexpr (std::is_pod::value) { - load_pod(m_is, val); - m_num_bytes_pods += pod_bytes(val); - } else { - val.visit(*this); - } - } - - template - void visit(std::vector& vec) { - size_t n; - visit(n); - vec.resize(n); - if constexpr (std::is_pod::value) { - m_is.read(reinterpret_cast(vec.data()), static_cast(sizeof(T) * n)); - m_num_bytes_vecs_of_pods += n * sizeof(T); - } else { - for (auto& v : vec) visit(v); - } - } - - size_t bytes() { - return m_is.tellg(); - } - - size_t bytes_pods() { - return m_num_bytes_pods; - } - - size_t bytes_vecs_of_pods() { - return m_num_bytes_vecs_of_pods; - } - - private: - size_t m_num_bytes_pods; - size_t m_num_bytes_vecs_of_pods; - std::ifstream m_is; -}; - -struct saver { - saver(char const* filename) : m_os(filename, std::ios::binary) { - if (!m_os.good()) { - throw std::runtime_error("Error in opening binary " - "file."); - } - } - - ~saver() { - m_os.close(); - } - - template - void visit(T& val) { - if constexpr (std::is_pod::value) { - save_pod(m_os, val); - } else { - val.visit(*this); - } - } - - template - void visit(std::vector& vec) { - if constexpr (std::is_pod::value) { - save_vec(m_os, vec); - } else { - size_t n = vec.size(); - visit(n); - for (auto& v : vec) visit(v); - } - } - - size_t bytes() { - return m_os.tellp(); - } - - private: - std::ofstream m_os; -}; - -std::string demangle(char const* mangled_name) { - size_t len = 0; - int status = 0; - std::unique_ptr ptr(__cxxabiv1::__cxa_demangle(mangled_name, nullptr, &len, &status), - &std::free); - return ptr.get(); -} - -struct sizer { - sizer(std::string const& root_name = "") : m_root(0, 0, root_name), m_current(&m_root) {} - - struct node { - node(size_t b, size_t d, std::string const& n = "") : bytes(b), depth(d), name(n) {} - - size_t bytes; - size_t depth; - std::string name; - std::vector children; - }; - - template - void visit(T& val) { - if constexpr (std::is_pod::value) { - node n(pod_bytes(val), m_current->depth + 1, demangle(typeid(T).name())); - m_current->children.push_back(n); - m_current->bytes += n.bytes; - } else { - val.visit(*this); - } - } - - template - void visit(std::vector& vec) { - if constexpr (std::is_pod::value) { - node n(vec_bytes(vec), m_current->depth + 1, demangle(typeid(std::vector).name())); - m_current->children.push_back(n); - m_current->bytes += n.bytes; - } else { - size_t n = vec.size(); - m_current->bytes += pod_bytes(n); - node* parent = m_current; - for (auto& v : vec) { - node n(0, parent->depth + 1, demangle(typeid(T).name())); - parent->children.push_back(n); - m_current = &parent->children.back(); - visit(v); - parent->bytes += m_current->bytes; - } - m_current = parent; - } - } - - template - void print(node const& n, size_t total_bytes, Device& device) const { - auto indent = std::string(n.depth * 4, ' '); - device << indent << "'" << n.name << "' - bytes = " << n.bytes << " (" << n.bytes * 100.0 / total_bytes << "%)" - << std::endl; - for (auto const& child : n.children) { - device << indent; - print(child, total_bytes, device); - } - } - - template - void print(Device& device) const { - print(m_root, bytes(), device); - } - - size_t bytes() const { - return m_root.bytes; - } - - private: - node m_root; - node* m_current; -}; - -template -struct allocator : std::allocator { - typedef T value_type; - - allocator() : m_addr(nullptr) {} - - allocator(T* addr) : m_addr(addr) {} - - T* allocate(size_t n) { - if (m_addr == nullptr) return std::allocator::allocate(n); - return m_addr; - } - - void deallocate(T* p, size_t n) { - if (m_addr == nullptr) return std::allocator::deallocate(p, n); - } - - private: - T* m_addr; -}; - -struct contiguous_memory_allocator { - contiguous_memory_allocator() : m_begin(nullptr), m_end(nullptr), m_size(0) {} - - struct visitor { - visitor(uint8_t* begin, size_t size, char const* filename) - : m_begin(begin), m_end(begin), m_size(size), m_is(filename, std::ios::binary) { - if (!m_is.good()) { - throw std::runtime_error("Error in opening binary " - "file."); - } - } - - ~visitor() { - m_is.close(); - } - - template - void visit(T& val) { - if constexpr (std::is_pod::value) { - load_pod(m_is, val); - } else { - val.visit(*this); - } - } - - template - void visit(std::vector& vec) { - if constexpr (std::is_pod::value) { - vec = std::vector(make_allocator()); - load_vec(m_is, vec); - consume(vec.size() * sizeof(T)); - } else { - size_t n; - visit(n); - vec.resize(n); - for (auto& v : vec) visit(v); - } - } - - uint8_t* end() { - return m_end; - } - - size_t size() const { - return m_size; - } - - size_t allocated() const { - assert(m_end >= m_begin); - return m_end - m_begin; - } - - template - allocator make_allocator() { - return allocator(reinterpret_cast(m_end)); - } - - void consume(size_t num_bytes) { - if (m_end == nullptr) return; - if (allocated() + num_bytes > size()) { - throw std::runtime_error("allocation failed"); - } - m_end += num_bytes; - } - - private: - uint8_t* m_begin; - uint8_t* m_end; - size_t m_size; - std::ifstream m_is; - }; - - template - size_t allocate(T& data_structure, char const* filename) { - loader l(filename); - l.visit(data_structure); - m_size = l.bytes_vecs_of_pods(); - m_begin = reinterpret_cast(malloc(m_size)); - if (m_begin == nullptr) throw std::runtime_error("malloc failed"); - visitor v(m_begin, m_size, filename); - v.visit(data_structure); - m_end = v.end(); - return l.bytes(); - } - - ~contiguous_memory_allocator() { - free(m_begin); - } - - uint8_t* begin() { - return m_begin; - } - - uint8_t* end() { - return m_end; - } - - size_t size() const { - return m_size; - } - - private: - uint8_t* m_begin; - uint8_t* m_end; - size_t m_size; -}; - -template -size_t visit(T& data_structure, char const* filename) { - Visitor visitor(filename); - visitor.visit(data_structure); - return visitor.bytes(); -} - -template -size_t load(T& data_structure, char const* filename) { - return visit(data_structure, filename); -} - -template -size_t load_with_custom_memory_allocation(T& data_structure, char const* filename) { - return data_structure.get_allocator().allocate(data_structure, filename); -} - -template -size_t save(T& data_structure, char const* filename) { - return visit(data_structure, filename); -} - -template -size_t print_size(T& data_structure, Device& device) { - sizer visitor(demangle(typeid(T).name())); - visitor.visit(data_structure); - visitor.print(device); - return visitor.bytes(); -} - -#if defined(__CYGWIN__) || defined(_WIN32) || defined(_WIN64) -#else -struct directory { - struct file_name { - std::string name; - std::string fullpath; - std::string extension; - }; - - ~directory() { - for (int i = 0; i != items(); ++i) { - free(m_items_names[i]); - } - free(m_items_names); - } - - directory(std::string const& name) : m_name(name) { - m_n = scandir(m_name.c_str(), &m_items_names, NULL, alphasort); - if (m_n < 0) { - throw std::runtime_error("error during scandir"); - } - } - - std::string const& name() const { - return m_name; - } - - int items() const { - return m_n; - } - - struct iterator { - iterator(directory const* d, int i) : m_d(d), m_i(i) {} - - file_name operator*() { - file_name fn; - fn.name = m_d->m_items_names[m_i]->d_name; - fn.fullpath = m_d->name() + "/" + fn.name; - size_t p = fn.name.find_last_of("."); - fn.extension = fn.name.substr(p + 1); - return fn; - } - - void operator++() { - ++m_i; - } - - bool operator==(iterator const& rhs) const { - return m_i == rhs.m_i; - } - - bool operator!=(iterator const& rhs) const { - return !(*this == rhs); - } - - private: - directory const* m_d; - int m_i; - }; - - iterator begin() { - return iterator(this, 0); - } - - iterator end() { - return iterator(this, items()); - } - - private: - std::string m_name; - struct dirent** m_items_names; - int m_n; -}; -#endif - -bool create_directory(std::string const& name) { - if (mkdir(name.c_str(), 0777) != 0) { - if (errno == EEXIST) { - std::cerr << "directory already exists" << std::endl; - } - return false; - } - return true; -} - -bool remove_directory(std::string const& name) { - return rmdir(name.c_str()) == 0; -} - -} // namespace essentials \ No newline at end of file diff --git a/include/xcdat/immutable_vector.hpp b/include/xcdat/immutable_vector.hpp index 3441a4f..00ff085 100644 --- a/include/xcdat/immutable_vector.hpp +++ b/include/xcdat/immutable_vector.hpp @@ -1,14 +1,20 @@ #pragma once -#include -#include +#include +#include +#include +#include +#include +#include namespace xcdat { template class immutable_vector { private: - std::vector m_vec; + std::unique_ptr m_allocator; + std::uint64_t m_size = 0; + const T* m_data = nullptr; public: immutable_vector() = default; @@ -20,54 +26,81 @@ class immutable_vector { immutable_vector(immutable_vector&&) noexcept = default; immutable_vector& operator=(immutable_vector&&) noexcept = default; - explicit immutable_vector(std::vector&& vec) { - steal(vec); + void clear() { + m_allocator.reset(); + m_size = 0; + m_data = nullptr; } - void steal(std::vector& vec) { + template + immutable_vector(const Vector& vec) { + build(vec); + } + + template + void build(const Vector& vec) { + clear(); if (vec.size() != 0) { - m_vec = std::move(vec); - m_vec.shrink_to_fit(); - } else { - clear(); + m_allocator = std::make_unique(vec.size()); + std::copy_n(vec.data(), vec.size(), m_allocator.get()); + m_size = vec.size(); + m_data = m_allocator.get(); } } - void clear() { - *this = immutable_vector(); + std::uint64_t mmap(const char* address) { + clear(); + m_size = *reinterpret_cast(address); + m_data = reinterpret_cast(address + sizeof(std::uint64_t)); + return sizeof(std::uint64_t) + m_size * sizeof(T); + } + + void load(std::ifstream& ifs) { + clear(); + ifs.read(reinterpret_cast(&m_size), sizeof(m_size)); + if (m_size != 0) { + m_allocator = std::make_unique(m_size); + ifs.read(reinterpret_cast(m_allocator.get()), sizeof(T) * m_size); + m_data = m_allocator.get(); + } + } + + void save(std::ofstream& ofs) const { + ofs.write(reinterpret_cast(&m_size), sizeof(m_size)); + ofs.write(reinterpret_cast(m_data), sizeof(T) * m_size); + } + + inline std::uint64_t memory_in_bytes() const { + return sizeof(m_size) + sizeof(T) * m_size; } inline std::uint64_t size() const { - return m_vec.size(); + return m_size; } - inline auto begin() const { - return m_vec.begin(); + inline const T* begin() const { + return m_data; } - inline auto end() const { - return m_vec.end(); + inline const T* end() const { + return m_data + m_size; } inline auto rbegin() const { - return m_vec.rbegin(); + return std::make_reverse_iterator(end()); } inline auto rend() const { - return m_vec.rend(); + return std::make_reverse_iterator(begin()); } inline const T& operator[](std::uint64_t i) const { - return m_vec[i]; + assert(i < m_size); + return m_data[i]; } inline const T* data() const { - return m_vec.data(); - } - - template - void visit(Visitor& visitor) { - visitor.visit(m_vec); + return m_data; } }; diff --git a/include/xcdat/io.hpp b/include/xcdat/io.hpp index 28d4f82..2da7aa6 100644 --- a/include/xcdat/io.hpp +++ b/include/xcdat/io.hpp @@ -19,4 +19,32 @@ namespace xcdat::io { return strs; } +template +void load_pod(std::istream& is, T& val) { + static_assert(std::is_pod::value); + is.read(reinterpret_cast(&val), sizeof(T)); +} + +template +void load_vec(std::istream& is, std::vector& vec) { + size_t n; + load_pod(is, n); + vec.resize(n); + is.read(reinterpret_cast(vec.data()), static_cast(sizeof(T) * n)); +} + +template +void save_pod(std::ostream& os, T const& val) { + static_assert(std::is_pod::value); + os.write(reinterpret_cast(&val), sizeof(T)); +} + +template +void save_vec(std::ostream& os, std::vector const& vec) { + static_assert(std::is_pod::value); + size_t n = vec.size(); + save_pod(os, n); + os.write(reinterpret_cast(vec.data()), static_cast(sizeof(T) * n)); +} + } // namespace xcdat::io diff --git a/include/xcdat/load_visitor.hpp b/include/xcdat/load_visitor.hpp new file mode 100644 index 0000000..bf6cf66 --- /dev/null +++ b/include/xcdat/load_visitor.hpp @@ -0,0 +1,43 @@ +#pragma once + +#include +#include + +#include "exception.hpp" +#include "immutable_vector.hpp" + +namespace xcdat { + +class load_visitor { + private: + std::ifstream m_ifs; + + public: + load_visitor(std::string_view filepath) : m_ifs(filepath, std::ios::binary) { + XCDAT_THROW_IF(!m_ifs.good(), "Cannot open the input file"); + } + + virtual ~load_visitor() { + m_ifs.close(); + } + + template + void visit(immutable_vector& vec) { + vec.load(m_ifs); + } + + template + void visit(T& obj) { + if constexpr (std::is_pod_v) { + m_ifs.read(reinterpret_cast(&obj), sizeof(T)); + } else { + obj.visit(*this); + } + } + + std::uint64_t bytes() { + return m_ifs.tellg(); + } +}; + +} // namespace xcdat diff --git a/include/xcdat/mmap_visitor.hpp b/include/xcdat/mmap_visitor.hpp new file mode 100644 index 0000000..83f6262 --- /dev/null +++ b/include/xcdat/mmap_visitor.hpp @@ -0,0 +1,39 @@ +#pragma once + +#include + +#include "immutable_vector.hpp" + +namespace xcdat { + +class mmap_visitor { + private: + const char* m_base = nullptr; + const char* m_cur = nullptr; + + public: + mmap_visitor(const char* base) : m_base(base), m_cur(base) {} + + virtual ~mmap_visitor() = default; + + template + void visit(immutable_vector& vec) { + m_cur += vec.mmap(m_cur); + } + + template + void visit(T& obj) { + if constexpr (std::is_pod_v) { + obj = *reinterpret_cast(m_cur); + m_cur += sizeof(T); + } else { + obj.visit(*this); + } + } + + std::uint64_t bytes() { + return std::distance(m_base, m_cur); + } +}; + +} // namespace xcdat diff --git a/include/xcdat/save_visitor.hpp b/include/xcdat/save_visitor.hpp new file mode 100644 index 0000000..f81270b --- /dev/null +++ b/include/xcdat/save_visitor.hpp @@ -0,0 +1,43 @@ +#pragma once + +#include +#include + +#include "exception.hpp" +#include "immutable_vector.hpp" + +namespace xcdat { + +class save_visitor { + private: + std::ofstream m_ofs; + + public: + save_visitor(std::string_view filepath) : m_ofs(filepath, std::ios::binary) { + XCDAT_THROW_IF(!m_ofs.good(), "Cannot open the input file"); + } + + virtual ~save_visitor() { + m_ofs.close(); + } + + template + void visit(const immutable_vector& vec) { + vec.save(m_ofs); + } + + template + void visit(const T& obj) { + if constexpr (std::is_pod_v) { + m_ofs.write(reinterpret_cast(&obj), sizeof(T)); + } else { + const_cast(obj).visit(*this); + } + } + + std::uint64_t bytes() { + return m_ofs.tellp(); + } +}; + +} // namespace xcdat \ No newline at end of file diff --git a/include/xcdat/size_visitor.hpp b/include/xcdat/size_visitor.hpp new file mode 100644 index 0000000..5ddcdb8 --- /dev/null +++ b/include/xcdat/size_visitor.hpp @@ -0,0 +1,39 @@ +#pragma once + +#include +#include + +#include "exception.hpp" +#include "immutable_vector.hpp" + +namespace xcdat { + +class size_visitor { + private: + std::uint64_t m_bytes = 0; + + public: + size_visitor() = default; + + virtual ~size_visitor() = default; + + template + void visit(const immutable_vector& vec) { + m_bytes += vec.memory_in_bytes(); + } + + template + void visit(const T& obj) { + if constexpr (std::is_pod_v) { + m_bytes += sizeof(T); + } else { + const_cast(obj).visit(*this); + } + } + + std::uint64_t bytes() { + return m_bytes; + } +}; + +} // namespace xcdat diff --git a/include/xcdat/tail_vector.hpp b/include/xcdat/tail_vector.hpp index 639b49d..fea2033 100644 --- a/include/xcdat/tail_vector.hpp +++ b/include/xcdat/tail_vector.hpp @@ -129,15 +129,7 @@ class tail_vector { tail_vector(tail_vector&&) noexcept = default; tail_vector& operator=(tail_vector&&) noexcept = default; - explicit tail_vector(builder&& b) { - m_chars.steal(b.m_chars); - m_terms.build(b.m_terms); - } - - void build(builder&& b) { - m_chars.steal(b.m_chars); - m_terms.build(b.m_terms); - } + explicit tail_vector(builder&& b) : m_chars(b.m_chars), m_terms(b.m_terms) {} inline bool bin_mode() const { return m_terms.size() != 0; diff --git a/include/xcdat/trie.hpp b/include/xcdat/trie.hpp index 9b95193..1b4e6fe 100644 --- a/include/xcdat/trie.hpp +++ b/include/xcdat/trie.hpp @@ -4,7 +4,6 @@ #include #include -#include "essentials/essentials.hpp" #include "trie_builder.hpp" namespace xcdat { @@ -23,7 +22,7 @@ namespace xcdat { template class trie { public: - using this_type = trie; + using trie_type = trie; using bc_vector_type = BcVector; static constexpr auto l1_bits = bc_vector_type::l1_bits; @@ -55,23 +54,7 @@ class trie { trie& operator=(trie&&) noexcept = default; template - static this_type build(const Strings& keys, bool bin_mode = false) { - return this_type(trie_builder(keys, l1_bits, bin_mode)); - } - - static this_type load(std::string_view filepath) { - this_type obj; - essentials::load(obj, filepath.data()); - return obj; - } - - std::uint64_t save(std::string_view filepath) const { - return essentials::save(const_cast(*this), filepath.data()); - } - - std::uint64_t memory_in_bytes() const { - return essentials::visit(const_cast(*this), ""); - } + explicit trie(const Strings& keys, bool bin_mode = false) : trie(trie_builder(keys, l1_bits, bin_mode)) {} //! Check the binary mode. inline bool bin_mode() const { @@ -155,7 +138,7 @@ class trie { */ class prefix_iterator { private: - const this_type* m_obj = nullptr; + const trie_type* m_obj = nullptr; std::string_view m_key; std::uint64_t m_id = 0; std::uint64_t m_kpos = 0; @@ -181,7 +164,7 @@ class trie { } private: - prefix_iterator(const this_type* obj, std::string_view key) : m_obj(obj), m_key(key) {} + prefix_iterator(const trie_type* obj, std::string_view key) : m_obj(obj), m_key(key) {} friend class trie; }; @@ -211,7 +194,7 @@ class trie { }; private: - const this_type* m_obj = nullptr; + const trie_type* m_obj = nullptr; std::string_view m_key; std::uint64_t m_id = 0; std::string m_decoded; @@ -237,7 +220,7 @@ class trie { } private: - predictive_iterator(const this_type* obj, std::string_view key) : m_obj(obj), m_key(key) {} + predictive_iterator(const trie_type* obj, std::string_view key) : m_obj(obj), m_key(key) {} friend class trie; }; diff --git a/include/xcdat/trie_builder.hpp b/include/xcdat/trie_builder.hpp index 04cac2a..1dfe724 100644 --- a/include/xcdat/trie_builder.hpp +++ b/include/xcdat/trie_builder.hpp @@ -81,7 +81,7 @@ class trie_builder { m_heads[taboo_npos >> m_l1_bits] = m_units[taboo_npos].base; // Build the code table - m_table.build(keys); + m_table = code_table(keys); m_bin_mode |= m_table.has_null(); // Build the BC units diff --git a/sample/sample.cpp b/sample/sample.cpp index 84d194c..34ba332 100644 --- a/sample/sample.cpp +++ b/sample/sample.cpp @@ -3,9 +3,10 @@ #include -using xcdat_trie = xcdat::trie_8_type; +using trie_type = xcdat::trie_8_type; int main() { + // Input keys std::vector keys = { "AirPods", "AirTag", "Mac", "MacBook", "MacBook_Air", "MacBook_Pro", "Mac_Mini", "Mac_Pro", "iMac", "iPad", "iPhone", "iPhone_SE", @@ -19,12 +20,12 @@ int main() { // Build and save the trie index { - const auto trie = xcdat_trie::build(keys); - trie.save(index_filename); + const trie_type trie(keys); + xcdat::save(trie, index_filename); } // Load the trie index - const auto trie = xcdat_trie::load(index_filename); + const auto trie = xcdat::load(index_filename); std::cout << "Basic operations" << std::endl; { diff --git a/tests/test_bit_vector.cpp b/tests/test_bit_vector.cpp index 177ae9a..06c2967 100644 --- a/tests/test_bit_vector.cpp +++ b/tests/test_bit_vector.cpp @@ -35,7 +35,7 @@ void test_rank_select(const std::vector& bits) { for (std::uint64_t i = 0; i < bits.size(); i++) { bvb.set_bit(i, bits[i]); } - bv.build(bvb, true, true); + bv = xcdat::bit_vector(bvb, true, true); } REQUIRE_EQ(bv.size(), bits.size()); diff --git a/tests/test_trie.cpp b/tests/test_trie.cpp index 79cdc1e..50900df 100644 --- a/tests/test_trie.cpp +++ b/tests/test_trie.cpp @@ -124,7 +124,7 @@ TEST_CASE("Test trie_type (tiny)") { "Google_Pixel", "iPad_mini", "iPadOS", "iPod", "ThinkPad", }; - auto trie = trie_type::build(keys); + trie_type trie(keys); REQUIRE_FALSE(trie.bin_mode()); test_basic_operations(trie, keys, others); @@ -163,7 +163,7 @@ TEST_CASE("Test trie_type (real)") { auto keys = xcdat::test::to_unique_vec(xcdat::io::load_strings("keys.txt")); auto others = xcdat::test::extract_keys(keys); - auto trie = trie_type::build(keys); + trie_type trie(keys); REQUIRE_FALSE(trie.bin_mode()); test_basic_operations(trie, keys, others); @@ -176,7 +176,7 @@ TEST_CASE("Test trie_type (random 10K, A--B)") { auto keys = xcdat::test::to_unique_vec(xcdat::test::make_random_keys(10000, 1, 30, 'A', 'B')); auto others = xcdat::test::extract_keys(keys); - auto trie = trie_type::build(keys); + trie_type trie(keys); REQUIRE_FALSE(trie.bin_mode()); test_basic_operations(trie, keys, others); @@ -189,7 +189,7 @@ TEST_CASE("Test trie_type (random 10K, A--Z)") { auto keys = xcdat::test::to_unique_vec(xcdat::test::make_random_keys(10000, 1, 30, 'A', 'Z')); auto others = xcdat::test::extract_keys(keys); - auto trie = trie_type::build(keys); + trie_type trie(keys); REQUIRE_FALSE(trie.bin_mode()); test_basic_operations(trie, keys, others); @@ -202,7 +202,7 @@ TEST_CASE("Test trie_type (random 10K, 0x00--0xFF)") { auto keys = xcdat::test::to_unique_vec(xcdat::test::make_random_keys(10000, 1, 30, INT8_MIN, INT8_MAX)); auto others = xcdat::test::extract_keys(keys); - auto trie = trie_type::build(keys); + trie_type trie(keys); REQUIRE(trie.bin_mode()); test_basic_operations(trie, keys, others); @@ -210,3 +210,44 @@ TEST_CASE("Test trie_type (random 10K, 0x00--0xFF)") { test_predictive_search(trie, keys, others); test_enumerate(trie, keys); } + +#ifdef NDEBUG +TEST_CASE("Test trie_type (random 100K, A--B)") { + auto keys = xcdat::test::to_unique_vec(xcdat::test::make_random_keys(100000, 1, 30, 'A', 'B')); + auto others = xcdat::test::extract_keys(keys); + + trie_type trie(keys); + REQUIRE_FALSE(trie.bin_mode()); + + test_basic_operations(trie, keys, others); + test_prefix_search(trie, keys, others); + test_predictive_search(trie, keys, others); + test_enumerate(trie, keys); +} + +TEST_CASE("Test trie_type (random 100K, A--Z)") { + auto keys = xcdat::test::to_unique_vec(xcdat::test::make_random_keys(100000, 1, 30, 'A', 'Z')); + auto others = xcdat::test::extract_keys(keys); + + trie_type trie(keys); + REQUIRE_FALSE(trie.bin_mode()); + + test_basic_operations(trie, keys, others); + test_prefix_search(trie, keys, others); + test_predictive_search(trie, keys, others); + test_enumerate(trie, keys); +} + +TEST_CASE("Test trie_type (random 100K, 0x00--0xFF)") { + auto keys = xcdat::test::to_unique_vec(xcdat::test::make_random_keys(100000, 1, 30, INT8_MIN, INT8_MAX)); + auto others = xcdat::test::extract_keys(keys); + + trie_type trie(keys); + REQUIRE(trie.bin_mode()); + + test_basic_operations(trie, keys, others); + test_prefix_search(trie, keys, others); + test_predictive_search(trie, keys, others); + test_enumerate(trie, keys); +} +#endif \ No newline at end of file diff --git a/tools/xcdat_build.cpp b/tools/xcdat_build.cpp index a03d5d3..630d275 100644 --- a/tools/xcdat_build.cpp +++ b/tools/xcdat_build.cpp @@ -1,3 +1,5 @@ +#include + #include #include "cmd_line_parser/parser.hpp" @@ -28,22 +30,21 @@ int build(const cmd_line_parser::parser& p) { keys.erase(std::unique(keys.begin(), keys.end()), keys.end()); } - essentials::timer timer; - timer.start(); - const auto trie = Trie::build(keys); - timer.stop(); + const auto start_tp = std::chrono::high_resolution_clock::now(); + const Trie trie(keys); + const auto stop_tp = std::chrono::high_resolution_clock::now(); - const double construction_time_in_sec = timer.average(); - const double memory_in_bytes = trie.memory_in_bytes(); + const double time_in_sec = std::chrono::duration_cast(stop_tp - start_tp).count(); + const double memory_in_bytes = xcdat::memory_in_bytes(trie); - tfm::printfln("construction_time_in_sec: %g", construction_time_in_sec); + tfm::printfln("time_in_sec: %g", time_in_sec); tfm::printfln("memory_in_bytes: %d", memory_in_bytes); - tfm::printfln("memory_in_MiB: %g", memory_in_bytes / essentials::MiB); + tfm::printfln("memory_in_MiB: %g", memory_in_bytes / (1024.0 * 1024.0)); tfm::printfln("number_of_keys: %d", trie.num_keys()); tfm::printfln("alphabet_size: %d", trie.alphabet_size()); tfm::printfln("max_length: %d", trie.max_length()); - trie.save(output_idx); + xcdat::save(trie, output_idx); return 0; } diff --git a/tools/xcdat_decode.cpp b/tools/xcdat_decode.cpp index aa9cd47..7536394 100644 --- a/tools/xcdat_decode.cpp +++ b/tools/xcdat_decode.cpp @@ -13,7 +13,7 @@ cmd_line_parser::parser make_parser(int argc, char** argv) { template int decode(const cmd_line_parser::parser& p) { const auto input_idx = p.get("input_idx"); - const auto trie = Trie::load(input_idx); + const auto trie = xcdat::load(input_idx); for (std::uint64_t id; std::cin >> id;) { const auto dec = trie.decode(id); diff --git a/tools/xcdat_enumerate.cpp b/tools/xcdat_enumerate.cpp index 3412bbe..cba27ed 100644 --- a/tools/xcdat_enumerate.cpp +++ b/tools/xcdat_enumerate.cpp @@ -13,7 +13,7 @@ cmd_line_parser::parser make_parser(int argc, char** argv) { template int enumerate(const cmd_line_parser::parser& p) { const auto input_idx = p.get("input_idx"); - const auto trie = Trie::load(input_idx); + const auto trie = xcdat::load(input_idx); trie.enumerate([&](std::uint64_t id, std::string_view str) { tfm::printfln("%d\t%s", id, str); }); diff --git a/tools/xcdat_lookup.cpp b/tools/xcdat_lookup.cpp index 064db99..58e23ba 100644 --- a/tools/xcdat_lookup.cpp +++ b/tools/xcdat_lookup.cpp @@ -13,7 +13,7 @@ cmd_line_parser::parser make_parser(int argc, char** argv) { template int lookup(const cmd_line_parser::parser& p) { const auto input_idx = p.get("input_idx"); - const auto trie = Trie::load(input_idx); + const auto trie = xcdat::load(input_idx); for (std::string str; std::getline(std::cin, str);) { const auto id = trie.lookup(str); diff --git a/tools/xcdat_predictive_search.cpp b/tools/xcdat_predictive_search.cpp index 9f10249..42eb40a 100644 --- a/tools/xcdat_predictive_search.cpp +++ b/tools/xcdat_predictive_search.cpp @@ -16,13 +16,12 @@ int predictive_search(const cmd_line_parser::parser& p) { const auto input_idx = p.get("input_idx"); const auto max_num_results = p.get("max_num_results", 10); - const auto trie = Trie::load(input_idx); + const auto trie = xcdat::load(input_idx); struct result_type { std::uint64_t id; std::string str; }; - std::vector results; results.reserve(1ULL << 10); diff --git a/tools/xcdat_prefix_search.cpp b/tools/xcdat_prefix_search.cpp index f8d07bf..e1ff481 100644 --- a/tools/xcdat_prefix_search.cpp +++ b/tools/xcdat_prefix_search.cpp @@ -13,7 +13,8 @@ cmd_line_parser::parser make_parser(int argc, char** argv) { template int prefix_search(const cmd_line_parser::parser& p) { const auto input_idx = p.get("input_idx"); - const auto trie = Trie::load(input_idx); + + const auto trie = xcdat::load(input_idx); struct result_type { std::uint64_t id;