fix the interface

This commit is contained in:
Shunsuke Kanda 2021-06-29 09:06:40 +09:00
parent 4ea523bc4f
commit e216688f2f
26 changed files with 397 additions and 824 deletions

View file

@ -34,6 +34,7 @@ message(STATUS "CXX_FLAGS_RELEASE are ${CMAKE_CXX_FLAGS_RELEASE}")
include_directories(include)
add_subdirectory(sample)
add_subdirectory(tools)
enable_testing()

View file

@ -6,9 +6,44 @@
#include "xcdat/io.hpp"
#include "xcdat/trie.hpp"
#include "xcdat/load_visitor.hpp"
#include "xcdat/mmap_visitor.hpp"
#include "xcdat/save_visitor.hpp"
#include "xcdat/size_visitor.hpp"
namespace xcdat {
using trie_7_type = trie<bc_vector_7>;
using trie_8_type = trie<bc_vector_8>;
template <class Trie>
static Trie mmap(const char* address) {
Trie idx;
mmap_visitor visitor(address);
visitor.visit(idx);
return idx;
}
template <class Trie>
static Trie load(std::string_view filepath) {
Trie idx;
load_visitor visitor(filepath);
visitor.visit(idx);
return idx;
}
template <class Trie>
static std::uint64_t save(const Trie& idx, std::string_view filepath) {
save_visitor visitor(filepath);
visitor.visit(const_cast<Trie&>(idx));
return visitor.bytes();
}
template <class Trie>
static std::uint64_t memory_in_bytes(const Trie& idx) {
size_visitor visitor;
visitor.visit(const_cast<Trie&>(idx));
return visitor.bytes();
}
} // namespace xcdat

View file

@ -11,6 +11,7 @@ class bc_vector_7 {
public:
static constexpr std::uint32_t l1_bits = 7;
static constexpr std::uint32_t max_levels = 4;
static constexpr std::uint64_t block_size_l1 = 1ULL << 7;
static constexpr std::uint64_t block_size_l2 = 1ULL << 15;
static constexpr std::uint64_t block_size_l3 = 1ULL << 31;
@ -37,11 +38,6 @@ class bc_vector_7 {
template <class BcUnits>
explicit bc_vector_7(const BcUnits& bc_units, bit_vector::builder&& leaves) {
build(bc_units, std::move(leaves));
}
template <class BcUnits>
void build(const BcUnits& bc_units, bit_vector::builder&& leaves) {
std::vector<std::uint8_t> ints_l1;
std::vector<std::uint16_t> ints_l2;
std::vector<std::uint32_t> ints_l3;
@ -110,15 +106,15 @@ class bc_vector_7 {
}
// release
m_ints_l1.steal(ints_l1);
m_ints_l2.steal(ints_l2);
m_ints_l3.steal(ints_l3);
m_ints_l4.steal(ints_l4);
m_ints_l1.build(ints_l1);
m_ints_l2.build(ints_l2);
m_ints_l3.build(ints_l3);
m_ints_l4.build(ints_l4);
for (std::uint32_t j = 0; j < m_ranks.size(); ++j) {
m_ranks[j].steal(ranks[j]);
m_ranks[j].build(ranks[j]);
}
m_links.build(links);
m_leaves.build(leaves, true, false);
m_links = compact_vector(links);
m_leaves = bit_vector(leaves, true, false);
}
inline std::uint64_t base(std::uint64_t i) const {

View file

@ -32,11 +32,6 @@ class bc_vector_8 {
template <class BcUnits>
explicit bc_vector_8(const BcUnits& bc_units, bit_vector::builder&& leaves) {
build(bc_units, std::move(leaves));
}
template <class BcUnits>
void build(const BcUnits& bc_units, bit_vector::builder&& leaves) {
std::array<std::vector<std::uint8_t>, max_levels> bytes;
std::array<bit_vector::builder, max_levels - 1> next_flags;
std::vector<std::uint64_t> links;
@ -81,13 +76,13 @@ class bc_vector_8 {
}
// release
for (uint8_t i = 0; i < m_num_levels; ++i) {
m_bytes[i].steal(bytes[i]);
m_nexts[i].build(next_flags[i], true, false);
for (std::uint32_t i = 0; i < m_num_levels; ++i) {
m_bytes[i].build(bytes[i]);
m_nexts[i] = bit_vector(next_flags[i], true, false);
}
m_bytes[m_num_levels].steal(bytes[m_num_levels]);
m_links.build(links);
m_leaves.build(leaves, true, false);
m_bytes[m_num_levels].build(bytes[m_num_levels]);
m_links = compact_vector(links);
m_leaves = bit_vector(leaves, true, false);
}
inline std::uint64_t base(std::uint64_t i) const {

View file

@ -1,17 +1,13 @@
#pragma once
#include <cassert>
#include <cstdint>
#include <numeric>
#include "essentials/essentials.hpp"
#include <vector>
#include "bit_tools.hpp"
#include "immutable_vector.hpp"
namespace xcdat {
//! Rank9 implementatoin
class bit_vector {
public:
class builder {
@ -56,12 +52,12 @@ class bit_vector {
}
inline void resize(std::uint64_t size) {
m_bits.resize(essentials::words_for(size), 0ULL);
m_bits.resize(words_for(size), 0ULL);
m_size = size;
}
inline void reserve(std::uint64_t capacity) {
m_bits.reserve(essentials::words_for(capacity));
m_bits.reserve(words_for(capacity));
}
inline std::uint64_t size() const {
@ -92,17 +88,10 @@ class bit_vector {
bit_vector& operator=(bit_vector&&) noexcept = default;
explicit bit_vector(builder& b, bool enable_rank = false, bool enable_select = false) {
build(b, enable_rank, enable_select);
}
void build(builder& b, bool enable_rank = false, bool enable_select = false) {
m_bits.steal(b.m_bits);
m_bits.build(b.m_bits);
m_size = b.m_size;
m_num_ones = std::accumulate(m_bits.begin(), m_bits.end(), 0ULL,
[](std::uint64_t acc, std::uint64_t x) { return acc + bit_tools::popcount(x); });
m_rank_hints.clear();
m_select_hints.clear();
if (enable_rank) {
build_rank_hints();
}
@ -172,6 +161,10 @@ class bit_vector {
return {x / N, x % N};
}
static std::uint64_t words_for(std::uint64_t nbits) {
return (nbits + 63) / 64;
}
inline std::uint64_t num_blocks() const {
return m_rank_hints.size() / 2 - 1;
}
@ -258,7 +251,7 @@ class bit_vector {
}
// Release
m_rank_hints.steal(rank_hints);
m_rank_hints.build(rank_hints);
}
void build_select_hints() {
@ -271,7 +264,7 @@ class bit_vector {
}
}
select_hints.push_back(num_blocks());
m_select_hints.steal(select_hints);
m_select_hints.build(select_hints);
}
};

View file

@ -29,12 +29,7 @@ class code_table {
code_table& operator=(code_table&&) noexcept = default;
template <class Strings>
explicit code_table(const Strings& keys) {
build(keys);
}
template <class Strings>
void build(const Strings& keys) {
code_table(const Strings& keys) {
std::array<counter_type, 256> counter;
for (std::uint32_t ch = 0; ch < 256; ++ch) {
counter[ch] = {static_cast<std::uint8_t>(ch), 0};
@ -55,7 +50,7 @@ class code_table {
alphabet.push_back(cf.ch);
}
}
m_alphabet.steal(alphabet);
m_alphabet.build(alphabet);
}
std::sort(counter.begin(), counter.end(),

View file

@ -1,14 +1,11 @@
#pragma once
#include "essentials/essentials.hpp"
#include "bit_tools.hpp"
#include "exception.hpp"
#include "immutable_vector.hpp"
namespace xcdat {
//! A compressed integer vector.
class compact_vector {
private:
std::uint64_t m_size = 0;
@ -27,21 +24,14 @@ class compact_vector {
compact_vector& operator=(compact_vector&&) noexcept = default;
template <class Vec>
explicit compact_vector(const Vec& vec) {
build(vec);
}
template <class Vec>
void build(const Vec& vec) {
compact_vector(const Vec& vec) {
XCDAT_THROW_IF(vec.size() == 0, "The input vector is empty.");
const std::uint64_t maxv = *std::max_element(vec.begin(), vec.end());
m_size = vec.size();
m_bits = needed_bits(maxv);
m_bits = needed_bits(*std::max_element(vec.begin(), vec.end()));
m_mask = (1ULL << m_bits) - 1;
std::vector<std::uint64_t> chunks(essentials::words_for(m_size * m_bits));
std::vector<std::uint64_t> chunks(words_for(m_size * m_bits));
for (std::uint64_t i = 0; i < m_size; i++) {
const auto [quo, mod] = decompose(i * m_bits);
@ -53,7 +43,7 @@ class compact_vector {
chunks[quo + 1] |= (vec[i] & m_mask) >> diff;
}
}
m_chunks.steal(chunks);
m_chunks.build(chunks);
}
inline std::uint64_t operator[](std::uint64_t i) const {
@ -90,6 +80,10 @@ class compact_vector {
static std::tuple<std::uint64_t, std::uint64_t> decompose(std::uint64_t x) {
return {x / 64, x % 64};
}
static std::uint64_t words_for(std::uint64_t nbits) {
return (nbits + 63) / 64;
}
};
} // namespace xcdat

View file

@ -1,679 +0,0 @@
#pragma once
#include <dirent.h>
#include <sys/resource.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <unistd.h>
#include <algorithm>
#include <cassert>
#include <chrono>
#include <cstring>
#include <fstream>
#include <iostream>
#include <locale>
#include <numeric>
#include <random>
#include <type_traits>
#include <vector>
#ifdef __GNUG__
#include <cxxabi.h> // for name demangling
#endif
namespace essentials {
void logger(std::string const& msg) {
time_t t = std::time(nullptr);
std::locale loc;
const std::time_put<char>& tp = std::use_facet<std::time_put<char>>(loc);
const char* fmt = "%F %T";
tp.put(std::cout, std::cout, ' ', std::localtime(&t), fmt, fmt + strlen(fmt));
std::cout << ": " << msg << std::endl;
}
static const uint64_t GB = 1000 * 1000 * 1000;
static const uint64_t GiB = uint64_t(1) << 30;
static const uint64_t MB = 1000 * 1000;
static const uint64_t MiB = uint64_t(1) << 20;
static const uint64_t KB = 1000;
static const uint64_t KiB = uint64_t(1) << 10;
double convert(size_t bytes, uint64_t unit) {
return static_cast<double>(bytes) / unit;
}
template <typename T>
size_t vec_bytes(T const& vec) {
return vec.size() * sizeof(vec.front()) + sizeof(typename T::size_type);
}
template <typename T>
size_t pod_bytes(T const& pod) {
static_assert(std::is_pod<T>::value);
return sizeof(pod);
}
size_t file_size(char const* filename) {
std::ifstream is(filename, std::ios::binary | std::ios::ate);
if (!is.good()) {
throw std::runtime_error("Error in opening binary "
"file.");
}
size_t bytes = (size_t)is.tellg();
is.close();
return bytes;
}
template <typename WordType = uint64_t>
uint64_t words_for(uint64_t bits) {
uint64_t word_bits = sizeof(WordType) * 8;
return (bits + word_bits - 1) / word_bits;
}
template <typename T>
inline void do_not_optimize_away(T&& value) {
asm volatile("" : "+r"(value));
}
uint64_t maxrss_in_bytes() {
struct rusage ru;
if (getrusage(RUSAGE_SELF, &ru) == 0) {
// NOTE: ru_maxrss is in kilobytes on Linux, but not on Apple...
#ifdef __APPLE__
return ru.ru_maxrss;
#endif
return ru.ru_maxrss * 1000;
}
return 0;
}
template <typename T>
void load_pod(std::istream& is, T& val) {
static_assert(std::is_pod<T>::value);
is.read(reinterpret_cast<char*>(&val), sizeof(T));
}
template <typename T, typename Allocator>
void load_vec(std::istream& is, std::vector<T, Allocator>& vec) {
size_t n;
load_pod(is, n);
vec.resize(n);
is.read(reinterpret_cast<char*>(vec.data()), static_cast<std::streamsize>(sizeof(T) * n));
}
template <typename T>
void save_pod(std::ostream& os, T const& val) {
static_assert(std::is_pod<T>::value);
os.write(reinterpret_cast<char const*>(&val), sizeof(T));
}
template <typename T, typename Allocator>
void save_vec(std::ostream& os, std::vector<T, Allocator> const& vec) {
static_assert(std::is_pod<T>::value);
size_t n = vec.size();
save_pod(os, n);
os.write(reinterpret_cast<char const*>(vec.data()), static_cast<std::streamsize>(sizeof(T) * n));
}
struct json_lines {
struct property {
property(std::string n, std::string v) : name(n), value(v) {}
std::string name;
std::string value;
};
void new_line() {
m_properties.push_back(std::vector<property>());
}
template <typename T>
void add(std::string name, T value) {
if (!m_properties.size()) {
new_line();
}
if constexpr (std::is_same<T, char const*>::value) {
m_properties.back().emplace_back(name, value);
} else {
m_properties.back().emplace_back(name, std::to_string(value));
}
}
void save_to_file(char const* filename) const {
std::ofstream out(filename);
print_to(out);
out.close();
}
void print_line() const {
print_line_to(m_properties.back(), std::cerr);
}
void print() const {
print_to(std::cerr);
}
private:
std::vector<std::vector<property>> m_properties;
template <typename T>
void print_line_to(std::vector<property> const& properties, T& device) const {
device << "{";
for (uint64_t i = 0; i != properties.size(); ++i) {
auto const& p = properties[i];
device << "\"" << p.name << "\": \"" << p.value << "\"";
if (i != properties.size() - 1) {
device << ", ";
}
}
device << "}\n";
}
template <typename T>
void print_to(T& device) const {
for (auto const& properties : m_properties) {
print_line_to(properties, device);
}
}
};
template <typename ClockType, typename DurationType>
struct timer {
void start() {
m_start = ClockType::now();
}
void stop() {
m_stop = ClockType::now();
auto elapsed = std::chrono::duration_cast<DurationType>(m_stop - m_start);
m_timings.push_back(elapsed.count());
}
size_t runs() const {
return m_timings.size();
}
void reset() {
m_timings.clear();
}
double min() const {
return *std::min_element(m_timings.begin(), m_timings.end());
}
double max() const {
return *std::max_element(m_timings.begin(), m_timings.end());
}
void discard_first() {
if (runs()) {
m_timings.erase(m_timings.begin());
}
}
void discard_min() {
if (runs() > 1) {
m_timings.erase(std::min_element(m_timings.begin(), m_timings.end()));
}
}
void discard_max() {
if (runs() > 1) {
m_timings.erase(std::max_element(m_timings.begin(), m_timings.end()));
}
}
double elapsed() {
return std::accumulate(m_timings.begin(), m_timings.end(), 0.0);
}
double average() {
return elapsed() / runs();
}
private:
typename ClockType::time_point m_start;
typename ClockType::time_point m_stop;
std::vector<double> m_timings;
};
typedef std::chrono::high_resolution_clock clock_type;
typedef std::chrono::microseconds duration_type;
typedef timer<clock_type, duration_type> timer_type;
unsigned get_random_seed() {
return std::chrono::system_clock::now().time_since_epoch().count();
}
template <typename IntType>
struct uniform_int_rng {
uniform_int_rng(IntType from, IntType to, unsigned seed = 13) : m_rng(seed), m_distr(from, to) {}
IntType gen() {
return m_distr(m_rng);
}
private:
std::mt19937_64 m_rng;
std::uniform_int_distribution<IntType> m_distr;
};
struct loader {
loader(char const* filename) : m_num_bytes_pods(0), m_num_bytes_vecs_of_pods(0), m_is(filename, std::ios::binary) {
if (!m_is.good()) {
throw std::runtime_error("Error in opening binary "
"file.");
}
}
~loader() {
m_is.close();
}
template <typename T>
void visit(T& val) {
if constexpr (std::is_pod<T>::value) {
load_pod(m_is, val);
m_num_bytes_pods += pod_bytes(val);
} else {
val.visit(*this);
}
}
template <typename T, typename Allocator>
void visit(std::vector<T, Allocator>& vec) {
size_t n;
visit(n);
vec.resize(n);
if constexpr (std::is_pod<T>::value) {
m_is.read(reinterpret_cast<char*>(vec.data()), static_cast<std::streamsize>(sizeof(T) * n));
m_num_bytes_vecs_of_pods += n * sizeof(T);
} else {
for (auto& v : vec) visit(v);
}
}
size_t bytes() {
return m_is.tellg();
}
size_t bytes_pods() {
return m_num_bytes_pods;
}
size_t bytes_vecs_of_pods() {
return m_num_bytes_vecs_of_pods;
}
private:
size_t m_num_bytes_pods;
size_t m_num_bytes_vecs_of_pods;
std::ifstream m_is;
};
struct saver {
saver(char const* filename) : m_os(filename, std::ios::binary) {
if (!m_os.good()) {
throw std::runtime_error("Error in opening binary "
"file.");
}
}
~saver() {
m_os.close();
}
template <typename T>
void visit(T& val) {
if constexpr (std::is_pod<T>::value) {
save_pod(m_os, val);
} else {
val.visit(*this);
}
}
template <typename T, typename Allocator>
void visit(std::vector<T, Allocator>& vec) {
if constexpr (std::is_pod<T>::value) {
save_vec(m_os, vec);
} else {
size_t n = vec.size();
visit(n);
for (auto& v : vec) visit(v);
}
}
size_t bytes() {
return m_os.tellp();
}
private:
std::ofstream m_os;
};
std::string demangle(char const* mangled_name) {
size_t len = 0;
int status = 0;
std::unique_ptr<char, decltype(&std::free)> ptr(__cxxabiv1::__cxa_demangle(mangled_name, nullptr, &len, &status),
&std::free);
return ptr.get();
}
struct sizer {
sizer(std::string const& root_name = "") : m_root(0, 0, root_name), m_current(&m_root) {}
struct node {
node(size_t b, size_t d, std::string const& n = "") : bytes(b), depth(d), name(n) {}
size_t bytes;
size_t depth;
std::string name;
std::vector<node> children;
};
template <typename T>
void visit(T& val) {
if constexpr (std::is_pod<T>::value) {
node n(pod_bytes(val), m_current->depth + 1, demangle(typeid(T).name()));
m_current->children.push_back(n);
m_current->bytes += n.bytes;
} else {
val.visit(*this);
}
}
template <typename T, typename Allocator>
void visit(std::vector<T, Allocator>& vec) {
if constexpr (std::is_pod<T>::value) {
node n(vec_bytes(vec), m_current->depth + 1, demangle(typeid(std::vector<T>).name()));
m_current->children.push_back(n);
m_current->bytes += n.bytes;
} else {
size_t n = vec.size();
m_current->bytes += pod_bytes(n);
node* parent = m_current;
for (auto& v : vec) {
node n(0, parent->depth + 1, demangle(typeid(T).name()));
parent->children.push_back(n);
m_current = &parent->children.back();
visit(v);
parent->bytes += m_current->bytes;
}
m_current = parent;
}
}
template <typename Device>
void print(node const& n, size_t total_bytes, Device& device) const {
auto indent = std::string(n.depth * 4, ' ');
device << indent << "'" << n.name << "' - bytes = " << n.bytes << " (" << n.bytes * 100.0 / total_bytes << "%)"
<< std::endl;
for (auto const& child : n.children) {
device << indent;
print(child, total_bytes, device);
}
}
template <typename Device>
void print(Device& device) const {
print(m_root, bytes(), device);
}
size_t bytes() const {
return m_root.bytes;
}
private:
node m_root;
node* m_current;
};
template <typename T>
struct allocator : std::allocator<T> {
typedef T value_type;
allocator() : m_addr(nullptr) {}
allocator(T* addr) : m_addr(addr) {}
T* allocate(size_t n) {
if (m_addr == nullptr) return std::allocator<T>::allocate(n);
return m_addr;
}
void deallocate(T* p, size_t n) {
if (m_addr == nullptr) return std::allocator<T>::deallocate(p, n);
}
private:
T* m_addr;
};
struct contiguous_memory_allocator {
contiguous_memory_allocator() : m_begin(nullptr), m_end(nullptr), m_size(0) {}
struct visitor {
visitor(uint8_t* begin, size_t size, char const* filename)
: m_begin(begin), m_end(begin), m_size(size), m_is(filename, std::ios::binary) {
if (!m_is.good()) {
throw std::runtime_error("Error in opening binary "
"file.");
}
}
~visitor() {
m_is.close();
}
template <typename T>
void visit(T& val) {
if constexpr (std::is_pod<T>::value) {
load_pod(m_is, val);
} else {
val.visit(*this);
}
}
template <typename T, typename Allocator>
void visit(std::vector<T, Allocator>& vec) {
if constexpr (std::is_pod<T>::value) {
vec = std::vector<T, Allocator>(make_allocator<T>());
load_vec(m_is, vec);
consume(vec.size() * sizeof(T));
} else {
size_t n;
visit(n);
vec.resize(n);
for (auto& v : vec) visit(v);
}
}
uint8_t* end() {
return m_end;
}
size_t size() const {
return m_size;
}
size_t allocated() const {
assert(m_end >= m_begin);
return m_end - m_begin;
}
template <typename T>
allocator<T> make_allocator() {
return allocator<T>(reinterpret_cast<T*>(m_end));
}
void consume(size_t num_bytes) {
if (m_end == nullptr) return;
if (allocated() + num_bytes > size()) {
throw std::runtime_error("allocation failed");
}
m_end += num_bytes;
}
private:
uint8_t* m_begin;
uint8_t* m_end;
size_t m_size;
std::ifstream m_is;
};
template <typename T>
size_t allocate(T& data_structure, char const* filename) {
loader l(filename);
l.visit(data_structure);
m_size = l.bytes_vecs_of_pods();
m_begin = reinterpret_cast<uint8_t*>(malloc(m_size));
if (m_begin == nullptr) throw std::runtime_error("malloc failed");
visitor v(m_begin, m_size, filename);
v.visit(data_structure);
m_end = v.end();
return l.bytes();
}
~contiguous_memory_allocator() {
free(m_begin);
}
uint8_t* begin() {
return m_begin;
}
uint8_t* end() {
return m_end;
}
size_t size() const {
return m_size;
}
private:
uint8_t* m_begin;
uint8_t* m_end;
size_t m_size;
};
template <typename T, typename Visitor>
size_t visit(T& data_structure, char const* filename) {
Visitor visitor(filename);
visitor.visit(data_structure);
return visitor.bytes();
}
template <typename T>
size_t load(T& data_structure, char const* filename) {
return visit<T, loader>(data_structure, filename);
}
template <typename T>
size_t load_with_custom_memory_allocation(T& data_structure, char const* filename) {
return data_structure.get_allocator().allocate(data_structure, filename);
}
template <typename T>
size_t save(T& data_structure, char const* filename) {
return visit<T, saver>(data_structure, filename);
}
template <typename T, typename Device>
size_t print_size(T& data_structure, Device& device) {
sizer visitor(demangle(typeid(T).name()));
visitor.visit(data_structure);
visitor.print(device);
return visitor.bytes();
}
#if defined(__CYGWIN__) || defined(_WIN32) || defined(_WIN64)
#else
struct directory {
struct file_name {
std::string name;
std::string fullpath;
std::string extension;
};
~directory() {
for (int i = 0; i != items(); ++i) {
free(m_items_names[i]);
}
free(m_items_names);
}
directory(std::string const& name) : m_name(name) {
m_n = scandir(m_name.c_str(), &m_items_names, NULL, alphasort);
if (m_n < 0) {
throw std::runtime_error("error during scandir");
}
}
std::string const& name() const {
return m_name;
}
int items() const {
return m_n;
}
struct iterator {
iterator(directory const* d, int i) : m_d(d), m_i(i) {}
file_name operator*() {
file_name fn;
fn.name = m_d->m_items_names[m_i]->d_name;
fn.fullpath = m_d->name() + "/" + fn.name;
size_t p = fn.name.find_last_of(".");
fn.extension = fn.name.substr(p + 1);
return fn;
}
void operator++() {
++m_i;
}
bool operator==(iterator const& rhs) const {
return m_i == rhs.m_i;
}
bool operator!=(iterator const& rhs) const {
return !(*this == rhs);
}
private:
directory const* m_d;
int m_i;
};
iterator begin() {
return iterator(this, 0);
}
iterator end() {
return iterator(this, items());
}
private:
std::string m_name;
struct dirent** m_items_names;
int m_n;
};
#endif
bool create_directory(std::string const& name) {
if (mkdir(name.c_str(), 0777) != 0) {
if (errno == EEXIST) {
std::cerr << "directory already exists" << std::endl;
}
return false;
}
return true;
}
bool remove_directory(std::string const& name) {
return rmdir(name.c_str()) == 0;
}
} // namespace essentials

View file

@ -1,14 +1,20 @@
#pragma once
#include <cstdlib>
#include <vector>
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <fstream>
#include <iterator>
#include <memory>
namespace xcdat {
template <class T>
class immutable_vector {
private:
std::vector<T> m_vec;
std::unique_ptr<T[]> m_allocator;
std::uint64_t m_size = 0;
const T* m_data = nullptr;
public:
immutable_vector() = default;
@ -20,54 +26,81 @@ class immutable_vector {
immutable_vector(immutable_vector&&) noexcept = default;
immutable_vector& operator=(immutable_vector&&) noexcept = default;
explicit immutable_vector(std::vector<T>&& vec) {
steal(vec);
void clear() {
m_allocator.reset();
m_size = 0;
m_data = nullptr;
}
void steal(std::vector<T>& vec) {
template <class Vector>
immutable_vector(const Vector& vec) {
build(vec);
}
template <class Vector>
void build(const Vector& vec) {
clear();
if (vec.size() != 0) {
m_vec = std::move(vec);
m_vec.shrink_to_fit();
} else {
clear();
m_allocator = std::make_unique<T[]>(vec.size());
std::copy_n(vec.data(), vec.size(), m_allocator.get());
m_size = vec.size();
m_data = m_allocator.get();
}
}
void clear() {
*this = immutable_vector<T>();
std::uint64_t mmap(const char* address) {
clear();
m_size = *reinterpret_cast<const std::uint64_t*>(address);
m_data = reinterpret_cast<const T*>(address + sizeof(std::uint64_t));
return sizeof(std::uint64_t) + m_size * sizeof(T);
}
void load(std::ifstream& ifs) {
clear();
ifs.read(reinterpret_cast<char*>(&m_size), sizeof(m_size));
if (m_size != 0) {
m_allocator = std::make_unique<T[]>(m_size);
ifs.read(reinterpret_cast<char*>(m_allocator.get()), sizeof(T) * m_size);
m_data = m_allocator.get();
}
}
void save(std::ofstream& ofs) const {
ofs.write(reinterpret_cast<const char*>(&m_size), sizeof(m_size));
ofs.write(reinterpret_cast<const char*>(m_data), sizeof(T) * m_size);
}
inline std::uint64_t memory_in_bytes() const {
return sizeof(m_size) + sizeof(T) * m_size;
}
inline std::uint64_t size() const {
return m_vec.size();
return m_size;
}
inline auto begin() const {
return m_vec.begin();
inline const T* begin() const {
return m_data;
}
inline auto end() const {
return m_vec.end();
inline const T* end() const {
return m_data + m_size;
}
inline auto rbegin() const {
return m_vec.rbegin();
return std::make_reverse_iterator(end());
}
inline auto rend() const {
return m_vec.rend();
return std::make_reverse_iterator(begin());
}
inline const T& operator[](std::uint64_t i) const {
return m_vec[i];
assert(i < m_size);
return m_data[i];
}
inline const T* data() const {
return m_vec.data();
}
template <class Visitor>
void visit(Visitor& visitor) {
visitor.visit(m_vec);
return m_data;
}
};

View file

@ -19,4 +19,32 @@ namespace xcdat::io {
return strs;
}
template <typename T>
void load_pod(std::istream& is, T& val) {
static_assert(std::is_pod<T>::value);
is.read(reinterpret_cast<char*>(&val), sizeof(T));
}
template <typename T, typename Allocator>
void load_vec(std::istream& is, std::vector<T, Allocator>& vec) {
size_t n;
load_pod(is, n);
vec.resize(n);
is.read(reinterpret_cast<char*>(vec.data()), static_cast<std::streamsize>(sizeof(T) * n));
}
template <typename T>
void save_pod(std::ostream& os, T const& val) {
static_assert(std::is_pod<T>::value);
os.write(reinterpret_cast<char const*>(&val), sizeof(T));
}
template <typename T, typename Allocator>
void save_vec(std::ostream& os, std::vector<T, Allocator> const& vec) {
static_assert(std::is_pod<T>::value);
size_t n = vec.size();
save_pod(os, n);
os.write(reinterpret_cast<char const*>(vec.data()), static_cast<std::streamsize>(sizeof(T) * n));
}
} // namespace xcdat::io

View file

@ -0,0 +1,43 @@
#pragma once
#include <string_view>
#include <type_traits>
#include "exception.hpp"
#include "immutable_vector.hpp"
namespace xcdat {
class load_visitor {
private:
std::ifstream m_ifs;
public:
load_visitor(std::string_view filepath) : m_ifs(filepath, std::ios::binary) {
XCDAT_THROW_IF(!m_ifs.good(), "Cannot open the input file");
}
virtual ~load_visitor() {
m_ifs.close();
}
template <class T>
void visit(immutable_vector<T>& vec) {
vec.load(m_ifs);
}
template <class T>
void visit(T& obj) {
if constexpr (std::is_pod_v<T>) {
m_ifs.read(reinterpret_cast<char*>(&obj), sizeof(T));
} else {
obj.visit(*this);
}
}
std::uint64_t bytes() {
return m_ifs.tellg();
}
};
} // namespace xcdat

View file

@ -0,0 +1,39 @@
#pragma once
#include <type_traits>
#include "immutable_vector.hpp"
namespace xcdat {
class mmap_visitor {
private:
const char* m_base = nullptr;
const char* m_cur = nullptr;
public:
mmap_visitor(const char* base) : m_base(base), m_cur(base) {}
virtual ~mmap_visitor() = default;
template <typename T>
void visit(immutable_vector<T>& vec) {
m_cur += vec.mmap(m_cur);
}
template <typename T>
void visit(T& obj) {
if constexpr (std::is_pod_v<T>) {
obj = *reinterpret_cast<const T*>(m_cur);
m_cur += sizeof(T);
} else {
obj.visit(*this);
}
}
std::uint64_t bytes() {
return std::distance(m_base, m_cur);
}
};
} // namespace xcdat

View file

@ -0,0 +1,43 @@
#pragma once
#include <string_view>
#include <type_traits>
#include "exception.hpp"
#include "immutable_vector.hpp"
namespace xcdat {
class save_visitor {
private:
std::ofstream m_ofs;
public:
save_visitor(std::string_view filepath) : m_ofs(filepath, std::ios::binary) {
XCDAT_THROW_IF(!m_ofs.good(), "Cannot open the input file");
}
virtual ~save_visitor() {
m_ofs.close();
}
template <typename T>
void visit(const immutable_vector<T>& vec) {
vec.save(m_ofs);
}
template <typename T>
void visit(const T& obj) {
if constexpr (std::is_pod_v<T>) {
m_ofs.write(reinterpret_cast<const char*>(&obj), sizeof(T));
} else {
const_cast<T&>(obj).visit(*this);
}
}
std::uint64_t bytes() {
return m_ofs.tellp();
}
};
} // namespace xcdat

View file

@ -0,0 +1,39 @@
#pragma once
#include <string_view>
#include <type_traits>
#include "exception.hpp"
#include "immutable_vector.hpp"
namespace xcdat {
class size_visitor {
private:
std::uint64_t m_bytes = 0;
public:
size_visitor() = default;
virtual ~size_visitor() = default;
template <typename T>
void visit(const immutable_vector<T>& vec) {
m_bytes += vec.memory_in_bytes();
}
template <typename T>
void visit(const T& obj) {
if constexpr (std::is_pod_v<T>) {
m_bytes += sizeof(T);
} else {
const_cast<T&>(obj).visit(*this);
}
}
std::uint64_t bytes() {
return m_bytes;
}
};
} // namespace xcdat

View file

@ -129,15 +129,7 @@ class tail_vector {
tail_vector(tail_vector&&) noexcept = default;
tail_vector& operator=(tail_vector&&) noexcept = default;
explicit tail_vector(builder&& b) {
m_chars.steal(b.m_chars);
m_terms.build(b.m_terms);
}
void build(builder&& b) {
m_chars.steal(b.m_chars);
m_terms.build(b.m_terms);
}
explicit tail_vector(builder&& b) : m_chars(b.m_chars), m_terms(b.m_terms) {}
inline bool bin_mode() const {
return m_terms.size() != 0;

View file

@ -4,7 +4,6 @@
#include <optional>
#include <string>
#include "essentials/essentials.hpp"
#include "trie_builder.hpp"
namespace xcdat {
@ -23,7 +22,7 @@ namespace xcdat {
template <class BcVector>
class trie {
public:
using this_type = trie<BcVector>;
using trie_type = trie<BcVector>;
using bc_vector_type = BcVector;
static constexpr auto l1_bits = bc_vector_type::l1_bits;
@ -55,23 +54,7 @@ class trie {
trie& operator=(trie&&) noexcept = default;
template <class Strings>
static this_type build(const Strings& keys, bool bin_mode = false) {
return this_type(trie_builder(keys, l1_bits, bin_mode));
}
static this_type load(std::string_view filepath) {
this_type obj;
essentials::load(obj, filepath.data());
return obj;
}
std::uint64_t save(std::string_view filepath) const {
return essentials::save(const_cast<this_type&>(*this), filepath.data());
}
std::uint64_t memory_in_bytes() const {
return essentials::visit<this_type, essentials::sizer>(const_cast<this_type&>(*this), "");
}
explicit trie(const Strings& keys, bool bin_mode = false) : trie(trie_builder(keys, l1_bits, bin_mode)) {}
//! Check the binary mode.
inline bool bin_mode() const {
@ -155,7 +138,7 @@ class trie {
*/
class prefix_iterator {
private:
const this_type* m_obj = nullptr;
const trie_type* m_obj = nullptr;
std::string_view m_key;
std::uint64_t m_id = 0;
std::uint64_t m_kpos = 0;
@ -181,7 +164,7 @@ class trie {
}
private:
prefix_iterator(const this_type* obj, std::string_view key) : m_obj(obj), m_key(key) {}
prefix_iterator(const trie_type* obj, std::string_view key) : m_obj(obj), m_key(key) {}
friend class trie;
};
@ -211,7 +194,7 @@ class trie {
};
private:
const this_type* m_obj = nullptr;
const trie_type* m_obj = nullptr;
std::string_view m_key;
std::uint64_t m_id = 0;
std::string m_decoded;
@ -237,7 +220,7 @@ class trie {
}
private:
predictive_iterator(const this_type* obj, std::string_view key) : m_obj(obj), m_key(key) {}
predictive_iterator(const trie_type* obj, std::string_view key) : m_obj(obj), m_key(key) {}
friend class trie;
};

View file

@ -81,7 +81,7 @@ class trie_builder {
m_heads[taboo_npos >> m_l1_bits] = m_units[taboo_npos].base;
// Build the code table
m_table.build(keys);
m_table = code_table(keys);
m_bin_mode |= m_table.has_null();
// Build the BC units

View file

@ -3,9 +3,10 @@
#include <xcdat.hpp>
using xcdat_trie = xcdat::trie_8_type;
using trie_type = xcdat::trie_8_type;
int main() {
// Input keys
std::vector<std::string> keys = {
"AirPods", "AirTag", "Mac", "MacBook", "MacBook_Air", "MacBook_Pro",
"Mac_Mini", "Mac_Pro", "iMac", "iPad", "iPhone", "iPhone_SE",
@ -19,12 +20,12 @@ int main() {
// Build and save the trie index
{
const auto trie = xcdat_trie::build(keys);
trie.save(index_filename);
const trie_type trie(keys);
xcdat::save(trie, index_filename);
}
// Load the trie index
const auto trie = xcdat_trie::load(index_filename);
const auto trie = xcdat::load<trie_type>(index_filename);
std::cout << "Basic operations" << std::endl;
{

View file

@ -35,7 +35,7 @@ void test_rank_select(const std::vector<bool>& bits) {
for (std::uint64_t i = 0; i < bits.size(); i++) {
bvb.set_bit(i, bits[i]);
}
bv.build(bvb, true, true);
bv = xcdat::bit_vector(bvb, true, true);
}
REQUIRE_EQ(bv.size(), bits.size());

View file

@ -124,7 +124,7 @@ TEST_CASE("Test trie_type (tiny)") {
"Google_Pixel", "iPad_mini", "iPadOS", "iPod", "ThinkPad",
};
auto trie = trie_type::build(keys);
trie_type trie(keys);
REQUIRE_FALSE(trie.bin_mode());
test_basic_operations(trie, keys, others);
@ -163,7 +163,7 @@ TEST_CASE("Test trie_type (real)") {
auto keys = xcdat::test::to_unique_vec(xcdat::io::load_strings("keys.txt"));
auto others = xcdat::test::extract_keys(keys);
auto trie = trie_type::build(keys);
trie_type trie(keys);
REQUIRE_FALSE(trie.bin_mode());
test_basic_operations(trie, keys, others);
@ -176,7 +176,7 @@ TEST_CASE("Test trie_type (random 10K, A--B)") {
auto keys = xcdat::test::to_unique_vec(xcdat::test::make_random_keys(10000, 1, 30, 'A', 'B'));
auto others = xcdat::test::extract_keys(keys);
auto trie = trie_type::build(keys);
trie_type trie(keys);
REQUIRE_FALSE(trie.bin_mode());
test_basic_operations(trie, keys, others);
@ -189,7 +189,7 @@ TEST_CASE("Test trie_type (random 10K, A--Z)") {
auto keys = xcdat::test::to_unique_vec(xcdat::test::make_random_keys(10000, 1, 30, 'A', 'Z'));
auto others = xcdat::test::extract_keys(keys);
auto trie = trie_type::build(keys);
trie_type trie(keys);
REQUIRE_FALSE(trie.bin_mode());
test_basic_operations(trie, keys, others);
@ -202,7 +202,7 @@ TEST_CASE("Test trie_type (random 10K, 0x00--0xFF)") {
auto keys = xcdat::test::to_unique_vec(xcdat::test::make_random_keys(10000, 1, 30, INT8_MIN, INT8_MAX));
auto others = xcdat::test::extract_keys(keys);
auto trie = trie_type::build(keys);
trie_type trie(keys);
REQUIRE(trie.bin_mode());
test_basic_operations(trie, keys, others);
@ -210,3 +210,44 @@ TEST_CASE("Test trie_type (random 10K, 0x00--0xFF)") {
test_predictive_search(trie, keys, others);
test_enumerate(trie, keys);
}
#ifdef NDEBUG
TEST_CASE("Test trie_type (random 100K, A--B)") {
auto keys = xcdat::test::to_unique_vec(xcdat::test::make_random_keys(100000, 1, 30, 'A', 'B'));
auto others = xcdat::test::extract_keys(keys);
trie_type trie(keys);
REQUIRE_FALSE(trie.bin_mode());
test_basic_operations(trie, keys, others);
test_prefix_search(trie, keys, others);
test_predictive_search(trie, keys, others);
test_enumerate(trie, keys);
}
TEST_CASE("Test trie_type (random 100K, A--Z)") {
auto keys = xcdat::test::to_unique_vec(xcdat::test::make_random_keys(100000, 1, 30, 'A', 'Z'));
auto others = xcdat::test::extract_keys(keys);
trie_type trie(keys);
REQUIRE_FALSE(trie.bin_mode());
test_basic_operations(trie, keys, others);
test_prefix_search(trie, keys, others);
test_predictive_search(trie, keys, others);
test_enumerate(trie, keys);
}
TEST_CASE("Test trie_type (random 100K, 0x00--0xFF)") {
auto keys = xcdat::test::to_unique_vec(xcdat::test::make_random_keys(100000, 1, 30, INT8_MIN, INT8_MAX));
auto others = xcdat::test::extract_keys(keys);
trie_type trie(keys);
REQUIRE(trie.bin_mode());
test_basic_operations(trie, keys, others);
test_prefix_search(trie, keys, others);
test_predictive_search(trie, keys, others);
test_enumerate(trie, keys);
}
#endif

View file

@ -1,3 +1,5 @@
#include <chrono>
#include <xcdat.hpp>
#include "cmd_line_parser/parser.hpp"
@ -28,22 +30,21 @@ int build(const cmd_line_parser::parser& p) {
keys.erase(std::unique(keys.begin(), keys.end()), keys.end());
}
essentials::timer<essentials::clock_type, std::chrono::seconds> timer;
timer.start();
const auto trie = Trie::build(keys);
timer.stop();
const auto start_tp = std::chrono::high_resolution_clock::now();
const Trie trie(keys);
const auto stop_tp = std::chrono::high_resolution_clock::now();
const double construction_time_in_sec = timer.average();
const double memory_in_bytes = trie.memory_in_bytes();
const double time_in_sec = std::chrono::duration_cast<std::chrono::seconds>(stop_tp - start_tp).count();
const double memory_in_bytes = xcdat::memory_in_bytes(trie);
tfm::printfln("construction_time_in_sec: %g", construction_time_in_sec);
tfm::printfln("time_in_sec: %g", time_in_sec);
tfm::printfln("memory_in_bytes: %d", memory_in_bytes);
tfm::printfln("memory_in_MiB: %g", memory_in_bytes / essentials::MiB);
tfm::printfln("memory_in_MiB: %g", memory_in_bytes / (1024.0 * 1024.0));
tfm::printfln("number_of_keys: %d", trie.num_keys());
tfm::printfln("alphabet_size: %d", trie.alphabet_size());
tfm::printfln("max_length: %d", trie.max_length());
trie.save(output_idx);
xcdat::save(trie, output_idx);
return 0;
}

View file

@ -13,7 +13,7 @@ cmd_line_parser::parser make_parser(int argc, char** argv) {
template <class Trie>
int decode(const cmd_line_parser::parser& p) {
const auto input_idx = p.get<std::string>("input_idx");
const auto trie = Trie::load(input_idx);
const auto trie = xcdat::load<Trie>(input_idx);
for (std::uint64_t id; std::cin >> id;) {
const auto dec = trie.decode(id);

View file

@ -13,7 +13,7 @@ cmd_line_parser::parser make_parser(int argc, char** argv) {
template <class Trie>
int enumerate(const cmd_line_parser::parser& p) {
const auto input_idx = p.get<std::string>("input_idx");
const auto trie = Trie::load(input_idx);
const auto trie = xcdat::load<Trie>(input_idx);
trie.enumerate([&](std::uint64_t id, std::string_view str) { tfm::printfln("%d\t%s", id, str); });

View file

@ -13,7 +13,7 @@ cmd_line_parser::parser make_parser(int argc, char** argv) {
template <class Trie>
int lookup(const cmd_line_parser::parser& p) {
const auto input_idx = p.get<std::string>("input_idx");
const auto trie = Trie::load(input_idx);
const auto trie = xcdat::load<Trie>(input_idx);
for (std::string str; std::getline(std::cin, str);) {
const auto id = trie.lookup(str);

View file

@ -16,13 +16,12 @@ int predictive_search(const cmd_line_parser::parser& p) {
const auto input_idx = p.get<std::string>("input_idx");
const auto max_num_results = p.get<std::uint64_t>("max_num_results", 10);
const auto trie = Trie::load(input_idx);
const auto trie = xcdat::load<Trie>(input_idx);
struct result_type {
std::uint64_t id;
std::string str;
};
std::vector<result_type> results;
results.reserve(1ULL << 10);

View file

@ -13,7 +13,8 @@ cmd_line_parser::parser make_parser(int argc, char** argv) {
template <class Trie>
int prefix_search(const cmd_line_parser::parser& p) {
const auto input_idx = p.get<std::string>("input_idx");
const auto trie = Trie::load(input_idx);
const auto trie = xcdat::load<Trie>(input_idx);
struct result_type {
std::uint64_t id;