rename
This commit is contained in:
parent
dd1860e792
commit
acce0ba187
|
@ -11,4 +11,26 @@ namespace xcdat {
|
||||||
using trie_7_type = trie<bc_vector_7>;
|
using trie_7_type = trie<bc_vector_7>;
|
||||||
using trie_8_type = trie<bc_vector_8>;
|
using trie_8_type = trie<bc_vector_8>;
|
||||||
|
|
||||||
|
template <class Trie, class Strings>
|
||||||
|
static Trie build(const Strings& keys, bool bin_mode = false) {
|
||||||
|
return Trie(trie_builder(keys, Trie::bc_vector_type::l1_bits, bin_mode));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class Trie>
|
||||||
|
static Trie load(std::string_view filename) {
|
||||||
|
Trie trie;
|
||||||
|
essentials::load(trie, filename.data());
|
||||||
|
return trie;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class Trie>
|
||||||
|
static std::uint64_t save(Trie& trie, std::string_view filename) {
|
||||||
|
return essentials::save(trie, filename.data());
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class Trie>
|
||||||
|
static std::uint64_t get_memory_in_bytes(Trie& trie) {
|
||||||
|
return essentials::visit<Trie, essentials::sizer>(trie, "");
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace xcdat
|
} // namespace xcdat
|
||||||
|
|
|
@ -17,11 +17,11 @@ class bc_vector_7 {
|
||||||
|
|
||||||
private:
|
private:
|
||||||
std::uint64_t m_num_frees = 0;
|
std::uint64_t m_num_frees = 0;
|
||||||
mm_vector<std::uint8_t> m_ints_l1;
|
vector_wrapper<std::uint8_t> m_ints_l1;
|
||||||
mm_vector<std::uint16_t> m_ints_l2;
|
vector_wrapper<std::uint16_t> m_ints_l2;
|
||||||
mm_vector<std::uint32_t> m_ints_l3;
|
vector_wrapper<std::uint32_t> m_ints_l3;
|
||||||
mm_vector<std::uint64_t> m_ints_l4;
|
vector_wrapper<std::uint64_t> m_ints_l4;
|
||||||
std::array<mm_vector<std::uint64_t>, max_levels - 1> m_ranks;
|
std::array<vector_wrapper<std::uint64_t>, max_levels - 1> m_ranks;
|
||||||
compact_vector m_links;
|
compact_vector m_links;
|
||||||
bit_vector m_leaves;
|
bit_vector m_leaves;
|
||||||
|
|
||||||
|
|
|
@ -15,7 +15,7 @@ class bc_vector_8 {
|
||||||
private:
|
private:
|
||||||
std::uint32_t m_num_levels = 0;
|
std::uint32_t m_num_levels = 0;
|
||||||
std::uint64_t m_num_frees = 0;
|
std::uint64_t m_num_frees = 0;
|
||||||
std::array<mm_vector<std::uint8_t>, max_levels> m_bytes;
|
std::array<vector_wrapper<std::uint8_t>, max_levels> m_bytes;
|
||||||
std::array<bit_vector, max_levels - 1> m_nexts;
|
std::array<bit_vector, max_levels - 1> m_nexts;
|
||||||
compact_vector m_links;
|
compact_vector m_links;
|
||||||
bit_vector m_leaves;
|
bit_vector m_leaves;
|
||||||
|
|
|
@ -7,7 +7,7 @@
|
||||||
#include "essentials/essentials.hpp"
|
#include "essentials/essentials.hpp"
|
||||||
|
|
||||||
#include "bit_tools.hpp"
|
#include "bit_tools.hpp"
|
||||||
#include "mm_vector.hpp"
|
#include "vector_wrapper.hpp"
|
||||||
|
|
||||||
namespace xcdat {
|
namespace xcdat {
|
||||||
|
|
||||||
|
@ -77,9 +77,9 @@ class bit_vector {
|
||||||
private:
|
private:
|
||||||
std::uint64_t m_size = 0;
|
std::uint64_t m_size = 0;
|
||||||
std::uint64_t m_num_ones = 0;
|
std::uint64_t m_num_ones = 0;
|
||||||
mm_vector<std::uint64_t> m_bits;
|
vector_wrapper<std::uint64_t> m_bits;
|
||||||
mm_vector<std::uint64_t> m_rank_hints;
|
vector_wrapper<std::uint64_t> m_rank_hints;
|
||||||
mm_vector<std::uint64_t> m_select_hints;
|
vector_wrapper<std::uint64_t> m_select_hints;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
bit_vector() = default;
|
bit_vector() = default;
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
#include <array>
|
#include <array>
|
||||||
#include <string_view>
|
#include <string_view>
|
||||||
|
|
||||||
#include "mm_vector.hpp"
|
#include "vector_wrapper.hpp"
|
||||||
|
|
||||||
namespace xcdat {
|
namespace xcdat {
|
||||||
|
|
||||||
|
@ -11,7 +11,7 @@ class code_table {
|
||||||
private:
|
private:
|
||||||
std::uint64_t m_max_length = 0;
|
std::uint64_t m_max_length = 0;
|
||||||
std::array<std::uint8_t, 512> m_table;
|
std::array<std::uint8_t, 512> m_table;
|
||||||
mm_vector<std::uint8_t> m_alphabet;
|
vector_wrapper<std::uint8_t> m_alphabet;
|
||||||
|
|
||||||
struct counter_type {
|
struct counter_type {
|
||||||
std::uint8_t ch;
|
std::uint8_t ch;
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
|
|
||||||
#include "bit_tools.hpp"
|
#include "bit_tools.hpp"
|
||||||
#include "exception.hpp"
|
#include "exception.hpp"
|
||||||
#include "mm_vector.hpp"
|
#include "vector_wrapper.hpp"
|
||||||
|
|
||||||
namespace xcdat {
|
namespace xcdat {
|
||||||
|
|
||||||
|
@ -14,7 +14,7 @@ class compact_vector {
|
||||||
std::uint64_t m_size = 0;
|
std::uint64_t m_size = 0;
|
||||||
std::uint64_t m_bits = 0;
|
std::uint64_t m_bits = 0;
|
||||||
std::uint64_t m_mask = 0;
|
std::uint64_t m_mask = 0;
|
||||||
mm_vector<std::uint64_t> m_chunks;
|
vector_wrapper<std::uint64_t> m_chunks;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
compact_vector() = default;
|
compact_vector() = default;
|
||||||
|
|
|
@ -1,177 +0,0 @@
|
||||||
#pragma once
|
|
||||||
|
|
||||||
#include <sys/mman.h>
|
|
||||||
#include <sys/stat.h>
|
|
||||||
#include <type_traits>
|
|
||||||
#include <fcntl.h>
|
|
||||||
#include <unistd.h> // close(fd)
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
namespace mm {
|
|
||||||
|
|
||||||
namespace advice {
|
|
||||||
static const int normal = POSIX_MADV_NORMAL;
|
|
||||||
static const int random = POSIX_MADV_RANDOM;
|
|
||||||
static const int sequential = POSIX_MADV_SEQUENTIAL;
|
|
||||||
} // namespace advice
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
struct file {
|
|
||||||
file() {
|
|
||||||
init();
|
|
||||||
}
|
|
||||||
|
|
||||||
~file() {
|
|
||||||
close();
|
|
||||||
}
|
|
||||||
|
|
||||||
file(file const&) = delete; // non construction-copyable
|
|
||||||
file& operator=(file const&) = delete; // non copyable
|
|
||||||
|
|
||||||
bool is_open() const {
|
|
||||||
return m_fd != -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
void close() {
|
|
||||||
if (is_open()) {
|
|
||||||
if (munmap((char*)m_data, m_size) == -1) {
|
|
||||||
throw std::runtime_error("munmap failed when closing file");
|
|
||||||
}
|
|
||||||
::close(m_fd);
|
|
||||||
init();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t bytes() const {
|
|
||||||
return m_size;
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t size() const {
|
|
||||||
return m_size / sizeof(T);
|
|
||||||
}
|
|
||||||
|
|
||||||
T* data() const {
|
|
||||||
return m_data;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct iterator {
|
|
||||||
iterator(T* addr, size_t offset = 0) : m_ptr(addr + offset) {}
|
|
||||||
|
|
||||||
T operator*() {
|
|
||||||
return *m_ptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
void operator++() {
|
|
||||||
++m_ptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool operator==(iterator const& rhs) const {
|
|
||||||
return m_ptr == rhs.m_ptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool operator!=(iterator const& rhs) const {
|
|
||||||
return !((*this) == rhs);
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
T* m_ptr;
|
|
||||||
};
|
|
||||||
|
|
||||||
iterator begin() const {
|
|
||||||
return iterator(m_data);
|
|
||||||
}
|
|
||||||
|
|
||||||
iterator end() const {
|
|
||||||
return iterator(m_data, size());
|
|
||||||
}
|
|
||||||
|
|
||||||
protected:
|
|
||||||
int m_fd;
|
|
||||||
size_t m_size;
|
|
||||||
T* m_data;
|
|
||||||
|
|
||||||
void init() {
|
|
||||||
m_fd = -1;
|
|
||||||
m_size = 0;
|
|
||||||
m_data = nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
void check_fd() {
|
|
||||||
if (m_fd == -1) throw std::runtime_error("cannot open file");
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
template <typename Pointer>
|
|
||||||
Pointer mmap(int fd, size_t size, int prot) {
|
|
||||||
static const size_t offset = 0;
|
|
||||||
Pointer p =
|
|
||||||
static_cast<Pointer>(::mmap(NULL, size, prot, MAP_SHARED, fd, offset));
|
|
||||||
if (p == MAP_FAILED) throw std::runtime_error("mmap failed");
|
|
||||||
return p;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
struct file_source : public file<T const> {
|
|
||||||
typedef file<T const> base;
|
|
||||||
|
|
||||||
file_source() {}
|
|
||||||
|
|
||||||
file_source(std::string const& path, int adv = advice::normal) {
|
|
||||||
open(path, adv);
|
|
||||||
}
|
|
||||||
|
|
||||||
void open(std::string const& path, int adv = advice::normal) {
|
|
||||||
base::m_fd = ::open(path.c_str(), O_RDONLY);
|
|
||||||
base::check_fd();
|
|
||||||
struct stat fs;
|
|
||||||
if (fstat(base::m_fd, &fs) == -1) {
|
|
||||||
throw std::runtime_error("cannot stat file");
|
|
||||||
}
|
|
||||||
base::m_size = fs.st_size;
|
|
||||||
base::m_data = mmap<T const*>(base::m_fd, base::m_size, PROT_READ);
|
|
||||||
if (posix_madvise((void*)base::m_data, base::m_size, adv)) {
|
|
||||||
throw std::runtime_error("madvise failed");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
struct file_sink : public file<T> {
|
|
||||||
typedef file<T> base;
|
|
||||||
|
|
||||||
file_sink() {}
|
|
||||||
|
|
||||||
file_sink(std::string const& path) {
|
|
||||||
open(path);
|
|
||||||
}
|
|
||||||
|
|
||||||
file_sink(std::string const& path, size_t n) {
|
|
||||||
open(path, n);
|
|
||||||
}
|
|
||||||
|
|
||||||
void open(std::string const& path) {
|
|
||||||
static const mode_t mode = 0600; // read/write
|
|
||||||
base::m_fd = ::open(path.c_str(), O_RDWR, mode);
|
|
||||||
base::check_fd();
|
|
||||||
struct stat fs;
|
|
||||||
if (fstat(base::m_fd, &fs) == -1) {
|
|
||||||
throw std::runtime_error("cannot stat file");
|
|
||||||
}
|
|
||||||
base::m_size = fs.st_size;
|
|
||||||
base::m_data =
|
|
||||||
mmap<T*>(base::m_fd, base::m_size, PROT_READ | PROT_WRITE);
|
|
||||||
}
|
|
||||||
|
|
||||||
void open(std::string const& path, size_t n) {
|
|
||||||
static const mode_t mode = 0600; // read/write
|
|
||||||
base::m_fd = ::open(path.c_str(), O_RDWR | O_CREAT | O_TRUNC, mode);
|
|
||||||
base::check_fd();
|
|
||||||
base::m_size = n * sizeof(T);
|
|
||||||
ftruncate(base::m_fd,
|
|
||||||
base::m_size); // truncate the file at the new size
|
|
||||||
base::m_data =
|
|
||||||
mmap<T*>(base::m_fd, base::m_size, PROT_READ | PROT_WRITE);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
} // namespace mm
|
|
|
@ -8,7 +8,7 @@
|
||||||
|
|
||||||
#include "bit_vector.hpp"
|
#include "bit_vector.hpp"
|
||||||
#include "exception.hpp"
|
#include "exception.hpp"
|
||||||
#include "mm_vector.hpp"
|
#include "vector_wrapper.hpp"
|
||||||
|
|
||||||
namespace xcdat {
|
namespace xcdat {
|
||||||
|
|
||||||
|
@ -116,7 +116,7 @@ class tail_vector {
|
||||||
};
|
};
|
||||||
|
|
||||||
private:
|
private:
|
||||||
mm_vector<char> m_chars;
|
vector_wrapper<char> m_chars;
|
||||||
bit_vector m_terms;
|
bit_vector m_terms;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
@ -219,6 +219,12 @@ class tail_vector {
|
||||||
inline std::uint64_t size() const {
|
inline std::uint64_t size() const {
|
||||||
return m_chars.size();
|
return m_chars.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <class Visitor>
|
||||||
|
void visit(Visitor& visitor) {
|
||||||
|
visitor.visit(m_chars);
|
||||||
|
visitor.visit(m_terms);
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace xcdat
|
} // namespace xcdat
|
||||||
|
|
|
@ -4,6 +4,7 @@
|
||||||
#include <optional>
|
#include <optional>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
|
#include "essentials/essentials.hpp"
|
||||||
#include "trie_builder.hpp"
|
#include "trie_builder.hpp"
|
||||||
|
|
||||||
namespace xcdat {
|
namespace xcdat {
|
||||||
|
@ -51,6 +52,11 @@ class trie {
|
||||||
//! Move constructor
|
//! Move constructor
|
||||||
trie& operator=(trie&&) noexcept = default;
|
trie& operator=(trie&&) noexcept = default;
|
||||||
|
|
||||||
|
template <class Strings>
|
||||||
|
explicit trie(trie_builder<Strings>&& b)
|
||||||
|
: m_num_keys(b.m_keys.size()), m_table(std::move(b.m_table)), m_terms(b.m_terms, true, true),
|
||||||
|
m_bcvec(b.m_units, std::move(b.m_leaves)), m_tvec(std::move(b.m_suffixes)) {}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Build the trie dictioanry from the input keywords.
|
* Build the trie dictioanry from the input keywords.
|
||||||
* @param[in] key The query keyword.
|
* @param[in] key The query keyword.
|
||||||
|
@ -265,11 +271,6 @@ class trie {
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
template <class Strings>
|
|
||||||
explicit trie(trie_builder<Strings>&& b)
|
|
||||||
: m_num_keys(b.m_keys.size()), m_table(std::move(b.m_table)), m_terms(b.m_terms, true, true),
|
|
||||||
m_bcvec(b.m_units, std::move(b.m_leaves)), m_tvec(std::move(b.m_suffixes)) {}
|
|
||||||
|
|
||||||
template <class String>
|
template <class String>
|
||||||
static constexpr String get_suffix(const String& s, std::uint64_t i) {
|
static constexpr String get_suffix(const String& s, std::uint64_t i) {
|
||||||
assert(i <= s.size());
|
assert(i <= s.size());
|
||||||
|
|
|
@ -5,33 +5,36 @@
|
||||||
|
|
||||||
namespace xcdat {
|
namespace xcdat {
|
||||||
|
|
||||||
//! A memory-mappable vector.
|
|
||||||
template <class T>
|
template <class T>
|
||||||
class mm_vector {
|
class vector_wrapper {
|
||||||
private:
|
private:
|
||||||
std::vector<T> m_vec;
|
std::vector<T> m_vec;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
mm_vector() = default;
|
vector_wrapper() = default;
|
||||||
virtual ~mm_vector() = default;
|
virtual ~vector_wrapper() = default;
|
||||||
|
|
||||||
mm_vector(const mm_vector&) = delete;
|
vector_wrapper(const vector_wrapper&) = delete;
|
||||||
mm_vector& operator=(const mm_vector&) = delete;
|
vector_wrapper& operator=(const vector_wrapper&) = delete;
|
||||||
|
|
||||||
mm_vector(mm_vector&&) noexcept = default;
|
vector_wrapper(vector_wrapper&&) noexcept = default;
|
||||||
mm_vector& operator=(mm_vector&&) noexcept = default;
|
vector_wrapper& operator=(vector_wrapper&&) noexcept = default;
|
||||||
|
|
||||||
explicit mm_vector(std::vector<T>&& vec) {
|
explicit vector_wrapper(std::vector<T>&& vec) {
|
||||||
steal(vec);
|
steal(vec);
|
||||||
}
|
}
|
||||||
|
|
||||||
void steal(std::vector<T>& vec) {
|
void steal(std::vector<T>& vec) {
|
||||||
m_vec.swap(vec);
|
if (vec.size() != 0) {
|
||||||
|
m_vec = std::move(vec);
|
||||||
m_vec.shrink_to_fit();
|
m_vec.shrink_to_fit();
|
||||||
|
} else {
|
||||||
|
clear();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void clear() {
|
void clear() {
|
||||||
m_vec.clear();
|
*this = vector_wrapper<T>();
|
||||||
}
|
}
|
||||||
|
|
||||||
inline std::uint64_t size() const {
|
inline std::uint64_t size() const {
|
|
@ -13,7 +13,7 @@ int main() {
|
||||||
std::sort(keys.begin(), keys.end());
|
std::sort(keys.begin(), keys.end());
|
||||||
keys.erase(std::unique(keys.begin(), keys.end()), keys.end());
|
keys.erase(std::unique(keys.begin(), keys.end()), keys.end());
|
||||||
|
|
||||||
auto trie = xcdat::trie_8_type::build(keys);
|
auto trie = xcdat::build<xcdat::trie_8_type>(keys);
|
||||||
|
|
||||||
std::cout << "Basic operations" << std::endl;
|
std::cout << "Basic operations" << std::endl;
|
||||||
{
|
{
|
||||||
|
@ -41,5 +41,14 @@ int main() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string index_filename = "tmp.idx";
|
||||||
|
std::cout << "mem: " << xcdat::save(trie, index_filename) << std::endl;
|
||||||
|
|
||||||
|
{
|
||||||
|
auto ohter = xcdat::load<xcdat::trie_8_type>(index_filename);
|
||||||
|
std::cout << "num_keys:" << ohter.num_keys() << std::endl;
|
||||||
|
std::cout << "mem: " << xcdat::get_memory_in_bytes(ohter) << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue