This commit is contained in:
Shunsuke Kanda 2021-06-27 23:37:38 +09:00
parent dd1860e792
commit acce0ba187
11 changed files with 76 additions and 212 deletions

View file

@ -11,4 +11,26 @@ namespace xcdat {
using trie_7_type = trie<bc_vector_7>; using trie_7_type = trie<bc_vector_7>;
using trie_8_type = trie<bc_vector_8>; using trie_8_type = trie<bc_vector_8>;
template <class Trie, class Strings>
static Trie build(const Strings& keys, bool bin_mode = false) {
return Trie(trie_builder(keys, Trie::bc_vector_type::l1_bits, bin_mode));
}
template <class Trie>
static Trie load(std::string_view filename) {
Trie trie;
essentials::load(trie, filename.data());
return trie;
}
template <class Trie>
static std::uint64_t save(Trie& trie, std::string_view filename) {
return essentials::save(trie, filename.data());
}
template <class Trie>
static std::uint64_t get_memory_in_bytes(Trie& trie) {
return essentials::visit<Trie, essentials::sizer>(trie, "");
}
} // namespace xcdat } // namespace xcdat

View file

@ -17,11 +17,11 @@ class bc_vector_7 {
private: private:
std::uint64_t m_num_frees = 0; std::uint64_t m_num_frees = 0;
mm_vector<std::uint8_t> m_ints_l1; vector_wrapper<std::uint8_t> m_ints_l1;
mm_vector<std::uint16_t> m_ints_l2; vector_wrapper<std::uint16_t> m_ints_l2;
mm_vector<std::uint32_t> m_ints_l3; vector_wrapper<std::uint32_t> m_ints_l3;
mm_vector<std::uint64_t> m_ints_l4; vector_wrapper<std::uint64_t> m_ints_l4;
std::array<mm_vector<std::uint64_t>, max_levels - 1> m_ranks; std::array<vector_wrapper<std::uint64_t>, max_levels - 1> m_ranks;
compact_vector m_links; compact_vector m_links;
bit_vector m_leaves; bit_vector m_leaves;

View file

@ -15,7 +15,7 @@ class bc_vector_8 {
private: private:
std::uint32_t m_num_levels = 0; std::uint32_t m_num_levels = 0;
std::uint64_t m_num_frees = 0; std::uint64_t m_num_frees = 0;
std::array<mm_vector<std::uint8_t>, max_levels> m_bytes; std::array<vector_wrapper<std::uint8_t>, max_levels> m_bytes;
std::array<bit_vector, max_levels - 1> m_nexts; std::array<bit_vector, max_levels - 1> m_nexts;
compact_vector m_links; compact_vector m_links;
bit_vector m_leaves; bit_vector m_leaves;

View file

@ -7,7 +7,7 @@
#include "essentials/essentials.hpp" #include "essentials/essentials.hpp"
#include "bit_tools.hpp" #include "bit_tools.hpp"
#include "mm_vector.hpp" #include "vector_wrapper.hpp"
namespace xcdat { namespace xcdat {
@ -77,9 +77,9 @@ class bit_vector {
private: private:
std::uint64_t m_size = 0; std::uint64_t m_size = 0;
std::uint64_t m_num_ones = 0; std::uint64_t m_num_ones = 0;
mm_vector<std::uint64_t> m_bits; vector_wrapper<std::uint64_t> m_bits;
mm_vector<std::uint64_t> m_rank_hints; vector_wrapper<std::uint64_t> m_rank_hints;
mm_vector<std::uint64_t> m_select_hints; vector_wrapper<std::uint64_t> m_select_hints;
public: public:
bit_vector() = default; bit_vector() = default;

View file

@ -3,7 +3,7 @@
#include <array> #include <array>
#include <string_view> #include <string_view>
#include "mm_vector.hpp" #include "vector_wrapper.hpp"
namespace xcdat { namespace xcdat {
@ -11,7 +11,7 @@ class code_table {
private: private:
std::uint64_t m_max_length = 0; std::uint64_t m_max_length = 0;
std::array<std::uint8_t, 512> m_table; std::array<std::uint8_t, 512> m_table;
mm_vector<std::uint8_t> m_alphabet; vector_wrapper<std::uint8_t> m_alphabet;
struct counter_type { struct counter_type {
std::uint8_t ch; std::uint8_t ch;

View file

@ -4,7 +4,7 @@
#include "bit_tools.hpp" #include "bit_tools.hpp"
#include "exception.hpp" #include "exception.hpp"
#include "mm_vector.hpp" #include "vector_wrapper.hpp"
namespace xcdat { namespace xcdat {
@ -14,7 +14,7 @@ class compact_vector {
std::uint64_t m_size = 0; std::uint64_t m_size = 0;
std::uint64_t m_bits = 0; std::uint64_t m_bits = 0;
std::uint64_t m_mask = 0; std::uint64_t m_mask = 0;
mm_vector<std::uint64_t> m_chunks; vector_wrapper<std::uint64_t> m_chunks;
public: public:
compact_vector() = default; compact_vector() = default;

View file

@ -1,177 +0,0 @@
#pragma once
#include <sys/mman.h>
#include <sys/stat.h>
#include <type_traits>
#include <fcntl.h>
#include <unistd.h> // close(fd)
#include <string>
namespace mm {
namespace advice {
static const int normal = POSIX_MADV_NORMAL;
static const int random = POSIX_MADV_RANDOM;
static const int sequential = POSIX_MADV_SEQUENTIAL;
} // namespace advice
template <typename T>
struct file {
file() {
init();
}
~file() {
close();
}
file(file const&) = delete; // non construction-copyable
file& operator=(file const&) = delete; // non copyable
bool is_open() const {
return m_fd != -1;
}
void close() {
if (is_open()) {
if (munmap((char*)m_data, m_size) == -1) {
throw std::runtime_error("munmap failed when closing file");
}
::close(m_fd);
init();
}
}
size_t bytes() const {
return m_size;
}
size_t size() const {
return m_size / sizeof(T);
}
T* data() const {
return m_data;
}
struct iterator {
iterator(T* addr, size_t offset = 0) : m_ptr(addr + offset) {}
T operator*() {
return *m_ptr;
}
void operator++() {
++m_ptr;
}
bool operator==(iterator const& rhs) const {
return m_ptr == rhs.m_ptr;
}
bool operator!=(iterator const& rhs) const {
return !((*this) == rhs);
}
private:
T* m_ptr;
};
iterator begin() const {
return iterator(m_data);
}
iterator end() const {
return iterator(m_data, size());
}
protected:
int m_fd;
size_t m_size;
T* m_data;
void init() {
m_fd = -1;
m_size = 0;
m_data = nullptr;
}
void check_fd() {
if (m_fd == -1) throw std::runtime_error("cannot open file");
}
};
template <typename Pointer>
Pointer mmap(int fd, size_t size, int prot) {
static const size_t offset = 0;
Pointer p =
static_cast<Pointer>(::mmap(NULL, size, prot, MAP_SHARED, fd, offset));
if (p == MAP_FAILED) throw std::runtime_error("mmap failed");
return p;
}
template <typename T>
struct file_source : public file<T const> {
typedef file<T const> base;
file_source() {}
file_source(std::string const& path, int adv = advice::normal) {
open(path, adv);
}
void open(std::string const& path, int adv = advice::normal) {
base::m_fd = ::open(path.c_str(), O_RDONLY);
base::check_fd();
struct stat fs;
if (fstat(base::m_fd, &fs) == -1) {
throw std::runtime_error("cannot stat file");
}
base::m_size = fs.st_size;
base::m_data = mmap<T const*>(base::m_fd, base::m_size, PROT_READ);
if (posix_madvise((void*)base::m_data, base::m_size, adv)) {
throw std::runtime_error("madvise failed");
}
}
};
template <typename T>
struct file_sink : public file<T> {
typedef file<T> base;
file_sink() {}
file_sink(std::string const& path) {
open(path);
}
file_sink(std::string const& path, size_t n) {
open(path, n);
}
void open(std::string const& path) {
static const mode_t mode = 0600; // read/write
base::m_fd = ::open(path.c_str(), O_RDWR, mode);
base::check_fd();
struct stat fs;
if (fstat(base::m_fd, &fs) == -1) {
throw std::runtime_error("cannot stat file");
}
base::m_size = fs.st_size;
base::m_data =
mmap<T*>(base::m_fd, base::m_size, PROT_READ | PROT_WRITE);
}
void open(std::string const& path, size_t n) {
static const mode_t mode = 0600; // read/write
base::m_fd = ::open(path.c_str(), O_RDWR | O_CREAT | O_TRUNC, mode);
base::check_fd();
base::m_size = n * sizeof(T);
ftruncate(base::m_fd,
base::m_size); // truncate the file at the new size
base::m_data =
mmap<T*>(base::m_fd, base::m_size, PROT_READ | PROT_WRITE);
}
};
} // namespace mm

View file

@ -8,7 +8,7 @@
#include "bit_vector.hpp" #include "bit_vector.hpp"
#include "exception.hpp" #include "exception.hpp"
#include "mm_vector.hpp" #include "vector_wrapper.hpp"
namespace xcdat { namespace xcdat {
@ -116,7 +116,7 @@ class tail_vector {
}; };
private: private:
mm_vector<char> m_chars; vector_wrapper<char> m_chars;
bit_vector m_terms; bit_vector m_terms;
public: public:
@ -219,6 +219,12 @@ class tail_vector {
inline std::uint64_t size() const { inline std::uint64_t size() const {
return m_chars.size(); return m_chars.size();
} }
template <class Visitor>
void visit(Visitor& visitor) {
visitor.visit(m_chars);
visitor.visit(m_terms);
}
}; };
} // namespace xcdat } // namespace xcdat

View file

@ -4,6 +4,7 @@
#include <optional> #include <optional>
#include <string> #include <string>
#include "essentials/essentials.hpp"
#include "trie_builder.hpp" #include "trie_builder.hpp"
namespace xcdat { namespace xcdat {
@ -51,6 +52,11 @@ class trie {
//! Move constructor //! Move constructor
trie& operator=(trie&&) noexcept = default; trie& operator=(trie&&) noexcept = default;
template <class Strings>
explicit trie(trie_builder<Strings>&& b)
: m_num_keys(b.m_keys.size()), m_table(std::move(b.m_table)), m_terms(b.m_terms, true, true),
m_bcvec(b.m_units, std::move(b.m_leaves)), m_tvec(std::move(b.m_suffixes)) {}
/** /**
* Build the trie dictioanry from the input keywords. * Build the trie dictioanry from the input keywords.
* @param[in] key The query keyword. * @param[in] key The query keyword.
@ -265,11 +271,6 @@ class trie {
} }
private: private:
template <class Strings>
explicit trie(trie_builder<Strings>&& b)
: m_num_keys(b.m_keys.size()), m_table(std::move(b.m_table)), m_terms(b.m_terms, true, true),
m_bcvec(b.m_units, std::move(b.m_leaves)), m_tvec(std::move(b.m_suffixes)) {}
template <class String> template <class String>
static constexpr String get_suffix(const String& s, std::uint64_t i) { static constexpr String get_suffix(const String& s, std::uint64_t i) {
assert(i <= s.size()); assert(i <= s.size());

View file

@ -5,33 +5,36 @@
namespace xcdat { namespace xcdat {
//! A memory-mappable vector.
template <class T> template <class T>
class mm_vector { class vector_wrapper {
private: private:
std::vector<T> m_vec; std::vector<T> m_vec;
public: public:
mm_vector() = default; vector_wrapper() = default;
virtual ~mm_vector() = default; virtual ~vector_wrapper() = default;
mm_vector(const mm_vector&) = delete; vector_wrapper(const vector_wrapper&) = delete;
mm_vector& operator=(const mm_vector&) = delete; vector_wrapper& operator=(const vector_wrapper&) = delete;
mm_vector(mm_vector&&) noexcept = default; vector_wrapper(vector_wrapper&&) noexcept = default;
mm_vector& operator=(mm_vector&&) noexcept = default; vector_wrapper& operator=(vector_wrapper&&) noexcept = default;
explicit mm_vector(std::vector<T>&& vec) { explicit vector_wrapper(std::vector<T>&& vec) {
steal(vec); steal(vec);
} }
void steal(std::vector<T>& vec) { void steal(std::vector<T>& vec) {
m_vec.swap(vec); if (vec.size() != 0) {
m_vec = std::move(vec);
m_vec.shrink_to_fit(); m_vec.shrink_to_fit();
} else {
clear();
}
} }
void clear() { void clear() {
m_vec.clear(); *this = vector_wrapper<T>();
} }
inline std::uint64_t size() const { inline std::uint64_t size() const {

View file

@ -13,7 +13,7 @@ int main() {
std::sort(keys.begin(), keys.end()); std::sort(keys.begin(), keys.end());
keys.erase(std::unique(keys.begin(), keys.end()), keys.end()); keys.erase(std::unique(keys.begin(), keys.end()), keys.end());
auto trie = xcdat::trie_8_type::build(keys); auto trie = xcdat::build<xcdat::trie_8_type>(keys);
std::cout << "Basic operations" << std::endl; std::cout << "Basic operations" << std::endl;
{ {
@ -41,5 +41,14 @@ int main() {
} }
} }
std::string index_filename = "tmp.idx";
std::cout << "mem: " << xcdat::save(trie, index_filename) << std::endl;
{
auto ohter = xcdat::load<xcdat::trie_8_type>(index_filename);
std::cout << "num_keys:" << ohter.num_keys() << std::endl;
std::cout << "mem: " << xcdat::get_memory_in_bytes(ohter) << std::endl;
}
return 0; return 0;
} }