minor
This commit is contained in:
parent
6a7c00d07a
commit
db944cff22
|
@ -42,5 +42,7 @@ file(COPY ${CMAKE_SOURCE_DIR}/tests/keys.txt DESTINATION ${CMAKE_CURRENT_BINARY_
|
|||
|
||||
# Install the library
|
||||
file(GLOB XCDAT_HEADER_FILES include/xcdat/*.hpp)
|
||||
file(GLOB MM_HEADER_FILES include/mm_file/*.hpp)
|
||||
install(FILES include/xcdat.hpp DESTINATION include)
|
||||
install(FILES ${XCDAT_HEADER_FILES} DESTINATION include/xcdat)
|
||||
install(FILES ${MM_HEADER_FILES} DESTINATION include/mm_file)
|
||||
|
|
73
README.md
73
README.md
|
@ -1,6 +1,6 @@
|
|||
# Xcdat: Fast compressed trie dictionary library
|
||||
|
||||
**Xcdat** is a C++17 header-only library of a fast compressed string dictionary based on the improved double-array trie structure described in the paper: [Compressed double-array tries for string dictionaries supporting fast lookup](https://doi.org/10.1007/s10115-016-0999-8), *Knowledge and Information Systems*, 2017, available at [here](https://kampersanda.github.io/pdf/KAIS2017.pdf).
|
||||
**Xcdat** is a C++17 header-only library of a fast compressed string dictionary based on an improved double-array trie structure described in the paper: [Compressed double-array tries for string dictionaries supporting fast lookup](https://doi.org/10.1007/s10115-016-0999-8), *Knowledge and Information Systems*, 2017, available at [here](https://kampersanda.github.io/pdf/KAIS2017.pdf).
|
||||
|
||||
## Table of contents
|
||||
|
||||
|
@ -17,8 +17,8 @@
|
|||
## Features
|
||||
|
||||
- **Compressed string dictionary.** Xcdat implements a (static) *compressed string dictioanry* that stores a set of strings (or keywords) in a compressed space while supporting several search operations [1,2]. For example, Xcdat can store an entire set of English Wikipedia titles at half the size of the raw data.
|
||||
- **Fast and compact data structure.** Xcdat employs the *double-array trie* [3] known as the fastest data structure for trie implementation. However, the double-array trie resorts to many pointers and consumes a large amount of memory. To address this, Xcdat applies the *XCDA* method [2] that represents the double-array trie in a compressed format while maintaining the fast searches.
|
||||
- **Cache efficiency.** Xcdat employs a *minimal-prefix trie* [4] that replaces redundant trie nodes into strings, resulting in reducing random access and improving locality of references.
|
||||
- **Fast and compact data structure.** Xcdat employs the *double-array trie* [3] known as the fastest trie implementation. However, the double-array trie resorts to many pointers and consumes a large amount of memory. To address this, Xcdat applies the *XCDA* method [2] that represents the double-array trie in a compressed format while maintaining the fast searches.
|
||||
- **Cache efficiency.** Xcdat employs a *minimal-prefix trie* [4] that replaces redundant trie nodes into strings to reduce random access and to improve locality of references.
|
||||
- **Dictionary encoding.** Xcdat maps `N` distinct keywords into unique IDs from `[0,N-1]`, and supports the two symmetric operations: `lookup` returns the ID corresponding to a given keyword; `decode` returns the keyword associated with a given ID. The mapping is so-called *dictionary encoding* (or *domain encoding*) and is fundamental in many DB applications as described by Martínez-Prieto et al [1] or Müller et al. [5].
|
||||
- **Prefix search operations.** Xcdat supports prefix search operations realized by trie search algorithms: `prefix_search` returns all the keywords contained as prefixes of a given string; `predictive search` returns all the keywords starting with a given string. These will be useful in many NLP applications such as auto completions [6], stemmed searches [7], or input method editors [8].
|
||||
- **64-bit support.** As mentioned before, since the double array is a pointer-based data structure, most double-array libraries use 32-bit pointers to reduce memory consumption, resulting in limiting the scale of the input dataset. On the other hand, the XCDA method allows Xcdat to represent 64-bit pointers without sacrificing memory efficiency.
|
||||
|
@ -50,11 +50,11 @@ The library considers a 64-bit operating system. The code has been tested only o
|
|||
|
||||
## Command line tools
|
||||
|
||||
Xcdat provides command line tools to build the index and perform searches, which are inspired by [marisa-trie](https://github.com/s-yata/marisa-trie). All the tools will print the command line options by specifying the parameter `-h`.
|
||||
Xcdat provides command line tools to build the dictionary and perform searches, which are inspired by [marisa-trie](https://github.com/s-yata/marisa-trie). All the tools will print the command line options by specifying the parameter `-h`.
|
||||
|
||||
### `xcdat_build`
|
||||
|
||||
It builds the trie index from a given dataset consisting of keywords separated by newlines. The following command builds the trie index from dataset `enwiki-titles.txt` and writes the index into file `idx.bin`.
|
||||
It builds the trie dictionary from a given dataset consisting of keywords separated by newlines. The following command builds the trie dictionary from dataset `enwiki-titles.txt` and writes the dictionary into file `idx.bin`.
|
||||
|
||||
```
|
||||
$ xcdat_build enwiki-titles.txt idx.bin
|
||||
|
@ -67,7 +67,7 @@ Memory usage in MiB: 162.714
|
|||
|
||||
### `xcdat_lookup`
|
||||
|
||||
It tests the `lookup` operation for a given index. Given a query string via `stdin`, it prints the associated ID if found, or `-1` otherwise.
|
||||
It tests the `lookup` operation for a given dictionary. Given a query string via `stdin`, it prints the associated ID if found, or `-1` otherwise.
|
||||
|
||||
```
|
||||
$ xcdat_lookup idx.bin
|
||||
|
@ -79,7 +79,7 @@ Double_Array
|
|||
|
||||
### `xcdat_decode`
|
||||
|
||||
It tests the `decode` operation for a given index. Given a query ID via `stdin`, it prints the corresponding keyword if the ID is in the range `[0,N-1]`, where `N` is the number of stored keywords.
|
||||
It tests the `decode` operation for a given dictionary. Given a query ID via `stdin`, it prints the corresponding keyword if the ID is in the range `[0,N-1]`, where `N` is the number of stored keywords.
|
||||
|
||||
```
|
||||
$ xcdat_decode idx.bin
|
||||
|
@ -89,7 +89,7 @@ $ xcdat_decode idx.bin
|
|||
|
||||
### `xcdat_prefix_search`
|
||||
|
||||
It tests the `prefix_search` operation for a given index. Given a query string via `stdin`, it prints all the keywords contained as prefixes of a given string.
|
||||
It tests the `prefix_search` operation for a given dictionary. Given a query string via `stdin`, it prints all the keywords contained as prefixes of a given string.
|
||||
|
||||
```
|
||||
$ xcdat_prefix_search idx.bin
|
||||
|
@ -105,7 +105,7 @@ Algorithmic
|
|||
|
||||
### `xcdat_predictive_search`
|
||||
|
||||
It tests the `predictive_search` operation for a given index. Given a query string via `stdin`, it prints the first `n` keywords starting with a given string, where `n` is one of the parameters.
|
||||
It tests the `predictive_search` operation for a given dictionary. Given a query string via `stdin`, it prints the first `n` keywords starting with a given string, where `n` is one of the parameters.
|
||||
|
||||
```
|
||||
$ xcdat_predictive_search idx.bin -n 3
|
||||
|
@ -118,7 +118,7 @@ Algorithm
|
|||
|
||||
### `xcdat_enumerate`
|
||||
|
||||
It prints all the keywords stored in a given index.
|
||||
It prints all the keywords stored in a given dictionary.
|
||||
|
||||
```
|
||||
$ xcdat_enumerate idx.bin | head -3
|
||||
|
@ -151,16 +151,17 @@ Decode time in microsec/query: 1.2341
|
|||
|
||||
## Sample usage
|
||||
|
||||
`sample/sample.cpp` provides a sample usage.
|
||||
`sample/sample.cpp` provides a sample usage. It employs the external library [mm_file](https://github.com/jermp/mm_file) to implement a memory-mapped file, which will be installed by `make install` together.
|
||||
|
||||
```c++
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
#include <mm_file/mm_file.hpp>
|
||||
#include <xcdat.hpp>
|
||||
|
||||
int main() {
|
||||
// Input keys
|
||||
// Dataset
|
||||
std::vector<std::string> keys = {
|
||||
"AirPods", "AirTag", "Mac", "MacBook", "MacBook_Air", "MacBook_Pro",
|
||||
"Mac_Mini", "Mac_Pro", "iMac", "iPad", "iPhone", "iPhone_SE",
|
||||
|
@ -170,27 +171,32 @@ int main() {
|
|||
std::sort(keys.begin(), keys.end());
|
||||
keys.erase(std::unique(keys.begin(), keys.end()), keys.end());
|
||||
|
||||
const char* index_filename = "tmp.idx";
|
||||
|
||||
// The trie index type
|
||||
// The trie dictionary type
|
||||
using trie_type = xcdat::trie_8_type;
|
||||
|
||||
// Build and save the trie index.
|
||||
// The dictionary filename
|
||||
const char* tmp_filename = "dic.bin";
|
||||
|
||||
// Build and save the trie dictionary.
|
||||
{
|
||||
const trie_type trie(keys);
|
||||
xcdat::save(trie, index_filename);
|
||||
xcdat::save(trie, tmp_filename);
|
||||
}
|
||||
|
||||
// Load the trie index.
|
||||
const auto trie = xcdat::load<trie_type>(index_filename);
|
||||
// Memory-map the trie dictionary.
|
||||
const mm::file_source<char> fin(tmp_filename, mm::advice::sequential);
|
||||
const auto trie = xcdat::mmap<trie_type>(fin.data());
|
||||
|
||||
// Or, load the trie dictionary on memory.
|
||||
// const auto trie = xcdat::load<trie_type>(tmp_filename);
|
||||
|
||||
// Basic statistics
|
||||
std::cout << "NumberKeys: " << trie.num_keys() << std::endl;
|
||||
std::cout << "MaxLength: " << trie.max_length() << std::endl;
|
||||
std::cout << "AlphabetSize: " << trie.alphabet_size() << std::endl;
|
||||
std::cout << "Memory: " << xcdat::memory_in_bytes(trie) << " bytes" << std::endl;
|
||||
std::cout << "Number of keys: " << trie.num_keys() << std::endl;
|
||||
std::cout << "Number of trie nodes: " << trie.num_nodes() << std::endl;
|
||||
std::cout << "Number of DA units: " << trie.num_units() << std::endl;
|
||||
std::cout << "Memory usage in bytes: " << xcdat::memory_in_bytes(trie) << std::endl;
|
||||
|
||||
// Lookup IDs from keys
|
||||
// Lookup the ID for a query key.
|
||||
{
|
||||
const auto id = trie.lookup("Mac_Pro");
|
||||
std::cout << "Lookup(Mac_Pro) = " << id.value_or(UINT64_MAX) << std::endl;
|
||||
|
@ -200,7 +206,7 @@ int main() {
|
|||
std::cout << "Lookup(Google_Pixel) = " << id.value_or(UINT64_MAX) << std::endl;
|
||||
}
|
||||
|
||||
// Decode keys from IDs
|
||||
// Decode the key for a query ID.
|
||||
{
|
||||
const auto dec = trie.decode(4);
|
||||
std::cout << "Decode(4) = " << dec << std::endl;
|
||||
|
@ -236,7 +242,8 @@ int main() {
|
|||
std::cout << "}" << std::endl;
|
||||
}
|
||||
|
||||
std::remove(index_filename);
|
||||
std::remove(tmp_filename);
|
||||
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
|
@ -244,10 +251,10 @@ int main() {
|
|||
The output will be
|
||||
|
||||
```
|
||||
NumberKeys: 12
|
||||
MaxLength: 11
|
||||
AlphabetSize: 20
|
||||
Memory: 1762 bytes
|
||||
Number of keys: 12
|
||||
Number of trie nodes: 28
|
||||
Number of DA units: 256
|
||||
Memory usage in bytes: 1766
|
||||
Lookup(Mac_Pro) = 7
|
||||
Lookup(Google_Pixel) = 18446744073709551615
|
||||
Decode(4) = MacBook_Air
|
||||
|
@ -451,15 +458,15 @@ class trie {
|
|||
template <class Trie>
|
||||
Trie mmap(const char* address);
|
||||
|
||||
//! Load the trie index from the file.
|
||||
//! Load the trie dictionary from the file.
|
||||
template <class Trie>
|
||||
Trie load(std::string_view filepath);
|
||||
|
||||
//! Save the trie index to the file and returns the file size in bytes.
|
||||
//! Save the trie dictionary to the file and returns the file size in bytes.
|
||||
template <class Trie>
|
||||
std::uint64_t save(const Trie& idx, std::string_view filepath);
|
||||
|
||||
//! Get the index size in bytes.
|
||||
//! Get the dictionary size in bytes.
|
||||
template <class Trie>
|
||||
std::uint64_t memory_in_bytes(const Trie& idx);
|
||||
|
||||
|
|
|
@ -20,28 +20,28 @@ template <class Trie>
|
|||
|
||||
std::uint32_t flag;
|
||||
visitor.visit(flag);
|
||||
XCDAT_THROW_IF(flag != Trie::l1_bits, "The input index type is different.");
|
||||
XCDAT_THROW_IF(flag != Trie::l1_bits, "The input dictionary type is different.");
|
||||
|
||||
Trie idx;
|
||||
visitor.visit(idx);
|
||||
return idx;
|
||||
}
|
||||
|
||||
//! Load the trie index from the file.
|
||||
//! Load the trie dictionary from the file.
|
||||
template <class Trie>
|
||||
[[maybe_unused]] Trie load(std::string_view filepath) {
|
||||
load_visitor visitor(filepath);
|
||||
|
||||
std::uint32_t flag;
|
||||
visitor.visit(flag);
|
||||
XCDAT_THROW_IF(flag != Trie::l1_bits, "The input index type is different.");
|
||||
XCDAT_THROW_IF(flag != Trie::l1_bits, "The input dictionary type is different.");
|
||||
|
||||
Trie idx;
|
||||
visitor.visit(idx);
|
||||
return idx;
|
||||
}
|
||||
|
||||
//! Save the trie index to the file and returns the file size in bytes.
|
||||
//! Save the trie dictionary to the file and returns the file size in bytes.
|
||||
template <class Trie>
|
||||
[[maybe_unused]] std::uint64_t save(const Trie& idx, std::string_view filepath) {
|
||||
save_visitor visitor(filepath);
|
||||
|
@ -50,7 +50,7 @@ template <class Trie>
|
|||
return visitor.bytes();
|
||||
}
|
||||
|
||||
//! Get the index size in bytes.
|
||||
//! Get the dictionary size in bytes.
|
||||
template <class Trie>
|
||||
[[maybe_unused]] std::uint64_t memory_in_bytes(const Trie& idx) {
|
||||
size_visitor visitor;
|
||||
|
@ -59,7 +59,7 @@ template <class Trie>
|
|||
return visitor.bytes();
|
||||
}
|
||||
|
||||
//! Get the flag indicating the trie type, embedded by the function 'save'.
|
||||
//! Get the flag indicating the trie dictionary type, embedded by the function 'save'.
|
||||
//! The flag corresponds to trie::l1_bits and will be used to detect the trie type from the file.
|
||||
[[maybe_unused]] std::uint32_t get_flag(std::string_view filepath) {
|
||||
std::ifstream ifs(filepath);
|
||||
|
|
|
@ -87,6 +87,9 @@ class trie_builder {
|
|||
// Build the BC units
|
||||
arrange(0, m_keys.size(), 0, 0);
|
||||
|
||||
// Finish
|
||||
finish();
|
||||
|
||||
// Build the TAIL vector
|
||||
m_suffixes.complete(m_bin_mode, [&](std::uint64_t npos, std::uint64_t tpos) { m_units[npos].base = tpos; });
|
||||
}
|
||||
|
@ -161,6 +164,13 @@ class trie_builder {
|
|||
}
|
||||
}
|
||||
|
||||
void finish() {
|
||||
while (m_units[taboo_npos].base != taboo_npos) {
|
||||
auto bpos = m_units[taboo_npos].base / 256;
|
||||
close_block(bpos);
|
||||
}
|
||||
}
|
||||
|
||||
void arrange(std::uint64_t beg, std::uint64_t end, std::uint64_t kpos, std::uint64_t npos) {
|
||||
if (m_keys[beg].size() == kpos) {
|
||||
m_terms.set_bit(npos, true);
|
||||
|
|
|
@ -1,10 +1,11 @@
|
|||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
#include <mm_file/mm_file.hpp>
|
||||
#include <xcdat.hpp>
|
||||
|
||||
int main() {
|
||||
// Input keys
|
||||
// Dataset of keywords
|
||||
std::vector<std::string> keys = {
|
||||
"AirPods", "AirTag", "Mac", "MacBook", "MacBook_Air", "MacBook_Pro",
|
||||
"Mac_Mini", "Mac_Pro", "iMac", "iPad", "iPhone", "iPhone_SE",
|
||||
|
@ -14,27 +15,32 @@ int main() {
|
|||
std::sort(keys.begin(), keys.end());
|
||||
keys.erase(std::unique(keys.begin(), keys.end()), keys.end());
|
||||
|
||||
const char* index_filename = "tmp.idx";
|
||||
|
||||
// The trie index type
|
||||
// The trie dictionary type
|
||||
using trie_type = xcdat::trie_8_type;
|
||||
|
||||
// Build and save the trie index.
|
||||
// The dictionary filename
|
||||
const char* tmp_filename = "dic.bin";
|
||||
|
||||
// Build and save the trie dictionary.
|
||||
{
|
||||
const trie_type trie(keys);
|
||||
xcdat::save(trie, index_filename);
|
||||
xcdat::save(trie, tmp_filename);
|
||||
}
|
||||
|
||||
// Load the trie index.
|
||||
const auto trie = xcdat::load<trie_type>(index_filename);
|
||||
// Memory-map the trie dictionary.
|
||||
const mm::file_source<char> fin(tmp_filename, mm::advice::sequential);
|
||||
const auto trie = xcdat::mmap<trie_type>(fin.data());
|
||||
|
||||
// Or, load the trie dictionary on memory.
|
||||
// const auto trie = xcdat::load<trie_type>(tmp_filename);
|
||||
|
||||
// Basic statistics
|
||||
std::cout << "NumberKeys: " << trie.num_keys() << std::endl;
|
||||
std::cout << "MaxLength: " << trie.max_length() << std::endl;
|
||||
std::cout << "AlphabetSize: " << trie.alphabet_size() << std::endl;
|
||||
std::cout << "Memory: " << xcdat::memory_in_bytes(trie) << " bytes" << std::endl;
|
||||
std::cout << "Number of keys: " << trie.num_keys() << std::endl;
|
||||
std::cout << "Number of trie nodes: " << trie.num_nodes() << std::endl;
|
||||
std::cout << "Number of DA units: " << trie.num_units() << std::endl;
|
||||
std::cout << "Memory usage in bytes: " << xcdat::memory_in_bytes(trie) << std::endl;
|
||||
|
||||
// Lookup IDs from keys
|
||||
// Lookup the ID for a query key.
|
||||
{
|
||||
const auto id = trie.lookup("Mac_Pro");
|
||||
std::cout << "Lookup(Mac_Pro) = " << id.value_or(UINT64_MAX) << std::endl;
|
||||
|
@ -44,7 +50,7 @@ int main() {
|
|||
std::cout << "Lookup(Google_Pixel) = " << id.value_or(UINT64_MAX) << std::endl;
|
||||
}
|
||||
|
||||
// Decode keys from IDs
|
||||
// Decode the key for a query ID.
|
||||
{
|
||||
const auto dec = trie.decode(4);
|
||||
std::cout << "Decode(4) = " << dec << std::endl;
|
||||
|
@ -80,6 +86,7 @@ int main() {
|
|||
std::cout << "}" << std::endl;
|
||||
}
|
||||
|
||||
std::remove(index_filename);
|
||||
std::remove(tmp_filename);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -1,177 +0,0 @@
|
|||
#pragma once
|
||||
|
||||
#include <sys/mman.h>
|
||||
#include <sys/stat.h>
|
||||
#include <type_traits>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h> // close(fd)
|
||||
#include <string>
|
||||
|
||||
namespace mm {
|
||||
|
||||
namespace advice {
|
||||
static const int normal = POSIX_MADV_NORMAL;
|
||||
static const int random = POSIX_MADV_RANDOM;
|
||||
static const int sequential = POSIX_MADV_SEQUENTIAL;
|
||||
} // namespace advice
|
||||
|
||||
template <typename T>
|
||||
struct file {
|
||||
file() {
|
||||
init();
|
||||
}
|
||||
|
||||
~file() {
|
||||
close();
|
||||
}
|
||||
|
||||
file(file const&) = delete; // non construction-copyable
|
||||
file& operator=(file const&) = delete; // non copyable
|
||||
|
||||
bool is_open() const {
|
||||
return m_fd != -1;
|
||||
}
|
||||
|
||||
void close() {
|
||||
if (is_open()) {
|
||||
if (munmap((char*)m_data, m_size) == -1) {
|
||||
throw std::runtime_error("munmap failed when closing file");
|
||||
}
|
||||
::close(m_fd);
|
||||
init();
|
||||
}
|
||||
}
|
||||
|
||||
size_t bytes() const {
|
||||
return m_size;
|
||||
}
|
||||
|
||||
size_t size() const {
|
||||
return m_size / sizeof(T);
|
||||
}
|
||||
|
||||
T* data() const {
|
||||
return m_data;
|
||||
}
|
||||
|
||||
struct iterator {
|
||||
iterator(T* addr, size_t offset = 0) : m_ptr(addr + offset) {}
|
||||
|
||||
T operator*() {
|
||||
return *m_ptr;
|
||||
}
|
||||
|
||||
void operator++() {
|
||||
++m_ptr;
|
||||
}
|
||||
|
||||
bool operator==(iterator const& rhs) const {
|
||||
return m_ptr == rhs.m_ptr;
|
||||
}
|
||||
|
||||
bool operator!=(iterator const& rhs) const {
|
||||
return !((*this) == rhs);
|
||||
}
|
||||
|
||||
private:
|
||||
T* m_ptr;
|
||||
};
|
||||
|
||||
iterator begin() const {
|
||||
return iterator(m_data);
|
||||
}
|
||||
|
||||
iterator end() const {
|
||||
return iterator(m_data, size());
|
||||
}
|
||||
|
||||
protected:
|
||||
int m_fd;
|
||||
size_t m_size;
|
||||
T* m_data;
|
||||
|
||||
void init() {
|
||||
m_fd = -1;
|
||||
m_size = 0;
|
||||
m_data = nullptr;
|
||||
}
|
||||
|
||||
void check_fd() {
|
||||
if (m_fd == -1) throw std::runtime_error("cannot open file");
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Pointer>
|
||||
Pointer mmap(int fd, size_t size, int prot) {
|
||||
static const size_t offset = 0;
|
||||
Pointer p =
|
||||
static_cast<Pointer>(::mmap(NULL, size, prot, MAP_SHARED, fd, offset));
|
||||
if (p == MAP_FAILED) throw std::runtime_error("mmap failed");
|
||||
return p;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
struct file_source : public file<T const> {
|
||||
typedef file<T const> base;
|
||||
|
||||
file_source() {}
|
||||
|
||||
file_source(std::string const& path, int adv = advice::normal) {
|
||||
open(path, adv);
|
||||
}
|
||||
|
||||
void open(std::string const& path, int adv = advice::normal) {
|
||||
base::m_fd = ::open(path.c_str(), O_RDONLY);
|
||||
base::check_fd();
|
||||
struct stat fs;
|
||||
if (fstat(base::m_fd, &fs) == -1) {
|
||||
throw std::runtime_error("cannot stat file");
|
||||
}
|
||||
base::m_size = fs.st_size;
|
||||
base::m_data = mmap<T const*>(base::m_fd, base::m_size, PROT_READ);
|
||||
if (posix_madvise((void*)base::m_data, base::m_size, adv)) {
|
||||
throw std::runtime_error("madvise failed");
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct file_sink : public file<T> {
|
||||
typedef file<T> base;
|
||||
|
||||
file_sink() {}
|
||||
|
||||
file_sink(std::string const& path) {
|
||||
open(path);
|
||||
}
|
||||
|
||||
file_sink(std::string const& path, size_t n) {
|
||||
open(path, n);
|
||||
}
|
||||
|
||||
void open(std::string const& path) {
|
||||
static const mode_t mode = 0600; // read/write
|
||||
base::m_fd = ::open(path.c_str(), O_RDWR, mode);
|
||||
base::check_fd();
|
||||
struct stat fs;
|
||||
if (fstat(base::m_fd, &fs) == -1) {
|
||||
throw std::runtime_error("cannot stat file");
|
||||
}
|
||||
base::m_size = fs.st_size;
|
||||
base::m_data =
|
||||
mmap<T*>(base::m_fd, base::m_size, PROT_READ | PROT_WRITE);
|
||||
}
|
||||
|
||||
void open(std::string const& path, size_t n) {
|
||||
static const mode_t mode = 0600; // read/write
|
||||
base::m_fd = ::open(path.c_str(), O_RDWR | O_CREAT | O_TRUNC, mode);
|
||||
base::check_fd();
|
||||
base::m_size = n * sizeof(T);
|
||||
ftruncate(base::m_fd,
|
||||
base::m_size); // truncate the file at the new size
|
||||
base::m_data =
|
||||
mmap<T*>(base::m_fd, base::m_size, PROT_READ | PROT_WRITE);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace mm
|
|
@ -1,5 +1,3 @@
|
|||
#include <chrono>
|
||||
|
||||
#include <xcdat.hpp>
|
||||
|
||||
#include "cmd_line_parser/parser.hpp"
|
||||
|
@ -7,8 +5,8 @@
|
|||
|
||||
cmd_line_parser::parser make_parser(int argc, char** argv) {
|
||||
cmd_line_parser::parser p(argc, argv);
|
||||
p.add("input_keys", "Input filepath of data keys");
|
||||
p.add("output_idx", "Output filepath of trie index");
|
||||
p.add("input_keys", "Input filepath of keywords");
|
||||
p.add("output_dic", "Output filepath of trie dictionary");
|
||||
p.add("trie_type", "Trie type: [7|8] (default=7)", "-t", false);
|
||||
p.add("binary_mode", "Is binary mode? (default=0)", "-b", false);
|
||||
return p;
|
||||
|
@ -17,7 +15,7 @@ cmd_line_parser::parser make_parser(int argc, char** argv) {
|
|||
template <class Trie>
|
||||
int build(const cmd_line_parser::parser& p) {
|
||||
const auto input_keys = p.get<std::string>("input_keys");
|
||||
const auto output_idx = p.get<std::string>("output_idx");
|
||||
const auto output_dic = p.get<std::string>("output_dic");
|
||||
const auto binary_mode = p.get<bool>("binary_mode", false);
|
||||
|
||||
auto keys = xcdat::load_strings(input_keys);
|
||||
|
@ -37,7 +35,7 @@ int build(const cmd_line_parser::parser& p) {
|
|||
tfm::printfln("Memory usage in bytes: %d", memory_in_bytes);
|
||||
tfm::printfln("Memory usage in MiB: %g", memory_in_bytes / (1024.0 * 1024.0));
|
||||
|
||||
xcdat::save(trie, output_idx);
|
||||
xcdat::save(trie, output_dic);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -1,20 +1,20 @@
|
|||
#include <mm_file/mm_file.hpp>
|
||||
#include <xcdat.hpp>
|
||||
|
||||
#include "cmd_line_parser/parser.hpp"
|
||||
#include "mm_file/mm_file.hpp"
|
||||
#include "tinyformat/tinyformat.h"
|
||||
|
||||
cmd_line_parser::parser make_parser(int argc, char** argv) {
|
||||
cmd_line_parser::parser p(argc, argv);
|
||||
p.add("input_idx", "Input filepath of trie index");
|
||||
p.add("input_dic", "Input filepath of trie dictionary");
|
||||
return p;
|
||||
}
|
||||
|
||||
template <class Trie>
|
||||
int decode(const cmd_line_parser::parser& p) {
|
||||
const auto input_idx = p.get<std::string>("input_idx");
|
||||
const auto input_dic = p.get<std::string>("input_dic");
|
||||
|
||||
const mm::file_source<char> fin(input_idx.c_str(), mm::advice::sequential);
|
||||
const mm::file_source<char> fin(input_dic.c_str(), mm::advice::sequential);
|
||||
const auto trie = xcdat::mmap<Trie>(fin.data());
|
||||
|
||||
for (std::uint64_t id; std::cin >> id;) {
|
||||
|
@ -36,8 +36,8 @@ int main(int argc, char** argv) {
|
|||
return 1;
|
||||
}
|
||||
|
||||
const auto input_idx = p.get<std::string>("input_idx");
|
||||
const auto flag = xcdat::get_flag(input_idx);
|
||||
const auto input_dic = p.get<std::string>("input_dic");
|
||||
const auto flag = xcdat::get_flag(input_dic);
|
||||
|
||||
switch (flag) {
|
||||
case 7:
|
||||
|
|
|
@ -1,20 +1,20 @@
|
|||
#include <mm_file/mm_file.hpp>
|
||||
#include <xcdat.hpp>
|
||||
|
||||
#include "cmd_line_parser/parser.hpp"
|
||||
#include "mm_file/mm_file.hpp"
|
||||
#include "tinyformat/tinyformat.h"
|
||||
|
||||
cmd_line_parser::parser make_parser(int argc, char** argv) {
|
||||
cmd_line_parser::parser p(argc, argv);
|
||||
p.add("input_idx", "Input filepath of trie index");
|
||||
p.add("input_dic", "Input filepath of trie dictionary");
|
||||
return p;
|
||||
}
|
||||
|
||||
template <class Trie>
|
||||
int enumerate(const cmd_line_parser::parser& p) {
|
||||
const auto input_idx = p.get<std::string>("input_idx");
|
||||
const auto input_dic = p.get<std::string>("input_dic");
|
||||
|
||||
const mm::file_source<char> fin(input_idx.c_str(), mm::advice::sequential);
|
||||
const mm::file_source<char> fin(input_dic.c_str(), mm::advice::sequential);
|
||||
const auto trie = xcdat::mmap<Trie>(fin.data());
|
||||
|
||||
trie.enumerate([&](std::uint64_t id, std::string_view str) { tfm::printfln("%d\t%s", id, str); });
|
||||
|
@ -33,8 +33,8 @@ int main(int argc, char** argv) {
|
|||
return 1;
|
||||
}
|
||||
|
||||
const auto input_idx = p.get<std::string>("input_idx");
|
||||
const auto flag = xcdat::get_flag(input_idx);
|
||||
const auto input_dic = p.get<std::string>("input_dic");
|
||||
const auto flag = xcdat::get_flag(input_dic);
|
||||
|
||||
switch (flag) {
|
||||
case 7:
|
||||
|
|
|
@ -1,20 +1,20 @@
|
|||
#include <mm_file/mm_file.hpp>
|
||||
#include <xcdat.hpp>
|
||||
|
||||
#include "cmd_line_parser/parser.hpp"
|
||||
#include "mm_file/mm_file.hpp"
|
||||
#include "tinyformat/tinyformat.h"
|
||||
|
||||
cmd_line_parser::parser make_parser(int argc, char** argv) {
|
||||
cmd_line_parser::parser p(argc, argv);
|
||||
p.add("input_idx", "Input filepath of trie index");
|
||||
p.add("input_dic", "Input filepath of trie dictionary");
|
||||
return p;
|
||||
}
|
||||
|
||||
template <class Trie>
|
||||
int lookup(const cmd_line_parser::parser& p) {
|
||||
const auto input_idx = p.get<std::string>("input_idx");
|
||||
const auto input_dic = p.get<std::string>("input_dic");
|
||||
|
||||
const mm::file_source<char> fin(input_idx.c_str(), mm::advice::sequential);
|
||||
const mm::file_source<char> fin(input_dic.c_str(), mm::advice::sequential);
|
||||
const auto trie = xcdat::mmap<Trie>(fin.data());
|
||||
|
||||
for (std::string str; std::getline(std::cin, str);) {
|
||||
|
@ -40,8 +40,8 @@ int main(int argc, char** argv) {
|
|||
return 1;
|
||||
}
|
||||
|
||||
const auto input_idx = p.get<std::string>("input_idx");
|
||||
const auto flag = xcdat::get_flag(input_idx);
|
||||
const auto input_dic = p.get<std::string>("input_dic");
|
||||
const auto flag = xcdat::get_flag(input_dic);
|
||||
|
||||
switch (flag) {
|
||||
case 7:
|
||||
|
|
|
@ -1,22 +1,22 @@
|
|||
#include <mm_file/mm_file.hpp>
|
||||
#include <xcdat.hpp>
|
||||
|
||||
#include "cmd_line_parser/parser.hpp"
|
||||
#include "mm_file/mm_file.hpp"
|
||||
#include "tinyformat/tinyformat.h"
|
||||
|
||||
cmd_line_parser::parser make_parser(int argc, char** argv) {
|
||||
cmd_line_parser::parser p(argc, argv);
|
||||
p.add("input_idx", "Input filepath of trie index");
|
||||
p.add("input_dic", "Input filepath of trie dictionary");
|
||||
p.add("max_num_results", "The max number of results (default=10)", "-n", false);
|
||||
return p;
|
||||
}
|
||||
|
||||
template <class Trie>
|
||||
int predictive_search(const cmd_line_parser::parser& p) {
|
||||
const auto input_idx = p.get<std::string>("input_idx");
|
||||
const auto input_dic = p.get<std::string>("input_dic");
|
||||
const auto max_num_results = p.get<std::uint64_t>("max_num_results", 10);
|
||||
|
||||
const mm::file_source<char> fin(input_idx.c_str(), mm::advice::sequential);
|
||||
const mm::file_source<char> fin(input_dic.c_str(), mm::advice::sequential);
|
||||
const auto trie = xcdat::mmap<Trie>(fin.data());
|
||||
|
||||
struct result_type {
|
||||
|
@ -53,8 +53,8 @@ int main(int argc, char** argv) {
|
|||
return 1;
|
||||
}
|
||||
|
||||
const auto input_idx = p.get<std::string>("input_idx");
|
||||
const auto flag = xcdat::get_flag(input_idx);
|
||||
const auto input_dic = p.get<std::string>("input_dic");
|
||||
const auto flag = xcdat::get_flag(input_dic);
|
||||
|
||||
switch (flag) {
|
||||
case 7:
|
||||
|
|
|
@ -1,20 +1,20 @@
|
|||
#include <mm_file/mm_file.hpp>
|
||||
#include <xcdat.hpp>
|
||||
|
||||
#include "cmd_line_parser/parser.hpp"
|
||||
#include "mm_file/mm_file.hpp"
|
||||
#include "tinyformat/tinyformat.h"
|
||||
|
||||
cmd_line_parser::parser make_parser(int argc, char** argv) {
|
||||
cmd_line_parser::parser p(argc, argv);
|
||||
p.add("input_idx", "Input filepath of trie index");
|
||||
p.add("input_dic", "Input filepath of trie dictionary");
|
||||
return p;
|
||||
}
|
||||
|
||||
template <class Trie>
|
||||
int prefix_search(const cmd_line_parser::parser& p) {
|
||||
const auto input_idx = p.get<std::string>("input_idx");
|
||||
const auto input_dic = p.get<std::string>("input_dic");
|
||||
|
||||
const mm::file_source<char> fin(input_idx.c_str(), mm::advice::sequential);
|
||||
const mm::file_source<char> fin(input_dic.c_str(), mm::advice::sequential);
|
||||
const auto trie = xcdat::mmap<Trie>(fin.data());
|
||||
|
||||
struct result_type {
|
||||
|
@ -49,8 +49,8 @@ int main(int argc, char** argv) {
|
|||
return 1;
|
||||
}
|
||||
|
||||
const auto input_idx = p.get<std::string>("input_idx");
|
||||
const auto flag = xcdat::get_flag(input_idx);
|
||||
const auto input_dic = p.get<std::string>("input_dic");
|
||||
const auto flag = xcdat::get_flag(input_dic);
|
||||
|
||||
switch (flag) {
|
||||
case 7:
|
||||
|
|
Loading…
Reference in a new issue