This commit is contained in:
Shunsuke Kanda 2021-07-02 21:50:10 +09:00
parent 6a7c00d07a
commit db944cff22
13 changed files with 114 additions and 267 deletions

View file

@ -42,5 +42,7 @@ file(COPY ${CMAKE_SOURCE_DIR}/tests/keys.txt DESTINATION ${CMAKE_CURRENT_BINARY_
# Install the library
file(GLOB XCDAT_HEADER_FILES include/xcdat/*.hpp)
file(GLOB MM_HEADER_FILES include/mm_file/*.hpp)
install(FILES include/xcdat.hpp DESTINATION include)
install(FILES ${XCDAT_HEADER_FILES} DESTINATION include/xcdat)
install(FILES ${MM_HEADER_FILES} DESTINATION include/mm_file)

View file

@ -1,6 +1,6 @@
# Xcdat: Fast compressed trie dictionary library
**Xcdat** is a C++17 header-only library of a fast compressed string dictionary based on the improved double-array trie structure described in the paper: [Compressed double-array tries for string dictionaries supporting fast lookup](https://doi.org/10.1007/s10115-016-0999-8), *Knowledge and Information Systems*, 2017, available at [here](https://kampersanda.github.io/pdf/KAIS2017.pdf).
**Xcdat** is a C++17 header-only library of a fast compressed string dictionary based on an improved double-array trie structure described in the paper: [Compressed double-array tries for string dictionaries supporting fast lookup](https://doi.org/10.1007/s10115-016-0999-8), *Knowledge and Information Systems*, 2017, available at [here](https://kampersanda.github.io/pdf/KAIS2017.pdf).
## Table of contents
@ -17,8 +17,8 @@
## Features
- **Compressed string dictionary.** Xcdat implements a (static) *compressed string dictioanry* that stores a set of strings (or keywords) in a compressed space while supporting several search operations [1,2]. For example, Xcdat can store an entire set of English Wikipedia titles at half the size of the raw data.
- **Fast and compact data structure.** Xcdat employs the *double-array trie* [3] known as the fastest data structure for trie implementation. However, the double-array trie resorts to many pointers and consumes a large amount of memory. To address this, Xcdat applies the *XCDA* method [2] that represents the double-array trie in a compressed format while maintaining the fast searches.
- **Cache efficiency.** Xcdat employs a *minimal-prefix trie* [4] that replaces redundant trie nodes into strings, resulting in reducing random access and improving locality of references.
- **Fast and compact data structure.** Xcdat employs the *double-array trie* [3] known as the fastest trie implementation. However, the double-array trie resorts to many pointers and consumes a large amount of memory. To address this, Xcdat applies the *XCDA* method [2] that represents the double-array trie in a compressed format while maintaining the fast searches.
- **Cache efficiency.** Xcdat employs a *minimal-prefix trie* [4] that replaces redundant trie nodes into strings to reduce random access and to improve locality of references.
- **Dictionary encoding.** Xcdat maps `N` distinct keywords into unique IDs from `[0,N-1]`, and supports the two symmetric operations: `lookup` returns the ID corresponding to a given keyword; `decode` returns the keyword associated with a given ID. The mapping is so-called *dictionary encoding* (or *domain encoding*) and is fundamental in many DB applications as described by Martínez-Prieto et al [1] or Müller et al. [5].
- **Prefix search operations.** Xcdat supports prefix search operations realized by trie search algorithms: `prefix_search` returns all the keywords contained as prefixes of a given string; `predictive search` returns all the keywords starting with a given string. These will be useful in many NLP applications such as auto completions [6], stemmed searches [7], or input method editors [8].
- **64-bit support.** As mentioned before, since the double array is a pointer-based data structure, most double-array libraries use 32-bit pointers to reduce memory consumption, resulting in limiting the scale of the input dataset. On the other hand, the XCDA method allows Xcdat to represent 64-bit pointers without sacrificing memory efficiency.
@ -50,11 +50,11 @@ The library considers a 64-bit operating system. The code has been tested only o
## Command line tools
Xcdat provides command line tools to build the index and perform searches, which are inspired by [marisa-trie](https://github.com/s-yata/marisa-trie). All the tools will print the command line options by specifying the parameter `-h`.
Xcdat provides command line tools to build the dictionary and perform searches, which are inspired by [marisa-trie](https://github.com/s-yata/marisa-trie). All the tools will print the command line options by specifying the parameter `-h`.
### `xcdat_build`
It builds the trie index from a given dataset consisting of keywords separated by newlines. The following command builds the trie index from dataset `enwiki-titles.txt` and writes the index into file `idx.bin`.
It builds the trie dictionary from a given dataset consisting of keywords separated by newlines. The following command builds the trie dictionary from dataset `enwiki-titles.txt` and writes the dictionary into file `idx.bin`.
```
$ xcdat_build enwiki-titles.txt idx.bin
@ -67,7 +67,7 @@ Memory usage in MiB: 162.714
### `xcdat_lookup`
It tests the `lookup` operation for a given index. Given a query string via `stdin`, it prints the associated ID if found, or `-1` otherwise.
It tests the `lookup` operation for a given dictionary. Given a query string via `stdin`, it prints the associated ID if found, or `-1` otherwise.
```
$ xcdat_lookup idx.bin
@ -79,7 +79,7 @@ Double_Array
### `xcdat_decode`
It tests the `decode` operation for a given index. Given a query ID via `stdin`, it prints the corresponding keyword if the ID is in the range `[0,N-1]`, where `N` is the number of stored keywords.
It tests the `decode` operation for a given dictionary. Given a query ID via `stdin`, it prints the corresponding keyword if the ID is in the range `[0,N-1]`, where `N` is the number of stored keywords.
```
$ xcdat_decode idx.bin
@ -89,7 +89,7 @@ $ xcdat_decode idx.bin
### `xcdat_prefix_search`
It tests the `prefix_search` operation for a given index. Given a query string via `stdin`, it prints all the keywords contained as prefixes of a given string.
It tests the `prefix_search` operation for a given dictionary. Given a query string via `stdin`, it prints all the keywords contained as prefixes of a given string.
```
$ xcdat_prefix_search idx.bin
@ -105,7 +105,7 @@ Algorithmic
### `xcdat_predictive_search`
It tests the `predictive_search` operation for a given index. Given a query string via `stdin`, it prints the first `n` keywords starting with a given string, where `n` is one of the parameters.
It tests the `predictive_search` operation for a given dictionary. Given a query string via `stdin`, it prints the first `n` keywords starting with a given string, where `n` is one of the parameters.
```
$ xcdat_predictive_search idx.bin -n 3
@ -118,7 +118,7 @@ Algorithm
### `xcdat_enumerate`
It prints all the keywords stored in a given index.
It prints all the keywords stored in a given dictionary.
```
$ xcdat_enumerate idx.bin | head -3
@ -151,16 +151,17 @@ Decode time in microsec/query: 1.2341
## Sample usage
`sample/sample.cpp` provides a sample usage.
`sample/sample.cpp` provides a sample usage. It employs the external library [mm_file](https://github.com/jermp/mm_file) to implement a memory-mapped file, which will be installed by `make install` together.
```c++
#include <iostream>
#include <string>
#include <mm_file/mm_file.hpp>
#include <xcdat.hpp>
int main() {
// Input keys
// Dataset
std::vector<std::string> keys = {
"AirPods", "AirTag", "Mac", "MacBook", "MacBook_Air", "MacBook_Pro",
"Mac_Mini", "Mac_Pro", "iMac", "iPad", "iPhone", "iPhone_SE",
@ -170,27 +171,32 @@ int main() {
std::sort(keys.begin(), keys.end());
keys.erase(std::unique(keys.begin(), keys.end()), keys.end());
const char* index_filename = "tmp.idx";
// The trie index type
// The trie dictionary type
using trie_type = xcdat::trie_8_type;
// Build and save the trie index.
// The dictionary filename
const char* tmp_filename = "dic.bin";
// Build and save the trie dictionary.
{
const trie_type trie(keys);
xcdat::save(trie, index_filename);
xcdat::save(trie, tmp_filename);
}
// Load the trie index.
const auto trie = xcdat::load<trie_type>(index_filename);
// Memory-map the trie dictionary.
const mm::file_source<char> fin(tmp_filename, mm::advice::sequential);
const auto trie = xcdat::mmap<trie_type>(fin.data());
// Or, load the trie dictionary on memory.
// const auto trie = xcdat::load<trie_type>(tmp_filename);
// Basic statistics
std::cout << "NumberKeys: " << trie.num_keys() << std::endl;
std::cout << "MaxLength: " << trie.max_length() << std::endl;
std::cout << "AlphabetSize: " << trie.alphabet_size() << std::endl;
std::cout << "Memory: " << xcdat::memory_in_bytes(trie) << " bytes" << std::endl;
std::cout << "Number of keys: " << trie.num_keys() << std::endl;
std::cout << "Number of trie nodes: " << trie.num_nodes() << std::endl;
std::cout << "Number of DA units: " << trie.num_units() << std::endl;
std::cout << "Memory usage in bytes: " << xcdat::memory_in_bytes(trie) << std::endl;
// Lookup IDs from keys
// Lookup the ID for a query key.
{
const auto id = trie.lookup("Mac_Pro");
std::cout << "Lookup(Mac_Pro) = " << id.value_or(UINT64_MAX) << std::endl;
@ -200,7 +206,7 @@ int main() {
std::cout << "Lookup(Google_Pixel) = " << id.value_or(UINT64_MAX) << std::endl;
}
// Decode keys from IDs
// Decode the key for a query ID.
{
const auto dec = trie.decode(4);
std::cout << "Decode(4) = " << dec << std::endl;
@ -236,7 +242,8 @@ int main() {
std::cout << "}" << std::endl;
}
std::remove(index_filename);
std::remove(tmp_filename);
return 0;
}
```
@ -244,10 +251,10 @@ int main() {
The output will be
```
NumberKeys: 12
MaxLength: 11
AlphabetSize: 20
Memory: 1762 bytes
Number of keys: 12
Number of trie nodes: 28
Number of DA units: 256
Memory usage in bytes: 1766
Lookup(Mac_Pro) = 7
Lookup(Google_Pixel) = 18446744073709551615
Decode(4) = MacBook_Air
@ -451,15 +458,15 @@ class trie {
template <class Trie>
Trie mmap(const char* address);
//! Load the trie index from the file.
//! Load the trie dictionary from the file.
template <class Trie>
Trie load(std::string_view filepath);
//! Save the trie index to the file and returns the file size in bytes.
//! Save the trie dictionary to the file and returns the file size in bytes.
template <class Trie>
std::uint64_t save(const Trie& idx, std::string_view filepath);
//! Get the index size in bytes.
//! Get the dictionary size in bytes.
template <class Trie>
std::uint64_t memory_in_bytes(const Trie& idx);

View file

@ -20,28 +20,28 @@ template <class Trie>
std::uint32_t flag;
visitor.visit(flag);
XCDAT_THROW_IF(flag != Trie::l1_bits, "The input index type is different.");
XCDAT_THROW_IF(flag != Trie::l1_bits, "The input dictionary type is different.");
Trie idx;
visitor.visit(idx);
return idx;
}
//! Load the trie index from the file.
//! Load the trie dictionary from the file.
template <class Trie>
[[maybe_unused]] Trie load(std::string_view filepath) {
load_visitor visitor(filepath);
std::uint32_t flag;
visitor.visit(flag);
XCDAT_THROW_IF(flag != Trie::l1_bits, "The input index type is different.");
XCDAT_THROW_IF(flag != Trie::l1_bits, "The input dictionary type is different.");
Trie idx;
visitor.visit(idx);
return idx;
}
//! Save the trie index to the file and returns the file size in bytes.
//! Save the trie dictionary to the file and returns the file size in bytes.
template <class Trie>
[[maybe_unused]] std::uint64_t save(const Trie& idx, std::string_view filepath) {
save_visitor visitor(filepath);
@ -50,7 +50,7 @@ template <class Trie>
return visitor.bytes();
}
//! Get the index size in bytes.
//! Get the dictionary size in bytes.
template <class Trie>
[[maybe_unused]] std::uint64_t memory_in_bytes(const Trie& idx) {
size_visitor visitor;
@ -59,7 +59,7 @@ template <class Trie>
return visitor.bytes();
}
//! Get the flag indicating the trie type, embedded by the function 'save'.
//! Get the flag indicating the trie dictionary type, embedded by the function 'save'.
//! The flag corresponds to trie::l1_bits and will be used to detect the trie type from the file.
[[maybe_unused]] std::uint32_t get_flag(std::string_view filepath) {
std::ifstream ifs(filepath);

View file

@ -87,6 +87,9 @@ class trie_builder {
// Build the BC units
arrange(0, m_keys.size(), 0, 0);
// Finish
finish();
// Build the TAIL vector
m_suffixes.complete(m_bin_mode, [&](std::uint64_t npos, std::uint64_t tpos) { m_units[npos].base = tpos; });
}
@ -161,6 +164,13 @@ class trie_builder {
}
}
void finish() {
while (m_units[taboo_npos].base != taboo_npos) {
auto bpos = m_units[taboo_npos].base / 256;
close_block(bpos);
}
}
void arrange(std::uint64_t beg, std::uint64_t end, std::uint64_t kpos, std::uint64_t npos) {
if (m_keys[beg].size() == kpos) {
m_terms.set_bit(npos, true);

View file

@ -1,10 +1,11 @@
#include <iostream>
#include <string>
#include <mm_file/mm_file.hpp>
#include <xcdat.hpp>
int main() {
// Input keys
// Dataset of keywords
std::vector<std::string> keys = {
"AirPods", "AirTag", "Mac", "MacBook", "MacBook_Air", "MacBook_Pro",
"Mac_Mini", "Mac_Pro", "iMac", "iPad", "iPhone", "iPhone_SE",
@ -14,27 +15,32 @@ int main() {
std::sort(keys.begin(), keys.end());
keys.erase(std::unique(keys.begin(), keys.end()), keys.end());
const char* index_filename = "tmp.idx";
// The trie index type
// The trie dictionary type
using trie_type = xcdat::trie_8_type;
// Build and save the trie index.
// The dictionary filename
const char* tmp_filename = "dic.bin";
// Build and save the trie dictionary.
{
const trie_type trie(keys);
xcdat::save(trie, index_filename);
xcdat::save(trie, tmp_filename);
}
// Load the trie index.
const auto trie = xcdat::load<trie_type>(index_filename);
// Memory-map the trie dictionary.
const mm::file_source<char> fin(tmp_filename, mm::advice::sequential);
const auto trie = xcdat::mmap<trie_type>(fin.data());
// Or, load the trie dictionary on memory.
// const auto trie = xcdat::load<trie_type>(tmp_filename);
// Basic statistics
std::cout << "NumberKeys: " << trie.num_keys() << std::endl;
std::cout << "MaxLength: " << trie.max_length() << std::endl;
std::cout << "AlphabetSize: " << trie.alphabet_size() << std::endl;
std::cout << "Memory: " << xcdat::memory_in_bytes(trie) << " bytes" << std::endl;
std::cout << "Number of keys: " << trie.num_keys() << std::endl;
std::cout << "Number of trie nodes: " << trie.num_nodes() << std::endl;
std::cout << "Number of DA units: " << trie.num_units() << std::endl;
std::cout << "Memory usage in bytes: " << xcdat::memory_in_bytes(trie) << std::endl;
// Lookup IDs from keys
// Lookup the ID for a query key.
{
const auto id = trie.lookup("Mac_Pro");
std::cout << "Lookup(Mac_Pro) = " << id.value_or(UINT64_MAX) << std::endl;
@ -44,7 +50,7 @@ int main() {
std::cout << "Lookup(Google_Pixel) = " << id.value_or(UINT64_MAX) << std::endl;
}
// Decode keys from IDs
// Decode the key for a query ID.
{
const auto dec = trie.decode(4);
std::cout << "Decode(4) = " << dec << std::endl;
@ -80,6 +86,7 @@ int main() {
std::cout << "}" << std::endl;
}
std::remove(index_filename);
std::remove(tmp_filename);
return 0;
}

View file

@ -1,177 +0,0 @@
#pragma once
#include <sys/mman.h>
#include <sys/stat.h>
#include <type_traits>
#include <fcntl.h>
#include <unistd.h> // close(fd)
#include <string>
namespace mm {
namespace advice {
static const int normal = POSIX_MADV_NORMAL;
static const int random = POSIX_MADV_RANDOM;
static const int sequential = POSIX_MADV_SEQUENTIAL;
} // namespace advice
template <typename T>
struct file {
file() {
init();
}
~file() {
close();
}
file(file const&) = delete; // non construction-copyable
file& operator=(file const&) = delete; // non copyable
bool is_open() const {
return m_fd != -1;
}
void close() {
if (is_open()) {
if (munmap((char*)m_data, m_size) == -1) {
throw std::runtime_error("munmap failed when closing file");
}
::close(m_fd);
init();
}
}
size_t bytes() const {
return m_size;
}
size_t size() const {
return m_size / sizeof(T);
}
T* data() const {
return m_data;
}
struct iterator {
iterator(T* addr, size_t offset = 0) : m_ptr(addr + offset) {}
T operator*() {
return *m_ptr;
}
void operator++() {
++m_ptr;
}
bool operator==(iterator const& rhs) const {
return m_ptr == rhs.m_ptr;
}
bool operator!=(iterator const& rhs) const {
return !((*this) == rhs);
}
private:
T* m_ptr;
};
iterator begin() const {
return iterator(m_data);
}
iterator end() const {
return iterator(m_data, size());
}
protected:
int m_fd;
size_t m_size;
T* m_data;
void init() {
m_fd = -1;
m_size = 0;
m_data = nullptr;
}
void check_fd() {
if (m_fd == -1) throw std::runtime_error("cannot open file");
}
};
template <typename Pointer>
Pointer mmap(int fd, size_t size, int prot) {
static const size_t offset = 0;
Pointer p =
static_cast<Pointer>(::mmap(NULL, size, prot, MAP_SHARED, fd, offset));
if (p == MAP_FAILED) throw std::runtime_error("mmap failed");
return p;
}
template <typename T>
struct file_source : public file<T const> {
typedef file<T const> base;
file_source() {}
file_source(std::string const& path, int adv = advice::normal) {
open(path, adv);
}
void open(std::string const& path, int adv = advice::normal) {
base::m_fd = ::open(path.c_str(), O_RDONLY);
base::check_fd();
struct stat fs;
if (fstat(base::m_fd, &fs) == -1) {
throw std::runtime_error("cannot stat file");
}
base::m_size = fs.st_size;
base::m_data = mmap<T const*>(base::m_fd, base::m_size, PROT_READ);
if (posix_madvise((void*)base::m_data, base::m_size, adv)) {
throw std::runtime_error("madvise failed");
}
}
};
template <typename T>
struct file_sink : public file<T> {
typedef file<T> base;
file_sink() {}
file_sink(std::string const& path) {
open(path);
}
file_sink(std::string const& path, size_t n) {
open(path, n);
}
void open(std::string const& path) {
static const mode_t mode = 0600; // read/write
base::m_fd = ::open(path.c_str(), O_RDWR, mode);
base::check_fd();
struct stat fs;
if (fstat(base::m_fd, &fs) == -1) {
throw std::runtime_error("cannot stat file");
}
base::m_size = fs.st_size;
base::m_data =
mmap<T*>(base::m_fd, base::m_size, PROT_READ | PROT_WRITE);
}
void open(std::string const& path, size_t n) {
static const mode_t mode = 0600; // read/write
base::m_fd = ::open(path.c_str(), O_RDWR | O_CREAT | O_TRUNC, mode);
base::check_fd();
base::m_size = n * sizeof(T);
ftruncate(base::m_fd,
base::m_size); // truncate the file at the new size
base::m_data =
mmap<T*>(base::m_fd, base::m_size, PROT_READ | PROT_WRITE);
}
};
} // namespace mm

View file

@ -1,5 +1,3 @@
#include <chrono>
#include <xcdat.hpp>
#include "cmd_line_parser/parser.hpp"
@ -7,8 +5,8 @@
cmd_line_parser::parser make_parser(int argc, char** argv) {
cmd_line_parser::parser p(argc, argv);
p.add("input_keys", "Input filepath of data keys");
p.add("output_idx", "Output filepath of trie index");
p.add("input_keys", "Input filepath of keywords");
p.add("output_dic", "Output filepath of trie dictionary");
p.add("trie_type", "Trie type: [7|8] (default=7)", "-t", false);
p.add("binary_mode", "Is binary mode? (default=0)", "-b", false);
return p;
@ -17,7 +15,7 @@ cmd_line_parser::parser make_parser(int argc, char** argv) {
template <class Trie>
int build(const cmd_line_parser::parser& p) {
const auto input_keys = p.get<std::string>("input_keys");
const auto output_idx = p.get<std::string>("output_idx");
const auto output_dic = p.get<std::string>("output_dic");
const auto binary_mode = p.get<bool>("binary_mode", false);
auto keys = xcdat::load_strings(input_keys);
@ -37,7 +35,7 @@ int build(const cmd_line_parser::parser& p) {
tfm::printfln("Memory usage in bytes: %d", memory_in_bytes);
tfm::printfln("Memory usage in MiB: %g", memory_in_bytes / (1024.0 * 1024.0));
xcdat::save(trie, output_idx);
xcdat::save(trie, output_dic);
return 0;
}

View file

@ -1,20 +1,20 @@
#include <mm_file/mm_file.hpp>
#include <xcdat.hpp>
#include "cmd_line_parser/parser.hpp"
#include "mm_file/mm_file.hpp"
#include "tinyformat/tinyformat.h"
cmd_line_parser::parser make_parser(int argc, char** argv) {
cmd_line_parser::parser p(argc, argv);
p.add("input_idx", "Input filepath of trie index");
p.add("input_dic", "Input filepath of trie dictionary");
return p;
}
template <class Trie>
int decode(const cmd_line_parser::parser& p) {
const auto input_idx = p.get<std::string>("input_idx");
const auto input_dic = p.get<std::string>("input_dic");
const mm::file_source<char> fin(input_idx.c_str(), mm::advice::sequential);
const mm::file_source<char> fin(input_dic.c_str(), mm::advice::sequential);
const auto trie = xcdat::mmap<Trie>(fin.data());
for (std::uint64_t id; std::cin >> id;) {
@ -36,8 +36,8 @@ int main(int argc, char** argv) {
return 1;
}
const auto input_idx = p.get<std::string>("input_idx");
const auto flag = xcdat::get_flag(input_idx);
const auto input_dic = p.get<std::string>("input_dic");
const auto flag = xcdat::get_flag(input_dic);
switch (flag) {
case 7:

View file

@ -1,20 +1,20 @@
#include <mm_file/mm_file.hpp>
#include <xcdat.hpp>
#include "cmd_line_parser/parser.hpp"
#include "mm_file/mm_file.hpp"
#include "tinyformat/tinyformat.h"
cmd_line_parser::parser make_parser(int argc, char** argv) {
cmd_line_parser::parser p(argc, argv);
p.add("input_idx", "Input filepath of trie index");
p.add("input_dic", "Input filepath of trie dictionary");
return p;
}
template <class Trie>
int enumerate(const cmd_line_parser::parser& p) {
const auto input_idx = p.get<std::string>("input_idx");
const auto input_dic = p.get<std::string>("input_dic");
const mm::file_source<char> fin(input_idx.c_str(), mm::advice::sequential);
const mm::file_source<char> fin(input_dic.c_str(), mm::advice::sequential);
const auto trie = xcdat::mmap<Trie>(fin.data());
trie.enumerate([&](std::uint64_t id, std::string_view str) { tfm::printfln("%d\t%s", id, str); });
@ -33,8 +33,8 @@ int main(int argc, char** argv) {
return 1;
}
const auto input_idx = p.get<std::string>("input_idx");
const auto flag = xcdat::get_flag(input_idx);
const auto input_dic = p.get<std::string>("input_dic");
const auto flag = xcdat::get_flag(input_dic);
switch (flag) {
case 7:

View file

@ -1,20 +1,20 @@
#include <mm_file/mm_file.hpp>
#include <xcdat.hpp>
#include "cmd_line_parser/parser.hpp"
#include "mm_file/mm_file.hpp"
#include "tinyformat/tinyformat.h"
cmd_line_parser::parser make_parser(int argc, char** argv) {
cmd_line_parser::parser p(argc, argv);
p.add("input_idx", "Input filepath of trie index");
p.add("input_dic", "Input filepath of trie dictionary");
return p;
}
template <class Trie>
int lookup(const cmd_line_parser::parser& p) {
const auto input_idx = p.get<std::string>("input_idx");
const auto input_dic = p.get<std::string>("input_dic");
const mm::file_source<char> fin(input_idx.c_str(), mm::advice::sequential);
const mm::file_source<char> fin(input_dic.c_str(), mm::advice::sequential);
const auto trie = xcdat::mmap<Trie>(fin.data());
for (std::string str; std::getline(std::cin, str);) {
@ -40,8 +40,8 @@ int main(int argc, char** argv) {
return 1;
}
const auto input_idx = p.get<std::string>("input_idx");
const auto flag = xcdat::get_flag(input_idx);
const auto input_dic = p.get<std::string>("input_dic");
const auto flag = xcdat::get_flag(input_dic);
switch (flag) {
case 7:

View file

@ -1,22 +1,22 @@
#include <mm_file/mm_file.hpp>
#include <xcdat.hpp>
#include "cmd_line_parser/parser.hpp"
#include "mm_file/mm_file.hpp"
#include "tinyformat/tinyformat.h"
cmd_line_parser::parser make_parser(int argc, char** argv) {
cmd_line_parser::parser p(argc, argv);
p.add("input_idx", "Input filepath of trie index");
p.add("input_dic", "Input filepath of trie dictionary");
p.add("max_num_results", "The max number of results (default=10)", "-n", false);
return p;
}
template <class Trie>
int predictive_search(const cmd_line_parser::parser& p) {
const auto input_idx = p.get<std::string>("input_idx");
const auto input_dic = p.get<std::string>("input_dic");
const auto max_num_results = p.get<std::uint64_t>("max_num_results", 10);
const mm::file_source<char> fin(input_idx.c_str(), mm::advice::sequential);
const mm::file_source<char> fin(input_dic.c_str(), mm::advice::sequential);
const auto trie = xcdat::mmap<Trie>(fin.data());
struct result_type {
@ -53,8 +53,8 @@ int main(int argc, char** argv) {
return 1;
}
const auto input_idx = p.get<std::string>("input_idx");
const auto flag = xcdat::get_flag(input_idx);
const auto input_dic = p.get<std::string>("input_dic");
const auto flag = xcdat::get_flag(input_dic);
switch (flag) {
case 7:

View file

@ -1,20 +1,20 @@
#include <mm_file/mm_file.hpp>
#include <xcdat.hpp>
#include "cmd_line_parser/parser.hpp"
#include "mm_file/mm_file.hpp"
#include "tinyformat/tinyformat.h"
cmd_line_parser::parser make_parser(int argc, char** argv) {
cmd_line_parser::parser p(argc, argv);
p.add("input_idx", "Input filepath of trie index");
p.add("input_dic", "Input filepath of trie dictionary");
return p;
}
template <class Trie>
int prefix_search(const cmd_line_parser::parser& p) {
const auto input_idx = p.get<std::string>("input_idx");
const auto input_dic = p.get<std::string>("input_dic");
const mm::file_source<char> fin(input_idx.c_str(), mm::advice::sequential);
const mm::file_source<char> fin(input_dic.c_str(), mm::advice::sequential);
const auto trie = xcdat::mmap<Trie>(fin.data());
struct result_type {
@ -49,8 +49,8 @@ int main(int argc, char** argv) {
return 1;
}
const auto input_idx = p.get<std::string>("input_idx");
const auto flag = xcdat::get_flag(input_idx);
const auto input_dic = p.get<std::string>("input_dic");
const auto flag = xcdat::get_flag(input_dic);
switch (flag) {
case 7: