add
This commit is contained in:
parent
2955cab72c
commit
178bbb46d3
|
@ -42,7 +42,5 @@ file(COPY ${CMAKE_SOURCE_DIR}/tests/keys.txt DESTINATION ${CMAKE_CURRENT_BINARY_
|
|||
|
||||
# Install the library
|
||||
file(GLOB XCDAT_HEADER_FILES include/xcdat/*.hpp)
|
||||
file(GLOB MM_HEADER_FILES include/mm_file/*.hpp)
|
||||
install(FILES include/xcdat.hpp DESTINATION include)
|
||||
install(FILES ${XCDAT_HEADER_FILES} DESTINATION include/xcdat)
|
||||
install(FILES ${MM_HEADER_FILES} DESTINATION include/mm_file)
|
||||
|
|
80
README.md
80
README.md
|
@ -52,17 +52,19 @@ The library considers a 64-bit operating system. The code has been tested only o
|
|||
|
||||
Xcdat provides command line tools to build the dictionary and perform searches, which are inspired by [marisa-trie](https://github.com/s-yata/marisa-trie). All the tools will print the command line options by specifying the parameter `-h`.
|
||||
|
||||
The tools employ the external libraries [cmd_line_parser](https://github.com/jermp/cmd_line_parser), [mm_file](https://github.com/jermp/mm_file), and [tinyformat](https://github.com/c42f/tinyformat), which are contained in the repository.
|
||||
|
||||
### `xcdat_build`
|
||||
|
||||
It builds the trie dictionary from a given dataset consisting of keywords separated by newlines. The following command builds the trie dictionary from dataset `enwiki-titles.txt` and writes the dictionary into file `idx.bin`.
|
||||
It builds the trie dictionary from a given dataset consisting of keywords separated by newlines. The following command builds the trie dictionary from dataset `enwiki-titles.txt` and writes the dictionary into file `dic.bin`.
|
||||
|
||||
```
|
||||
$ xcdat_build enwiki-titles.txt idx.bin
|
||||
$ xcdat_build enwiki-titles.txt dic.bin
|
||||
Number of keys: 15955763
|
||||
Number of trie nodes: 36441058
|
||||
Number of DA units: 36520704
|
||||
Memory usage in bytes: 1.70618e+08
|
||||
Memory usage in MiB: 162.714
|
||||
Number of trie nodes: 36439320
|
||||
Number of DA units: 36515840
|
||||
Memory usage in bytes: 1.64104e+08
|
||||
Memory usage in MiB: 156.502
|
||||
```
|
||||
|
||||
### `xcdat_lookup`
|
||||
|
@ -70,7 +72,7 @@ Memory usage in MiB: 162.714
|
|||
It tests the `lookup` operation for a given dictionary. Given a query string via `stdin`, it prints the associated ID if found, or `-1` otherwise.
|
||||
|
||||
```
|
||||
$ xcdat_lookup idx.bin
|
||||
$ xcdat_lookup dic.bin
|
||||
Algorithm
|
||||
1255938 Algorithm
|
||||
Double_Array
|
||||
|
@ -82,7 +84,7 @@ Double_Array
|
|||
It tests the `decode` operation for a given dictionary. Given a query ID via `stdin`, it prints the corresponding keyword if the ID is in the range `[0,N-1]`, where `N` is the number of stored keywords.
|
||||
|
||||
```
|
||||
$ xcdat_decode idx.bin
|
||||
$ xcdat_decode dic.bin
|
||||
1255938
|
||||
1255938 Algorithm
|
||||
```
|
||||
|
@ -92,7 +94,7 @@ $ xcdat_decode idx.bin
|
|||
It tests the `prefix_search` operation for a given dictionary. Given a query string via `stdin`, it prints all the keywords contained as prefixes of a given string.
|
||||
|
||||
```
|
||||
$ xcdat_prefix_search idx.bin
|
||||
$ xcdat_prefix_search dic.bin
|
||||
Algorithmic
|
||||
6 found
|
||||
57 A
|
||||
|
@ -108,7 +110,7 @@ Algorithmic
|
|||
It tests the `predictive_search` operation for a given dictionary. Given a query string via `stdin`, it prints the first `n` keywords starting with a given string, where `n` is one of the parameters.
|
||||
|
||||
```
|
||||
$ xcdat_predictive_search idx.bin -n 3
|
||||
$ xcdat_predictive_search dic.bin -n 3
|
||||
Algorithm
|
||||
263 found
|
||||
1255938 Algorithm
|
||||
|
@ -121,7 +123,7 @@ Algorithm
|
|||
It prints all the keywords stored in a given dictionary.
|
||||
|
||||
```
|
||||
$ xcdat_enumerate idx.bin | head -3
|
||||
$ xcdat_enumerate dic.bin | head -3
|
||||
0 !
|
||||
107 !!
|
||||
138 !!!
|
||||
|
@ -137,31 +139,44 @@ $ xcdat_benchmark enwiki-titles.txt
|
|||
Number of keys: 15955763
|
||||
Memory usage in bytes: 1.70618e+08
|
||||
Memory usage in MiB: 162.714
|
||||
Construction time in seconds: 12.907
|
||||
Lookup time in microsec/query: 0.4674
|
||||
Decode time in microsec/query: 0.8722
|
||||
Construction time in seconds: 13.501
|
||||
Lookup time in microsec/query: 0.5708
|
||||
Decode time in microsec/query: 1.0846
|
||||
** xcdat::trie_8_type **
|
||||
Number of keys: 15955763
|
||||
Memory usage in bytes: 1.64104e+08
|
||||
Memory usage in MiB: 156.502
|
||||
Construction time in seconds: 13.442
|
||||
Lookup time in microsec/query: 0.7593
|
||||
Decode time in microsec/query: 1.2341
|
||||
Construction time in seconds: 13.626
|
||||
Lookup time in microsec/query: 0.6391
|
||||
Decode time in microsec/query: 1.0531
|
||||
** xcdat::trie_15_type **
|
||||
Number of keys: 15955763
|
||||
Memory usage in bytes: 2.05737e+08
|
||||
Memory usage in MiB: 196.206
|
||||
Construction time in seconds: 13.425
|
||||
Lookup time in microsec/query: 0.3613
|
||||
Decode time in microsec/query: 0.7044
|
||||
** xcdat::trie_16_type **
|
||||
Number of keys: 15955763
|
||||
Memory usage in bytes: 2.15935e+08
|
||||
Memory usage in MiB: 205.932
|
||||
Construction time in seconds: 13.704
|
||||
Lookup time in microsec/query: 0.3832
|
||||
Decode time in microsec/query: 0.8362
|
||||
```
|
||||
|
||||
## Sample usage
|
||||
|
||||
`sample/sample.cpp` provides a sample usage. It employs the external library [mm_file](https://github.com/jermp/mm_file) to implement a memory-mapped file, which will be installed by `make install` together.
|
||||
`sample/sample.cpp` provides a sample usage.
|
||||
|
||||
```c++
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
#include <mm_file/mm_file.hpp>
|
||||
#include <xcdat.hpp>
|
||||
|
||||
int main() {
|
||||
// Dataset
|
||||
// Dataset of keywords
|
||||
std::vector<std::string> keys = {
|
||||
"AirPods", "AirTag", "Mac", "MacBook", "MacBook_Air", "MacBook_Pro",
|
||||
"Mac_Mini", "Mac_Pro", "iMac", "iPad", "iPhone", "iPhone_SE",
|
||||
|
@ -183,12 +198,8 @@ int main() {
|
|||
xcdat::save(trie, tmp_filename);
|
||||
}
|
||||
|
||||
// Memory-map the trie dictionary.
|
||||
const mm::file_source<char> fin(tmp_filename, mm::advice::sequential);
|
||||
const auto trie = xcdat::mmap<trie_type>(fin.data());
|
||||
|
||||
// Or, load the trie dictionary on memory.
|
||||
// const auto trie = xcdat::load<trie_type>(tmp_filename);
|
||||
// Load the trie dictionary on memory.
|
||||
const auto trie = xcdat::load<trie_type>(tmp_filename);
|
||||
|
||||
// Basic statistics
|
||||
std::cout << "Number of keys: " << trie.num_keys() << std::endl;
|
||||
|
@ -293,10 +304,21 @@ Xcdat can be used by including only the header `xcdat.hpp`.
|
|||
|
||||
### Trie dictionary types
|
||||
|
||||
The two dictionary types of specialization of class `xcdat::trie` are difined:
|
||||
The four dictionary types of specialization of class `xcdat::trie` are difined. The first two types are based on standard DACs by Brisaboa et al. [9]. The last two types are based on pointer-based DACs by Kanda et al. [2].
|
||||
|
||||
- `xcdat::trie_8_type` is the trie dictionary using standard DACs [9] using 8-bit integers for elements.
|
||||
- `xcdat::trie_7_type` is the trie dictionary using pointer-based DACs [2] using 7-bit integers for elements.
|
||||
```c++
|
||||
//! The trie type with standard DACs using 8-bit integers
|
||||
using trie_8_type = trie<bc_vector_8>;
|
||||
|
||||
//! The trie type with standard DACs using 16-bit integers
|
||||
using trie_16_type = trie<bc_vector_16>;
|
||||
|
||||
//! The trie type with pointer-based DACs using 7-bit integers (for the 1st layer)
|
||||
using trie_7_type = trie<bc_vector_7>;
|
||||
|
||||
//! The trie type with pointer-based DACs using 15-bit integers (for the 1st layer)
|
||||
using trie_15_type = trie<bc_vector_15>;
|
||||
```
|
||||
|
||||
### Trie dictionary class
|
||||
|
||||
|
|
|
@ -12,10 +12,16 @@
|
|||
|
||||
namespace xcdat {
|
||||
|
||||
//! The trie type with standard DACs using 8-bit integers
|
||||
using trie_8_type = trie<bc_vector_8>;
|
||||
|
||||
//! The trie type with standard DACs using 16-bit integers
|
||||
using trie_16_type = trie<bc_vector_16>;
|
||||
|
||||
//! The trie type with pointer-based DACs using 7-bit integers (for the 1st layer)
|
||||
using trie_7_type = trie<bc_vector_7>;
|
||||
|
||||
//! The trie type with pointer-based DACs using 15-bit integers (for the 1st layer)
|
||||
using trie_15_type = trie<bc_vector_15>;
|
||||
|
||||
//! Set the continuous memory block to a new trie instance (for a memory-mapped file).
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
#include <mm_file/mm_file.hpp>
|
||||
#include <xcdat.hpp>
|
||||
|
||||
int main() {
|
||||
|
@ -27,12 +26,8 @@ int main() {
|
|||
xcdat::save(trie, tmp_filename);
|
||||
}
|
||||
|
||||
// Memory-map the trie dictionary.
|
||||
const mm::file_source<char> fin(tmp_filename, mm::advice::sequential);
|
||||
const auto trie = xcdat::mmap<trie_type>(fin.data());
|
||||
|
||||
// Or, load the trie dictionary on memory.
|
||||
// const auto trie = xcdat::load<trie_type>(tmp_filename);
|
||||
// Load the trie dictionary on memory.
|
||||
const auto trie = xcdat::load<trie_type>(tmp_filename);
|
||||
|
||||
// Basic statistics
|
||||
std::cout << "Number of keys: " << trie.num_keys() << std::endl;
|
||||
|
|
|
@ -135,6 +135,10 @@ void test_io(const trie_type& trie, const std::vector<std::string>& keys, const
|
|||
REQUIRE_EQ(trie.num_keys(), loaded.num_keys());
|
||||
REQUIRE_EQ(trie.alphabet_size(), loaded.alphabet_size());
|
||||
REQUIRE_EQ(trie.max_length(), loaded.max_length());
|
||||
REQUIRE_EQ(trie.num_nodes(), loaded.num_nodes());
|
||||
REQUIRE_EQ(trie.num_units(), loaded.num_units());
|
||||
REQUIRE_EQ(trie.num_free_units(), loaded.num_free_units());
|
||||
REQUIRE_EQ(trie.tail_length(), loaded.tail_length());
|
||||
REQUIRE_EQ(memory, xcdat::memory_in_bytes(loaded));
|
||||
test_basic_operations(loaded, keys, others);
|
||||
}
|
||||
|
@ -146,6 +150,10 @@ void test_io(const trie_type& trie, const std::vector<std::string>& keys, const
|
|||
REQUIRE_EQ(trie.num_keys(), mapped.num_keys());
|
||||
REQUIRE_EQ(trie.alphabet_size(), mapped.alphabet_size());
|
||||
REQUIRE_EQ(trie.max_length(), mapped.max_length());
|
||||
REQUIRE_EQ(trie.num_nodes(), mapped.num_nodes());
|
||||
REQUIRE_EQ(trie.num_units(), mapped.num_units());
|
||||
REQUIRE_EQ(trie.num_free_units(), mapped.num_free_units());
|
||||
REQUIRE_EQ(trie.tail_length(), mapped.tail_length());
|
||||
REQUIRE_EQ(memory, xcdat::memory_in_bytes(mapped));
|
||||
test_basic_operations(mapped, keys, others);
|
||||
}
|
||||
|
|
177
tools/mm_file/mm_file.hpp
Normal file
177
tools/mm_file/mm_file.hpp
Normal file
|
@ -0,0 +1,177 @@
|
|||
#pragma once
|
||||
|
||||
#include <sys/mman.h>
|
||||
#include <sys/stat.h>
|
||||
#include <type_traits>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h> // close(fd)
|
||||
#include <string>
|
||||
|
||||
namespace mm {
|
||||
|
||||
namespace advice {
|
||||
static const int normal = POSIX_MADV_NORMAL;
|
||||
static const int random = POSIX_MADV_RANDOM;
|
||||
static const int sequential = POSIX_MADV_SEQUENTIAL;
|
||||
} // namespace advice
|
||||
|
||||
template <typename T>
|
||||
struct file {
|
||||
file() {
|
||||
init();
|
||||
}
|
||||
|
||||
~file() {
|
||||
close();
|
||||
}
|
||||
|
||||
file(file const&) = delete; // non construction-copyable
|
||||
file& operator=(file const&) = delete; // non copyable
|
||||
|
||||
bool is_open() const {
|
||||
return m_fd != -1;
|
||||
}
|
||||
|
||||
void close() {
|
||||
if (is_open()) {
|
||||
if (munmap((char*)m_data, m_size) == -1) {
|
||||
throw std::runtime_error("munmap failed when closing file");
|
||||
}
|
||||
::close(m_fd);
|
||||
init();
|
||||
}
|
||||
}
|
||||
|
||||
size_t bytes() const {
|
||||
return m_size;
|
||||
}
|
||||
|
||||
size_t size() const {
|
||||
return m_size / sizeof(T);
|
||||
}
|
||||
|
||||
T* data() const {
|
||||
return m_data;
|
||||
}
|
||||
|
||||
struct iterator {
|
||||
iterator(T* addr, size_t offset = 0) : m_ptr(addr + offset) {}
|
||||
|
||||
T operator*() {
|
||||
return *m_ptr;
|
||||
}
|
||||
|
||||
void operator++() {
|
||||
++m_ptr;
|
||||
}
|
||||
|
||||
bool operator==(iterator const& rhs) const {
|
||||
return m_ptr == rhs.m_ptr;
|
||||
}
|
||||
|
||||
bool operator!=(iterator const& rhs) const {
|
||||
return !((*this) == rhs);
|
||||
}
|
||||
|
||||
private:
|
||||
T* m_ptr;
|
||||
};
|
||||
|
||||
iterator begin() const {
|
||||
return iterator(m_data);
|
||||
}
|
||||
|
||||
iterator end() const {
|
||||
return iterator(m_data, size());
|
||||
}
|
||||
|
||||
protected:
|
||||
int m_fd;
|
||||
size_t m_size;
|
||||
T* m_data;
|
||||
|
||||
void init() {
|
||||
m_fd = -1;
|
||||
m_size = 0;
|
||||
m_data = nullptr;
|
||||
}
|
||||
|
||||
void check_fd() {
|
||||
if (m_fd == -1) throw std::runtime_error("cannot open file");
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Pointer>
|
||||
Pointer mmap(int fd, size_t size, int prot) {
|
||||
static const size_t offset = 0;
|
||||
Pointer p =
|
||||
static_cast<Pointer>(::mmap(NULL, size, prot, MAP_SHARED, fd, offset));
|
||||
if (p == MAP_FAILED) throw std::runtime_error("mmap failed");
|
||||
return p;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
struct file_source : public file<T const> {
|
||||
typedef file<T const> base;
|
||||
|
||||
file_source() {}
|
||||
|
||||
file_source(std::string const& path, int adv = advice::normal) {
|
||||
open(path, adv);
|
||||
}
|
||||
|
||||
void open(std::string const& path, int adv = advice::normal) {
|
||||
base::m_fd = ::open(path.c_str(), O_RDONLY);
|
||||
base::check_fd();
|
||||
struct stat fs;
|
||||
if (fstat(base::m_fd, &fs) == -1) {
|
||||
throw std::runtime_error("cannot stat file");
|
||||
}
|
||||
base::m_size = fs.st_size;
|
||||
base::m_data = mmap<T const*>(base::m_fd, base::m_size, PROT_READ);
|
||||
if (posix_madvise((void*)base::m_data, base::m_size, adv)) {
|
||||
throw std::runtime_error("madvise failed");
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct file_sink : public file<T> {
|
||||
typedef file<T> base;
|
||||
|
||||
file_sink() {}
|
||||
|
||||
file_sink(std::string const& path) {
|
||||
open(path);
|
||||
}
|
||||
|
||||
file_sink(std::string const& path, size_t n) {
|
||||
open(path, n);
|
||||
}
|
||||
|
||||
void open(std::string const& path) {
|
||||
static const mode_t mode = 0600; // read/write
|
||||
base::m_fd = ::open(path.c_str(), O_RDWR, mode);
|
||||
base::check_fd();
|
||||
struct stat fs;
|
||||
if (fstat(base::m_fd, &fs) == -1) {
|
||||
throw std::runtime_error("cannot stat file");
|
||||
}
|
||||
base::m_size = fs.st_size;
|
||||
base::m_data =
|
||||
mmap<T*>(base::m_fd, base::m_size, PROT_READ | PROT_WRITE);
|
||||
}
|
||||
|
||||
void open(std::string const& path, size_t n) {
|
||||
static const mode_t mode = 0600; // read/write
|
||||
base::m_fd = ::open(path.c_str(), O_RDWR | O_CREAT | O_TRUNC, mode);
|
||||
base::check_fd();
|
||||
base::m_size = n * sizeof(T);
|
||||
ftruncate(base::m_fd,
|
||||
base::m_size); // truncate the file at the new size
|
||||
base::m_data =
|
||||
mmap<T*>(base::m_fd, base::m_size, PROT_READ | PROT_WRITE);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace mm
|
|
@ -1,7 +1,7 @@
|
|||
#include <mm_file/mm_file.hpp>
|
||||
#include <xcdat.hpp>
|
||||
|
||||
#include "cmd_line_parser/parser.hpp"
|
||||
#include "mm_file/mm_file.hpp"
|
||||
#include "tinyformat/tinyformat.h"
|
||||
|
||||
cmd_line_parser::parser make_parser(int argc, char** argv) {
|
||||
|
@ -44,6 +44,10 @@ int main(int argc, char** argv) {
|
|||
return decode<xcdat::trie_7_type>(p);
|
||||
case 8:
|
||||
return decode<xcdat::trie_8_type>(p);
|
||||
case 15:
|
||||
return decode<xcdat::trie_15_type>(p);
|
||||
case 16:
|
||||
return decode<xcdat::trie_16_type>(p);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
#include <mm_file/mm_file.hpp>
|
||||
#include <xcdat.hpp>
|
||||
|
||||
#include "cmd_line_parser/parser.hpp"
|
||||
#include "mm_file/mm_file.hpp"
|
||||
#include "tinyformat/tinyformat.h"
|
||||
|
||||
cmd_line_parser::parser make_parser(int argc, char** argv) {
|
||||
|
@ -41,6 +41,10 @@ int main(int argc, char** argv) {
|
|||
return enumerate<xcdat::trie_7_type>(p);
|
||||
case 8:
|
||||
return enumerate<xcdat::trie_8_type>(p);
|
||||
case 15:
|
||||
return enumerate<xcdat::trie_15_type>(p);
|
||||
case 16:
|
||||
return enumerate<xcdat::trie_16_type>(p);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
#include <mm_file/mm_file.hpp>
|
||||
#include <xcdat.hpp>
|
||||
|
||||
#include "cmd_line_parser/parser.hpp"
|
||||
#include "mm_file/mm_file.hpp"
|
||||
#include "tinyformat/tinyformat.h"
|
||||
|
||||
cmd_line_parser::parser make_parser(int argc, char** argv) {
|
||||
|
@ -48,6 +48,10 @@ int main(int argc, char** argv) {
|
|||
return lookup<xcdat::trie_7_type>(p);
|
||||
case 8:
|
||||
return lookup<xcdat::trie_8_type>(p);
|
||||
case 15:
|
||||
return lookup<xcdat::trie_15_type>(p);
|
||||
case 16:
|
||||
return lookup<xcdat::trie_16_type>(p);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
#include <mm_file/mm_file.hpp>
|
||||
#include <xcdat.hpp>
|
||||
|
||||
#include "cmd_line_parser/parser.hpp"
|
||||
#include "mm_file/mm_file.hpp"
|
||||
#include "tinyformat/tinyformat.h"
|
||||
|
||||
cmd_line_parser::parser make_parser(int argc, char** argv) {
|
||||
|
@ -61,6 +61,10 @@ int main(int argc, char** argv) {
|
|||
return predictive_search<xcdat::trie_7_type>(p);
|
||||
case 8:
|
||||
return predictive_search<xcdat::trie_8_type>(p);
|
||||
case 15:
|
||||
return predictive_search<xcdat::trie_15_type>(p);
|
||||
case 16:
|
||||
return predictive_search<xcdat::trie_16_type>(p);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
#include <mm_file/mm_file.hpp>
|
||||
#include <xcdat.hpp>
|
||||
|
||||
#include "cmd_line_parser/parser.hpp"
|
||||
#include "mm_file/mm_file.hpp"
|
||||
#include "tinyformat/tinyformat.h"
|
||||
|
||||
cmd_line_parser::parser make_parser(int argc, char** argv) {
|
||||
|
@ -57,6 +57,10 @@ int main(int argc, char** argv) {
|
|||
return prefix_search<xcdat::trie_7_type>(p);
|
||||
case 8:
|
||||
return prefix_search<xcdat::trie_8_type>(p);
|
||||
case 15:
|
||||
return prefix_search<xcdat::trie_15_type>(p);
|
||||
case 16:
|
||||
return prefix_search<xcdat::trie_16_type>(p);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue