diff --git a/CMakeLists.txt b/CMakeLists.txt index 4665e59..4106204 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -42,7 +42,5 @@ file(COPY ${CMAKE_SOURCE_DIR}/tests/keys.txt DESTINATION ${CMAKE_CURRENT_BINARY_ # Install the library file(GLOB XCDAT_HEADER_FILES include/xcdat/*.hpp) -file(GLOB MM_HEADER_FILES include/mm_file/*.hpp) install(FILES include/xcdat.hpp DESTINATION include) install(FILES ${XCDAT_HEADER_FILES} DESTINATION include/xcdat) -install(FILES ${MM_HEADER_FILES} DESTINATION include/mm_file) diff --git a/README.md b/README.md index 365cd2c..ed6d442 100644 --- a/README.md +++ b/README.md @@ -52,17 +52,19 @@ The library considers a 64-bit operating system. The code has been tested only o Xcdat provides command line tools to build the dictionary and perform searches, which are inspired by [marisa-trie](https://github.com/s-yata/marisa-trie). All the tools will print the command line options by specifying the parameter `-h`. +The tools employ the external libraries [cmd_line_parser](https://github.com/jermp/cmd_line_parser), [mm_file](https://github.com/jermp/mm_file), and [tinyformat](https://github.com/c42f/tinyformat), which are contained in the repository. + ### `xcdat_build` -It builds the trie dictionary from a given dataset consisting of keywords separated by newlines. The following command builds the trie dictionary from dataset `enwiki-titles.txt` and writes the dictionary into file `idx.bin`. +It builds the trie dictionary from a given dataset consisting of keywords separated by newlines. The following command builds the trie dictionary from dataset `enwiki-titles.txt` and writes the dictionary into file `dic.bin`. ``` -$ xcdat_build enwiki-titles.txt idx.bin +$ xcdat_build enwiki-titles.txt dic.bin Number of keys: 15955763 -Number of trie nodes: 36441058 -Number of DA units: 36520704 -Memory usage in bytes: 1.70618e+08 -Memory usage in MiB: 162.714 +Number of trie nodes: 36439320 +Number of DA units: 36515840 +Memory usage in bytes: 1.64104e+08 +Memory usage in MiB: 156.502 ``` ### `xcdat_lookup` @@ -70,7 +72,7 @@ Memory usage in MiB: 162.714 It tests the `lookup` operation for a given dictionary. Given a query string via `stdin`, it prints the associated ID if found, or `-1` otherwise. ``` -$ xcdat_lookup idx.bin +$ xcdat_lookup dic.bin Algorithm 1255938 Algorithm Double_Array @@ -82,7 +84,7 @@ Double_Array It tests the `decode` operation for a given dictionary. Given a query ID via `stdin`, it prints the corresponding keyword if the ID is in the range `[0,N-1]`, where `N` is the number of stored keywords. ``` -$ xcdat_decode idx.bin +$ xcdat_decode dic.bin 1255938 1255938 Algorithm ``` @@ -92,7 +94,7 @@ $ xcdat_decode idx.bin It tests the `prefix_search` operation for a given dictionary. Given a query string via `stdin`, it prints all the keywords contained as prefixes of a given string. ``` -$ xcdat_prefix_search idx.bin +$ xcdat_prefix_search dic.bin Algorithmic 6 found 57 A @@ -108,7 +110,7 @@ Algorithmic It tests the `predictive_search` operation for a given dictionary. Given a query string via `stdin`, it prints the first `n` keywords starting with a given string, where `n` is one of the parameters. ``` -$ xcdat_predictive_search idx.bin -n 3 +$ xcdat_predictive_search dic.bin -n 3 Algorithm 263 found 1255938 Algorithm @@ -121,7 +123,7 @@ Algorithm It prints all the keywords stored in a given dictionary. ``` -$ xcdat_enumerate idx.bin | head -3 +$ xcdat_enumerate dic.bin | head -3 0 ! 107 !! 138 !!! @@ -137,31 +139,44 @@ $ xcdat_benchmark enwiki-titles.txt Number of keys: 15955763 Memory usage in bytes: 1.70618e+08 Memory usage in MiB: 162.714 -Construction time in seconds: 12.907 -Lookup time in microsec/query: 0.4674 -Decode time in microsec/query: 0.8722 +Construction time in seconds: 13.501 +Lookup time in microsec/query: 0.5708 +Decode time in microsec/query: 1.0846 ** xcdat::trie_8_type ** Number of keys: 15955763 Memory usage in bytes: 1.64104e+08 Memory usage in MiB: 156.502 -Construction time in seconds: 13.442 -Lookup time in microsec/query: 0.7593 -Decode time in microsec/query: 1.2341 +Construction time in seconds: 13.626 +Lookup time in microsec/query: 0.6391 +Decode time in microsec/query: 1.0531 +** xcdat::trie_15_type ** +Number of keys: 15955763 +Memory usage in bytes: 2.05737e+08 +Memory usage in MiB: 196.206 +Construction time in seconds: 13.425 +Lookup time in microsec/query: 0.3613 +Decode time in microsec/query: 0.7044 +** xcdat::trie_16_type ** +Number of keys: 15955763 +Memory usage in bytes: 2.15935e+08 +Memory usage in MiB: 205.932 +Construction time in seconds: 13.704 +Lookup time in microsec/query: 0.3832 +Decode time in microsec/query: 0.8362 ``` ## Sample usage -`sample/sample.cpp` provides a sample usage. It employs the external library [mm_file](https://github.com/jermp/mm_file) to implement a memory-mapped file, which will be installed by `make install` together. +`sample/sample.cpp` provides a sample usage. ```c++ #include #include -#include #include int main() { - // Dataset + // Dataset of keywords std::vector keys = { "AirPods", "AirTag", "Mac", "MacBook", "MacBook_Air", "MacBook_Pro", "Mac_Mini", "Mac_Pro", "iMac", "iPad", "iPhone", "iPhone_SE", @@ -183,12 +198,8 @@ int main() { xcdat::save(trie, tmp_filename); } - // Memory-map the trie dictionary. - const mm::file_source fin(tmp_filename, mm::advice::sequential); - const auto trie = xcdat::mmap(fin.data()); - - // Or, load the trie dictionary on memory. - // const auto trie = xcdat::load(tmp_filename); + // Load the trie dictionary on memory. + const auto trie = xcdat::load(tmp_filename); // Basic statistics std::cout << "Number of keys: " << trie.num_keys() << std::endl; @@ -293,10 +304,21 @@ Xcdat can be used by including only the header `xcdat.hpp`. ### Trie dictionary types -The two dictionary types of specialization of class `xcdat::trie` are difined: +The four dictionary types of specialization of class `xcdat::trie` are difined. The first two types are based on standard DACs by Brisaboa et al. [9]. The last two types are based on pointer-based DACs by Kanda et al. [2]. -- `xcdat::trie_8_type` is the trie dictionary using standard DACs [9] using 8-bit integers for elements. -- `xcdat::trie_7_type` is the trie dictionary using pointer-based DACs [2] using 7-bit integers for elements. +```c++ +//! The trie type with standard DACs using 8-bit integers +using trie_8_type = trie; + +//! The trie type with standard DACs using 16-bit integers +using trie_16_type = trie; + +//! The trie type with pointer-based DACs using 7-bit integers (for the 1st layer) +using trie_7_type = trie; + +//! The trie type with pointer-based DACs using 15-bit integers (for the 1st layer) +using trie_15_type = trie; +``` ### Trie dictionary class diff --git a/include/xcdat.hpp b/include/xcdat.hpp index 6b5843d..22a7823 100644 --- a/include/xcdat.hpp +++ b/include/xcdat.hpp @@ -12,10 +12,16 @@ namespace xcdat { +//! The trie type with standard DACs using 8-bit integers using trie_8_type = trie; + +//! The trie type with standard DACs using 16-bit integers using trie_16_type = trie; +//! The trie type with pointer-based DACs using 7-bit integers (for the 1st layer) using trie_7_type = trie; + +//! The trie type with pointer-based DACs using 15-bit integers (for the 1st layer) using trie_15_type = trie; //! Set the continuous memory block to a new trie instance (for a memory-mapped file). diff --git a/sample/sample.cpp b/sample/sample.cpp index 9c5b1e3..c6788e4 100644 --- a/sample/sample.cpp +++ b/sample/sample.cpp @@ -1,7 +1,6 @@ #include #include -#include #include int main() { @@ -27,12 +26,8 @@ int main() { xcdat::save(trie, tmp_filename); } - // Memory-map the trie dictionary. - const mm::file_source fin(tmp_filename, mm::advice::sequential); - const auto trie = xcdat::mmap(fin.data()); - - // Or, load the trie dictionary on memory. - // const auto trie = xcdat::load(tmp_filename); + // Load the trie dictionary on memory. + const auto trie = xcdat::load(tmp_filename); // Basic statistics std::cout << "Number of keys: " << trie.num_keys() << std::endl; diff --git a/include/mm_file/mm_file.hpp b/tests/mm_file/mm_file.hpp similarity index 100% rename from include/mm_file/mm_file.hpp rename to tests/mm_file/mm_file.hpp diff --git a/tests/test_trie.cpp b/tests/test_trie.cpp index b59ddf8..60410d4 100644 --- a/tests/test_trie.cpp +++ b/tests/test_trie.cpp @@ -135,6 +135,10 @@ void test_io(const trie_type& trie, const std::vector& keys, const REQUIRE_EQ(trie.num_keys(), loaded.num_keys()); REQUIRE_EQ(trie.alphabet_size(), loaded.alphabet_size()); REQUIRE_EQ(trie.max_length(), loaded.max_length()); + REQUIRE_EQ(trie.num_nodes(), loaded.num_nodes()); + REQUIRE_EQ(trie.num_units(), loaded.num_units()); + REQUIRE_EQ(trie.num_free_units(), loaded.num_free_units()); + REQUIRE_EQ(trie.tail_length(), loaded.tail_length()); REQUIRE_EQ(memory, xcdat::memory_in_bytes(loaded)); test_basic_operations(loaded, keys, others); } @@ -146,6 +150,10 @@ void test_io(const trie_type& trie, const std::vector& keys, const REQUIRE_EQ(trie.num_keys(), mapped.num_keys()); REQUIRE_EQ(trie.alphabet_size(), mapped.alphabet_size()); REQUIRE_EQ(trie.max_length(), mapped.max_length()); + REQUIRE_EQ(trie.num_nodes(), mapped.num_nodes()); + REQUIRE_EQ(trie.num_units(), mapped.num_units()); + REQUIRE_EQ(trie.num_free_units(), mapped.num_free_units()); + REQUIRE_EQ(trie.tail_length(), mapped.tail_length()); REQUIRE_EQ(memory, xcdat::memory_in_bytes(mapped)); test_basic_operations(mapped, keys, others); } diff --git a/tools/mm_file/mm_file.hpp b/tools/mm_file/mm_file.hpp new file mode 100644 index 0000000..b95031d --- /dev/null +++ b/tools/mm_file/mm_file.hpp @@ -0,0 +1,177 @@ +#pragma once + +#include +#include +#include +#include +#include // close(fd) +#include + +namespace mm { + +namespace advice { +static const int normal = POSIX_MADV_NORMAL; +static const int random = POSIX_MADV_RANDOM; +static const int sequential = POSIX_MADV_SEQUENTIAL; +} // namespace advice + +template +struct file { + file() { + init(); + } + + ~file() { + close(); + } + + file(file const&) = delete; // non construction-copyable + file& operator=(file const&) = delete; // non copyable + + bool is_open() const { + return m_fd != -1; + } + + void close() { + if (is_open()) { + if (munmap((char*)m_data, m_size) == -1) { + throw std::runtime_error("munmap failed when closing file"); + } + ::close(m_fd); + init(); + } + } + + size_t bytes() const { + return m_size; + } + + size_t size() const { + return m_size / sizeof(T); + } + + T* data() const { + return m_data; + } + + struct iterator { + iterator(T* addr, size_t offset = 0) : m_ptr(addr + offset) {} + + T operator*() { + return *m_ptr; + } + + void operator++() { + ++m_ptr; + } + + bool operator==(iterator const& rhs) const { + return m_ptr == rhs.m_ptr; + } + + bool operator!=(iterator const& rhs) const { + return !((*this) == rhs); + } + + private: + T* m_ptr; + }; + + iterator begin() const { + return iterator(m_data); + } + + iterator end() const { + return iterator(m_data, size()); + } + +protected: + int m_fd; + size_t m_size; + T* m_data; + + void init() { + m_fd = -1; + m_size = 0; + m_data = nullptr; + } + + void check_fd() { + if (m_fd == -1) throw std::runtime_error("cannot open file"); + } +}; + +template +Pointer mmap(int fd, size_t size, int prot) { + static const size_t offset = 0; + Pointer p = + static_cast(::mmap(NULL, size, prot, MAP_SHARED, fd, offset)); + if (p == MAP_FAILED) throw std::runtime_error("mmap failed"); + return p; +} + +template +struct file_source : public file { + typedef file base; + + file_source() {} + + file_source(std::string const& path, int adv = advice::normal) { + open(path, adv); + } + + void open(std::string const& path, int adv = advice::normal) { + base::m_fd = ::open(path.c_str(), O_RDONLY); + base::check_fd(); + struct stat fs; + if (fstat(base::m_fd, &fs) == -1) { + throw std::runtime_error("cannot stat file"); + } + base::m_size = fs.st_size; + base::m_data = mmap(base::m_fd, base::m_size, PROT_READ); + if (posix_madvise((void*)base::m_data, base::m_size, adv)) { + throw std::runtime_error("madvise failed"); + } + } +}; + +template +struct file_sink : public file { + typedef file base; + + file_sink() {} + + file_sink(std::string const& path) { + open(path); + } + + file_sink(std::string const& path, size_t n) { + open(path, n); + } + + void open(std::string const& path) { + static const mode_t mode = 0600; // read/write + base::m_fd = ::open(path.c_str(), O_RDWR, mode); + base::check_fd(); + struct stat fs; + if (fstat(base::m_fd, &fs) == -1) { + throw std::runtime_error("cannot stat file"); + } + base::m_size = fs.st_size; + base::m_data = + mmap(base::m_fd, base::m_size, PROT_READ | PROT_WRITE); + } + + void open(std::string const& path, size_t n) { + static const mode_t mode = 0600; // read/write + base::m_fd = ::open(path.c_str(), O_RDWR | O_CREAT | O_TRUNC, mode); + base::check_fd(); + base::m_size = n * sizeof(T); + ftruncate(base::m_fd, + base::m_size); // truncate the file at the new size + base::m_data = + mmap(base::m_fd, base::m_size, PROT_READ | PROT_WRITE); + } +}; + +} // namespace mm \ No newline at end of file diff --git a/tools/xcdat_decode.cpp b/tools/xcdat_decode.cpp index 24aa1b8..3178335 100644 --- a/tools/xcdat_decode.cpp +++ b/tools/xcdat_decode.cpp @@ -1,7 +1,7 @@ -#include #include #include "cmd_line_parser/parser.hpp" +#include "mm_file/mm_file.hpp" #include "tinyformat/tinyformat.h" cmd_line_parser::parser make_parser(int argc, char** argv) { @@ -44,6 +44,10 @@ int main(int argc, char** argv) { return decode(p); case 8: return decode(p); + case 15: + return decode(p); + case 16: + return decode(p); default: break; } diff --git a/tools/xcdat_enumerate.cpp b/tools/xcdat_enumerate.cpp index 8050c62..6fdd4a2 100644 --- a/tools/xcdat_enumerate.cpp +++ b/tools/xcdat_enumerate.cpp @@ -1,7 +1,7 @@ -#include #include #include "cmd_line_parser/parser.hpp" +#include "mm_file/mm_file.hpp" #include "tinyformat/tinyformat.h" cmd_line_parser::parser make_parser(int argc, char** argv) { @@ -41,6 +41,10 @@ int main(int argc, char** argv) { return enumerate(p); case 8: return enumerate(p); + case 15: + return enumerate(p); + case 16: + return enumerate(p); default: break; } diff --git a/tools/xcdat_lookup.cpp b/tools/xcdat_lookup.cpp index 18650ac..8599816 100644 --- a/tools/xcdat_lookup.cpp +++ b/tools/xcdat_lookup.cpp @@ -1,7 +1,7 @@ -#include #include #include "cmd_line_parser/parser.hpp" +#include "mm_file/mm_file.hpp" #include "tinyformat/tinyformat.h" cmd_line_parser::parser make_parser(int argc, char** argv) { @@ -48,6 +48,10 @@ int main(int argc, char** argv) { return lookup(p); case 8: return lookup(p); + case 15: + return lookup(p); + case 16: + return lookup(p); default: break; } diff --git a/tools/xcdat_predictive_search.cpp b/tools/xcdat_predictive_search.cpp index 21fd271..0c0ceee 100644 --- a/tools/xcdat_predictive_search.cpp +++ b/tools/xcdat_predictive_search.cpp @@ -1,7 +1,7 @@ -#include #include #include "cmd_line_parser/parser.hpp" +#include "mm_file/mm_file.hpp" #include "tinyformat/tinyformat.h" cmd_line_parser::parser make_parser(int argc, char** argv) { @@ -61,6 +61,10 @@ int main(int argc, char** argv) { return predictive_search(p); case 8: return predictive_search(p); + case 15: + return predictive_search(p); + case 16: + return predictive_search(p); default: break; } diff --git a/tools/xcdat_prefix_search.cpp b/tools/xcdat_prefix_search.cpp index 80752e7..05ab65e 100644 --- a/tools/xcdat_prefix_search.cpp +++ b/tools/xcdat_prefix_search.cpp @@ -1,7 +1,7 @@ -#include #include #include "cmd_line_parser/parser.hpp" +#include "mm_file/mm_file.hpp" #include "tinyformat/tinyformat.h" cmd_line_parser::parser make_parser(int argc, char** argv) { @@ -57,6 +57,10 @@ int main(int argc, char** argv) { return prefix_search(p); case 8: return prefix_search(p); + case 15: + return prefix_search(p); + case 16: + return prefix_search(p); default: break; }