diff --git a/CMakeLists.txt b/CMakeLists.txt index 03700a8..50be20d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -42,5 +42,7 @@ file(COPY ${CMAKE_SOURCE_DIR}/tests/keys.txt DESTINATION ${CMAKE_CURRENT_BINARY_ # Install the library file(GLOB XCDAT_HEADER_FILES include/xcdat/*.hpp) +file(GLOB MM_HEADER_FILES include/mm_file/*.hpp) install(FILES include/xcdat.hpp DESTINATION include) install(FILES ${XCDAT_HEADER_FILES} DESTINATION include/xcdat) +install(FILES ${MM_HEADER_FILES} DESTINATION include/mm_file) diff --git a/README.md b/README.md index 9863efd..0aac029 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Xcdat: Fast compressed trie dictionary library -**Xcdat** is a C++17 header-only library of a fast compressed string dictionary based on the improved double-array trie structure described in the paper: [Compressed double-array tries for string dictionaries supporting fast lookup](https://doi.org/10.1007/s10115-016-0999-8), *Knowledge and Information Systems*, 2017, available at [here](https://kampersanda.github.io/pdf/KAIS2017.pdf). +**Xcdat** is a C++17 header-only library of a fast compressed string dictionary based on an improved double-array trie structure described in the paper: [Compressed double-array tries for string dictionaries supporting fast lookup](https://doi.org/10.1007/s10115-016-0999-8), *Knowledge and Information Systems*, 2017, available at [here](https://kampersanda.github.io/pdf/KAIS2017.pdf). ## Table of contents @@ -17,8 +17,8 @@ ## Features - **Compressed string dictionary.** Xcdat implements a (static) *compressed string dictioanry* that stores a set of strings (or keywords) in a compressed space while supporting several search operations [1,2]. For example, Xcdat can store an entire set of English Wikipedia titles at half the size of the raw data. -- **Fast and compact data structure.** Xcdat employs the *double-array trie* [3] known as the fastest data structure for trie implementation. However, the double-array trie resorts to many pointers and consumes a large amount of memory. To address this, Xcdat applies the *XCDA* method [2] that represents the double-array trie in a compressed format while maintaining the fast searches. -- **Cache efficiency.** Xcdat employs a *minimal-prefix trie* [4] that replaces redundant trie nodes into strings, resulting in reducing random access and improving locality of references. +- **Fast and compact data structure.** Xcdat employs the *double-array trie* [3] known as the fastest trie implementation. However, the double-array trie resorts to many pointers and consumes a large amount of memory. To address this, Xcdat applies the *XCDA* method [2] that represents the double-array trie in a compressed format while maintaining the fast searches. +- **Cache efficiency.** Xcdat employs a *minimal-prefix trie* [4] that replaces redundant trie nodes into strings to reduce random access and to improve locality of references. - **Dictionary encoding.** Xcdat maps `N` distinct keywords into unique IDs from `[0,N-1]`, and supports the two symmetric operations: `lookup` returns the ID corresponding to a given keyword; `decode` returns the keyword associated with a given ID. The mapping is so-called *dictionary encoding* (or *domain encoding*) and is fundamental in many DB applications as described by Martínez-Prieto et al [1] or Müller et al. [5]. - **Prefix search operations.** Xcdat supports prefix search operations realized by trie search algorithms: `prefix_search` returns all the keywords contained as prefixes of a given string; `predictive search` returns all the keywords starting with a given string. These will be useful in many NLP applications such as auto completions [6], stemmed searches [7], or input method editors [8]. - **64-bit support.** As mentioned before, since the double array is a pointer-based data structure, most double-array libraries use 32-bit pointers to reduce memory consumption, resulting in limiting the scale of the input dataset. On the other hand, the XCDA method allows Xcdat to represent 64-bit pointers without sacrificing memory efficiency. @@ -50,11 +50,11 @@ The library considers a 64-bit operating system. The code has been tested only o ## Command line tools - Xcdat provides command line tools to build the index and perform searches, which are inspired by [marisa-trie](https://github.com/s-yata/marisa-trie). All the tools will print the command line options by specifying the parameter `-h`. + Xcdat provides command line tools to build the dictionary and perform searches, which are inspired by [marisa-trie](https://github.com/s-yata/marisa-trie). All the tools will print the command line options by specifying the parameter `-h`. ### `xcdat_build` -It builds the trie index from a given dataset consisting of keywords separated by newlines. The following command builds the trie index from dataset `enwiki-titles.txt` and writes the index into file `idx.bin`. +It builds the trie dictionary from a given dataset consisting of keywords separated by newlines. The following command builds the trie dictionary from dataset `enwiki-titles.txt` and writes the dictionary into file `idx.bin`. ``` $ xcdat_build enwiki-titles.txt idx.bin @@ -67,7 +67,7 @@ Memory usage in MiB: 162.714 ### `xcdat_lookup` -It tests the `lookup` operation for a given index. Given a query string via `stdin`, it prints the associated ID if found, or `-1` otherwise. +It tests the `lookup` operation for a given dictionary. Given a query string via `stdin`, it prints the associated ID if found, or `-1` otherwise. ``` $ xcdat_lookup idx.bin @@ -79,7 +79,7 @@ Double_Array ### `xcdat_decode` -It tests the `decode` operation for a given index. Given a query ID via `stdin`, it prints the corresponding keyword if the ID is in the range `[0,N-1]`, where `N` is the number of stored keywords. +It tests the `decode` operation for a given dictionary. Given a query ID via `stdin`, it prints the corresponding keyword if the ID is in the range `[0,N-1]`, where `N` is the number of stored keywords. ``` $ xcdat_decode idx.bin @@ -89,7 +89,7 @@ $ xcdat_decode idx.bin ### `xcdat_prefix_search` -It tests the `prefix_search` operation for a given index. Given a query string via `stdin`, it prints all the keywords contained as prefixes of a given string. +It tests the `prefix_search` operation for a given dictionary. Given a query string via `stdin`, it prints all the keywords contained as prefixes of a given string. ``` $ xcdat_prefix_search idx.bin @@ -105,7 +105,7 @@ Algorithmic ### `xcdat_predictive_search` -It tests the `predictive_search` operation for a given index. Given a query string via `stdin`, it prints the first `n` keywords starting with a given string, where `n` is one of the parameters. +It tests the `predictive_search` operation for a given dictionary. Given a query string via `stdin`, it prints the first `n` keywords starting with a given string, where `n` is one of the parameters. ``` $ xcdat_predictive_search idx.bin -n 3 @@ -118,7 +118,7 @@ Algorithm ### `xcdat_enumerate` -It prints all the keywords stored in a given index. +It prints all the keywords stored in a given dictionary. ``` $ xcdat_enumerate idx.bin | head -3 @@ -151,16 +151,17 @@ Decode time in microsec/query: 1.2341 ## Sample usage -`sample/sample.cpp` provides a sample usage. +`sample/sample.cpp` provides a sample usage. It employs the external library [mm_file](https://github.com/jermp/mm_file) to implement a memory-mapped file, which will be installed by `make install` together. ```c++ #include #include +#include #include int main() { - // Input keys + // Dataset std::vector keys = { "AirPods", "AirTag", "Mac", "MacBook", "MacBook_Air", "MacBook_Pro", "Mac_Mini", "Mac_Pro", "iMac", "iPad", "iPhone", "iPhone_SE", @@ -170,27 +171,32 @@ int main() { std::sort(keys.begin(), keys.end()); keys.erase(std::unique(keys.begin(), keys.end()), keys.end()); - const char* index_filename = "tmp.idx"; - - // The trie index type + // The trie dictionary type using trie_type = xcdat::trie_8_type; - // Build and save the trie index. + // The dictionary filename + const char* tmp_filename = "dic.bin"; + + // Build and save the trie dictionary. { const trie_type trie(keys); - xcdat::save(trie, index_filename); + xcdat::save(trie, tmp_filename); } - // Load the trie index. - const auto trie = xcdat::load(index_filename); + // Memory-map the trie dictionary. + const mm::file_source fin(tmp_filename, mm::advice::sequential); + const auto trie = xcdat::mmap(fin.data()); + + // Or, load the trie dictionary on memory. + // const auto trie = xcdat::load(tmp_filename); // Basic statistics - std::cout << "NumberKeys: " << trie.num_keys() << std::endl; - std::cout << "MaxLength: " << trie.max_length() << std::endl; - std::cout << "AlphabetSize: " << trie.alphabet_size() << std::endl; - std::cout << "Memory: " << xcdat::memory_in_bytes(trie) << " bytes" << std::endl; + std::cout << "Number of keys: " << trie.num_keys() << std::endl; + std::cout << "Number of trie nodes: " << trie.num_nodes() << std::endl; + std::cout << "Number of DA units: " << trie.num_units() << std::endl; + std::cout << "Memory usage in bytes: " << xcdat::memory_in_bytes(trie) << std::endl; - // Lookup IDs from keys + // Lookup the ID for a query key. { const auto id = trie.lookup("Mac_Pro"); std::cout << "Lookup(Mac_Pro) = " << id.value_or(UINT64_MAX) << std::endl; @@ -200,7 +206,7 @@ int main() { std::cout << "Lookup(Google_Pixel) = " << id.value_or(UINT64_MAX) << std::endl; } - // Decode keys from IDs + // Decode the key for a query ID. { const auto dec = trie.decode(4); std::cout << "Decode(4) = " << dec << std::endl; @@ -236,7 +242,8 @@ int main() { std::cout << "}" << std::endl; } - std::remove(index_filename); + std::remove(tmp_filename); + return 0; } ``` @@ -244,10 +251,10 @@ int main() { The output will be ``` -NumberKeys: 12 -MaxLength: 11 -AlphabetSize: 20 -Memory: 1762 bytes +Number of keys: 12 +Number of trie nodes: 28 +Number of DA units: 256 +Memory usage in bytes: 1766 Lookup(Mac_Pro) = 7 Lookup(Google_Pixel) = 18446744073709551615 Decode(4) = MacBook_Air @@ -451,15 +458,15 @@ class trie { template Trie mmap(const char* address); -//! Load the trie index from the file. +//! Load the trie dictionary from the file. template Trie load(std::string_view filepath); -//! Save the trie index to the file and returns the file size in bytes. +//! Save the trie dictionary to the file and returns the file size in bytes. template std::uint64_t save(const Trie& idx, std::string_view filepath); -//! Get the index size in bytes. +//! Get the dictionary size in bytes. template std::uint64_t memory_in_bytes(const Trie& idx); diff --git a/tests/mm_file/mm_file.hpp b/include/mm_file/mm_file.hpp similarity index 100% rename from tests/mm_file/mm_file.hpp rename to include/mm_file/mm_file.hpp diff --git a/include/xcdat.hpp b/include/xcdat.hpp index dc0f389..2c9ca62 100644 --- a/include/xcdat.hpp +++ b/include/xcdat.hpp @@ -20,28 +20,28 @@ template std::uint32_t flag; visitor.visit(flag); - XCDAT_THROW_IF(flag != Trie::l1_bits, "The input index type is different."); + XCDAT_THROW_IF(flag != Trie::l1_bits, "The input dictionary type is different."); Trie idx; visitor.visit(idx); return idx; } -//! Load the trie index from the file. +//! Load the trie dictionary from the file. template [[maybe_unused]] Trie load(std::string_view filepath) { load_visitor visitor(filepath); std::uint32_t flag; visitor.visit(flag); - XCDAT_THROW_IF(flag != Trie::l1_bits, "The input index type is different."); + XCDAT_THROW_IF(flag != Trie::l1_bits, "The input dictionary type is different."); Trie idx; visitor.visit(idx); return idx; } -//! Save the trie index to the file and returns the file size in bytes. +//! Save the trie dictionary to the file and returns the file size in bytes. template [[maybe_unused]] std::uint64_t save(const Trie& idx, std::string_view filepath) { save_visitor visitor(filepath); @@ -50,7 +50,7 @@ template return visitor.bytes(); } -//! Get the index size in bytes. +//! Get the dictionary size in bytes. template [[maybe_unused]] std::uint64_t memory_in_bytes(const Trie& idx) { size_visitor visitor; @@ -59,7 +59,7 @@ template return visitor.bytes(); } -//! Get the flag indicating the trie type, embedded by the function 'save'. +//! Get the flag indicating the trie dictionary type, embedded by the function 'save'. //! The flag corresponds to trie::l1_bits and will be used to detect the trie type from the file. [[maybe_unused]] std::uint32_t get_flag(std::string_view filepath) { std::ifstream ifs(filepath); diff --git a/include/xcdat/trie_builder.hpp b/include/xcdat/trie_builder.hpp index 1dfe724..2b009e2 100644 --- a/include/xcdat/trie_builder.hpp +++ b/include/xcdat/trie_builder.hpp @@ -87,6 +87,9 @@ class trie_builder { // Build the BC units arrange(0, m_keys.size(), 0, 0); + // Finish + finish(); + // Build the TAIL vector m_suffixes.complete(m_bin_mode, [&](std::uint64_t npos, std::uint64_t tpos) { m_units[npos].base = tpos; }); } @@ -161,6 +164,13 @@ class trie_builder { } } + void finish() { + while (m_units[taboo_npos].base != taboo_npos) { + auto bpos = m_units[taboo_npos].base / 256; + close_block(bpos); + } + } + void arrange(std::uint64_t beg, std::uint64_t end, std::uint64_t kpos, std::uint64_t npos) { if (m_keys[beg].size() == kpos) { m_terms.set_bit(npos, true); diff --git a/sample/sample.cpp b/sample/sample.cpp index 7654296..9c5b1e3 100644 --- a/sample/sample.cpp +++ b/sample/sample.cpp @@ -1,10 +1,11 @@ #include #include +#include #include int main() { - // Input keys + // Dataset of keywords std::vector keys = { "AirPods", "AirTag", "Mac", "MacBook", "MacBook_Air", "MacBook_Pro", "Mac_Mini", "Mac_Pro", "iMac", "iPad", "iPhone", "iPhone_SE", @@ -14,27 +15,32 @@ int main() { std::sort(keys.begin(), keys.end()); keys.erase(std::unique(keys.begin(), keys.end()), keys.end()); - const char* index_filename = "tmp.idx"; - - // The trie index type + // The trie dictionary type using trie_type = xcdat::trie_8_type; - // Build and save the trie index. + // The dictionary filename + const char* tmp_filename = "dic.bin"; + + // Build and save the trie dictionary. { const trie_type trie(keys); - xcdat::save(trie, index_filename); + xcdat::save(trie, tmp_filename); } - // Load the trie index. - const auto trie = xcdat::load(index_filename); + // Memory-map the trie dictionary. + const mm::file_source fin(tmp_filename, mm::advice::sequential); + const auto trie = xcdat::mmap(fin.data()); + + // Or, load the trie dictionary on memory. + // const auto trie = xcdat::load(tmp_filename); // Basic statistics - std::cout << "NumberKeys: " << trie.num_keys() << std::endl; - std::cout << "MaxLength: " << trie.max_length() << std::endl; - std::cout << "AlphabetSize: " << trie.alphabet_size() << std::endl; - std::cout << "Memory: " << xcdat::memory_in_bytes(trie) << " bytes" << std::endl; + std::cout << "Number of keys: " << trie.num_keys() << std::endl; + std::cout << "Number of trie nodes: " << trie.num_nodes() << std::endl; + std::cout << "Number of DA units: " << trie.num_units() << std::endl; + std::cout << "Memory usage in bytes: " << xcdat::memory_in_bytes(trie) << std::endl; - // Lookup IDs from keys + // Lookup the ID for a query key. { const auto id = trie.lookup("Mac_Pro"); std::cout << "Lookup(Mac_Pro) = " << id.value_or(UINT64_MAX) << std::endl; @@ -44,7 +50,7 @@ int main() { std::cout << "Lookup(Google_Pixel) = " << id.value_or(UINT64_MAX) << std::endl; } - // Decode keys from IDs + // Decode the key for a query ID. { const auto dec = trie.decode(4); std::cout << "Decode(4) = " << dec << std::endl; @@ -80,6 +86,7 @@ int main() { std::cout << "}" << std::endl; } - std::remove(index_filename); + std::remove(tmp_filename); + return 0; } diff --git a/tools/mm_file/mm_file.hpp b/tools/mm_file/mm_file.hpp deleted file mode 100644 index b95031d..0000000 --- a/tools/mm_file/mm_file.hpp +++ /dev/null @@ -1,177 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include // close(fd) -#include - -namespace mm { - -namespace advice { -static const int normal = POSIX_MADV_NORMAL; -static const int random = POSIX_MADV_RANDOM; -static const int sequential = POSIX_MADV_SEQUENTIAL; -} // namespace advice - -template -struct file { - file() { - init(); - } - - ~file() { - close(); - } - - file(file const&) = delete; // non construction-copyable - file& operator=(file const&) = delete; // non copyable - - bool is_open() const { - return m_fd != -1; - } - - void close() { - if (is_open()) { - if (munmap((char*)m_data, m_size) == -1) { - throw std::runtime_error("munmap failed when closing file"); - } - ::close(m_fd); - init(); - } - } - - size_t bytes() const { - return m_size; - } - - size_t size() const { - return m_size / sizeof(T); - } - - T* data() const { - return m_data; - } - - struct iterator { - iterator(T* addr, size_t offset = 0) : m_ptr(addr + offset) {} - - T operator*() { - return *m_ptr; - } - - void operator++() { - ++m_ptr; - } - - bool operator==(iterator const& rhs) const { - return m_ptr == rhs.m_ptr; - } - - bool operator!=(iterator const& rhs) const { - return !((*this) == rhs); - } - - private: - T* m_ptr; - }; - - iterator begin() const { - return iterator(m_data); - } - - iterator end() const { - return iterator(m_data, size()); - } - -protected: - int m_fd; - size_t m_size; - T* m_data; - - void init() { - m_fd = -1; - m_size = 0; - m_data = nullptr; - } - - void check_fd() { - if (m_fd == -1) throw std::runtime_error("cannot open file"); - } -}; - -template -Pointer mmap(int fd, size_t size, int prot) { - static const size_t offset = 0; - Pointer p = - static_cast(::mmap(NULL, size, prot, MAP_SHARED, fd, offset)); - if (p == MAP_FAILED) throw std::runtime_error("mmap failed"); - return p; -} - -template -struct file_source : public file { - typedef file base; - - file_source() {} - - file_source(std::string const& path, int adv = advice::normal) { - open(path, adv); - } - - void open(std::string const& path, int adv = advice::normal) { - base::m_fd = ::open(path.c_str(), O_RDONLY); - base::check_fd(); - struct stat fs; - if (fstat(base::m_fd, &fs) == -1) { - throw std::runtime_error("cannot stat file"); - } - base::m_size = fs.st_size; - base::m_data = mmap(base::m_fd, base::m_size, PROT_READ); - if (posix_madvise((void*)base::m_data, base::m_size, adv)) { - throw std::runtime_error("madvise failed"); - } - } -}; - -template -struct file_sink : public file { - typedef file base; - - file_sink() {} - - file_sink(std::string const& path) { - open(path); - } - - file_sink(std::string const& path, size_t n) { - open(path, n); - } - - void open(std::string const& path) { - static const mode_t mode = 0600; // read/write - base::m_fd = ::open(path.c_str(), O_RDWR, mode); - base::check_fd(); - struct stat fs; - if (fstat(base::m_fd, &fs) == -1) { - throw std::runtime_error("cannot stat file"); - } - base::m_size = fs.st_size; - base::m_data = - mmap(base::m_fd, base::m_size, PROT_READ | PROT_WRITE); - } - - void open(std::string const& path, size_t n) { - static const mode_t mode = 0600; // read/write - base::m_fd = ::open(path.c_str(), O_RDWR | O_CREAT | O_TRUNC, mode); - base::check_fd(); - base::m_size = n * sizeof(T); - ftruncate(base::m_fd, - base::m_size); // truncate the file at the new size - base::m_data = - mmap(base::m_fd, base::m_size, PROT_READ | PROT_WRITE); - } -}; - -} // namespace mm \ No newline at end of file diff --git a/tools/xcdat_build.cpp b/tools/xcdat_build.cpp index a71e07b..6930cfd 100644 --- a/tools/xcdat_build.cpp +++ b/tools/xcdat_build.cpp @@ -1,5 +1,3 @@ -#include - #include #include "cmd_line_parser/parser.hpp" @@ -7,8 +5,8 @@ cmd_line_parser::parser make_parser(int argc, char** argv) { cmd_line_parser::parser p(argc, argv); - p.add("input_keys", "Input filepath of data keys"); - p.add("output_idx", "Output filepath of trie index"); + p.add("input_keys", "Input filepath of keywords"); + p.add("output_dic", "Output filepath of trie dictionary"); p.add("trie_type", "Trie type: [7|8] (default=7)", "-t", false); p.add("binary_mode", "Is binary mode? (default=0)", "-b", false); return p; @@ -17,7 +15,7 @@ cmd_line_parser::parser make_parser(int argc, char** argv) { template int build(const cmd_line_parser::parser& p) { const auto input_keys = p.get("input_keys"); - const auto output_idx = p.get("output_idx"); + const auto output_dic = p.get("output_dic"); const auto binary_mode = p.get("binary_mode", false); auto keys = xcdat::load_strings(input_keys); @@ -37,7 +35,7 @@ int build(const cmd_line_parser::parser& p) { tfm::printfln("Memory usage in bytes: %d", memory_in_bytes); tfm::printfln("Memory usage in MiB: %g", memory_in_bytes / (1024.0 * 1024.0)); - xcdat::save(trie, output_idx); + xcdat::save(trie, output_dic); return 0; } diff --git a/tools/xcdat_decode.cpp b/tools/xcdat_decode.cpp index 25d77d0..24aa1b8 100644 --- a/tools/xcdat_decode.cpp +++ b/tools/xcdat_decode.cpp @@ -1,20 +1,20 @@ +#include #include #include "cmd_line_parser/parser.hpp" -#include "mm_file/mm_file.hpp" #include "tinyformat/tinyformat.h" cmd_line_parser::parser make_parser(int argc, char** argv) { cmd_line_parser::parser p(argc, argv); - p.add("input_idx", "Input filepath of trie index"); + p.add("input_dic", "Input filepath of trie dictionary"); return p; } template int decode(const cmd_line_parser::parser& p) { - const auto input_idx = p.get("input_idx"); + const auto input_dic = p.get("input_dic"); - const mm::file_source fin(input_idx.c_str(), mm::advice::sequential); + const mm::file_source fin(input_dic.c_str(), mm::advice::sequential); const auto trie = xcdat::mmap(fin.data()); for (std::uint64_t id; std::cin >> id;) { @@ -36,8 +36,8 @@ int main(int argc, char** argv) { return 1; } - const auto input_idx = p.get("input_idx"); - const auto flag = xcdat::get_flag(input_idx); + const auto input_dic = p.get("input_dic"); + const auto flag = xcdat::get_flag(input_dic); switch (flag) { case 7: diff --git a/tools/xcdat_enumerate.cpp b/tools/xcdat_enumerate.cpp index 9316211..8050c62 100644 --- a/tools/xcdat_enumerate.cpp +++ b/tools/xcdat_enumerate.cpp @@ -1,20 +1,20 @@ +#include #include #include "cmd_line_parser/parser.hpp" -#include "mm_file/mm_file.hpp" #include "tinyformat/tinyformat.h" cmd_line_parser::parser make_parser(int argc, char** argv) { cmd_line_parser::parser p(argc, argv); - p.add("input_idx", "Input filepath of trie index"); + p.add("input_dic", "Input filepath of trie dictionary"); return p; } template int enumerate(const cmd_line_parser::parser& p) { - const auto input_idx = p.get("input_idx"); + const auto input_dic = p.get("input_dic"); - const mm::file_source fin(input_idx.c_str(), mm::advice::sequential); + const mm::file_source fin(input_dic.c_str(), mm::advice::sequential); const auto trie = xcdat::mmap(fin.data()); trie.enumerate([&](std::uint64_t id, std::string_view str) { tfm::printfln("%d\t%s", id, str); }); @@ -33,8 +33,8 @@ int main(int argc, char** argv) { return 1; } - const auto input_idx = p.get("input_idx"); - const auto flag = xcdat::get_flag(input_idx); + const auto input_dic = p.get("input_dic"); + const auto flag = xcdat::get_flag(input_dic); switch (flag) { case 7: diff --git a/tools/xcdat_lookup.cpp b/tools/xcdat_lookup.cpp index ed68298..18650ac 100644 --- a/tools/xcdat_lookup.cpp +++ b/tools/xcdat_lookup.cpp @@ -1,20 +1,20 @@ +#include #include #include "cmd_line_parser/parser.hpp" -#include "mm_file/mm_file.hpp" #include "tinyformat/tinyformat.h" cmd_line_parser::parser make_parser(int argc, char** argv) { cmd_line_parser::parser p(argc, argv); - p.add("input_idx", "Input filepath of trie index"); + p.add("input_dic", "Input filepath of trie dictionary"); return p; } template int lookup(const cmd_line_parser::parser& p) { - const auto input_idx = p.get("input_idx"); + const auto input_dic = p.get("input_dic"); - const mm::file_source fin(input_idx.c_str(), mm::advice::sequential); + const mm::file_source fin(input_dic.c_str(), mm::advice::sequential); const auto trie = xcdat::mmap(fin.data()); for (std::string str; std::getline(std::cin, str);) { @@ -40,8 +40,8 @@ int main(int argc, char** argv) { return 1; } - const auto input_idx = p.get("input_idx"); - const auto flag = xcdat::get_flag(input_idx); + const auto input_dic = p.get("input_dic"); + const auto flag = xcdat::get_flag(input_dic); switch (flag) { case 7: diff --git a/tools/xcdat_predictive_search.cpp b/tools/xcdat_predictive_search.cpp index efaeda8..21fd271 100644 --- a/tools/xcdat_predictive_search.cpp +++ b/tools/xcdat_predictive_search.cpp @@ -1,22 +1,22 @@ +#include #include #include "cmd_line_parser/parser.hpp" -#include "mm_file/mm_file.hpp" #include "tinyformat/tinyformat.h" cmd_line_parser::parser make_parser(int argc, char** argv) { cmd_line_parser::parser p(argc, argv); - p.add("input_idx", "Input filepath of trie index"); + p.add("input_dic", "Input filepath of trie dictionary"); p.add("max_num_results", "The max number of results (default=10)", "-n", false); return p; } template int predictive_search(const cmd_line_parser::parser& p) { - const auto input_idx = p.get("input_idx"); + const auto input_dic = p.get("input_dic"); const auto max_num_results = p.get("max_num_results", 10); - const mm::file_source fin(input_idx.c_str(), mm::advice::sequential); + const mm::file_source fin(input_dic.c_str(), mm::advice::sequential); const auto trie = xcdat::mmap(fin.data()); struct result_type { @@ -53,8 +53,8 @@ int main(int argc, char** argv) { return 1; } - const auto input_idx = p.get("input_idx"); - const auto flag = xcdat::get_flag(input_idx); + const auto input_dic = p.get("input_dic"); + const auto flag = xcdat::get_flag(input_dic); switch (flag) { case 7: diff --git a/tools/xcdat_prefix_search.cpp b/tools/xcdat_prefix_search.cpp index 32abcc7..80752e7 100644 --- a/tools/xcdat_prefix_search.cpp +++ b/tools/xcdat_prefix_search.cpp @@ -1,20 +1,20 @@ +#include #include #include "cmd_line_parser/parser.hpp" -#include "mm_file/mm_file.hpp" #include "tinyformat/tinyformat.h" cmd_line_parser::parser make_parser(int argc, char** argv) { cmd_line_parser::parser p(argc, argv); - p.add("input_idx", "Input filepath of trie index"); + p.add("input_dic", "Input filepath of trie dictionary"); return p; } template int prefix_search(const cmd_line_parser::parser& p) { - const auto input_idx = p.get("input_idx"); + const auto input_dic = p.get("input_dic"); - const mm::file_source fin(input_idx.c_str(), mm::advice::sequential); + const mm::file_source fin(input_dic.c_str(), mm::advice::sequential); const auto trie = xcdat::mmap(fin.data()); struct result_type { @@ -49,8 +49,8 @@ int main(int argc, char** argv) { return 1; } - const auto input_idx = p.get("input_idx"); - const auto flag = xcdat::get_flag(input_idx); + const auto input_dic = p.get("input_dic"); + const auto flag = xcdat::get_flag(input_dic); switch (flag) { case 7: