From 178bbb46d379dedb3de0042bb930277cc427cc12 Mon Sep 17 00:00:00 2001
From: Shunsuke Kanda <shnsk.knd@gmail.com>
Date: Sat, 3 Jul 2021 09:46:04 +0900
Subject: [PATCH] add

---
 CMakeLists.txt                         |   2 -
 README.md                              |  80 +++++++----
 include/xcdat.hpp                      |   6 +
 sample/sample.cpp                      |   9 +-
 {include => tests}/mm_file/mm_file.hpp |   0
 tests/test_trie.cpp                    |   8 ++
 tools/mm_file/mm_file.hpp              | 177 +++++++++++++++++++++++++
 tools/xcdat_decode.cpp                 |   6 +-
 tools/xcdat_enumerate.cpp              |   6 +-
 tools/xcdat_lookup.cpp                 |   6 +-
 tools/xcdat_predictive_search.cpp      |   6 +-
 tools/xcdat_prefix_search.cpp          |   6 +-
 12 files changed, 269 insertions(+), 43 deletions(-)
 rename {include => tests}/mm_file/mm_file.hpp (100%)
 create mode 100644 tools/mm_file/mm_file.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4665e59..4106204 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -42,7 +42,5 @@ file(COPY ${CMAKE_SOURCE_DIR}/tests/keys.txt DESTINATION ${CMAKE_CURRENT_BINARY_
 
 # Install the library
 file(GLOB XCDAT_HEADER_FILES include/xcdat/*.hpp)
-file(GLOB MM_HEADER_FILES include/mm_file/*.hpp)
 install(FILES include/xcdat.hpp DESTINATION include)
 install(FILES ${XCDAT_HEADER_FILES} DESTINATION include/xcdat)
-install(FILES ${MM_HEADER_FILES} DESTINATION include/mm_file)
diff --git a/README.md b/README.md
index 365cd2c..ed6d442 100644
--- a/README.md
+++ b/README.md
@@ -52,17 +52,19 @@ The library considers a 64-bit operating system. The code has been tested only o
 
  Xcdat provides command line tools to build the dictionary and perform searches, which are inspired by [marisa-trie](https://github.com/s-yata/marisa-trie). All the tools will print the command line options by specifying the parameter `-h`.
 
+The tools employ the external libraries [cmd_line_parser](https://github.com/jermp/cmd_line_parser), [mm_file](https://github.com/jermp/mm_file), and [tinyformat](https://github.com/c42f/tinyformat), which are contained in the repository.
+
 ### `xcdat_build`
 
-It builds the trie dictionary from a given dataset consisting of keywords separated by newlines. The following command builds the trie dictionary from dataset `enwiki-titles.txt` and writes the dictionary into file `idx.bin`.
+It builds the trie dictionary from a given dataset consisting of keywords separated by newlines. The following command builds the trie dictionary from dataset `enwiki-titles.txt` and writes the dictionary into file `dic.bin`.
 
 ```
-$ xcdat_build enwiki-titles.txt idx.bin
+$ xcdat_build enwiki-titles.txt dic.bin
 Number of keys: 15955763
-Number of trie nodes: 36441058
-Number of DA units: 36520704
-Memory usage in bytes: 1.70618e+08
-Memory usage in MiB: 162.714
+Number of trie nodes: 36439320
+Number of DA units: 36515840
+Memory usage in bytes: 1.64104e+08
+Memory usage in MiB: 156.502
 ```
 
 ### `xcdat_lookup`
@@ -70,7 +72,7 @@ Memory usage in MiB: 162.714
 It tests the `lookup` operation for a given dictionary. Given a query string via `stdin`, it prints the associated ID if found, or `-1` otherwise.
 
 ```
-$ xcdat_lookup idx.bin
+$ xcdat_lookup dic.bin
 Algorithm
 1255938	Algorithm
 Double_Array
@@ -82,7 +84,7 @@ Double_Array
 It tests the `decode` operation for a given dictionary. Given a query ID via `stdin`, it prints the corresponding keyword if the ID is in the range `[0,N-1]`, where `N` is the number of stored keywords.
 
 ```
-$ xcdat_decode idx.bin
+$ xcdat_decode dic.bin
 1255938
 1255938	Algorithm
 ```
@@ -92,7 +94,7 @@ $ xcdat_decode idx.bin
 It tests the `prefix_search` operation for a given dictionary. Given a query string via `stdin`, it prints all the keywords contained as prefixes of a given string.
 
 ```
-$ xcdat_prefix_search idx.bin
+$ xcdat_prefix_search dic.bin
 Algorithmic
 6 found
 57	A
@@ -108,7 +110,7 @@ Algorithmic
 It tests the `predictive_search` operation for a given dictionary. Given a query string via `stdin`, it prints the first `n` keywords starting with a given string, where `n` is one of the parameters.
 
 ```
-$ xcdat_predictive_search idx.bin -n 3
+$ xcdat_predictive_search dic.bin -n 3
 Algorithm
 263 found
 1255938	Algorithm
@@ -121,7 +123,7 @@ Algorithm
 It prints all the keywords stored in a given dictionary.
 
 ```
-$ xcdat_enumerate idx.bin | head -3
+$ xcdat_enumerate dic.bin | head -3
 0	!
 107	!!
 138	!!!
@@ -137,31 +139,44 @@ $ xcdat_benchmark enwiki-titles.txt
 Number of keys: 15955763
 Memory usage in bytes: 1.70618e+08
 Memory usage in MiB: 162.714
-Construction time in seconds: 12.907
-Lookup time in microsec/query: 0.4674
-Decode time in microsec/query: 0.8722
+Construction time in seconds: 13.501
+Lookup time in microsec/query: 0.5708
+Decode time in microsec/query: 1.0846
 ** xcdat::trie_8_type **
 Number of keys: 15955763
 Memory usage in bytes: 1.64104e+08
 Memory usage in MiB: 156.502
-Construction time in seconds: 13.442
-Lookup time in microsec/query: 0.7593
-Decode time in microsec/query: 1.2341
+Construction time in seconds: 13.626
+Lookup time in microsec/query: 0.6391
+Decode time in microsec/query: 1.0531
+** xcdat::trie_15_type **
+Number of keys: 15955763
+Memory usage in bytes: 2.05737e+08
+Memory usage in MiB: 196.206
+Construction time in seconds: 13.425
+Lookup time in microsec/query: 0.3613
+Decode time in microsec/query: 0.7044
+** xcdat::trie_16_type **
+Number of keys: 15955763
+Memory usage in bytes: 2.15935e+08
+Memory usage in MiB: 205.932
+Construction time in seconds: 13.704
+Lookup time in microsec/query: 0.3832
+Decode time in microsec/query: 0.8362
 ```
 
 ## Sample usage
 
-`sample/sample.cpp` provides a sample usage. It employs the external library [mm_file](https://github.com/jermp/mm_file) to implement a memory-mapped file, which will be installed by `make install` together.
+`sample/sample.cpp` provides a sample usage.
 
 ```c++
 #include <iostream>
 #include <string>
 
-#include <mm_file/mm_file.hpp>
 #include <xcdat.hpp>
 
 int main() {
-    // Dataset
+    // Dataset of keywords
     std::vector<std::string> keys = {
         "AirPods",  "AirTag",  "Mac",  "MacBook", "MacBook_Air", "MacBook_Pro",
         "Mac_Mini", "Mac_Pro", "iMac", "iPad",    "iPhone",      "iPhone_SE",
@@ -183,12 +198,8 @@ int main() {
         xcdat::save(trie, tmp_filename);
     }
 
-    // Memory-map the trie dictionary.
-    const mm::file_source<char> fin(tmp_filename, mm::advice::sequential);
-    const auto trie = xcdat::mmap<trie_type>(fin.data());
-
-    // Or, load the trie dictionary on memory.
-    // const auto trie = xcdat::load<trie_type>(tmp_filename);
+    // Load the trie dictionary on memory.
+    const auto trie = xcdat::load<trie_type>(tmp_filename);
 
     // Basic statistics
     std::cout << "Number of keys: " << trie.num_keys() << std::endl;
@@ -293,10 +304,21 @@ Xcdat can be used by including only the header `xcdat.hpp`.
 
 ### Trie dictionary types
 
-The two dictionary types of specialization of class `xcdat::trie` are difined:
+The four dictionary types of specialization of class `xcdat::trie` are difined. The first two types are based on standard DACs by Brisaboa et al. [9]. The last two types are based on pointer-based DACs by Kanda et al. [2].
 
-- `xcdat::trie_8_type` is the trie dictionary using standard DACs [9] using 8-bit integers for elements.
-- `xcdat::trie_7_type` is the trie dictionary using pointer-based DACs [2] using 7-bit integers for elements.
+```c++
+//! The trie type with standard DACs using 8-bit integers
+using trie_8_type = trie<bc_vector_8>;
+
+//! The trie type with standard DACs using 16-bit integers
+using trie_16_type = trie<bc_vector_16>;
+
+//! The trie type with pointer-based DACs using 7-bit integers (for the 1st layer)
+using trie_7_type = trie<bc_vector_7>;
+
+//! The trie type with pointer-based DACs using 15-bit integers (for the 1st layer)
+using trie_15_type = trie<bc_vector_15>;
+```
 
 ### Trie dictionary class
 
diff --git a/include/xcdat.hpp b/include/xcdat.hpp
index 6b5843d..22a7823 100644
--- a/include/xcdat.hpp
+++ b/include/xcdat.hpp
@@ -12,10 +12,16 @@
 
 namespace xcdat {
 
+//! The trie type with standard DACs using 8-bit integers
 using trie_8_type = trie<bc_vector_8>;
+
+//! The trie type with standard DACs using 16-bit integers
 using trie_16_type = trie<bc_vector_16>;
 
+//! The trie type with pointer-based DACs using 7-bit integers (for the 1st layer)
 using trie_7_type = trie<bc_vector_7>;
+
+//! The trie type with pointer-based DACs using 15-bit integers (for the 1st layer)
 using trie_15_type = trie<bc_vector_15>;
 
 //! Set the continuous memory block to a new trie instance (for a memory-mapped file).
diff --git a/sample/sample.cpp b/sample/sample.cpp
index 9c5b1e3..c6788e4 100644
--- a/sample/sample.cpp
+++ b/sample/sample.cpp
@@ -1,7 +1,6 @@
 #include <iostream>
 #include <string>
 
-#include <mm_file/mm_file.hpp>
 #include <xcdat.hpp>
 
 int main() {
@@ -27,12 +26,8 @@ int main() {
         xcdat::save(trie, tmp_filename);
     }
 
-    // Memory-map the trie dictionary.
-    const mm::file_source<char> fin(tmp_filename, mm::advice::sequential);
-    const auto trie = xcdat::mmap<trie_type>(fin.data());
-
-    // Or, load the trie dictionary on memory.
-    // const auto trie = xcdat::load<trie_type>(tmp_filename);
+    // Load the trie dictionary on memory.
+    const auto trie = xcdat::load<trie_type>(tmp_filename);
 
     // Basic statistics
     std::cout << "Number of keys: " << trie.num_keys() << std::endl;
diff --git a/include/mm_file/mm_file.hpp b/tests/mm_file/mm_file.hpp
similarity index 100%
rename from include/mm_file/mm_file.hpp
rename to tests/mm_file/mm_file.hpp
diff --git a/tests/test_trie.cpp b/tests/test_trie.cpp
index b59ddf8..60410d4 100644
--- a/tests/test_trie.cpp
+++ b/tests/test_trie.cpp
@@ -135,6 +135,10 @@ void test_io(const trie_type& trie, const std::vector<std::string>& keys, const
         REQUIRE_EQ(trie.num_keys(), loaded.num_keys());
         REQUIRE_EQ(trie.alphabet_size(), loaded.alphabet_size());
         REQUIRE_EQ(trie.max_length(), loaded.max_length());
+        REQUIRE_EQ(trie.num_nodes(), loaded.num_nodes());
+        REQUIRE_EQ(trie.num_units(), loaded.num_units());
+        REQUIRE_EQ(trie.num_free_units(), loaded.num_free_units());
+        REQUIRE_EQ(trie.tail_length(), loaded.tail_length());
         REQUIRE_EQ(memory, xcdat::memory_in_bytes(loaded));
         test_basic_operations(loaded, keys, others);
     }
@@ -146,6 +150,10 @@ void test_io(const trie_type& trie, const std::vector<std::string>& keys, const
         REQUIRE_EQ(trie.num_keys(), mapped.num_keys());
         REQUIRE_EQ(trie.alphabet_size(), mapped.alphabet_size());
         REQUIRE_EQ(trie.max_length(), mapped.max_length());
+        REQUIRE_EQ(trie.num_nodes(), mapped.num_nodes());
+        REQUIRE_EQ(trie.num_units(), mapped.num_units());
+        REQUIRE_EQ(trie.num_free_units(), mapped.num_free_units());
+        REQUIRE_EQ(trie.tail_length(), mapped.tail_length());
         REQUIRE_EQ(memory, xcdat::memory_in_bytes(mapped));
         test_basic_operations(mapped, keys, others);
     }
diff --git a/tools/mm_file/mm_file.hpp b/tools/mm_file/mm_file.hpp
new file mode 100644
index 0000000..b95031d
--- /dev/null
+++ b/tools/mm_file/mm_file.hpp
@@ -0,0 +1,177 @@
+#pragma once
+
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <type_traits>
+#include <fcntl.h>
+#include <unistd.h>  // close(fd)
+#include <string>
+
+namespace mm {
+
+namespace advice {
+static const int normal = POSIX_MADV_NORMAL;
+static const int random = POSIX_MADV_RANDOM;
+static const int sequential = POSIX_MADV_SEQUENTIAL;
+}  // namespace advice
+
+template <typename T>
+struct file {
+    file() {
+        init();
+    }
+
+    ~file() {
+        close();
+    }
+
+    file(file const&) = delete;             // non construction-copyable
+    file& operator=(file const&) = delete;  // non copyable
+
+    bool is_open() const {
+        return m_fd != -1;
+    }
+
+    void close() {
+        if (is_open()) {
+            if (munmap((char*)m_data, m_size) == -1) {
+                throw std::runtime_error("munmap failed when closing file");
+            }
+            ::close(m_fd);
+            init();
+        }
+    }
+
+    size_t bytes() const {
+        return m_size;
+    }
+
+    size_t size() const {
+        return m_size / sizeof(T);
+    }
+
+    T* data() const {
+        return m_data;
+    }
+
+    struct iterator {
+        iterator(T* addr, size_t offset = 0) : m_ptr(addr + offset) {}
+
+        T operator*() {
+            return *m_ptr;
+        }
+
+        void operator++() {
+            ++m_ptr;
+        }
+
+        bool operator==(iterator const& rhs) const {
+            return m_ptr == rhs.m_ptr;
+        }
+
+        bool operator!=(iterator const& rhs) const {
+            return !((*this) == rhs);
+        }
+
+    private:
+        T* m_ptr;
+    };
+
+    iterator begin() const {
+        return iterator(m_data);
+    }
+
+    iterator end() const {
+        return iterator(m_data, size());
+    }
+
+protected:
+    int m_fd;
+    size_t m_size;
+    T* m_data;
+
+    void init() {
+        m_fd = -1;
+        m_size = 0;
+        m_data = nullptr;
+    }
+
+    void check_fd() {
+        if (m_fd == -1) throw std::runtime_error("cannot open file");
+    }
+};
+
+template <typename Pointer>
+Pointer mmap(int fd, size_t size, int prot) {
+    static const size_t offset = 0;
+    Pointer p =
+        static_cast<Pointer>(::mmap(NULL, size, prot, MAP_SHARED, fd, offset));
+    if (p == MAP_FAILED) throw std::runtime_error("mmap failed");
+    return p;
+}
+
+template <typename T>
+struct file_source : public file<T const> {
+    typedef file<T const> base;
+
+    file_source() {}
+
+    file_source(std::string const& path, int adv = advice::normal) {
+        open(path, adv);
+    }
+
+    void open(std::string const& path, int adv = advice::normal) {
+        base::m_fd = ::open(path.c_str(), O_RDONLY);
+        base::check_fd();
+        struct stat fs;
+        if (fstat(base::m_fd, &fs) == -1) {
+            throw std::runtime_error("cannot stat file");
+        }
+        base::m_size = fs.st_size;
+        base::m_data = mmap<T const*>(base::m_fd, base::m_size, PROT_READ);
+        if (posix_madvise((void*)base::m_data, base::m_size, adv)) {
+            throw std::runtime_error("madvise failed");
+        }
+    }
+};
+
+template <typename T>
+struct file_sink : public file<T> {
+    typedef file<T> base;
+
+    file_sink() {}
+
+    file_sink(std::string const& path) {
+        open(path);
+    }
+
+    file_sink(std::string const& path, size_t n) {
+        open(path, n);
+    }
+
+    void open(std::string const& path) {
+        static const mode_t mode = 0600;  // read/write
+        base::m_fd = ::open(path.c_str(), O_RDWR, mode);
+        base::check_fd();
+        struct stat fs;
+        if (fstat(base::m_fd, &fs) == -1) {
+            throw std::runtime_error("cannot stat file");
+        }
+        base::m_size = fs.st_size;
+        base::m_data =
+            mmap<T*>(base::m_fd, base::m_size, PROT_READ | PROT_WRITE);
+    }
+
+    void open(std::string const& path, size_t n) {
+        static const mode_t mode = 0600;  // read/write
+        base::m_fd = ::open(path.c_str(), O_RDWR | O_CREAT | O_TRUNC, mode);
+        base::check_fd();
+        base::m_size = n * sizeof(T);
+        ftruncate(base::m_fd,
+                  base::m_size);  // truncate the file at the new size
+        base::m_data =
+            mmap<T*>(base::m_fd, base::m_size, PROT_READ | PROT_WRITE);
+    }
+};
+
+}  // namespace mm
\ No newline at end of file
diff --git a/tools/xcdat_decode.cpp b/tools/xcdat_decode.cpp
index 24aa1b8..3178335 100644
--- a/tools/xcdat_decode.cpp
+++ b/tools/xcdat_decode.cpp
@@ -1,7 +1,7 @@
-#include <mm_file/mm_file.hpp>
 #include <xcdat.hpp>
 
 #include "cmd_line_parser/parser.hpp"
+#include "mm_file/mm_file.hpp"
 #include "tinyformat/tinyformat.h"
 
 cmd_line_parser::parser make_parser(int argc, char** argv) {
@@ -44,6 +44,10 @@ int main(int argc, char** argv) {
             return decode<xcdat::trie_7_type>(p);
         case 8:
             return decode<xcdat::trie_8_type>(p);
+        case 15:
+            return decode<xcdat::trie_15_type>(p);
+        case 16:
+            return decode<xcdat::trie_16_type>(p);
         default:
             break;
     }
diff --git a/tools/xcdat_enumerate.cpp b/tools/xcdat_enumerate.cpp
index 8050c62..6fdd4a2 100644
--- a/tools/xcdat_enumerate.cpp
+++ b/tools/xcdat_enumerate.cpp
@@ -1,7 +1,7 @@
-#include <mm_file/mm_file.hpp>
 #include <xcdat.hpp>
 
 #include "cmd_line_parser/parser.hpp"
+#include "mm_file/mm_file.hpp"
 #include "tinyformat/tinyformat.h"
 
 cmd_line_parser::parser make_parser(int argc, char** argv) {
@@ -41,6 +41,10 @@ int main(int argc, char** argv) {
             return enumerate<xcdat::trie_7_type>(p);
         case 8:
             return enumerate<xcdat::trie_8_type>(p);
+        case 15:
+            return enumerate<xcdat::trie_15_type>(p);
+        case 16:
+            return enumerate<xcdat::trie_16_type>(p);
         default:
             break;
     }
diff --git a/tools/xcdat_lookup.cpp b/tools/xcdat_lookup.cpp
index 18650ac..8599816 100644
--- a/tools/xcdat_lookup.cpp
+++ b/tools/xcdat_lookup.cpp
@@ -1,7 +1,7 @@
-#include <mm_file/mm_file.hpp>
 #include <xcdat.hpp>
 
 #include "cmd_line_parser/parser.hpp"
+#include "mm_file/mm_file.hpp"
 #include "tinyformat/tinyformat.h"
 
 cmd_line_parser::parser make_parser(int argc, char** argv) {
@@ -48,6 +48,10 @@ int main(int argc, char** argv) {
             return lookup<xcdat::trie_7_type>(p);
         case 8:
             return lookup<xcdat::trie_8_type>(p);
+        case 15:
+            return lookup<xcdat::trie_15_type>(p);
+        case 16:
+            return lookup<xcdat::trie_16_type>(p);
         default:
             break;
     }
diff --git a/tools/xcdat_predictive_search.cpp b/tools/xcdat_predictive_search.cpp
index 21fd271..0c0ceee 100644
--- a/tools/xcdat_predictive_search.cpp
+++ b/tools/xcdat_predictive_search.cpp
@@ -1,7 +1,7 @@
-#include <mm_file/mm_file.hpp>
 #include <xcdat.hpp>
 
 #include "cmd_line_parser/parser.hpp"
+#include "mm_file/mm_file.hpp"
 #include "tinyformat/tinyformat.h"
 
 cmd_line_parser::parser make_parser(int argc, char** argv) {
@@ -61,6 +61,10 @@ int main(int argc, char** argv) {
             return predictive_search<xcdat::trie_7_type>(p);
         case 8:
             return predictive_search<xcdat::trie_8_type>(p);
+        case 15:
+            return predictive_search<xcdat::trie_15_type>(p);
+        case 16:
+            return predictive_search<xcdat::trie_16_type>(p);
         default:
             break;
     }
diff --git a/tools/xcdat_prefix_search.cpp b/tools/xcdat_prefix_search.cpp
index 80752e7..05ab65e 100644
--- a/tools/xcdat_prefix_search.cpp
+++ b/tools/xcdat_prefix_search.cpp
@@ -1,7 +1,7 @@
-#include <mm_file/mm_file.hpp>
 #include <xcdat.hpp>
 
 #include "cmd_line_parser/parser.hpp"
+#include "mm_file/mm_file.hpp"
 #include "tinyformat/tinyformat.h"
 
 cmd_line_parser::parser make_parser(int argc, char** argv) {
@@ -57,6 +57,10 @@ int main(int argc, char** argv) {
             return prefix_search<xcdat::trie_7_type>(p);
         case 8:
             return prefix_search<xcdat::trie_8_type>(p);
+        case 15:
+            return prefix_search<xcdat::trie_15_type>(p);
+        case 16:
+            return prefix_search<xcdat::trie_16_type>(p);
         default:
             break;
     }