diff --git a/include/xcdat.hpp b/include/xcdat.hpp index de8085b..25d55a4 100644 --- a/include/xcdat.hpp +++ b/include/xcdat.hpp @@ -32,9 +32,9 @@ template [[maybe_unused]] Trie mmap(const char* address) { mmap_visitor visitor(address); - std::uint32_t flag; - visitor.visit(flag); - XCDAT_THROW_IF(flag != Trie::l1_bits, "The input dictionary type is different."); + std::uint32_t type_id; + visitor.visit(type_id); + XCDAT_THROW_IF(type_id != Trie::l1_bits, "The input dictionary type is different."); Trie idx; visitor.visit(idx); @@ -46,9 +46,9 @@ template [[maybe_unused]] Trie load(const std::string& filepath) { load_visitor visitor(filepath); - std::uint32_t flag; - visitor.visit(flag); - XCDAT_THROW_IF(flag != Trie::l1_bits, "The input dictionary type is different."); + std::uint32_t type_id; + visitor.visit(type_id); + XCDAT_THROW_IF(type_id != Trie::l1_bits, "The input dictionary type is different."); Trie idx; visitor.visit(idx); @@ -56,10 +56,11 @@ template } //! Save the trie dictionary to the file and returns the file size in bytes. +//! The identifier of the trie type will be written in the first 4 bytes. template [[maybe_unused]] std::uint64_t save(const Trie& idx, const std::string& filepath) { save_visitor visitor(filepath); - visitor.visit(static_cast(Trie::l1_bits)); // flag + visitor.visit(static_cast(Trie::l1_bits)); // identifier visitor.visit(const_cast(idx)); return visitor.bytes(); } @@ -68,14 +69,14 @@ template template [[maybe_unused]] std::uint64_t memory_in_bytes(const Trie& idx) { size_visitor visitor; - visitor.visit(static_cast(Trie::l1_bits)); // flag + visitor.visit(static_cast(Trie::l1_bits)); // identifier visitor.visit(const_cast(idx)); return visitor.bytes(); } -//! Get the flag indicating the trie dictionary type, embedded by the function 'save'. -//! The flag corresponds to trie::l1_bits and will be used to detect the trie type from the file. -[[maybe_unused]] std::uint32_t get_flag(const std::string& filepath) { +//! Get the identifier of the trie type embedded by the function 'save'. +//! The identifier corresponds to trie::l1_bits and will be used to detect the trie type. +[[maybe_unused]] std::uint32_t get_type_id(const std::string& filepath) { std::ifstream ifs(filepath); XCDAT_THROW_IF(!ifs.good(), "Cannot open the input file"); @@ -84,16 +85,4 @@ template return flag; } -//! Load the keywords from the file. -[[maybe_unused]] std::vector load_strings(const std::string& filepath, char delim = '\n') { - std::ifstream ifs(filepath); - XCDAT_THROW_IF(!ifs.good(), "Cannot open the input file"); - - std::vector strs; - for (std::string str; std::getline(ifs, str, delim);) { - strs.push_back(str); - } - return strs; -} - } // namespace xcdat diff --git a/include/xcdat/trie.hpp b/include/xcdat/trie.hpp index c6694e5..4948e3d 100644 --- a/include/xcdat/trie.hpp +++ b/include/xcdat/trie.hpp @@ -17,6 +17,7 @@ class trie { using trie_type = trie; using bc_vector_type = BcVector; + //! The identifier of BC vector. static constexpr auto l1_bits = bc_vector_type::l1_bits; private: @@ -59,7 +60,7 @@ class trie { //! - end() returns the iterator to the end. //! The type 'Strings::value_type::value_type' should be one-byte integer type such as 'char'. template - trie(const Strings& keys, bool bin_mode = false) : trie(trie_builder(keys, l1_bits, bin_mode)) { + trie(const Strings& keys, bool bin_mode = false) : trie(trie_builder(keys, bc_vector_type::l1_bits, bin_mode)) { static_assert(sizeof(char) == sizeof(typename Strings::value_type::value_type)); } diff --git a/sample/sample.cpp b/sample/sample.cpp index 0c181c5..f9688f5 100644 --- a/sample/sample.cpp +++ b/sample/sample.cpp @@ -10,15 +10,15 @@ int main() { "Mac_Mini", "Mac_Pro", "iMac", "iPad", "iPhone", "iPhone_SE", }; - // The input keys must be sorted and unique (although they have already satisfied in this case). + // The input keys must be sorted and unique (already satisfied in this case). std::sort(keys.begin(), keys.end()); keys.erase(std::unique(keys.begin(), keys.end()), keys.end()); - // The trie dictionary type - using trie_type = xcdat::trie_7_type; - // using trie_type = xcdat::trie_8_type; - // using trie_type = xcdat::trie_15_type; + // The trie dictionary type from the four types + using trie_type = xcdat::trie_8_type; // using trie_type = xcdat::trie_16_type; + // using trie_type = xcdat::trie_7_type; + // using trie_type = xcdat::trie_15_type; // The dictionary filename const char* tmp_filename = "dic.bin"; @@ -35,6 +35,9 @@ int main() { // Load the trie dictionary on memory. const auto trie = xcdat::load(tmp_filename); + // Or, you can set the continuous memory block via a memory-mapped file. + // const auto trie = xcdat::mmap(mapped_data); + // Basic statistics std::cout << "Number of keys: " << trie.num_keys() << std::endl; std::cout << "Number of trie nodes: " << trie.num_nodes() << std::endl; @@ -77,7 +80,7 @@ int main() { std::cout << "}" << std::endl; } - // Enumerate all the keys (in lex order). + // Enumerate all the keys (in lexicographical order). { std::cout << "Enumerate() = {" << std::endl; auto itr = trie.make_enumerative_iterator(); diff --git a/tests/test_trie.cpp b/tests/test_trie.cpp index 60410d4..a4fdef8 100644 --- a/tests/test_trie.cpp +++ b/tests/test_trie.cpp @@ -20,6 +20,17 @@ using trie_type = xcdat::trie_15_type; using trie_type = xcdat::trie_16_type; #endif +std::vector load_strings(const std::string& filepath, char delim = '\n') { + std::ifstream ifs(filepath); + XCDAT_THROW_IF(!ifs.good(), "Cannot open the input file"); + + std::vector strs; + for (std::string str; std::getline(ifs, str, delim);) { + strs.push_back(str); + } + return strs; +} + void test_basic_operations(const trie_type& trie, const std::vector& keys, const std::vector& others) { REQUIRE_EQ(trie.num_keys(), keys.size()); @@ -209,7 +220,7 @@ TEST_CASE("Test trie_type (tiny)") { } TEST_CASE("Test trie_type (real)") { - auto keys = xcdat::test::to_unique_vec(xcdat::load_strings("keys.txt")); + auto keys = xcdat::test::to_unique_vec(load_strings("keys.txt")); auto others = xcdat::test::extract_keys(keys); trie_type trie(keys); diff --git a/tools/xcdat_benchmark.cpp b/tools/xcdat_benchmark.cpp index 9009e40..9c2f6e4 100644 --- a/tools/xcdat_benchmark.cpp +++ b/tools/xcdat_benchmark.cpp @@ -17,6 +17,17 @@ cmd_line_parser::parser make_parser(int argc, char** argv) { return p; } +std::vector load_strings(const std::string& filepath, char delim = '\n') { + std::ifstream ifs(filepath); + XCDAT_THROW_IF(!ifs.good(), "Cannot open the input file"); + + std::vector strs; + for (std::string str; std::getline(ifs, str, delim);) { + strs.push_back(str); + } + return strs; +} + std::vector sample_keys(const std::vector& keys, std::uint64_t num_samples, std::uint64_t random_seed) { std::mt19937_64 engine(random_seed); @@ -127,7 +138,7 @@ int main(int argc, char** argv) { const auto random_seed = p.get("random_seed", 13); const auto binary_mode = p.get("binary_mode", false); - auto keys = xcdat::load_strings(input_keys); + auto keys = load_strings(input_keys); if (keys.empty()) { tfm::errorfln("Error: The input dataset is empty."); return 1; diff --git a/tools/xcdat_build.cpp b/tools/xcdat_build.cpp index dd5a36d..4119c51 100644 --- a/tools/xcdat_build.cpp +++ b/tools/xcdat_build.cpp @@ -12,13 +12,24 @@ cmd_line_parser::parser make_parser(int argc, char** argv) { return p; } +std::vector load_strings(const std::string& filepath, char delim = '\n') { + std::ifstream ifs(filepath); + XCDAT_THROW_IF(!ifs.good(), "Cannot open the input file"); + + std::vector strs; + for (std::string str; std::getline(ifs, str, delim);) { + strs.push_back(str); + } + return strs; +} + template int build(const cmd_line_parser::parser& p) { const auto input_keys = p.get("input_keys"); const auto output_dic = p.get("output_dic"); const auto binary_mode = p.get("binary_mode", false); - auto keys = xcdat::load_strings(input_keys); + auto keys = load_strings(input_keys); if (keys.empty()) { tfm::errorfln("Error: The input dataset is empty."); } diff --git a/tools/xcdat_decode.cpp b/tools/xcdat_decode.cpp index 3178335..a6cc6ab 100644 --- a/tools/xcdat_decode.cpp +++ b/tools/xcdat_decode.cpp @@ -37,9 +37,9 @@ int main(int argc, char** argv) { } const auto input_dic = p.get("input_dic"); - const auto flag = xcdat::get_flag(input_dic); + const auto type_id = xcdat::get_type_id(input_dic); - switch (flag) { + switch (type_id) { case 7: return decode(p); case 8: diff --git a/tools/xcdat_enumerate.cpp b/tools/xcdat_enumerate.cpp index 6fdd4a2..bb0f83e 100644 --- a/tools/xcdat_enumerate.cpp +++ b/tools/xcdat_enumerate.cpp @@ -34,9 +34,9 @@ int main(int argc, char** argv) { } const auto input_dic = p.get("input_dic"); - const auto flag = xcdat::get_flag(input_dic); + const auto type_id = xcdat::get_type_id(input_dic); - switch (flag) { + switch (type_id) { case 7: return enumerate(p); case 8: diff --git a/tools/xcdat_lookup.cpp b/tools/xcdat_lookup.cpp index 8599816..4ec3da3 100644 --- a/tools/xcdat_lookup.cpp +++ b/tools/xcdat_lookup.cpp @@ -41,9 +41,9 @@ int main(int argc, char** argv) { } const auto input_dic = p.get("input_dic"); - const auto flag = xcdat::get_flag(input_dic); + const auto type_id = xcdat::get_type_id(input_dic); - switch (flag) { + switch (type_id) { case 7: return lookup(p); case 8: diff --git a/tools/xcdat_predictive_search.cpp b/tools/xcdat_predictive_search.cpp index 0c0ceee..f21f40a 100644 --- a/tools/xcdat_predictive_search.cpp +++ b/tools/xcdat_predictive_search.cpp @@ -54,9 +54,9 @@ int main(int argc, char** argv) { } const auto input_dic = p.get("input_dic"); - const auto flag = xcdat::get_flag(input_dic); + const auto type_id = xcdat::get_type_id(input_dic); - switch (flag) { + switch (type_id) { case 7: return predictive_search(p); case 8: diff --git a/tools/xcdat_prefix_search.cpp b/tools/xcdat_prefix_search.cpp index 05ab65e..d927008 100644 --- a/tools/xcdat_prefix_search.cpp +++ b/tools/xcdat_prefix_search.cpp @@ -50,9 +50,9 @@ int main(int argc, char** argv) { } const auto input_dic = p.get("input_dic"); - const auto flag = xcdat::get_flag(input_dic); + const auto type_id = xcdat::get_type_id(input_dic); - switch (flag) { + switch (type_id) { case 7: return prefix_search(p); case 8: