diff --git a/.gitignore b/.gitignore index 7636d73..84ce414 100644 --- a/.gitignore +++ b/.gitignore @@ -33,3 +33,4 @@ build/ cmake-build-debug/ .idea/ .DS_Store +src/xcdat_config.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index f939135..42e5fcd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,22 +1,74 @@ cmake_minimum_required(VERSION 2.8) project(XCDAT) -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -g -std=c++11") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -std=c++11") -if(NOT CMAKE_BUILD_TYPE) +if (NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release) -endif() +endif () -if(NOT XCDAT64) - set(XCDAT64 OFF) -endif() +configure_file( + ${XCDAT_SOURCE_DIR}/xcdat_config.hpp.in + ${XCDAT_SOURCE_DIR}/src/xcdat_config.hpp +) -if(XCDAT64) - add_definitions(-DXCDAT64) -endif() +option(XCDAT_X64 + "Use 64-bit integers to represent nodes." + OFF) + +option(XCDAT_USE_POPCNT + "Use popcount intrinsic. Available on x86-64 since SSE4.2." + OFF) + +if (XCDAT_USE_POPCNT) + if (UNIX) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2") + endif () +endif () message(STATUS "BUILD_TYPE is ${CMAKE_BUILD_TYPE}") -message(STATUS "XCDAT64 is ${XCDAT64}") +message(STATUS "CXX_FLAGS are ${CMAKE_CXX_FLAGS}") +message(STATUS "CXX_FLAGS_DEBUG are ${CMAKE_CXX_FLAGS_DEBUG}") +message(STATUS "CXX_FLAGS_RELEASE are ${CMAKE_CXX_FLAGS_RELEASE}") +message(STATUS "XCDAT_X64 is ${XCDAT_X64}") +message(STATUS "XCDAT_USE_POPCNT is ${XCDAT_USE_POPCNT}") + +set(HEADER_FILES + src/BitVector.hpp + src/BitVectorBuilder.hpp + src/DacBc.hpp + src/FastDacBc.hpp + src/FitVector.hpp + src/Trie.hpp + src/TrieBuilder.hpp + src/Vector.hpp + src/xcdat_basics.hpp + src/xcdat_config.hpp + ) + +set(SOURCE_FILES + src/BitVector.cpp + src/DacBc.cpp + src/FitVector.cpp + src/TrieBuilder.cpp + src/FastDacBc.cpp + ) + +add_library(xcdat STATIC ${HEADER_FILES} ${SOURCE_FILES}) + +add_executable(xcdat-exe src/xcdat.cpp) +set_target_properties(xcdat-exe PROPERTIES OUTPUT_NAME xcdat) +target_link_libraries(xcdat-exe xcdat) enable_testing() -add_subdirectory(src) +file(GLOB TEST_SOURCES src/test*.cpp) +foreach(TEST_SOURCE ${TEST_SOURCES}) + get_filename_component(TEST_SOURCE_NAME ${TEST_SOURCE} NAME_WE) + add_executable(${TEST_SOURCE_NAME} ${TEST_SOURCE}) + target_link_libraries(${TEST_SOURCE_NAME} xcdat) + add_test(${TEST_SOURCE_NAME} ${TEST_SOURCE_NAME}) +endforeach() + +INSTALL(FILES ${HEADER_FILES} DESTINATION include/xcdat) +INSTALL(TARGETS xcdat ARCHIVE DESTINATION lib) +INSTALL(TARGETS xcdat-exe RUNTIME DESTINATION bin) \ No newline at end of file diff --git a/LICENSE b/LICENSE index 6eac522..d750e37 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2016 Shunsuke Kanda +Copyright (c) 2017 Shunsuke Kanda Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 2873d39..d59ffa1 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ Xcdat is a C++ library that implements static compressed dictionaries based on a The double array (Aoe, 1989) is known as the fastest trie representation and has been used in many trie libraries. On the other hand, it has a space efficiency problem because of a pointer-based data structure. Xcdat solves the problem using the XOR-compressed double array (XCDA) methods described in -- S. Kanda, K. Morita, and M. Fuketa. Compressed double-array tries for string dictionaries supporting fast lookup. _Knowledge and Information Systems_, Online first. [[doi](http://dx.doi.org/10.1007/s10115-016-0999-8)] [[pdf](https://kamp78.github.io/pdf/KAIS16_preprint.pdf)] +- S. Kanda, K. Morita, and M. Fuketa. Compressed double-array tries for string dictionaries supporting fast lookup. _Knowledge and Information Systems_, 51(3): 1023–1042, 2017. [[doi](http://dx.doi.org/10.1007/s10115-016-0999-8)] [[pdf](https://sites.google.com/site/shnskknd/KAIS2016.pdf)] Therefore, Xcdat can implement trie dictionaries in smaller space compared to the other double-array libraries. In addition, the lookup speed is relatively fast in compressed data structures from the double-array advantage. @@ -16,9 +16,9 @@ In addition, the lookup speed is relatively fast in compressed data structures f - **Two compression versions.** There are two versions for compressing elements: using byte-oriented DACs (Brisaboa et al., 2013) and using pointer-based ones (Kanda et al., 2016). For characterless strings such as natural language keywords, the former will be slightly smaller and the latter will be slightly faster. For long strings such as URLs, the latter will outperform the former. Xcdat implements the two versions by using a static polymorphism with C++ template to avoid an overhead of virtual functions. -- **64-bit version.** Although Xcdat represents array addresses using 32-bit integers in default configuration, we can allow for 64-bit integers by defining `XCDAT64`; therefore, the dictionary can be constructed from a very large dataset. The construction space becomes large, but the output dictionary size is nearly equal. +- **64-bit version.** Although Xcdat represents node addresses using 32-bit integers in default configuration, we can allow for 64-bit integers by defining `XCDAT_X64`; therefore, the dictionary can be constructed from a very large dataset. The construction space becomes large, but the output dictionary size is nearly equal. -- **NULL character.** The dictionary can be constructed from keys including the NULL character by setting the second parameter of the [xcdat::Trie](https://github.com/kamp78/xcdat/blob/master/src/Trie.hpp) constructer to `true`. However, we can generally construct high-performance dictionaries without this setting. +- **NULL character.** The dictionary can be constructed from keys including the NULL character by setting the second parameter of `xcdat::TrieBuilder::build()` to `true`. - **Invertible dictionary coding.** Xcdat supports mapping N different strings to unique IDs in [0,N). That is to say, it supports two basic dictionary operations: Lookup returns the ID corresponding to a given string and Access (also called ReverseLookup) returns the string corresponding to a given ID. Therefore, Xcdat is very useful in many applications for string precessing and indexing, such as described in (Martínez-Prieto et al., 2016). @@ -35,13 +35,44 @@ $ git clone https://github.com/kamp78/xcdat.git $ cd xcdat $ mkdir build $ cd build -$ cmake .. -DCMAKE_BUILD_TYPE=Release -DXCDAT64=OFF +$ cmake .. $ make +$ make install ``` +If you want to use a 64-bit setting, please add `-DXCDAT_X64=ON` to the CMake option. In addition, you can use the SSE4.2 POPCNT instruction by adding `-DXCDAT_USE_POPCNT=ON` for Rank/Select operations. + +The code has been tested only on Mac OS X and Linux. That is, this library considers only UNIX-compatible OS. + + +## Command Line Tool + +`xcdat` is a general-purpose command line tool to provide three modes as follows: + +``` +$ xcdat +xcdat build + '1' for DACs; '2' for FDACs. + input file of a set of keys. + output file of the dictionary. +xcdat query + '1' for DACs; '2' for FDACs. + input file of the dictionary. + limit at lookup (default=10). +xcdat bench + '1' for DACs; '2' for FDACs. + input file of the dictionary. + input file of keys for benchmark. +``` + + ## API -Refer to the header comments of [xcdat::Trie](https://github.com/kamp78/xcdat/blob/master/src/Trie.hpp). +You can build a dictionary using `xcdat::TrieBuilder::build()`. +This static function receives a set of keywords and returns the resulting class object of `xcdat::Trie`. +For the usage, refer to the header comments of [xcdat::TrieBuilder.hpp](https://github.com/kamp78/xcdat/blob/master/src/TrieBuilder.hpp). +Also for the usage of `xcdat::Trie`, refer to the header comments of [xcdat::Trie](https://github.com/kamp78/xcdat/blob/master/src/Trie.hpp). +If you want to get specific usage examples, refer to the source code of [xcdat.cpp](https://github.com/kamp78/xcdat/blob/master/src/xcdat.cpp). ## Benchmark @@ -50,14 +81,14 @@ WIP ## Future work - Show benchmarks -- Implement faster operations +- Support faster operations - Clear up source codes -- Set install opetions +- Extend results returned from prefix operations ## References - J. Aoe. An efficient digital search algorithm by using a double-array structure. _IEEE Transactions on Software Engineering_, 15(9):1066–1077, 1989. - N. R. Brisaboa, S. Ladra, and G. Navarro. DACs: Bringing direct access to variable-length codes. _Information Processing & Management_, 49(1):392–404, 2013. -- S. Kanda, K. Morita, and M. Fuketa. Compressed double-array tries for string dictionaries supporting fast lookup. _Knowledge and Information Systems_, Online first. +- S. Kanda, K. Morita, and M. Fuketa. Compressed double-array tries for string dictionaries supporting fast lookup. _Knowledge and Information Systems_, 51(3): 1023–1042, 2017. - M. A. Martínez-Prieto, N. Brisaboa, R. Cánovas, F. Claude, and G. Navarro. Practical compressed string dictionaries. _Information Systems_, 56:73–108, 2016. - S. Yata, M. Oono, K. Morita, M. Fuketa, T. Sumitomo, and J. Aoe. A compact static double-array keeping character codes. _Information Processing & Management_, 43(1):237–247, 2007. diff --git a/src/BitVector.cpp b/src/BitVector.cpp index 8efae81..4c3b751 100644 --- a/src/BitVector.cpp +++ b/src/BitVector.cpp @@ -1,3 +1,5 @@ +#include + #include "BitVector.hpp" namespace xcdat { @@ -169,12 +171,24 @@ constexpr uint8_t kSelectTable[9][256] = { }; uint32_t pop_count(uint32_t bits) { +#ifdef XCDAT_USE_POPCNT + return static_cast(_mm_popcnt_u32(bits)); +#else bits = ((bits & 0xAAAAAAAA) >> 1) + (bits & 0x55555555); bits = ((bits & 0xCCCCCCCC) >> 2) + (bits & 0x33333333); bits = ((bits >> 4) + bits) & 0x0F0F0F0F; bits += bits >> 8; bits += bits >> 16; return bits & 0x3F; +#endif +} + +BitVector::BitVector(std::istream& is) { + bits_ = Vector(is); + rank_tips_ = Vector(is); + select_tips_ = Vector(is); + size_ = read_value(is); + num_1s_ = read_value(is); } BitVector::BitVector(BitVectorBuilder& builder, bool rank_flag, bool select_flag) { @@ -182,7 +196,7 @@ BitVector::BitVector(BitVectorBuilder& builder, bool rank_flag, bool select_flag return; } - bits_.steal(builder.bits_); + bits_ = Vector(builder.bits_); size_ = builder.size_; num_1s_ = builder.num_1s_; @@ -201,7 +215,7 @@ BitVector::BitVector(BitVectorBuilder& builder, bool rank_flag, bool select_flag } } } - rank_tips_.steal(rank_tips); + rank_tips_ = Vector(rank_tips); } // builds select_tips_ @@ -215,7 +229,7 @@ BitVector::BitVector(BitVectorBuilder& builder, bool rank_flag, bool select_flag } } select_tips.push_back(static_cast(rank_tips_.size() - 1)); - select_tips_.steal(select_tips); + select_tips_ = Vector(select_tips); } } @@ -229,7 +243,7 @@ id_type BitVector::select(size_t i) const { id_type left = 0, right = static_cast(rank_tips_.size()); if (!select_tips_.is_empty()) { - id_type select_tip_id = i / kNum1sPerTip; + auto select_tip_id = static_cast(i / kNum1sPerTip); left = select_tips_[select_tip_id]; right = select_tips_[select_tip_id + 1] + 1; } @@ -296,20 +310,4 @@ void BitVector::write(std::ostream& os) const { write_value(num_1s_, os); } -void BitVector::read(std::istream& is) { - bits_.read(is); - rank_tips_.read(is); - select_tips_.read(is); - read_value(size_, is); - read_value(num_1s_, is); -} - -void BitVector::swap(BitVector& rhs) { - bits_.swap(rhs.bits_); - rank_tips_.swap(rhs.rank_tips_); - select_tips_.swap(rhs.select_tips_); - std::swap(size_, rhs.size_); - std::swap(num_1s_, rhs.num_1s_); -} - } //namespace - xcdat diff --git a/src/BitVector.hpp b/src/BitVector.hpp index 6702b26..c0ac054 100644 --- a/src/BitVector.hpp +++ b/src/BitVector.hpp @@ -6,13 +6,11 @@ namespace xcdat { -/* - * Bit vector supporting Rank/Select operations. - * */ +// Bit vector supporting Rank/Select operations. class BitVector { public: BitVector() {} - // builder.width_ is stolen. + BitVector(std::istream &is); BitVector(BitVectorBuilder& builder, bool rank_flag, bool select_flag); ~BitVector() {} @@ -21,8 +19,10 @@ public: return (bits_[i / 32] & (1U << (i % 32))) != 0; } - id_type rank(size_t i) const; // the number of 1s in B[0,i). - id_type select(size_t i) const; // the position of the i+1 th occurrence. + // the number of 1s in B[0,i). + id_type rank(size_t i) const; + // the position of the i+1 th occurrence. + id_type select(size_t i) const; size_t num_1s() const { return num_1s_; @@ -31,19 +31,20 @@ public: return size_ - num_1s_; } - size_t size() const { // the number of bits + // the number of bits + size_t size() const { return size_; } size_t size_in_bytes() const; void write(std::ostream &os) const; - void read(std::istream &is); - - void swap(BitVector& rhs); BitVector(const BitVector&) = delete; BitVector& operator=(const BitVector&) = delete; + BitVector(BitVector&&) = default; + BitVector& operator=(BitVector&&) = default; + private: static constexpr id_type kBitsInR1 = 256; static constexpr id_type kBitsInR2 = 32; @@ -52,7 +53,7 @@ private: struct RankTip { id_type L1; - std::array L2; + uint8_t L2[kR1PerR2]; }; Vector bits_; diff --git a/src/BitVectorBuilder.hpp b/src/BitVectorBuilder.hpp index 85545b3..74ff448 100644 --- a/src/BitVectorBuilder.hpp +++ b/src/BitVectorBuilder.hpp @@ -1,27 +1,23 @@ #ifndef XCDAT_BIT_VECTOR_BUILDER_HPP_ #define XCDAT_BIT_VECTOR_BUILDER_HPP_ -#include "xcdatBasics.hpp" +#include "xcdat_basics.hpp" namespace xcdat { -/* - * Bit pool for building BitVector. - * */ +// Bit pool for building BitVector. class BitVectorBuilder { public: friend class BitVector; BitVectorBuilder() {} - BitVectorBuilder(size_t size) { - resize(size); - } + BitVectorBuilder(size_t size) { resize(size); } ~BitVectorBuilder() {} void push_back(bool bit) { if (size_ % 32 == 0) { - bits_.emplace_back(0); + bits_.push_back(0); } if (bit) { set_bit(size_, true); diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt deleted file mode 100644 index b0c9366..0000000 --- a/src/CMakeLists.txt +++ /dev/null @@ -1,35 +0,0 @@ -set(HEADER_FILES - BitVector.hpp - BitVectorBuilder.hpp - DacBc.hpp - FastDacBc.hpp - FitVector.hpp - Trie.hpp - TrieBuilder.hpp - Vector.hpp - xcdatBasics.hpp - ) - -set(SOURCE_FILES - ${HEADER_FILES} - BitVector.cpp - DacBc.cpp - FitVector.cpp - TrieBuilder.cpp - FastDacBc.cpp - ) - -add_library(xcdat STATIC ${SOURCE_FILES}) - -add_executable(xcdat-exe xcdat.cpp) -set_target_properties(xcdat-exe PROPERTIES OUTPUT_NAME xcdat) -target_link_libraries(xcdat-exe xcdat) - -enable_testing() -file(GLOB TEST_SOURCES test*.cpp) -foreach(TEST_SOURCE ${TEST_SOURCES}) - get_filename_component(TEST_SOURCE_NAME ${TEST_SOURCE} NAME_WE) - add_executable(${TEST_SOURCE_NAME} ${TEST_SOURCE}) - target_link_libraries(${TEST_SOURCE_NAME} xcdat) - add_test(${TEST_SOURCE_NAME} ${TEST_SOURCE_NAME}) -endforeach() diff --git a/src/DacBc.cpp b/src/DacBc.cpp index 86b0498..be81ab9 100644 --- a/src/DacBc.cpp +++ b/src/DacBc.cpp @@ -4,16 +4,29 @@ namespace xcdat { +DacBc::DacBc(std::istream& is) { + for (size_t i = 0; i < sizeof(id_type); ++i) { + values_[i] = Vector(is); + } + for (size_t i = 0; i < sizeof(id_type) - 1; ++i) { + flags_[i] = BitVector(is); + } + leaf_flags_ = BitVector(is); + links_ = FitVector(is); + max_level_ = read_value(is); + num_free_nodes_ = read_value(is); +} + DacBc::DacBc(const std::vector& bc, BitVectorBuilder& leaf_flags) { if (bc.empty()) { return; } - std::array, sizeof(id_type)> values; - std::array flags; + std::vector values[sizeof(id_type)]; + BitVectorBuilder flags[sizeof(id_type)]; std::vector links; - BitVector(leaf_flags, true, false).swap(leaf_flags_); + leaf_flags_ = BitVector(leaf_flags, true, false); values[0].reserve(bc.size() * 2); flags[0].reserve(bc.size() * 2); @@ -56,11 +69,11 @@ DacBc::DacBc(const std::vector& bc, BitVectorBuilder& leaf_flags) { // release for (uint8_t i = 0; i < max_level_; ++i) { - values_[i].steal(values[i]); - BitVector(flags[i], true, false).swap(flags_[i]); + values_[i] = Vector(values[i]); + flags_[i] = BitVector(flags[i], true, false); } - values_[max_level_].steal(values[max_level_]); - FitVector(links).swap(links_); + values_[max_level_] = Vector(values[max_level_]); + links_ = FitVector(links); } size_t DacBc::size_in_bytes() const { @@ -111,32 +124,6 @@ void DacBc::write(std::ostream& os) const { write_value(num_free_nodes_, os); } -void DacBc::read(std::istream& is) { - for (auto& values : values_) { - values.read(is); - } - for (auto& flags : flags_) { - flags.read(is); - } - leaf_flags_.read(is); - links_.read(is); - read_value(max_level_, is); - read_value(num_free_nodes_, is); -} - -void DacBc::swap(DacBc& rhs) { - for (uint32_t i = 0; i < values_.size(); ++i) { - values_[i].swap(rhs.values_[i]); - } - for (uint32_t i = 0; i < flags_.size(); ++i) { - flags_[i].swap(rhs.flags_[i]); - } - leaf_flags_.swap(rhs.leaf_flags_); - links_.swap(rhs.links_); - std::swap(max_level_, rhs.max_level_); - std::swap(num_free_nodes_, rhs.num_free_nodes_); -} - id_type DacBc::access_(id_type i) const { uint8_t level = 0; id_type value = values_[level][i]; diff --git a/src/DacBc.hpp b/src/DacBc.hpp index 548f789..465b62f 100644 --- a/src/DacBc.hpp +++ b/src/DacBc.hpp @@ -6,14 +6,13 @@ namespace xcdat { -/* - * BASE/CHECK representation using byte-oriented DACs. - * */ +// BASE/CHECK representation using byte-oriented DACs. class DacBc { public: static constexpr id_type kWidthL1 = 8; DacBc() {} + DacBc(std::istream &is); DacBc(const std::vector& bc, BitVectorBuilder& leaf_flags); ~DacBc() {} @@ -49,16 +48,16 @@ public: void show_stat(std::ostream &os) const; void write(std::ostream &os) const; - void read(std::istream &is); - - void swap(DacBc& rhs); DacBc(const DacBc&) = delete; DacBc& operator=(const DacBc&) = delete; + DacBc(DacBc&&) = default; + DacBc& operator=(DacBc&&) = default; + private: - std::array, sizeof(id_type)> values_; - std::array flags_; + Vector values_[sizeof(id_type)]; + BitVector flags_[sizeof(id_type) - 1]; BitVector leaf_flags_; FitVector links_; uint8_t max_level_ = 0; diff --git a/src/FastDacBc.cpp b/src/FastDacBc.cpp index 8562b71..c77e24a 100644 --- a/src/FastDacBc.cpp +++ b/src/FastDacBc.cpp @@ -2,6 +2,21 @@ namespace xcdat { +FastDacBc::FastDacBc(std::istream& is) { + values_L1_ = Vector(is); + values_L2_ = Vector(is); + values_L3_ = Vector(is); +#ifdef XCDAT_X64 + values_L4_ = Vector(is); +#endif + for (size_t i = 0; i < kLayers - 1; ++i) { + ranks_[i] = Vector(is); + } + leaf_flags_ = BitVector(is); + links_ = FitVector(is); + num_free_nodes_ = read_value(is); +} + FastDacBc::FastDacBc(const std::vector& bc, BitVectorBuilder& leaf_flags) { if (bc.empty()) { return; @@ -10,13 +25,12 @@ FastDacBc::FastDacBc(const std::vector& bc, BitVectorBuilder& leaf_flags std::vector values_L1; std::vector values_L2; std::vector values_L3; -#ifdef XCDAT64 +#ifdef XCDAT_X64 std::vector values_L4; #endif - std::array, kLayers - 1> ranks; - + std::vector ranks[kLayers - 1]; std::vector links; - BitVector(leaf_flags, true, false).swap(leaf_flags_); + leaf_flags_ = BitVector(leaf_flags, true, false); ranks[0].reserve((bc.size() * 2) / 128); @@ -43,7 +57,7 @@ FastDacBc::FastDacBc(const std::vector& bc, BitVectorBuilder& leaf_flags values_L2.push_back(static_cast(1 | (pos << 1))); } -#ifdef XCDAT64 +#ifdef XCDAT_X64 if ((values_L3.size() % kBlockLenL3) == 0) { ranks[1].push_back(static_cast(values_L4.size())); } @@ -82,16 +96,16 @@ FastDacBc::FastDacBc(const std::vector& bc, BitVectorBuilder& leaf_flags } // release - values_L1_.steal(values_L1); - values_L2_.steal(values_L2); - values_L3_.steal(values_L3); -#ifdef XCDAT64 - values_L4_.steal(values_L4); + values_L1_ = Vector(values_L1); + values_L2_ = Vector(values_L2); + values_L3_ = Vector(values_L3); +#ifdef XCDAT_X64 + values_L4_ = Vector(values_L4); #endif - for (uint8_t j = 0; j < ranks.size(); ++j) { - ranks_[j].steal(ranks[j]); + for (uint8_t j = 0; j < kLayers - 1; ++j) { + ranks_[j] = Vector(ranks[j]); } - FitVector(links).swap(links_); + links_ = FitVector(links); } size_t FastDacBc::size_in_bytes() const { @@ -99,7 +113,7 @@ size_t FastDacBc::size_in_bytes() const { ret += values_L1_.size_in_bytes(); ret += values_L2_.size_in_bytes(); ret += values_L3_.size_in_bytes(); -#ifdef XCDAT64 +#ifdef XCDAT_X64 ret += values_L4_.size_in_bytes(); #endif for (auto& ranks : ranks_) { @@ -120,12 +134,12 @@ void FastDacBc::show_stat(std::ostream& os) const { show_size_ratio("\tvalues_L1:", values_L1_.size_in_bytes(), total_size, os); show_size_ratio("\tvalues_L2:", values_L2_.size_in_bytes(), total_size, os); show_size_ratio("\tvalues_L3:", values_L3_.size_in_bytes(), total_size, os); -#ifdef XCDAT64 +#ifdef XCDAT_X64 show_size_ratio("\tvalues_L4:", values_L4_.size_in_bytes(), total_size, os); #endif show_size_ratio("\tranks_L1: ", ranks_[0].size_in_bytes(), total_size, os); show_size_ratio("\tranks_L2: ", ranks_[1].size_in_bytes(), total_size, os); -#ifdef XCDAT64 +#ifdef XCDAT_X64 show_size_ratio("\tranks_L3: ", ranks_[2].size_in_bytes(), total_size, os); #endif show_size_ratio("\tleaves: ", leaf_flags_.size_in_bytes(), total_size, os); @@ -136,7 +150,7 @@ void FastDacBc::write(std::ostream& os) const { values_L1_.write(os); values_L2_.write(os); values_L3_.write(os); -#ifdef XCDAT64 +#ifdef XCDAT_X64 values_L4_.write(os); #endif for (auto& ranks : ranks_) { @@ -147,36 +161,6 @@ void FastDacBc::write(std::ostream& os) const { write_value(num_free_nodes_, os); } -void FastDacBc::read(std::istream& is) { - values_L1_.read(is); - values_L2_.read(is); - values_L3_.read(is); -#ifdef XCDAT64 - values_L4_.read(is); -#endif - for (auto& ranks : ranks_) { - ranks.read(is); - } - leaf_flags_.read(is); - links_.read(is); - read_value(num_free_nodes_, is); -} - -void FastDacBc::swap(FastDacBc& rhs) { - values_L1_.swap(rhs.values_L1_); - values_L2_.swap(rhs.values_L2_); - values_L3_.swap(rhs.values_L3_); -#ifdef XCDAT64 - values_L4_.swap(rhs.values_L4_); -#endif - for (uint32_t j = 0; j < ranks_.size(); ++j) { - ranks_[j].swap(rhs.ranks_[j]); - } - leaf_flags_.swap(rhs.leaf_flags_); - links_.swap(rhs.links_); - std::swap(num_free_nodes_, rhs.num_free_nodes_); -} - id_type FastDacBc::access_(id_type i) const { uint32_t value = values_L1_[i] >> 1; if ((values_L1_[i] & 1U) == 0) { @@ -188,7 +172,7 @@ id_type FastDacBc::access_(id_type i) const { return value; } i = ranks_[1][i / kBlockLenL2] + value; -#ifdef XCDAT64 +#ifdef XCDAT_X64 value = values_L3_[i] >> 1; if ((values_L3_[i] & 1U) == 0) { return value; diff --git a/src/FastDacBc.hpp b/src/FastDacBc.hpp index 6336355..ab17837 100644 --- a/src/FastDacBc.hpp +++ b/src/FastDacBc.hpp @@ -9,13 +9,11 @@ namespace xcdat { -/* - * BASE/CHECK representation using pointer-based byte-oriented DACs. - * */ +// BASE/CHECK representation using pointer-based byte-oriented DACs. class FastDacBc { public: static constexpr id_type kWidthL1 = 7; -#ifdef XCDAT64 +#ifdef XCDAT_X64 static constexpr uint8_t kLayers = 4; #else static constexpr uint8_t kLayers = 3; @@ -23,11 +21,12 @@ public: static constexpr id_type kBlockLenL1 = 1U << 7; static constexpr id_type kBlockLenL2 = 1U << 15; -#ifdef XCDAT64 +#ifdef XCDAT_X64 static constexpr id_type kBlockLenL3 = 1U << 31; #endif FastDacBc() {} + FastDacBc(std::istream& is); FastDacBc(const std::vector& bc, BitVectorBuilder& leaf_flags); ~FastDacBc() {} @@ -60,26 +59,23 @@ public: } size_t size_in_bytes() const; - void show_stat(std::ostream& os) const; - void write(std::ostream& os) const; - void read(std::istream& is); - - void swap(FastDacBc& rhs); FastDacBc(const FastDacBc&) = delete; FastDacBc& operator=(const FastDacBc&) = delete; + FastDacBc(FastDacBc&&) = default; + FastDacBc& operator=(FastDacBc&&) = default; + private: Vector values_L1_; Vector values_L2_; Vector values_L3_; -#ifdef XCDAT64 +#ifdef XCDAT_X64 Vector values_L4_; #endif - std::array, kLayers - 1> ranks_; - + Vector ranks_[kLayers - 1]; BitVector leaf_flags_; FitVector links_; size_t num_free_nodes_ = 0; diff --git a/src/FitVector.cpp b/src/FitVector.cpp index f0516f4..484af17 100644 --- a/src/FitVector.cpp +++ b/src/FitVector.cpp @@ -2,36 +2,43 @@ namespace xcdat { -FitVector::FitVector(const std::vector& integers) { - if (integers.empty()) { +FitVector::FitVector(std::istream& is) { + chunks_ = Vector(is); + size_= read_value(is); + width_ = read_value(is); + mask_ = read_value(is); +} + +FitVector::FitVector(const std::vector& values) { + if (values.empty()) { return; } width_ = 0; - auto max_value = *std::max_element(std::begin(integers), std::end(integers)); + auto max_value = *std::max_element(std::begin(values), std::end(values)); do { ++width_; max_value >>= 1; } while (max_value); - size_ = integers.size(); + size_ = values.size(); mask_ = (1U << width_) - 1; std::vector chunks(size_ * width_ / kChunkWidth + 1, 0); for (id_type i = 0; i < size_; ++i) { - const auto chunk_pos = i * width_ / kChunkWidth; - const auto offset = i * width_ % kChunkWidth; + const auto chunk_pos = static_cast(i * width_ / kChunkWidth); + const auto offset = static_cast(i * width_ % kChunkWidth); chunks[chunk_pos] &= ~(mask_ << offset); - chunks[chunk_pos] |= (integers[i] & mask_) << offset; + chunks[chunk_pos] |= (values[i] & mask_) << offset; if (kChunkWidth < offset + width_) { chunks[chunk_pos + 1] &= ~(mask_ >> (kChunkWidth - offset)); - chunks[chunk_pos + 1] |= (integers[i] & mask_) >> (kChunkWidth - offset); + chunks[chunk_pos + 1] |= (values[i] & mask_) >> (kChunkWidth - offset); } } - chunks_.steal(chunks); + chunks_ = Vector(chunks); } size_t FitVector::size_in_bytes() const { @@ -50,18 +57,4 @@ void FitVector::write(std::ostream& os) const { write_value(mask_, os); } -void FitVector::read(std::istream& is) { - chunks_.read(is); - read_value(size_, is); - read_value(width_, is); - read_value(mask_, is); -} - -void FitVector::swap(FitVector& rhs) { - chunks_.swap(rhs.chunks_); - std::swap(size_, rhs.size_); - std::swap(width_, rhs.width_); - std::swap(mask_, rhs.mask_); -} - } //namespace - xcdat diff --git a/src/FitVector.hpp b/src/FitVector.hpp index 0f8369e..2370751 100644 --- a/src/FitVector.hpp +++ b/src/FitVector.hpp @@ -5,21 +5,20 @@ namespace xcdat { -/* - * Compressed integer vector. - * */ +// Compacted integer vector. class FitVector { public: static constexpr id_type kChunkWidth = sizeof(id_type) * 8; FitVector() {} - FitVector(const std::vector& integers); + FitVector(std::istream &is); + FitVector(const std::vector& values); ~FitVector() {} id_type operator[](size_t i) const { - id_type chunk_pos = i * width_ / kChunkWidth; - id_type offset = i * width_ % kChunkWidth; + auto chunk_pos = static_cast(i * width_ / kChunkWidth); + auto offset = static_cast(i * width_ % kChunkWidth); if (offset + width_ <= kChunkWidth) { return (chunks_[chunk_pos] >> offset) & mask_; } else { @@ -32,13 +31,13 @@ public: size_t size_in_bytes() const; void write(std::ostream &os) const; - void read(std::istream &is); - - void swap(FitVector& rhs); FitVector(const FitVector&) = delete; FitVector& operator=(const FitVector&) = delete; + FitVector(FitVector&&) = default; + FitVector& operator=(FitVector&&) = default; + private: Vector chunks_; size_t size_ = 0; diff --git a/src/Trie.hpp b/src/Trie.hpp index c662f0a..a295502 100644 --- a/src/Trie.hpp +++ b/src/Trie.hpp @@ -3,250 +3,242 @@ #include "DacBc.hpp" #include "FastDacBc.hpp" -#include "TrieBuilder.hpp" namespace xcdat { -constexpr auto kNotFound = static_cast(-1); -constexpr auto kDefaultLimit = static_cast(-1); +constexpr auto kNotFound = kIdMax; -/* - * Compressed string dictionary using an improved double-array trie. There are two versions for - * representing BASE/CHECK arrays. - * @param Fast: the version of DACs representing BASE/CHECK arrays. - * */ +// Compressed string dictionary using an improved double-array trie. There are two versions of DACs +// representing BASE/CHECK arrays, selected with the Fast parameter. template class Trie { public: using Type = Trie; - using BcType = Conditional; + using BcType = typename std::conditional::type; - /* - * Generic constructor. - * */ + // Generic constructor. Trie() {} - /* - * Builds the dictionary from given string keys. The keys must be sorted in lexicographical order - * without duplication. Any error in construction is reported by TrieBuilder::Exception. - * @param keys: the pairs of key pointers and lengths - * @param binary_mode: whether the keys include the ASCII zero code or not - * */ - Trie(const std::vector>& keys, bool binary_mode = false) { - TrieBuilder builder(keys, BcType::kWidthL1, false); - - BcType(builder.bc_, builder.leaf_flags_).swap(bc_); - BitVector(builder.term_flags_, true, true).swap(terminal_flags_); - tail_.steal(builder.tail_); - BitVector(builder.boundary_flags_, false, false).swap(boundary_flags_); - alphabet_.steal(builder.alphabet_); - table_ = builder.table_; - - num_keys_ = keys.size(); - max_length_ = builder.max_length_; - binary_mode_ = builder.binary_mode_; + // Reads the dictionary from an istream. + Trie(std::istream& is) { + bc_ = BcType(is); + terminal_flags_ = BitVector(is); + tail_ = Vector(is); + boundary_flags_ = BitVector(is); + alphabet_ = Vector(is); + is.read(reinterpret_cast(table_), 512); + num_keys_ = read_value(is); + max_length_ = read_value(is); + binary_mode_ = read_value(is); } - /* - * Generic destructor. - * */ + // Generic destructor. ~Trie() {} - /* - * Lookups the ID of a given key. - * @param key: key pointer. - * @param length: key length. - * @returns the ID if the query is registered; otherwise returns kNotFound. - * */ + // Lookups the ID of a given key. If the key is not registered, returns kNotFound. id_type lookup(const uint8_t* key, size_t length) const { + size_t pos = 0; id_type node_id = 0; - size_t i = 0; while (!bc_.is_leaf(node_id)) { - if (i == length) { - return terminal_flags_[node_id] ? to_string_id_(node_id) : kNotFound; + if (pos == length) { + return terminal_flags_[node_id] ? to_key_id_(node_id) : kNotFound; } - const auto child_id = bc_.base(node_id) ^table_[key[i++]]; + + const auto child_id = bc_.base(node_id) ^table_[key[pos++]]; if (bc_.check(child_id) != node_id) { return kNotFound; } + node_id = child_id; } - if (match_(key + i, length - i, bc_.link(node_id))) { - return to_string_id_(node_id); + size_t tail_pos = bc_.link(node_id); + if (!match_(key, length, pos, tail_pos)) { + return kNotFound; } - return kNotFound; + + return to_key_id_(node_id); } - /* - * Decodes the key associated with a given ID. - * @param id: ID. - * @param[out] ret: the decoded key. - * @returns whether the given ID is within the range or not. - * */ - bool access(id_type id, std::vector& ret) const { + // Decodes the key associated with a given ID. The decoded key is appended to 'ret' and its + // length is returned. + size_t access(id_type id, std::vector& ret) const { if (num_keys_ <= id) { - return false; + return 0; } - ret.reserve(ret.size() + max_length_); + auto orig_size = ret.size(); + ret.reserve(orig_size + max_length_); auto node_id = to_node_id_(id); - auto link = bc_.is_leaf(node_id) ? bc_.link(node_id) : kNotFound; + auto tail_pos = bc_.is_leaf(node_id) ? bc_.link(node_id) : kNotFound; while (node_id) { const auto parent_id = bc_.check(node_id); - ret.emplace_back(edge_(parent_id, node_id)); + ret.push_back(edge_(parent_id, node_id)); node_id = parent_id; } - std::reverse(std::begin(ret), std::end(ret)); + std::reverse(std::begin(ret) + orig_size, std::end(ret)); - if (link != 0 && link != kNotFound) { + if (tail_pos != 0 && tail_pos != kNotFound) { if (binary_mode_) { do { - ret.push_back(tail_[link]); - } while (!boundary_flags_[link++]); + ret.push_back(tail_[tail_pos]); + } while (!boundary_flags_[tail_pos++]); } else { do { - ret.push_back(tail_[link++]); - } while (tail_[link]); + ret.push_back(tail_[tail_pos++]); + } while (tail_[tail_pos]); } } - return true; + return ret.size() - orig_size; } - /* - * Enumerates the IDs of keys included as prefixes of a given key. - * @param key: key pointer. - * @param length: key length. - * @param[out] ids: IDs of matched keys. - * @param limit: the maximum number of matched keys (optional). - * @returns the number of matched keys. - * */ + // Returns the IDs of keys included as prefixes of a given key. The IDs are appended to 'ids' and + // the number is returned. By using 'limit', you can restrict the maximum number of returned IDs. size_t common_prefix_lookup(const uint8_t* key, size_t length, std::vector& ids, - size_t limit = kDefaultLimit) const { + size_t limit = std::numeric_limits::max()) const { if (limit == 0) { return 0; } + size_t pos = 0, count = 0; id_type node_id = 0; - size_t i = 0, num_ids = 0; while (!bc_.is_leaf(node_id)) { if (terminal_flags_[node_id]) { - ids.push_back(to_string_id_(node_id)); - ++num_ids; - if (num_ids == limit) { - return num_ids; + ids.push_back(to_key_id_(node_id)); + if (limit <= ++count) { + return count; } } - if (i == length) { - return num_ids; + + if (pos == length) { + return count; } - const auto child_id = bc_.base(node_id) ^table_[key[i++]]; - + const auto child_id = bc_.base(node_id) ^table_[key[pos++]]; if (bc_.check(child_id) != node_id) { - return num_ids; + return count; } node_id = child_id; } - if (match_(key + i, length - i, bc_.link(node_id))) { - ids.push_back(to_string_id_(node_id)); - ++num_ids; + size_t tail_pos = bc_.link(node_id); + if (match_(key, length, pos, tail_pos)) { + ids.push_back(to_key_id_(node_id)); + ++count; } - return num_ids; + return count; } - /* - * Enumerates the IDs of keys starting with a given key. - * @param key: key pointer. - * @param length: key length. - * @param[out] ids: IDs of matched keys. - * @param limit: the maximum number of matched keys (optional). - * @returns the number of matched keys. - * */ + // Returns the IDs of keys starting with a given key. The IDs are appended to 'ids' and the + // number is returned. By using 'limit', you can restrict the maximum number of returned IDs. size_t predictive_lookup(const uint8_t* key, size_t length, std::vector& ids, - size_t limit = kDefaultLimit) const { + size_t limit = std::numeric_limits::max()) const { if (limit == 0) { return 0; } + size_t pos = 0; id_type node_id = 0; - size_t i = 0; - for (; i < length; ++i) { + for (; pos < length; ++pos) { if (bc_.is_leaf(node_id)) { - if (prefix_match_(key + i, length - i, bc_.link(node_id))) { - ids.push_back(to_string_id_(node_id)); - return 1; + size_t tail_pos = bc_.link(node_id); + if (!prefix_match_(key, length, pos, tail_pos)) { + return 0; } - return 0; + + ids.push_back(to_key_id_(node_id)); + return 1; } - const auto child_id = bc_.base(node_id) ^table_[key[i]]; + const auto child_id = bc_.base(node_id) ^table_[key[pos]]; if (bc_.check(child_id) != node_id) { return 0; } + node_id = child_id; } - size_t num_ids = 0; - enumerate_ids_(node_id, ids, num_ids, limit); - return num_ids; + size_t count = 0; + + std::vector> stack; + stack.push_back({node_id, pos}); + + while (!stack.empty()) { + node_id = stack.back().first; + pos = stack.back().second; + stack.pop_back(); + + if (bc_.is_leaf(node_id)) { + ids.push_back(to_key_id_(node_id)); + if (limit <= ++count) { + break; + } + } else { + if (terminal_flags_[node_id]) { + ids.push_back(to_key_id_(node_id)); + if (limit <= ++count) { + break; + } + } + + const auto base = bc_.base(node_id); + for (const auto label : alphabet_) { + const auto child_id = base ^table_[label]; + if (bc_.check(child_id) == node_id) { + stack.push_back({child_id, pos + 1}); + } + } + } + } + + return count; } - /* - * Gets the number of keys in the dictionary. - * @returns the number of keys in the dictionary. - * */ + // Gets the number of registered keys in the dictionary size_t num_keys() const { return num_keys_; } - /* - * Gets the size of alphabet drawing keys in the dictionary. - * @returns the alphabet size. - * */ + // Gets the maximum length of registered keys + size_t max_length() const { + return max_length_; + } + + // Gets the binary mode + bool is_binary_mode() const { + return binary_mode_; + } + + // Gets the size of alphabet drawing keys in the dictionary. size_t alphabet_size() const { return alphabet_.size(); } - /* - * Gets the number of nodes assigned by arranging nodes. - * The result is the same as num_used_nodes() + num_free_nodes(). - * @returns the number of the nodes. - * */ + // Gets the number of nodes including free nodes. size_t num_nodes() const { return bc_.num_nodes(); } - /* - * Gets the number of nodes in the original trie. - * @returns the number of the nodes. - * */ + // Gets the number of nodes in the original trie. size_t num_used_nodes() const { return bc_.num_used_nodes(); } - /* - * Gets the number of nodes corresponding to empty elements. - * @returns the number of the nodes. - * */ + // Gets the number of free nodes corresponding to empty elements. size_t num_free_nodes() const { return bc_.num_free_nodes(); } - /* - * Computes the size of the structure in bytes. - * @returns the dictionary size in bytes. - * */ + // Computes the output dictionary size in bytes. size_t size_in_bytes() const { size_t ret = 0; ret += bc_.size_in_bytes(); @@ -257,21 +249,11 @@ public: ret += sizeof(table_); ret += sizeof(num_keys_); ret += sizeof(max_length_); + ret += sizeof(binary_mode_); return ret; } - /* - * Gets the binary mode. - * @returns the binary mode. - * */ - bool is_binary_mode() const { - return binary_mode_; - } - - /* - * Reports the dictionary statistics into an ostream. - * @param os: the ostream. - * */ + // Reports the dictionary statistics into an ostream. void show_stat(std::ostream& os) const { const auto total_size = size_in_bytes(); os << "basic statistics of xcdat::Trie" << std::endl; @@ -289,70 +271,39 @@ public: bc_.show_stat(os); } - /* - * Writes the dictionary into an ostream. - * @param os: the ostream. - * */ + // Writes the dictionary into an ostream. void write(std::ostream& os) const { bc_.write(os); terminal_flags_.write(os); tail_.write(os); boundary_flags_.write(os); alphabet_.write(os); - write_value(table_, os); + os.write(reinterpret_cast(table_), 512); write_value(num_keys_, os); write_value(max_length_, os); + write_value(binary_mode_, os); } - /* - * Reads the dictionary from an istream. - * @param is: the istream. - * */ - void read(std::istream& is) { - bc_.read(is); - terminal_flags_.read(is); - tail_.read(is); - boundary_flags_.read(is); - alphabet_.read(is); - read_value(table_, is); - read_value(num_keys_, is); - read_value(max_length_, is); - } - - /* - * Swaps the dictionary. - * @param rhs: the dictionary to be swapped. - * */ - void swap(Type& rhs) { - bc_.swap(rhs.bc_); - terminal_flags_.swap(rhs.terminal_flags_); - tail_.swap(rhs.tail_); - boundary_flags_.swap(rhs.boundary_flags_); - alphabet_.swap(rhs.alphabet_); - table_.swap(rhs.table_); - std::swap(num_keys_, rhs.num_keys_); - std::swap(max_length_, rhs.max_length_); - } - - /* - * Disallows copy and assignment. - * */ + // Disallows copy and assignment. Trie(const Trie&) = delete; Trie& operator=(const Trie&) = delete; + Trie(Trie&&) = default; + Trie& operator=(Trie&&) = default; + private: BcType bc_; BitVector terminal_flags_; Vector tail_; - BitVector boundary_flags_; // if binary_mode_ + BitVector boundary_flags_; // used if binary_mode_ == true Vector alphabet_; - std::array table_; // table[table[c] + 256] = c + uint8_t table_[512]; // table[table[c] + 256] = c size_t num_keys_ = 0; size_t max_length_ = 0; bool binary_mode_ = false; - id_type to_string_id_(id_type node_id) const { + id_type to_key_id_(id_type node_id) const { return terminal_flags_.rank(node_id); }; @@ -364,78 +315,68 @@ private: return table_[static_cast(bc_.base(node_id) ^ child_id) + 256]; } - bool match_(const uint8_t* key, size_t length, id_type link) const { - if (link == 0) { - return length == 0; + bool match_(const uint8_t* key, size_t length, size_t pos, size_t tail_pos) const { + assert(pos <= length); + + if (pos == length) { + return tail_pos == 0; } if (binary_mode_) { - for (size_t i = 0; i < length;) { - if (tail_[link] != key[i++]) { + do { + if (key[pos] != tail_[tail_pos]) { return false; } - if (boundary_flags_[link++]) { - return i == length; + ++pos; + if (boundary_flags_[tail_pos]) { + return pos == length; } - } + ++tail_pos; + } while (pos < length); return false; } else { - auto tail = tail_.data() + link; - for (size_t i = 0; i < length; ++i) { - if (tail[i] == '\0' || key[i] != tail[i]) { + do { + if (!tail_[tail_pos] || key[pos] != tail_[tail_pos]) { return false; } - } - return tail[length] == '\0'; + ++pos; + ++tail_pos; + } while (pos < length); + return !tail_[tail_pos]; } } - bool prefix_match_(const uint8_t* key, size_t length, id_type link) const { - if (link == 0) { - return length == 0; + bool prefix_match_(const uint8_t* key, size_t length, size_t pos, size_t tail_pos) const { + assert(pos < length); + + if (tail_pos == 0) { + return false; } if (binary_mode_) { - for (size_t i = 0; i < length;) { - if (tail_[link] != key[i++]) { + do { + if (key[pos] != tail_[tail_pos]) { return false; } - if (boundary_flags_[link++]) { - return i == length; + ++pos; + if (boundary_flags_[tail_pos]) { + return pos == length; } - } + ++tail_pos; + } while (pos < length); } else { - auto tail = tail_.data() + link; - for (size_t i = 0; i < length; ++i) { - if (tail[i] == '\0' || key[i] != tail[i]) { + do { + if (key[pos] != tail_[tail_pos] || !tail_[tail_pos]) { return false; } - } + ++pos; + ++tail_pos; + } while (pos < length); } - return true; } - void enumerate_ids_(id_type node_id, std::vector& ids, - size_t& num_ids, size_t limit) const { - if (terminal_flags_[node_id]) { - ids.push_back(to_string_id_(node_id)); - ++num_ids; - if (bc_.is_leaf(node_id)) { - return; - } - } - const auto base = bc_.base(node_id); - for (const auto label : alphabet_) { - if (num_ids == limit) { - break; - } - const auto child_id = base ^table_[label]; - if (bc_.check(child_id) == node_id) { - enumerate_ids_(child_id, ids, num_ids, limit); - } - } - } + friend class TrieBuilder; }; } //namespace - xcdat diff --git a/src/TrieBuilder.cpp b/src/TrieBuilder.cpp index d09cb93..22a4097 100644 --- a/src/TrieBuilder.cpp +++ b/src/TrieBuilder.cpp @@ -3,13 +3,12 @@ namespace xcdat { -TrieBuilder::TrieBuilder(const std::vector>& keys, - id_type width_L1, bool binary_mode) +TrieBuilder::TrieBuilder(const std::vector& keys, id_type width_L1, bool binary_mode) : keys_(keys), block_size_(1U << width_L1), width_L1_(width_L1), binary_mode_(binary_mode) { if (keys_.empty()) { throw TrieBuilder::Exception("The input data is empty."); } - if (kIdUpper < keys_.size()) { + if (kIdMax < keys_.size()) { throw TrieBuilder::Exception("Key ID range error."); } @@ -18,6 +17,7 @@ TrieBuilder::TrieBuilder(const std::vector>& k while (init_capacity < keys_.size()) { init_capacity <<= 1; } + bc_.reserve(init_capacity); leaf_flags_.reserve(init_capacity); term_flags_.reserve(init_capacity); @@ -40,7 +40,7 @@ TrieBuilder::TrieBuilder(const std::vector>& k bc_[0].check = 255; for (id_type i = 0; i < 256; i += block_size_) { - heads_.emplace_back(i); + heads_.push_back(i); } use_(0); @@ -55,24 +55,18 @@ TrieBuilder::TrieBuilder(const std::vector>& k void TrieBuilder::build_table_() { using tb_type = std::pair; - std::array table_builder; + tb_type table_builder[256]; for (uint32_t i = 0; i < 256; ++i) { table_builder[i] = {static_cast(i), 0}; } - auto char_count = [&](const std::pair& key) { - for (size_t i = 0; i < key.second; ++i) { - ++table_builder[key.first[i]].second; + max_length_ = 0; + for (size_t i = 0; i < keys_.size(); ++i) { + for (size_t j = 0; j < keys_[i].length; ++j) { + ++table_builder[keys_[i].ptr[j]].second; } - }; - - char_count(keys_[0]); - max_length_ = keys_[0].second; - - for (size_t i = 1; i < keys_.size(); ++i) { - char_count(keys_[i]); - max_length_ = std::max(max_length_, keys_[i].second); + max_length_ = std::max(max_length_, keys_[i].length); } if (table_builder[0].second) { // including '\0' @@ -87,7 +81,9 @@ void TrieBuilder::build_table_() { alphabet_.shrink_to_fit(); std::sort(std::begin(table_builder), std::end(table_builder), - [](const tb_type& lhs, const tb_type& rhs) { return lhs.second > rhs.second; }); + [](const tb_type& lhs, const tb_type& rhs) { + return lhs.second > rhs.second; + }); for (uint32_t i = 0; i < 256; ++i) { table_[table_builder[i].first] = static_cast(i); @@ -99,7 +95,7 @@ void TrieBuilder::build_table_() { } void TrieBuilder::build_bc_(size_t begin, size_t end, size_t depth, id_type node_id) { - if (keys_[begin].second == depth) { + if (keys_[begin].length == depth) { term_flags_.set_bit(node_id, true); if (++begin == end) { // without link? bc_[node_id].base = 0; // with an empty suffix @@ -110,15 +106,15 @@ void TrieBuilder::build_bc_(size_t begin, size_t end, size_t depth, id_type node term_flags_.set_bit(node_id, true); leaf_flags_.set_bit(node_id, true); auto& key = keys_[begin]; - suffixes_.push_back({{key.first + depth, key.second - depth}, node_id}); + suffixes_.push_back({{key.ptr + depth, key.length - depth}, node_id}); return; } { // fetching edges edges_.clear(); - auto label = keys_[begin].first[depth]; + auto label = keys_[begin].ptr[depth]; for (auto str_id = begin + 1; str_id < end; ++str_id) { - const auto _label = keys_[str_id].first[depth]; + const auto _label = keys_[str_id].ptr[depth]; if (label != _label) { if (_label < label) { throw TrieBuilder::Exception("The input data is not in lexicographical order."); @@ -145,9 +141,9 @@ void TrieBuilder::build_bc_(size_t begin, size_t end, size_t depth, id_type node // following the children auto _begin = begin; - auto label = keys_[begin].first[depth]; + auto label = keys_[begin].ptr[depth]; for (auto _end = begin + 1; _end < end; ++_end) { - const auto _label = keys_[_end].first[depth]; + const auto _label = keys_[_end].ptr[depth]; if (label != _label) { build_bc_(_begin, _end, depth + 1, base ^ table_[label]); label = _label; @@ -159,10 +155,11 @@ void TrieBuilder::build_bc_(size_t begin, size_t end, size_t depth, id_type node // The algorithm is inspired by marisa-trie void TrieBuilder::build_tail_() { - auto cmp = [](const Suffix& lhs, const Suffix& rhs) { - return std::lexicographical_compare(lhs.rbegin(), lhs.rend(), rhs.rbegin(), rhs.rend()); - }; - std::sort(std::begin(suffixes_), std::end(suffixes_), cmp); + std::sort(std::begin(suffixes_), std::end(suffixes_), + [](const Suffix& lhs, const Suffix& rhs) { + return std::lexicographical_compare(lhs.rbegin(), lhs.rend(), + rhs.rbegin(), rhs.rend()); + }); // For empty suffixes tail_.emplace_back('\0'); @@ -190,7 +187,7 @@ void TrieBuilder::build_tail_() { } else { // append bc_[cur.node_id].base = static_cast(tail_.size()); for (size_t j = 0; j < cur.length(); ++j) { - tail_.push_back(cur.string.first[j]); + tail_.push_back(cur.str.ptr[j]); } if (binary_mode_) { for (size_t j = 1; j < cur.length(); ++j) { @@ -200,18 +197,16 @@ void TrieBuilder::build_tail_() { } else { tail_.emplace_back('\0'); } - if (kIdUpper < tail_.size()) { + if (kIdMax < tail_.size()) { throw TrieBuilder::Exception("TAIL address range error."); } } prev = &cur; } - - tail_.shrink_to_fit(); } void TrieBuilder::expand_() { - if (kIdUpper < bc_.size() + 256) { + if (kIdMax < bc_.size() + 256) { throw TrieBuilder::Exception("Node ID range error."); } @@ -234,7 +229,7 @@ void TrieBuilder::expand_() { } for (auto i = old_size; i < new_size; i += block_size_) { - heads_.emplace_back(i); + heads_.push_back(i); } const auto block_id = old_size / 256; diff --git a/src/TrieBuilder.hpp b/src/TrieBuilder.hpp index 6c471c0..5860a92 100644 --- a/src/TrieBuilder.hpp +++ b/src/TrieBuilder.hpp @@ -1,36 +1,50 @@ #ifndef XCDAT_TRIE_BUILDER_HPP_ #define XCDAT_TRIE_BUILDER_HPP_ -#include "BitVectorBuilder.hpp" +#include "Trie.hpp" namespace xcdat { -// prototype declaration for friend -template class Trie; - -/* - * Double-array trie builder. - * */ +// Double-array trie builder. class TrieBuilder { public: - friend class Trie; - friend class Trie; - // for avoiding undefined traversal static constexpr id_type kTabooId = 1; // inspired by darts-clone static constexpr id_type kFreeBlocks = 16; - TrieBuilder(const std::vector>& keys, - id_type width_L1, bool binary_mode); - ~TrieBuilder() {} + struct Key { + const uint8_t* ptr; + size_t length; + }; - /* - * Exception class for xcdat::TrieBuilder - * */ + // Builds the dictionary from given string keys. The keys must be sorted in lexicographical order + // without duplication. Any error in construction is reported by TrieBuilder::Exception. If the + // keys include the ASCII zero code, pass binary_mode = true. + template + static Trie build(const std::vector& keys, bool binary_mode = false) { + TrieBuilder builder(keys, Trie::BcType::kWidthL1, binary_mode); + + Trie trie; + + trie.bc_ = typename Trie::BcType(builder.bc_, builder.leaf_flags_); + trie.terminal_flags_ = BitVector(builder.term_flags_, true, true); + trie.tail_ = Vector(builder.tail_); + trie.boundary_flags_ = BitVector(builder.boundary_flags_, false, false); + trie.alphabet_ = builder.alphabet_; + std::swap(trie.table_, builder.table_); + + trie.num_keys_ = keys.size(); + trie.max_length_ = builder.max_length_; + trie.binary_mode_ = builder.binary_mode_; + + return trie; + } + + // Exception class for xcdat::TrieBuilder class Exception : public std::exception { public: - explicit Exception(const std::string& message) : message_(message) {} + explicit Exception(std::string message) : message_(message) {} virtual ~Exception() throw() {} // overrides what() of std::exception. @@ -47,25 +61,25 @@ public: private: struct Suffix { - std::pair string; + Key str; id_type node_id; size_t length() const { - return string.second; + return str.length; } uint8_t operator[](size_t i) const { - return string.first[length() - i - 1]; + return str.ptr[length() - i - 1]; } std::reverse_iterator rbegin() const { - return std::reverse_iterator(string.first + string.second); + return std::reverse_iterator(str.ptr + str.length); } std::reverse_iterator rend() const { - return std::reverse_iterator(string.first); + return std::reverse_iterator(str.ptr); } }; - const std::vector>& keys_; + const std::vector& keys_; const id_type block_size_; const id_type width_L1_; @@ -74,12 +88,10 @@ private: std::vector bc_; BitVectorBuilder leaf_flags_; BitVectorBuilder term_flags_; - std::vector tail_; BitVectorBuilder boundary_flags_; - std::vector alphabet_; - std::array table_; + uint8_t table_[512]; std::vector used_flags_; std::vector edges_; @@ -88,6 +100,9 @@ private: size_t max_length_ = 0; + TrieBuilder(const std::vector& keys, id_type width_L1, bool binary_mode); + ~TrieBuilder() {} + void build_table_(); void build_bc_(size_t begin, size_t end, size_t depth, id_type node_id); void build_tail_(); diff --git a/src/Vector.hpp b/src/Vector.hpp index a11c369..835ba86 100644 --- a/src/Vector.hpp +++ b/src/Vector.hpp @@ -5,33 +5,37 @@ #ifndef XCDAT_VECTOR_HPP #define XCDAT_VECTOR_HPP -#include "xcdatBasics.hpp" +#include "xcdat_basics.hpp" namespace xcdat { -/* - * Simple vector - * */ +// Simple vector template class Vector { public: Vector() { - static_assert(!Is_same(), "Type bool is not supported."); - static_assert(Is_pod(), "T is not POD."); + static_assert(!std::is_same::value, "Type bool is not supported."); + static_assert(std::is_pod::value, "T is not POD."); } - ~Vector() {} + Vector(std::istream& is) { + size_ = read_value(is); + vec_.resize(size_); + is.read(reinterpret_cast(&vec_[0]), sizeof(T) * size_); + data_ = vec_.data(); + } - void steal(std::vector& vec) { - Vector().swap(*this); + Vector(std::vector& vec) { if (vec.size() != vec.capacity()) { vec.shrink_to_fit(); } - buf_.swap(vec); - data_ = buf_.data(); - size_ = buf_.size(); + vec_ = std::move(vec); + data_ = vec_.data(); + size_ = vec_.size(); } + ~Vector() {} + const T& operator[](size_t i) const { return data_[i]; } @@ -63,27 +67,16 @@ public: os.write(reinterpret_cast(data_), sizeof(T) * size_); } - void read(std::istream& is) { - Vector().swap(*this); - read_value(size_, is); - buf_.resize(size_); - is.read(reinterpret_cast(&buf_[0]), sizeof(T) * size_); - data_ = buf_.data(); - } - - void swap(Vector& rhs) { - std::swap(data_, rhs.data_); - std::swap(size_, rhs.size_); - buf_.swap(rhs.buf_); - } - Vector(const Vector&) = delete; Vector& operator=(const Vector&) = delete; + Vector(Vector&&) = default; + Vector& operator=(Vector&&) = default; + private: const T* data_ = nullptr; size_t size_ = 0; - std::vector buf_; + std::vector vec_; }; } diff --git a/src/testTrie.cpp b/src/testTrie.cpp index 32aeffd..9eea59c 100644 --- a/src/testTrie.cpp +++ b/src/testTrie.cpp @@ -6,7 +6,7 @@ #include #include -#include "Trie.hpp" +#include "TrieBuilder.hpp" using namespace xcdat; @@ -15,6 +15,8 @@ namespace { constexpr size_t kNumStrings = 1U << 10; constexpr size_t kMaxLength = 20; +using Key = TrieBuilder::Key; + void to_set(std::vector& keys) { std::sort(std::begin(keys), std::end(keys)); keys.erase(std::unique(std::begin(keys), std::end(keys)), std::end(keys)); @@ -32,19 +34,20 @@ std::string make_key() { return key; } -void make_keys(std::vector& keys) { - keys.clear(); +std::vector make_keys() { + std::vector keys; keys.reserve(kNumStrings); for (size_t i = 0; i < kNumStrings; ++i) { - keys.emplace_back(make_key()); + keys.push_back(make_key()); } to_set(keys); + return keys; } -void make_other_keys(const std::vector& keys, std::vector& others) { - others.clear(); +std::vector make_other_keys(const std::vector& keys) { + std::vector others; for (size_t i = 0; i < kNumStrings; ++i) { auto string = make_key(); @@ -54,47 +57,47 @@ void make_other_keys(const std::vector& keys, std::vector -void test_build(Trie& trie, const std::vector>& keys, - bool binary_mode) { +template +void test_build(Trie& trie, const std::vector& keys, bool binary_mode) { std::cerr << "Construction -> build()" << std::endl; - Trie(keys, binary_mode).swap(trie); + trie = TrieBuilder::build(keys, binary_mode); assert(trie.num_keys() == keys.size()); } -template -void test_basic_operations(const Trie& trie, - const std::vector>& keys, - const std::vector>& others) { +template +void test_basic_operations(const Trie& trie, const std::vector& keys, + const std::vector& others) { std::cerr << "Basic operations -> lookup() and access()" << std::endl; for (auto& key : keys) { - const auto id = trie.lookup(key.first, key.second); + const auto id = trie.lookup(key.ptr, key.length); assert(id != kNotFound); + std::vector ret; - assert(trie.access(id, ret)); - assert(ret.size() == key.second); - assert(std::memcmp(ret.data(), key.first, key.second) == 0); + trie.access(id, ret); + + assert(ret.size() == key.length); + assert(std::memcmp(ret.data(), key.ptr, key.length) == 0); } for (auto& other : others) { - const auto id = trie.lookup(other.first, other.second); + const auto id = trie.lookup(other.ptr, other.length); assert(id == kNotFound); } } -template -void test_prefix_operations(const Trie& trie, - const std::vector>& keys, - const std::vector>& others) { +template +void test_prefix_operations(const Trie& trie, const std::vector& keys, + const std::vector& others) { std::cerr << "Prefix operations -> common_prefix_lookup()" << std::endl; for (auto& key : keys) { std::vector ids; - auto num_ids = trie.common_prefix_lookup(key.first, key.second, ids); + auto num_ids = trie.common_prefix_lookup(key.ptr, key.length, ids); assert(1 <= num_ids); assert(num_ids <= kMaxLength); @@ -102,65 +105,88 @@ void test_prefix_operations(const Trie& trie, for (auto id : ids) { std::vector ret; - assert(trie.access(id, ret)); - assert(ret.size() <= key.second); + trie.access(id, ret); + assert(ret.size() <= key.length); } + + auto limit = num_ids / 2; + auto new_num_ids = trie.common_prefix_lookup(key.ptr, key.length, ids, limit); + + assert(new_num_ids == limit); + assert(num_ids + new_num_ids == ids.size()); } for (auto& other : others) { std::vector ids; - auto num_ids = trie.common_prefix_lookup(other.first, other.second, ids); + auto num_ids = trie.common_prefix_lookup(other.ptr, other.length, ids); assert(num_ids <= kMaxLength); assert(num_ids == ids.size()); for (auto id : ids) { std::vector ret; - assert(trie.access(id, ret)); - assert(ret.size() < other.second); + trie.access(id, ret); + assert(ret.size() < other.length); } + + auto limit = num_ids / 2; + auto new_num_ids = trie.common_prefix_lookup(other.ptr, other.length, ids, limit); + + assert(new_num_ids == limit); + assert(num_ids + new_num_ids == ids.size()); } } -template -void test_predictive_operations(const Trie& trie, - const std::vector>& keys, - const std::vector>& others) { +template +void test_predictive_operations(const Trie& trie, const std::vector& keys, + const std::vector& others) { std::cerr << "Predictive operations -> predictive_lookup()" << std::endl; for (auto& key : keys) { std::vector ids; - auto num_ids = trie.predictive_lookup(key.first, key.second, ids); + auto num_ids = trie.predictive_lookup(key.ptr, key.length, ids); assert(1 <= num_ids); assert(num_ids == ids.size()); for (auto id : ids) { std::vector ret; - assert(trie.access(id, ret)); - assert(key.second <= ret.size()); + trie.access(id, ret); + assert(key.length <= ret.size()); } + + auto limit = num_ids / 2; + auto new_num_ids = trie.predictive_lookup(key.ptr, key.length, ids, limit); + + assert(new_num_ids == limit); + assert(num_ids + new_num_ids == ids.size()); } for (auto& other : others) { std::vector ids; - auto num_ids = trie.predictive_lookup(other.first, other.second, ids); + auto num_ids = trie.predictive_lookup(other.ptr, other.length, ids); assert(num_ids == ids.size()); for (auto id : ids) { std::vector ret; - assert(trie.access(id, ret)); - assert(other.second < ret.size()); + trie.access(id, ret); + assert(other.length < ret.size()); } + + auto limit = num_ids / 2; + auto new_num_ids = trie.predictive_lookup(other.ptr, other.length, ids, limit); + + assert(new_num_ids == limit); + assert(num_ids + new_num_ids == ids.size()); } } -template +template void test_io(const Trie& trie) { std::cerr << "File I/O -> write() and read()" << std::endl; - const char* file_name = "test.trie"; + const char* file_name = "index"; { std::ofstream ofs{file_name}; trie.write(ofs); @@ -174,10 +200,12 @@ void test_io(const Trie& trie) { Trie _trie; { std::ifstream ifs{file_name}; - _trie.read(ifs); + _trie = Trie(ifs); } assert(trie.num_keys() == _trie.num_keys()); + assert(trie.max_length() == _trie.max_length()); + assert(trie.is_binary_mode() == _trie.is_binary_mode()); assert(trie.alphabet_size() == _trie.alphabet_size()); assert(trie.num_nodes() == _trie.num_nodes()); assert(trie.num_used_nodes() == _trie.num_used_nodes()); @@ -185,14 +213,13 @@ void test_io(const Trie& trie) { assert(trie.size_in_bytes() == _trie.size_in_bytes()); } -template -void test_trie(const std::vector>& strings, - const std::vector>& others) { +template +void test_trie(const std::vector& strings, const std::vector& others) { for (int i = 0; i < 2; ++i) { std::cerr << "** " << (i % 2 ? "Binary" : "Text") << " Mode **" << std::endl; std::cerr << "Testing xcdat::Trie<" << (Fast ? "true" : "false") << ">" << std::endl; Trie trie; - test_build(trie, strings, i % 2 == 0); + test_build(trie, strings, i % 2 != 0); test_basic_operations(trie, strings, others); test_prefix_operations(trie, strings, others); test_predictive_operations(trie, strings, others); @@ -204,19 +231,16 @@ void test_trie(const std::vector>& strings, } // namespace int main() { - std::vector keys_buffer; - make_keys(keys_buffer); + auto keys_buffer = make_keys(); + auto others_buffer = make_other_keys(keys_buffer); - std::vector others_buffer; - make_other_keys(keys_buffer, others_buffer); - - std::vector> keys(keys_buffer.size()); + std::vector keys(keys_buffer.size()); for (size_t i = 0; i < keys.size(); ++i) { keys[i] = {reinterpret_cast(keys_buffer[i].c_str()), keys_buffer[i].length()}; } - std::vector> others(others_buffer.size()); + std::vector others(others_buffer.size()); for (size_t i = 0; i < others.size(); ++i) { others[i] = {reinterpret_cast(others_buffer[i].c_str()), others_buffer[i].length()}; diff --git a/src/testVector.cpp b/src/testVector.cpp index 22fb57b..9750ea1 100644 --- a/src/testVector.cpp +++ b/src/testVector.cpp @@ -28,7 +28,7 @@ void test_bit_vector() { for (size_t i = 0; i < kSize; ++i) { builder.push_back(orig_bit_vector[i]); } - BitVector(builder, true, true).swap(bit_vector); + bit_vector = BitVector(builder, true, true); } assert(bit_vector.size() == kSize); diff --git a/src/xcdat.cpp b/src/xcdat.cpp index 2a74d32..48d2d21 100644 --- a/src/xcdat.cpp +++ b/src/xcdat.cpp @@ -2,7 +2,7 @@ #include #include -#include "Trie.hpp" +#include "TrieBuilder.hpp" using namespace xcdat; @@ -10,12 +10,14 @@ namespace { constexpr uint32_t kRuns = 10; -enum class Times { - SEC, MILLI, MICRO -}; +using Key = TrieBuilder::Key; class StopWatch { public: + enum Times { + SEC, MILLI, MICRO + }; + StopWatch() : tp_(std::chrono::high_resolution_clock::now()) {} ~StopWatch() {} @@ -57,8 +59,7 @@ size_t read_keys(const char* file_name, std::vector& keys) { return size; } -void extract_pairs(const std::vector& keys, - std::vector>& pairs) { +void extract_pairs(const std::vector& keys, std::vector& pairs) { pairs.clear(); pairs.resize(keys.size()); for (size_t i = 0; i < keys.size(); ++i) { @@ -70,7 +71,7 @@ void show_usage(std::ostream& os) { os << "xcdat build " << std::endl; os << "\t\t'1' for DACs; '2' for FDACs." << std::endl; os << "\t \tinput file of a set of keys." << std::endl; - os << "\t\toutput file for storing the dictionary." << std::endl; + os << "\t\toutput file of the dictionary." << std::endl; os << "xcdat query " << std::endl; os << "\t \t'1' for DACs; '2' for FDACs." << std::endl; os << "\t \tinput file of the dictionary." << std::endl; @@ -88,22 +89,22 @@ int build(std::vector& args) { return 1; } - std::vector keys; - auto raw_size = read_keys(args[2].c_str(), keys); + std::vector strs; + auto raw_size = read_keys(args[2].c_str(), strs); if (raw_size == 0) { std::cerr << "open error : " << args[2] << std::endl; return 1; } - std::vector> pairs; - extract_pairs(keys, pairs); + std::vector keys; + extract_pairs(strs, keys); Trie trie; try { StopWatch sw; - Trie(pairs).swap(trie); - std::cout << "constr. time: " << sw(Times::SEC) << " sec" << std::endl; + trie = TrieBuilder::build(keys); + std::cout << "constr. time: " << sw(StopWatch::SEC) << " sec" << std::endl; } catch (const xcdat::TrieBuilder::Exception& ex) { std::cerr << ex.what() << std::endl; return 1; @@ -138,7 +139,7 @@ int query(std::vector& args) { std::cerr << "open error : " << args[2] << std::endl; return 1; } - trie.read(ifs); + trie = Trie(ifs); } size_t limit = 10; @@ -172,20 +173,26 @@ int query(std::vector& args) { ids.clear(); trie.common_prefix_lookup(key, length, ids); std::cout << ids.size() << " found" << std::endl; + for (size_t i = 0; i < std::min(ids.size(), limit); ++i) { buf.clear(); trie.access(ids[i], buf); - std::cout << ids[i] << '\t' << buf.data() << std::endl; + std::cout << ids[i] << '\t'; + std::cout.write(reinterpret_cast(buf.data()), buf.size()); + std::cout << std::endl; } std::cout << "predictive_lookup()" << std::endl; ids.clear(); trie.predictive_lookup(key, length, ids); std::cout << ids.size() << " found" << std::endl; + for (size_t i = 0; i < std::min(ids.size(), limit); ++i) { buf.clear(); trie.access(ids[i], buf); - std::cout << ids[i] << '\t' << buf.data() << std::endl; + std::cout << ids[i] << '\t'; + std::cout.write(reinterpret_cast(buf.data()), buf.size()); + std::cout << std::endl; } } @@ -206,21 +213,21 @@ int bench(std::vector& args) { std::cerr << "open error : " << args[2] << std::endl; return 1; } - trie.read(ifs); + trie = Trie(ifs); } - std::vector keys; - if (read_keys(args[3].c_str(), keys) == 0) { + std::vector strs; + if (read_keys(args[3].c_str(), strs) == 0) { std::cerr << "open error : " << args[3] << std::endl; return 1; } - std::vector> pairs; - extract_pairs(keys, pairs); + std::vector keys; + extract_pairs(strs, keys); - std::vector ids(pairs.size()); - for (size_t i = 0; i < pairs.size(); ++i) { - ids[i] = trie.lookup(pairs[i].first, pairs[i].second); + std::vector ids(keys.size()); + for (size_t i = 0; i < keys.size(); ++i) { + ids[i] = trie.lookup(keys[i].ptr, keys[i].length); } { @@ -228,15 +235,15 @@ int bench(std::vector& args) { StopWatch sw; for (uint32_t r = 0; r < kRuns; ++r) { - for (size_t i = 0; i < pairs.size(); ++i) { - if (trie.lookup(pairs[i].first, pairs[i].second) == kNotFound) { - std::cerr << "Failed to lookup " << keys[i] << std::endl; + for (size_t i = 0; i < keys.size(); ++i) { + if (trie.lookup(keys[i].ptr, keys[i].length) == kNotFound) { + std::cerr << "Failed to lookup " << strs[i] << std::endl; return 1; } } } - std::cout << sw(Times::MICRO) / kRuns / pairs.size() << " us per str" << std::endl; + std::cout << sw(StopWatch::MICRO) / kRuns / keys.size() << " us per str" << std::endl; } { @@ -253,7 +260,7 @@ int bench(std::vector& args) { } } - std::cout << sw(Times::MICRO) / kRuns / ids.size() << " us per ID" << std::endl; + std::cout << sw(StopWatch::MICRO) / kRuns / ids.size() << " us per ID" << std::endl; } return 0; diff --git a/src/xcdatBasics.hpp b/src/xcdat_basics.hpp similarity index 65% rename from src/xcdatBasics.hpp rename to src/xcdat_basics.hpp index 9032168..89405a6 100644 --- a/src/xcdatBasics.hpp +++ b/src/xcdat_basics.hpp @@ -11,24 +11,17 @@ #include #include +#include "xcdat_config.hpp" + namespace xcdat { -#ifdef XCDAT64 +#ifdef XCDAT_X64 using id_type = uint64_t; #else using id_type = uint32_t; #endif -constexpr id_type kIdUpper = std::numeric_limits::max(); - -template -using Conditional = typename std::conditional::type; - -template -inline constexpr bool Is_same() { return std::is_same::value; } - -template -inline constexpr bool Is_pod() { return std::is_pod::value; } +constexpr id_type kIdMax = std::numeric_limits::max(); struct BcPair { id_type base; @@ -44,7 +37,7 @@ inline void show_size(const char* str, size_t size, std::ostream& os) { } inline void show_size_ratio(const char* str, size_t size, size_t denom, std::ostream& os) { - os << str << "\t" << size << "\t" << (double) size / denom << std::endl; + os << str << "\t" << size << "\t" << static_cast(size) / denom << std::endl; } template @@ -53,8 +46,10 @@ inline void write_value(const T val, std::ostream& os) { } template -inline void read_value(T& val, std::istream& is) { +inline T read_value(std::istream& is) { + T val; is.read(reinterpret_cast(&val), sizeof(val)); + return val; } } //namespace - xcdat diff --git a/xcdat_config.hpp.in b/xcdat_config.hpp.in new file mode 100644 index 0000000..f084264 --- /dev/null +++ b/xcdat_config.hpp.in @@ -0,0 +1,7 @@ +#ifndef XCDAT_CONFIG_HPP +#define XCDAT_CONFIG_HPP + +#cmakedefine XCDAT_X64 +#cmakedefine XCDAT_USE_POPCNT + +#endif // XCDAT_CONFIG_HPP \ No newline at end of file