diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..a086fb2 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,20 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "(lldb) test_trie", + "type": "cppdbg", + "request": "launch", + "program": "${workspaceFolder}/build/test/test_trie", + "args": [], + "stopAtEntry": false, + "cwd": "${fileDirname}", + "environment": [], + "externalConsole": false, + "MIMode": "lldb" + }, + ] +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..34447d4 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,77 @@ +{ + "C_Cpp.errorSquiggles": "Disabled", + "files.associations": { + "__bit_reference": "cpp", + "__config": "cpp", + "__debug": "cpp", + "__errc": "cpp", + "__functional_base": "cpp", + "__hash_table": "cpp", + "__locale": "cpp", + "__mutex_base": "cpp", + "__node_handle": "cpp", + "__nullptr": "cpp", + "__split_buffer": "cpp", + "__string": "cpp", + "__threading_support": "cpp", + "__tree": "cpp", + "__tuple": "cpp", + "algorithm": "cpp", + "array": "cpp", + "atomic": "cpp", + "bit": "cpp", + "bitset": "cpp", + "cctype": "cpp", + "chrono": "cpp", + "cmath": "cpp", + "complex": "cpp", + "csignal": "cpp", + "cstdarg": "cpp", + "cstddef": "cpp", + "cstdint": "cpp", + "cstdio": "cpp", + "cstdlib": "cpp", + "cstring": "cpp", + "ctime": "cpp", + "cwchar": "cpp", + "cwctype": "cpp", + "deque": "cpp", + "exception": "cpp", + "fstream": "cpp", + "functional": "cpp", + "initializer_list": "cpp", + "iomanip": "cpp", + "ios": "cpp", + "iosfwd": "cpp", + "iostream": "cpp", + "istream": "cpp", + "iterator": "cpp", + "limits": "cpp", + "locale": "cpp", + "map": "cpp", + "memory": "cpp", + "mutex": "cpp", + "new": "cpp", + "numeric": "cpp", + "optional": "cpp", + "ostream": "cpp", + "random": "cpp", + "ratio": "cpp", + "set": "cpp", + "sstream": "cpp", + "stack": "cpp", + "stdexcept": "cpp", + "streambuf": "cpp", + "string": "cpp", + "string_view": "cpp", + "system_error": "cpp", + "tuple": "cpp", + "type_traits": "cpp", + "typeinfo": "cpp", + "unordered_map": "cpp", + "utility": "cpp", + "vector": "cpp", + "__functional_03": "cpp", + "filesystem": "cpp" + } +} \ No newline at end of file diff --git a/.vscode/tasks.json b/.vscode/tasks.json new file mode 100644 index 0000000..446a3c5 --- /dev/null +++ b/.vscode/tasks.json @@ -0,0 +1,27 @@ +{ + "tasks": [ + { + "type": "cppbuild", + "label": "C/C++: clang++ build active file", + "command": "/usr/bin/clang++", + "args": [ + "-g", + "${file}", + "-o", + "${fileDirname}/${fileBasenameNoExtension}" + ], + "options": { + "cwd": "${fileDirname}" + }, + "problemMatcher": [ + "$gcc" + ], + "group": { + "kind": "build", + "isDefault": true + }, + "detail": "Task generated by Debugger." + } + ], + "version": "2.0.0" +} \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt deleted file mode 100644 index 0400e02..0000000 --- a/CMakeLists.txt +++ /dev/null @@ -1,61 +0,0 @@ -cmake_minimum_required(VERSION 3.1) -project(XCDAT) - -set(CMAKE_CXX_STANDARD 17) - -if (NOT CMAKE_BUILD_TYPE) - set(CMAKE_BUILD_TYPE Release) -endif () - -configure_file( - ${XCDAT_SOURCE_DIR}/xcdat_config.hpp.in - ${XCDAT_SOURCE_DIR}/include/xcdat/xcdat_config.hpp -) - -message(STATUS "XCDAT_SOURCE_DIR is ${XCDAT_SOURCE_DIR}") - -option(XCDAT_X64 - "Use 64-bit integers for node representation." - OFF) - -option(XCDAT_USE_POPCNT - "Use popcount intrinsic. Available on x86-64 since SSE4.2." - OFF) - -if (XCDAT_USE_POPCNT) - if (UNIX) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2") - endif () -endif () - -message(STATUS "BUILD_TYPE is ${CMAKE_BUILD_TYPE}") -message(STATUS "CXX_FLAGS are ${CMAKE_CXX_FLAGS}") -message(STATUS "CXX_FLAGS_DEBUG are ${CMAKE_CXX_FLAGS_DEBUG}") -message(STATUS "CXX_FLAGS_RELEASE are ${CMAKE_CXX_FLAGS_RELEASE}") -message(STATUS "XCDAT_X64 is ${XCDAT_X64}") -message(STATUS "XCDAT_USE_POPCNT is ${XCDAT_USE_POPCNT}") - -file(GLOB HEADER_FILES include/xcdat/*.hpp) -file(GLOB SOURCE_FILES src/*.cpp) - -include_directories(include) -add_library(xcdat STATIC ${HEADER_FILES} ${SOURCE_FILES}) - -add_subdirectory(tool) -add_subdirectory(sample) - -enable_testing() -add_subdirectory(test) - -install(FILES include/xcdat.hpp DESTINATION include) -install(FILES ${HEADER_FILES} DESTINATION include/xcdat) - -install(TARGETS xcdat - EXPORT xcdat-targets - LIBRARY DESTINATION lib - ARCHIVE DESTINATION lib - RUNTIME DESTINATION bin) - -install(EXPORT xcdat-targets - FILE xcdat-config.cmake - DESTINATION lib/cmake/xcdat) diff --git a/LICENSE b/LICENSE deleted file mode 100644 index d750e37..0000000 --- a/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2017 Shunsuke Kanda - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/README.md b/README.md deleted file mode 100644 index 4e72748..0000000 --- a/README.md +++ /dev/null @@ -1,27 +0,0 @@ -# Xcdat: xor-compressed double-array trie - -Xcdat is a C++ library that implements static compressed string dictionaries based on an improved double-array trie. - -The double array is known as the fastest trie representation and has been used in many trie libraries. On the other hand, it has a space efficiency problem because of a pointer-based data structure. Xcdat solves the problem using the XOR-compressed double-array methods described in the following article. - -> Shunsuke Kanda, Kazuhiro Morita, and Masao Fuketa. Compressed double-array tries for string dictionaries supporting fast lookup. Knowledge and Information Systems, 51(3): 1023–1042, 2017. [[doi](https://doi.org/10.1007/s10115-016-0999-8)] [[pdf](https://drive.google.com/file/d/1_BknOv1misIK-iUk4u9c9yZi3qmWNruf/view?usp=sharing)] - -Xcdat can implement trie dictionaries in smaller space compared to the other double-array libraries. In addition, the lookup speed is relatively fast in compressed data structures from the double-array advantage. - -## Documentation - -The document of Xcdat is [here](https://kampersanda.github.io/xcdat/). - -## Build Instructions - -You can download and compile Xcdat as the following commands. - -``` -$ git clone https://github.com/kampersanda/xcdat.git -$ cd xcdat -$ mkdir build -$ cd build -$ cmake .. -$ make -$ make install -``` diff --git a/docs/document.md b/docs/document.md deleted file mode 100644 index 621a850..0000000 --- a/docs/document.md +++ /dev/null @@ -1,308 +0,0 @@ -% Xcdat: XOR-compressed double-array trie -% Shunsuke Kanda -% 2017 - -## What is Xcdat? - -Xcdat is a C++ library that implements static compressed string dictionaries based on an improved double-array trie. - -The double array (Aoe, 1989) is known as the fastest trie representation and has been used in many trie libraries. On the other hand, it has a space efficiency problem because of a pointer-based data structure. Xcdat solves the problem using the XOR-compressed double-array methods described in the following article. - -> Shunsuke Kanda, Kazuhiro Morita, and Masao Fuketa. Compressed double-array tries for string dictionaries supporting fast lookup. Knowledge and Information Systems, 51(3): 1023–1042, 2017. [[doi](https://doi.org/10.1007/s10115-016-0999-8)] [[pdf](https://sites.google.com/site/shnskknd/KAIS2016.pdf)] - -Xcdat can implement trie dictionaries in smaller space compared to the other double-array libraries. In addition, the lookup speed is relatively fast in compressed data structures from the double-array advantage. - -Xcdat is available at [GitHub repsitory](https://github.com/kampersanda/xcdat). - -## Features - -- **Compressed Data Structure**: Xcdat practically compresses double-array elements for representing node pointers by using the XCDA methods. While the original double array uses 8 bytes (or 16 bytes) per node, it uses about 3–4 bytes (but, depending on datasets). In addition, the dictionary is implemented using a minimal-prefix trie (Yata et al., 2007) that is effective for long strings in time and space. -- **Two Compression Approaches**: There are two approaches of compressing elements: using byte-oriented DACs (Brisaboa et al., 2013) and using pointer-based ones (Kanda et al., 2017). For characterless strings such as natural language keywords, the former will be slightly smaller and the latter will be slightly faster. For long strings such as URLs, the latter will outperform the former. Xcdat implements the two versions by using a static polymorphism with C++ template to avoid an overhead of virtual functions. -- **64-bit Version**: Although Xcdat represents node addresses using 32-bit integers in default configuration, we can allow for 64-bit integers by defining `XCDAT_X64`; therefore, the dictionary can be constructed from a very large dataset. The construction space becomes large, but the output dictionary size is nearly equal. -- **NULL Character**: The dictionary can be constructed from keys including the NULL character by setting the second parameter of `xcdat::TrieBuilder::build()` to `true`. -- **Dictionary Encoding**: Xcdat supports mapping N different strings to unique IDs in [0,N-1]. That is to say, it supports two basic dictionary operations: Lookup returns the ID corresponding to a given string and Access (also called ReverseLookup) returns the string corresponding to a given ID. Therefore, Xcdat is very useful in many applications for string precessing and indexing, such as described in (Martínez-Prieto et al., 2016). -- **Fast Operations**: Xcdat can provide lookup operations faster than other compressed trie libraries because it is based on the double-array trie. On the other hand, compared to the existing double-array libraries, the speed will be slower due to the compression. -- **Prefix-based Lookup Operations**: As with other trie libraries, Xcdat also provides prefix-based lookup operations required for natural language processing and so on. - -## Build Instructions - -You can download and compile Xcdat as the following commands. - -``` -$ git clone https://github.com/kampersanda/xcdat.git -$ cd xcdat -$ mkdir build -$ cd build -$ cmake .. -$ make -$ make install -``` - -If you want to use a 64-bit setting, please add `-DXCDAT_X64=ON` to the CMake option. In addition, you can use the SSE4.2 POPCNT instruction by adding `-DXCDAT_USE_POPCNT=ON` for Rank/Select operations. The code has been tested only on Mac OS X and Linux. That is, this library considers only UNIX-compatible OS. - - -## Command Line Tools - -`xcdat` is a general-purpose command line tool to provide three modes as follows. - -``` -$ xcdat -xcdat build - 1: DACs, 2: FDACs - Input file name of a set of keys (must be sorted) - Output file name of the dictionary (optional) - If omitted, .dacs or .fdacs is output -xcdat query - 1: DACs, 2: FDACs - Input file name of the dictionary - Limit of #results (optional, default=10) -xcdat bench - 1: DACs, 2: FDACs - Input file name of the dictionary - Input file name of keys for benchmark -``` - -### Example 1: Construction - -Command `xcdat build [params...]` builds Xcdat dictionaries from a given dataset and saves it to a file, as follows. - -``` -$ xcdat build 1 jawiki-all-titles -constr. time: 1.58574 sec -cmpr. ratio: 0.524287 over the raw size - -basic statistics of xcdat::Trie - num keys: 1738995 - alphabet size: 189 - num nodes: 4042496 - num used nodes: 4034357 - num free nodes: 8139 - size in bytes: 20546967 -member size statistics of xcdat::Trie - bc: 13879098 0.675482 - terminal_flags: 708448 0.0344794 - tail: 5958655 0.290002 - boundary_flags: 40 1.94676e-06 -basic statistics of xcdat::DacBc - num links: 1499605 - bytes per node: 3.4333 -member size statistics of xcdat::DacBc - values_L0: 8085000 0.582531 - values_L1: 746760 0.0538046 - values_L2: 22581 0.00162698 - flags_L0: 1389660 0.100126 - flags_L1: 128400 0.00925132 - leaves: 694856 0.0500649 - links: 2811784 0.202591 - -output -> jawiki-all-titles.dac -``` - -### Example 2: Query Processing - -Command `xcdat query [params...]` loads a dictionary file and tests lookup operations, as follows. - -``` -$ xcdat query 1 jawiki-all-titles.dac -> NEW_GAME! -Lookup -125989 NEW_GAME! -Common Prefix Lookup -28 N -124185 NE -125428 NEW -125988 NEW_GAME -125989 NEW_GAME! -5 found -Predictive Lookup -125989 NEW_GAME! -126003 NEW_GAME!! -126059 NEW_GAME!_-THE_CHALLENGE_STAGE!- -3 found -``` - -### Example 3: Benchmark Test - -Command `xcdat bench [params...]` tests time performances of a given dictionary, as follows. - -``` -$ xcdat bench 1 jawiki-all-titles.dac jawiki-all-titles.rnd -Warm up -Lookup benchmark on 10 runs -1.5065 us per str -Access benchmark on 10 runs -1.81289 us per ID -``` - -## Sample Usage - -The following code shows an easy routine sample. - -```cpp -#include -#include - -int main() { - std::vector keys_buf = { - "Aoba", "Yun", "Hajime", "Hihumi", "Kou", "Rin", - "Hazuki", "Umiko", "Nene", "Nenecchi" - }; - - // Convert to the input format - std::vector keys(keys_buf.size()); - for (size_t i = 0; i < keys.size(); ++i) { - keys[i] = std::string_view{keys_buf[i]}; - } - - // Input data must be sorted. - std::sort(std::begin(keys), std::end(keys)); - - // Dictionary class - using Trie = xcdat::Trie; - - try { - // Builds a dictionary from the keys - Trie trie = xcdat::TrieBuilder::build(keys); // move - - // Writes the dictionary to a file. - std::ofstream ofs{"sample.bin"}; - trie.write(ofs); - } catch (const xcdat::TrieBuilder::Exception& ex) { - // Abort if something went wrong... - std::cerr << ex.what() << std::endl; - return 1; - } - - // Creates an empty dictionary - Trie trie; - { - // Reads the dictionary to the file. - std::ifstream ifs{"sample.bin"}; - trie = Trie{ifs}; // move - } - - std::cout << "Performing basic operations..." << std::endl; - { - // lookup() obtains the unique ID for a given key - xcdat::id_type key_id = trie.lookup("Rin"); - // access() decodes the key from a given ID - std::cout << key_id << " : " << trie.access(key_id) << std::endl; - - // Given an unregistered key, lookup() returns NOT_FOUND. - if (trie.lookup("Hotaru") == Trie::NOT_FOUND) { - std::cout << "? : " << "Hotaru" << std::endl; - } - } - - std::cout << "Performing a common prefix operation..." << std::endl; - { - // Common prefix operation is implemented using PrefixIterator, created by - // make_prefix_iterator(). - Trie::PrefixIterator it = trie.make_prefix_iterator("Nenecchi"); - - // next() continues to obtain the next key until false is returned. - while (it.next()) { - std::cout << it.id() << " : " << it.key() << std::endl; - } - } - - std::cout << "Performing a predictive operation..." << std::endl; - { - // Predictive operation is implemented using PredictiveIterator, created by - // make_predictive_iterator(). - Trie::PredictiveIterator it = trie.make_predictive_iterator("Ha"); - - // next() continues to obtain the next key until false is returned in - // lexicographical order. - while (it.next()) { - std::cout << it.id() << " : " << it.key() << std::endl; - } - } - - std::cout << "Enumerating all registered keys..." << std::endl; - { - // PredictiveIterator for an empty string provides enumeration of all - // registered keys in lexicographical order. - Trie::PredictiveIterator it = trie.make_predictive_iterator(""); - while (it.next()) { - std::cout << it.id() << " : " << it.key() << std::endl; - } - } - - return 0; -} -``` - -The standard output is as follows. - -``` -Performing basic operations... -7 : Rin -? : Hotaru -Performing common prefix operations... -4 : Nene -6 : Nenecchi -Performing predictive operations... -3 : Hajime -5 : Hazuki -Enumerating all registered keys... -0 : Aoba -3 : Hajime -5 : Hazuki -1 : Hihumi -2 : Kou -4 : Nene -6 : Nenecchi -7 : Rin -8 : Umiko -9 : Yun -``` - -As shown in the output, `xcdat::Trie` assigns unique integer IDs to each registered key. The ID order is random, depending on node arrangement. - -## API - -You can build a dictionary using static member function `xcdat::TrieBuilder::build()`. -This function receives a set of keywords and returns the resulting class object of `xcdat::Trie`. -For the usage, refer to the header comments of [`xcdat::TrieBuilder.hpp`](https://github.com/kampersanda/xcdat/blob/master/include/xcdat/TrieBuilder.hpp). -Also for the usage of `xcdat::Trie`, refer to the header comments of [`xcdat::Trie`](https://github.com/kampersanda/xcdat/blob/master/include/xcdat/Trie.hpp). - -The detailed descriptions of AIP are under construction... - -## Benchmark - -Work in progress... - -## To Do - -- Show benchmarks -- Create AIP descriptions - -## Licensing - -This library is free software provided under the MIT License. - -## Citation - -If you use the library in academic settings, please cite the following paper. - -```bibtex -@article{kanda2017compressed, - title={Compressed double-array tries for string dictionaries supporting fast lookup}, - author={Kanda, Shunsuke and Morita, Kazuhiro and Fuketa, Masao}, - journal={Knowledge and Information Systems}, - volume={51}, - number={3}, - pages={1023--1042}, - year={2017}, - publisher={Springer} -} -``` - -## References - -- J. Aoe. An efficient digital search algorithm by using a double-array structure. IEEE Transactions on Software Engineering, 15(9):1066–1077, 1989. -- N. R. Brisaboa, S. Ladra, and G. Navarro. DACs: Bringing direct access to variable-length codes. Information Processing & Management, 49(1):392–404, 2013. -- S. Kanda, K. Morita, and M. Fuketa. Compressed double-array tries for string dictionaries supporting fast lookup. Knowledge and Information Systems, 51(3): 1023–1042, 2017. -- M. A. Martínez-Prieto, N. Brisaboa, R. Cánovas, F. Claude, and G. Navarro. Practical compressed string dictionaries. Information Systems, 56:73–108, 2016 -- S. Yata, M. Oono, K. Morita, M. Fuketa, T. Sumitomo, and J. Aoe. A compact static double-array keeping character codes. Information Processing & Management, 43(1):237–247, 2007. \ No newline at end of file diff --git a/docs/index.html b/docs/index.html deleted file mode 100644 index ede03ef..0000000 --- a/docs/index.html +++ /dev/null @@ -1,348 +0,0 @@ - - - - - - - - - Xcdat: XOR-compressed double-array trie - - - - - - -
-

Xcdat: XOR-compressed double-array trie

-

Created by Shunsuke Kanda

-
-

Contents

- -

What is Xcdat?

-

Xcdat is a C++ library that implements static compressed string dictionaries based on an improved double-array trie.

-

The double array (Aoe, 1989) is known as the fastest trie representation and has been used in many trie libraries. On the other hand, it has a space efficiency problem because of a pointer-based data structure. Xcdat solves the problem using the XOR-compressed double-array methods described in the following article.

-
-

Shunsuke Kanda, Kazuhiro Morita, and Masao Fuketa. Compressed double-array tries for string dictionaries supporting fast lookup. Knowledge and Information Systems, 51(3): 1023–1042, 2017. [doi] [pdf]

-
-

Xcdat can implement trie dictionaries in smaller space compared to the other double-array libraries. In addition, the lookup speed is relatively fast in compressed data structures from the double-array advantage.

-

Xcdat is available at GitHub repsitory.

-

Features

-
    -
  • Compressed Data Structure: Xcdat practically compresses double-array elements for representing node pointers by using the XCDA methods. While the original double array uses 8 bytes (or 16 bytes) per node, it uses about 3–4 bytes (but, depending on datasets). In addition, the dictionary is implemented using a minimal-prefix trie (Yata et al., 2007) that is effective for long strings in time and space.
  • -
  • Two Compression Approaches: There are two approaches of compressing elements: using byte-oriented DACs (Brisaboa et al., 2013) and using pointer-based ones (Kanda et al., 2017). For characterless strings such as natural language keywords, the former will be slightly smaller and the latter will be slightly faster. For long strings such as URLs, the latter will outperform the former. Xcdat implements the two versions by using a static polymorphism with C++ template to avoid an overhead of virtual functions.
  • -
  • 64-bit Version: Although Xcdat represents node addresses using 32-bit integers in default configuration, we can allow for 64-bit integers by defining XCDAT_X64; therefore, the dictionary can be constructed from a very large dataset. The construction space becomes large, but the output dictionary size is nearly equal.
  • -
  • NULL Character: The dictionary can be constructed from keys including the NULL character by setting the second parameter of xcdat::TrieBuilder::build() to true.
  • -
  • Dictionary Encoding: Xcdat supports mapping N different strings to unique IDs in [0,N-1]. That is to say, it supports two basic dictionary operations: Lookup returns the ID corresponding to a given string and Access (also called ReverseLookup) returns the string corresponding to a given ID. Therefore, Xcdat is very useful in many applications for string precessing and indexing, such as described in (Martínez-Prieto et al., 2016).
  • -
  • Fast Operations: Xcdat can provide lookup operations faster than other compressed trie libraries because it is based on the double-array trie. On the other hand, compared to the existing double-array libraries, the speed will be slower due to the compression.
  • -
  • Prefix-based Lookup Operations: As with other trie libraries, Xcdat also provides prefix-based lookup operations required for natural language processing and so on.
  • -
-

Build Instructions

-

You can download and compile Xcdat as the following commands.

-
$ git clone https://github.com/kampersanda/xcdat.git
-$ cd xcdat
-$ mkdir build
-$ cd build
-$ cmake ..
-$ make
-$ make install
-

If you want to use a 64-bit setting, please add -DXCDAT_X64=ON to the CMake option. In addition, you can use the SSE4.2 POPCNT instruction by adding -DXCDAT_USE_POPCNT=ON for Rank/Select operations. The code has been tested only on Mac OS X and Linux. That is, this library considers only UNIX-compatible OS.

-

Command Line Tools

-

xcdat is a general-purpose command line tool to provide three modes as follows.

-
$ xcdat 
-xcdat build <type> <key> <dict>
-    <type>  1: DACs, 2: FDACs
-    <key>   Input file name of a set of keys (must be sorted)
-    <dict>  Output file name of the dictionary (optional)
-            If omitted, <key>.dacs or <key>.fdacs is output
-xcdat query <type> <dict> <limit>
-    <type>  1: DACs, 2: FDACs
-    <dict>  Input file name of the dictionary
-    <limit> Limit of #results (optional, default=10)
-xcdat bench <type> <dict> <key>
-    <type>  1: DACs, 2: FDACs
-    <dict>  Input file name of the dictionary
-    <key>   Input file name of keys for benchmark
-

Example 1: Construction

-

Command xcdat build [params...] builds Xcdat dictionaries from a given dataset and saves it to a file, as follows.

-
$ xcdat build 1 jawiki-all-titles
-constr. time:   1.58574 sec
-cmpr. ratio:    0.524287 over the raw size
-
-basic statistics of xcdat::Trie<false>
-    num keys:       1738995
-    alphabet size:  189
-    num nodes:      4042496
-    num used nodes: 4034357
-    num free nodes: 8139
-    size in bytes:  20546967
-member size statistics of xcdat::Trie<false>
-    bc:             13879098    0.675482
-    terminal_flags: 708448  0.0344794
-    tail:           5958655 0.290002
-    boundary_flags: 40  1.94676e-06
-basic statistics of xcdat::DacBc
-    num links:      1499605
-    bytes per node: 3.4333
-member size statistics of xcdat::DacBc
-    values_L0:  8085000 0.582531
-    values_L1:  746760  0.0538046
-    values_L2:  22581   0.00162698
-    flags_L0:   1389660 0.100126
-    flags_L1:   128400  0.00925132
-    leaves:     694856  0.0500649
-    links:      2811784 0.202591
-
-output -> jawiki-all-titles.dac
-

Example 2: Query Processing

-

Command xcdat query [params...] loads a dictionary file and tests lookup operations, as follows.

-
$ xcdat query 1 jawiki-all-titles.dac
-> NEW_GAME!
-Lookup
-125989  NEW_GAME!
-Common Prefix Lookup
-28  N
-124185  NE
-125428  NEW
-125988  NEW_GAME
-125989  NEW_GAME!
-5 found
-Predictive Lookup
-125989  NEW_GAME!
-126003  NEW_GAME!!
-126059  NEW_GAME!_-THE_CHALLENGE_STAGE!-
-3 found
-

Example 3: Benchmark Test

-

Command xcdat bench [params...] tests time performances of a given dictionary, as follows.

-
$ xcdat bench 1 jawiki-all-titles.dac jawiki-all-titles.rnd
-Warm up
-Lookup benchmark on 10 runs
-1.5065 us per str
-Access benchmark on 10 runs
-1.81289 us per ID
-

Sample Usage

-

The following code shows an easy routine sample.

-
#include <iostream>
-
#include <xcdat.hpp>
-
-
int main() {
-
std::vector<std::string> keys_buf = {
-
"Aoba", "Yun", "Hajime", "Hihumi", "Kou", "Rin",
-
"Hazuki", "Umiko", "Nene", "Nenecchi"
-
};
-
-
// Convert to the input format
-
std::vector<std::string_view> keys(keys_buf.size());
-
for (size_t i = 0; i < keys.size(); ++i) {
-
keys[i] = std::string_view{keys_buf[i]};
-
}
-
-
// Input data must be sorted.
-
std::sort(std::begin(keys), std::end(keys));
-
-
// Dictionary class
-
using Trie = xcdat::Trie<true>;
-
-
try {
-
// Builds a dictionary from the keys
-
Trie trie = xcdat::TrieBuilder::build<true>(keys); // move
-
-
// Writes the dictionary to a file.
-
std::ofstream ofs{"sample.bin"};
-
trie.write(ofs);
-
} catch (const xcdat::TrieBuilder::Exception& ex) {
-
// Abort if something went wrong...
-
std::cerr << ex.what() << std::endl;
-
return 1;
-
}
-
-
// Creates an empty dictionary
-
Trie trie;
-
{
-
// Reads the dictionary to the file.
-
std::ifstream ifs{"sample.bin"};
-
trie = Trie{ifs}; // move
-
}
-
-
std::cout << "Performing basic operations..." << std::endl;
-
{
-
// lookup() obtains the unique ID for a given key
-
xcdat::id_type key_id = trie.lookup("Rin");
-
// access() decodes the key from a given ID
-
std::cout << key_id << " : " << trie.access(key_id) << std::endl;
-
-
// Given an unregistered key, lookup() returns NOT_FOUND.
-
if (trie.lookup("Hotaru") == Trie::NOT_FOUND) {
-
std::cout << "? : " << "Hotaru" << std::endl;
-
}
-
}
-
-
std::cout << "Performing a common prefix operation..." << std::endl;
-
{
-
// Common prefix operation is implemented using PrefixIterator, created by
-
// make_prefix_iterator().
-
Trie::PrefixIterator it = trie.make_prefix_iterator("Nenecchi");
-
-
// next() continues to obtain the next key until false is returned.
-
while (it.next()) {
-
std::cout << it.id() << " : " << it.key() << std::endl;
-
}
-
}
-
-
std::cout << "Performing a predictive operation..." << std::endl;
-
{
-
// Predictive operation is implemented using PredictiveIterator, created by
-
// make_predictive_iterator().
-
Trie::PredictiveIterator it = trie.make_predictive_iterator("Ha");
-
-
// next() continues to obtain the next key until false is returned in
-
// lexicographical order.
-
while (it.next()) {
-
std::cout << it.id() << " : " << it.key() << std::endl;
-
}
-
}
-
-
std::cout << "Enumerating all registered keys..." << std::endl;
-
{
-
// PredictiveIterator for an empty string provides enumeration of all
-
// registered keys in lexicographical order.
-
Trie::PredictiveIterator it = trie.make_predictive_iterator("");
-
while (it.next()) {
-
std::cout << it.id() << " : " << it.key() << std::endl;
-
}
-
}
-
-
return 0;
-
}
-

The standard output is as follows.

-
Performing basic operations...
-7 : Rin
-? : Hotaru
-Performing common prefix operations...
-4 : Nene
-6 : Nenecchi
-Performing predictive operations...
-3 : Hajime
-5 : Hazuki
-Enumerating all registered keys...
-0 : Aoba
-3 : Hajime
-5 : Hazuki
-1 : Hihumi
-2 : Kou
-4 : Nene
-6 : Nenecchi
-7 : Rin
-8 : Umiko
-9 : Yun
-

As shown in the output, xcdat::Trie assigns unique integer IDs to each registered key. The ID order is random, depending on node arrangement.

-

API

-

You can build a dictionary using static member function xcdat::TrieBuilder::build(). This function receives a set of keywords and returns the resulting class object of xcdat::Trie. For the usage, refer to the header comments of xcdat::TrieBuilder.hpp. Also for the usage of xcdat::Trie, refer to the header comments of xcdat::Trie.

-

The detailed descriptions of AIP are under construction…

-

Benchmark

-

Work in progress…

-

To Do

-
    -
  • Show benchmarks
  • -
  • Create AIP descriptions
  • -
-

Licensing

-

This library is free software provided under the MIT License.

-

Citation

-

If you use the library in academic settings, please cite the following paper.

-
@article{kanda2017compressed,
-
title={Compressed double-array tries for string dictionaries supporting fast lookup},
-
author={Kanda, Shunsuke and Morita, Kazuhiro and Fuketa, Masao},
-
journal={Knowledge and Information Systems},
-
volume={51},
-
number={3},
-
pages={1023--1042},
-
year={2017},
-
publisher={Springer}
-
}
-

References

-
    -
  • J. Aoe. An efficient digital search algorithm by using a double-array structure. IEEE Transactions on Software Engineering, 15(9):1066–1077, 1989.
  • -
  • N. R. Brisaboa, S. Ladra, and G. Navarro. DACs: Bringing direct access to variable-length codes. Information Processing & Management, 49(1):392–404, 2013.
  • -
  • S. Kanda, K. Morita, and M. Fuketa. Compressed double-array tries for string dictionaries supporting fast lookup. Knowledge and Information Systems, 51(3): 1023–1042, 2017.
  • -
  • M. A. Martínez-Prieto, N. Brisaboa, R. Cánovas, F. Claude, and G. Navarro. Practical compressed string dictionaries. Information Systems, 56:73–108, 2016
  • -
  • S. Yata, M. Oono, K. Morita, M. Fuketa, T. Sumitomo, and J. Aoe. A compact static double-array keeping character codes. Information Processing & Management, 43(1):237–247, 2007.
  • -
-
-

Copyright © 2017 Shunsuke Kanda, All Rights Reserved.

-
- - diff --git a/docs/pandoc.sh b/docs/pandoc.sh deleted file mode 100755 index 601b1fe..0000000 --- a/docs/pandoc.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/sh - -pandoc --template=template.html -o index.html document.md -c style.css --toc --toc-depth=2 \ No newline at end of file diff --git a/docs/style.css b/docs/style.css deleted file mode 100644 index 1ec9fe7..0000000 --- a/docs/style.css +++ /dev/null @@ -1,163 +0,0 @@ -@import url('https://fonts.googleapis.com/css?family=Comfortaa'); -@import url('https://fonts.googleapis.com/css?family=Source+Code+Pro'); -@import url('https://maxcdn.bootstrapcdn.com/font-awesome/4.7.0/css/font-awesome.min.css'); - -body { - background: #fff; - color: #545454; - font-family: 'Comfortaa'; - font-size: 16px; - line-height: 1.5; - margin: 0 auto; - max-width: 800px; - padding: 2em 2em 2em; -} - -h1, -h2, -h3, -h4, -h5, -h6 { - color: #494949; - font-weight: 600; - line-height: 1.3; -} - -h1 { - line-height: 1.7; - text-align: center; -} - -h2 { - margin-top: 1.3em; - padding: 0.25em 0.5em; - color: #494949; - background: transparent; - border-left: solid 5px #7db4e6; -} - -h3 { - margin-top: 1.3em; - padding: 0.25em 0.0em; -} - -h4 { - margin-top: 1.3em; - padding: 0.25em 0.0em; -} - -a { - color: #0083e8; - text-decoration: none; -} - -a:hover { - text-decoration: underline; -} - -b, -strong { - font-weight: 600; - background: linear-gradient(transparent 75%, #a7d6ff 70%); -} - -img { - animation: colorize 2s cubic-bezier(0, 0, .78, .36) 1; - background: transparent; - border: 10px solid rgba(0, 0, 0, 0.12); - border-radius: 4px; - display: block; - margin: 1.3em auto; - max-width: 95%; -} - -blockquote { - position: relative; - padding: 10px 15px 10px 60px; - box-sizing: border-box; - background: #f5f5f5; - color: #777777; - border-left: 4px solid #9dd4ff; - box-shadow: 0 2px 4px rgba(0, 0, 0, 0.14); -} - -blockquote:before { - display: inline-block; - position: absolute; - top: 15px; - left: 15px; - vertical-align: middle; - content: "\f10d"; - font-family: FontAwesome; - color: #9dd4ff; - font-size: 30px; - line-height: 1; -} - -blockquote p { - padding: 0; - margin: 7px 0; -} - -blockquote cite { - display: block; - text-align: right; - color: #888888; - font-size: 0.9em; -} - -ul { - padding: 0 0.5em; - position: relative; -} - -ul li { - line-height: 1.5; - padding: 0.2em 0 0.5em 1.5em; - border-bottom: 2px solid white; - list-style-type: none!important; -} - -ul li:before { - font-family: FontAwesome; - content: "\f00c"; - position: absolute; - left: 0.5em; - color: #9dd4ff; -} - -ul li:last-of-type { - border-bottom: none; -} - -pre, -code { - background: #f5f5f5; - font-family: 'Source Code Pro', monospace; -} - -p code { - padding: 0.1em 0.5em; -} - -pre { - font-size: 0.95rem; - padding: 1em; - overflow: auto; - white-space: pre; -} - -pre.sourceCode { - font-size: 0.95rem; - padding: 1em; - overflow: auto; - white-space: pre; -} - -footer { - font-size: 14px; - color: #8f9296; - text-align: center; - margin-top: 40px; -} diff --git a/docs/template.html b/docs/template.html deleted file mode 100644 index b6fc7cf..0000000 --- a/docs/template.html +++ /dev/null @@ -1,68 +0,0 @@ - - - - - - -$for(author-meta)$ - -$endfor$ -$if(date-meta)$ - -$endif$ -$if(keywords)$ - -$endif$ - $if(title-prefix)$$title-prefix$ – $endif$$pagetitle$ - -$if(highlighting-css)$ - -$endif$ -$for(css)$ - -$endfor$ -$if(math)$ - $math$ -$endif$ - -$for(header-includes)$ - $header-includes$ -$endfor$ - - -$for(include-before)$ -$include-before$ -$endfor$ -$if(title)$ -
-

$title$

-

Created by $author$

-
-$endif$ -$if(toc)$ -

Contents

- -$endif$ -$body$ -$for(include-after)$ -$include-after$ -$endfor$ -
-

Copyright © $date$ $author$, All Rights Reserved.

-
- - diff --git a/include/xcdat.hpp b/include/xcdat.hpp deleted file mode 100644 index 85ccabd..0000000 --- a/include/xcdat.hpp +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef XCDAT_XCDAT_HPP_ -#define XCDAT_XCDAT_HPP_ - -#include "xcdat/TrieBuilder.hpp" - -#endif //XCDAT_XCDAT_HPP_ diff --git a/include/xcdat/BitVector.hpp b/include/xcdat/BitVector.hpp deleted file mode 100644 index 52ca4ec..0000000 --- a/include/xcdat/BitVector.hpp +++ /dev/null @@ -1,73 +0,0 @@ -#ifndef XCDAT_BIT_VECTOR_HPP_ -#define XCDAT_BIT_VECTOR_HPP_ - -#include "BitVectorBuilder.hpp" -#include "Vector.hpp" - -namespace xcdat { - -// Bit vector supporting Rank/Select operations. -class BitVector { -public: - BitVector() = default; - explicit BitVector(std::istream &is); - BitVector(BitVectorBuilder& builder, bool rank_flag, bool select_flag); - - ~BitVector() = default; - - bool operator[](size_t i) const { - return (bits_[i / 32] & (1U << (i % 32))) != 0; - } - - // the number of 1s in B[0,i). - id_type rank(id_type i) const; - // the position of the i+1 th occurrence. - id_type select(id_type i) const; - - size_t num_1s() const { - return num_1s_; - } - size_t num_0s() const { - return size_ - num_1s_; - } - - // the number of bits - size_t size() const { - return size_; - } - - size_t size_in_bytes() const; - - void write(std::ostream &os) const; - - void swap(BitVector& rhs) { - std::swap(*this, rhs); - } - - BitVector(const BitVector&) = delete; - BitVector& operator=(const BitVector&) = delete; - - BitVector(BitVector&&) noexcept = default; - BitVector& operator=(BitVector&&) noexcept = default; - -private: - static constexpr id_type BITS_IN_R1 {256}; - static constexpr id_type BITS_IN_R2 {32}; - static constexpr id_type R1_PER_R2 {BITS_IN_R1 / BITS_IN_R2}; // 8 - static constexpr id_type ONES_PER_TIP {512}; - - struct RankTip { - id_type L1; - uint8_t L2[R1_PER_R2]; - }; - - Vector bits_ {}; - Vector rank_tips_ {}; - Vector select_tips_ {}; - size_t size_ {}; - size_t num_1s_ {}; -}; - -} //namespace - xcdat - -#endif //XCDAT_BIT_VECTOR_HPP_ diff --git a/include/xcdat/BitVectorBuilder.hpp b/include/xcdat/BitVectorBuilder.hpp deleted file mode 100644 index 21d0296..0000000 --- a/include/xcdat/BitVectorBuilder.hpp +++ /dev/null @@ -1,64 +0,0 @@ -#ifndef XCDAT_BIT_VECTOR_BUILDER_HPP_ -#define XCDAT_BIT_VECTOR_BUILDER_HPP_ - -#include "xcdat_basics.hpp" - -namespace xcdat { - -// Bit pool for building BitVector. -class BitVectorBuilder { -public: - friend class BitVector; - - BitVectorBuilder() = default; - ~BitVectorBuilder() = default; - - explicit BitVectorBuilder(size_t size) { - resize(size); - } - - void push_back(bool bit) { - if (size_ % 32 == 0) { - bits_.push_back(0); - } - if (bit) { - set_bit(size_, true); - } - ++size_; - } - - void set_bit(size_t i, bool bit) { - if (bit) { - bits_[i / 32] |= (1U << (i % 32)); - ++num_1s_; - } else { - bits_[i / 32] &= (~(1U << (i % 32))); - --num_1s_; - } - } - - void resize(size_t size) { - bits_.resize(size / 32 + 1, 0); - size_ = size; - } - - void reserve(size_t capacity) { - bits_.reserve(capacity / 32 + 1); - } - - size_t size() const { - return size_; - } - - BitVectorBuilder(const BitVectorBuilder&) = delete; - BitVectorBuilder& operator=(const BitVectorBuilder&) = delete; - -private: - std::vector bits_ {}; - size_t size_ {}; - size_t num_1s_ {}; -}; - -} //namespace - xcdat - -#endif //XCDAT_BIT_VECTOR_BUILDER_HPP_ diff --git a/include/xcdat/DacBc.hpp b/include/xcdat/DacBc.hpp deleted file mode 100644 index 541c79a..0000000 --- a/include/xcdat/DacBc.hpp +++ /dev/null @@ -1,75 +0,0 @@ -#ifndef XCDAT_DAC_BC_HPP_ -#define XCDAT_DAC_BC_HPP_ - -#include "BitVector.hpp" -#include "FitVector.hpp" - -namespace xcdat { - -// BASE/CHECK representation using byte-oriented DACs. -class DacBc { -public: - static constexpr id_type WIDTH_L1 {8}; - - DacBc() = default; - ~DacBc() = default; - - explicit DacBc(std::istream &is); - explicit DacBc(const std::vector& bc, BitVectorBuilder& leaf_flags); - - id_type base(id_type i) const { - return access_(i * 2) ^ i; - } - id_type link(id_type i) const { - return values_[0][i * 2] | (links_[leaf_flags_.rank(i)] << 8); - } - id_type check(id_type i) const { - return access_(i * 2 + 1) ^ i; - } - - bool is_leaf(id_type i) const { - return leaf_flags_[i]; - } - bool is_used(id_type i) const { - return check(i) != i; - } - - size_t num_nodes() const { - return values_[0].size() / 2; - } - size_t num_used_nodes() const { - return num_nodes() - num_free_nodes_; - } - size_t num_free_nodes() const { - return num_free_nodes_; - } - - size_t size_in_bytes() const; - void show_stat(std::ostream &os) const; - - void write(std::ostream &os) const; - - void swap(DacBc& rhs) { - std::swap(*this, rhs); - } - - DacBc(const DacBc&) = delete; - DacBc& operator=(const DacBc&) = delete; - - DacBc(DacBc&&) noexcept = default; - DacBc& operator=(DacBc&&) noexcept = default; - -private: - Vector values_[sizeof(id_type)] {}; - BitVector flags_[sizeof(id_type) - 1] {}; - BitVector leaf_flags_ {}; - FitVector links_ {}; - uint8_t max_level_ {}; - size_t num_free_nodes_ {}; - - id_type access_(id_type i) const; -}; - -} //namespace - xcdat - -#endif //XCDAT_DAC_BC_HPP_ diff --git a/include/xcdat/FastDacBc.hpp b/include/xcdat/FastDacBc.hpp deleted file mode 100644 index 247de4f..0000000 --- a/include/xcdat/FastDacBc.hpp +++ /dev/null @@ -1,93 +0,0 @@ -#ifndef XCDAT_FAST_DAC_BC_HPP_ -#define XCDAT_FAST_DAC_BC_HPP_ - -#include - -#include "BitVector.hpp" -#include "FitVector.hpp" -#include "Vector.hpp" - -namespace xcdat { - -// BASE/CHECK representation using pointer-based byte-oriented DACs. -class FastDacBc { -public: - static constexpr id_type WIDTH_L1 = 7; -#ifdef XCDAT_X64 - static constexpr uint8_t LAYERS = 4; -#else - static constexpr uint8_t LAYERS = 3; -#endif - - static constexpr id_type BLOCK_SIZE_L1 = 1U << 7; - static constexpr id_type BLOCK_SIZE_L2 = 1U << 15; -#ifdef XCDAT_X64 - static constexpr id_type BLOCK_SIZE_L3 = 1U << 31; -#endif - - FastDacBc() = default; - explicit FastDacBc(std::istream& is); - explicit FastDacBc(const std::vector& bc, - BitVectorBuilder& leaf_flags); - - ~FastDacBc() = default; - - id_type base(id_type i) const { - return access_(i * 2) ^ i; - } - id_type link(id_type i) const { - return values_L1_[i * 2] | (links_[leaf_flags_.rank(i)] << 8); - } - id_type check(id_type i) const { - return access_(i * 2 + 1) ^ i; - } - - bool is_leaf(id_type i) const { - return leaf_flags_[i]; - } - bool is_used(id_type i) const { - return check(i) != i; - } - - size_t num_nodes() const { - return values_L1_.size() / 2; - } - size_t num_used_nodes() const { - return num_nodes() - num_free_nodes_; - } - size_t num_free_nodes() const { - return num_free_nodes_; - } - - size_t size_in_bytes() const; - void show_stat(std::ostream& os) const; - void write(std::ostream& os) const; - - void swap(FastDacBc& rhs) { - std::swap(*this, rhs); - } - - FastDacBc(const FastDacBc&) = delete; - FastDacBc& operator=(const FastDacBc&) = delete; - - FastDacBc(FastDacBc&&) noexcept = default; - FastDacBc& operator=(FastDacBc&&) noexcept = default; - -private: - Vector values_L1_{}; - Vector values_L2_{}; - Vector values_L3_{}; -#ifdef XCDAT_X64 - Vector values_L4_ {}; -#endif - Vector ranks_[LAYERS - 1]{}; - BitVector leaf_flags_{}; - FitVector links_{}; - size_t num_free_nodes_{}; - - id_type access_(id_type i) const; -}; - -} //namespace - xcdat - -#endif //XCDAT_FAST_DAC_BC_HPP_ diff --git a/include/xcdat/FitVector.hpp b/include/xcdat/FitVector.hpp deleted file mode 100644 index 8f85979..0000000 --- a/include/xcdat/FitVector.hpp +++ /dev/null @@ -1,56 +0,0 @@ -#ifndef XCDAT_SMALL_VECTOR_HPP_ -#define XCDAT_SMALL_VECTOR_HPP_ - -#include "Vector.hpp" - -namespace xcdat { - -// Compacted integer vector. -class FitVector { -public: - static constexpr id_type CHUNK_WIDTH = sizeof(id_type) * 8; - - FitVector() = default; - explicit FitVector(std::istream &is); - explicit FitVector(const std::vector& values); - - ~FitVector() = default; - - id_type operator[](size_t i) const { - auto chunk_pos = static_cast(i * width_ / CHUNK_WIDTH); - auto offset = static_cast(i * width_ % CHUNK_WIDTH); - if (offset + width_ <= CHUNK_WIDTH) { - return (chunks_[chunk_pos] >> offset) & mask_; - } else { - return ((chunks_[chunk_pos] >> offset) - | (chunks_[chunk_pos + 1] << (CHUNK_WIDTH - offset))) & mask_; - } - } - - size_t size() const { - return size_; - } - size_t size_in_bytes() const; - - void write(std::ostream &os) const; - - void swap(FitVector& rhs) { - std::swap(*this, rhs); - } - - FitVector(const FitVector&) = delete; - FitVector& operator=(const FitVector&) = delete; - - FitVector(FitVector&&) noexcept = default; - FitVector& operator=(FitVector&&) noexcept = default; - -private: - Vector chunks_ {}; - size_t size_ {}; - id_type width_ {}; - id_type mask_ {}; -}; - -} //namespace - xcdat - -#endif //XCDAT_SMALL_VECTOR_HPP_ diff --git a/include/xcdat/Trie.hpp b/include/xcdat/Trie.hpp deleted file mode 100644 index f2e26b8..0000000 --- a/include/xcdat/Trie.hpp +++ /dev/null @@ -1,514 +0,0 @@ -#ifndef XCDAT_TRIE_HPP_ -#define XCDAT_TRIE_HPP_ - -#include -#include - -#include "Trie.hpp" -#include "DacBc.hpp" -#include "FastDacBc.hpp" - -namespace xcdat { - -// Compressed string dictionary using an improved double-array trie. There are -// two versions of DACs to represent BASE/CHECK arrays in small space. The -// versions can be chosen using the Fast parameter. -template -class Trie { -public: - using trie_type = Trie; - using bc_type = typename std::conditional::type; - - static constexpr auto NOT_FOUND = ID_MAX; - - // Generic constructor. - Trie() = default; - - // Reads the dictionary from an std::istream. - explicit Trie(std::istream& is) { - bc_ = bc_type(is); - terminal_flags_ = BitVector(is); - tail_ = Vector(is); - boundary_flags_ = BitVector(is); - alphabet_ = Vector(is); - is.read(reinterpret_cast(table_), 512); - num_keys_ = read_value(is); - max_length_ = read_value(is); - bin_mode_ = read_value(is); - } - - // Generic destructor. - ~Trie() = default; - - // Lookups the ID of a given key. If the key is not registered, otherwise - // returns NOT_FOUND. - id_type lookup(std::string_view key) const { - size_t pos = 0; - id_type node_id = 0; - - while (!bc_.is_leaf(node_id)) { - if (pos == key.length()) { - return terminal_flags_[node_id] ? to_key_id_(node_id) : NOT_FOUND; - } - - const auto child_id = bc_.base(node_id) ^code_(key[pos++]); - if (bc_.check(child_id) != node_id) { - return NOT_FOUND; - } - - node_id = child_id; - } - - size_t tail_pos = bc_.link(node_id); - if (!match_suffix_(key, pos, tail_pos)) { - return NOT_FOUND; - } - - return to_key_id_(node_id); - } - - // Decodes the key associated with a given ID. - std::string access(id_type id) const { - if (num_keys_ <= id) { - return {}; - } - - std::string dec; - dec.reserve(max_length_); - - auto node_id = to_node_id_(id); - auto tail_pos = bc_.is_leaf(node_id) ? bc_.link(node_id) : NOT_FOUND; - - while (node_id) { - const auto parent_id = bc_.check(node_id); - dec += edge_(parent_id, node_id); - node_id = parent_id; - } - - std::reverse(std::begin(dec), std::end(dec)); - - if (tail_pos != 0 && tail_pos != NOT_FOUND) { - if (bin_mode_) { - do { - dec += tail_[tail_pos]; - } while (!boundary_flags_[tail_pos++]); - } else { - do { - dec += tail_[tail_pos++]; - } while (tail_[tail_pos]); - } - } - - return dec; - } - - // Iterator for enumerating the keys and IDs included as prefixes of a given - // key, that is, supporting so-called common prefix lookup. It is created by - // using make_prefix_iterator(). - class PrefixIterator { - public: - PrefixIterator() = default; - - // Scans the next key. If it does not exist, returns false. - bool next() { - return trie_ != nullptr && trie_->next_prefix_(this); - } - - // Gets the key. - std::string_view key() const { - return {key_.data(), pos_}; - }; - // Gets the ID. - id_type id() const { - return id_; - } - - private: - const trie_type* trie_{}; - const std::string_view key_{}; - - size_t pos_{0}; - id_type node_id_{0}; - id_type id_{}; - - bool begin_flag_{true}; - bool end_flag_{false}; - - PrefixIterator(const trie_type* trie, std::string_view key) - : trie_{trie}, key_{key} {} - - friend class Trie; - }; - - // Makes PrefixIterator from a given key. - PrefixIterator make_prefix_iterator(std::string_view key) const { - return PrefixIterator{this, key}; - } - - // Iterator class for enumerating the keys and IDs starting with prefixes of - // a given key, that is, supporting so-called predictive lookup. It is in - // lexicographical order. It is created by using make_predictive_iterator(). - class PredictiveIterator { - public: - PredictiveIterator() = default; - - // Scans the next key. If it does not exist, returns false. - bool next() { - return trie_ != nullptr && trie_->next_predictive_(this); - } - - // Gets the key. - std::string_view key() const { - return {buf_.data(), buf_.size()}; - }; - // Gets the ID. - id_type id() const { - return id_; - } - - private: - const trie_type* trie_{}; - const std::string_view key_{}; - - bool begin_flag_{true}; - bool end_flag_{false}; - - struct stack_t { - size_t depth; - char c; - id_type node_id; - }; - - std::vector stack_{}; - std::string buf_{}; - id_type id_{}; - - PredictiveIterator(const trie_type* trie, std::string_view key) - : trie_{trie}, key_{key} { - buf_.reserve(trie->max_length_); - } - - friend class Trie; - }; - - // Makes PredictiveIterator from a given key. - PredictiveIterator make_predictive_iterator(std::string_view key) const { - return {this, key}; - } - - // Gets the number of registered keys in the dictionary - size_t num_keys() const { - return num_keys_; - } - - // Gets whether a binary mode or not. - bool bin_mode() const { - return bin_mode_; - } - - // Gets the size of alphabet drawing keys in the dictionary. - size_t alphabet_size() const { - return alphabet_.size(); - } - - // Gets the number of nodes including free nodes. - size_t num_nodes() const { - return bc_.num_nodes(); - } - - // Gets the number of nodes in the original trie. - size_t num_used_nodes() const { - return bc_.num_used_nodes(); - } - - // Gets the number of free nodes corresponding to empty elements. - size_t num_free_nodes() const { - return bc_.num_free_nodes(); - } - - // Computes the output dictionary size in bytes. - size_t size_in_bytes() const { - size_t ret = 0; - ret += bc_.size_in_bytes(); - ret += terminal_flags_.size_in_bytes(); - ret += tail_.size_in_bytes(); - ret += boundary_flags_.size_in_bytes(); - ret += alphabet_.size_in_bytes(); - ret += sizeof(table_); - ret += sizeof(num_keys_); - ret += sizeof(max_length_); - ret += sizeof(bin_mode_); - return ret; - } - - // Reports the dictionary statistics into an ostream. - void show_stat(std::ostream& os) const { - const auto total_size = size_in_bytes(); - os << "basic statistics of xcdat::Trie<" - << (Fast ? "true" : "false") << ">" << std::endl; - show_size("\tnum keys: ", num_keys(), os); - show_size("\talphabet size: ", alphabet_size(), os); - show_size("\tnum nodes: ", num_nodes(), os); - show_size("\tnum used nodes:", num_used_nodes(), os); - show_size("\tnum free nodes:", num_free_nodes(), os); - show_size("\tsize in bytes: ", size_in_bytes(), os); - os << "member size statistics of xcdat::Trie<" - << (Fast ? "true" : "false") << ">" << std::endl; - show_size_ratio("\tbc: ", bc_.size_in_bytes(), total_size, os); - show_size_ratio("\tterminal_flags:", terminal_flags_.size_in_bytes(), - total_size, os); - show_size_ratio("\ttail: ", tail_.size_in_bytes(), total_size, os); - show_size_ratio("\tboundary_flags:", boundary_flags_.size_in_bytes(), - total_size, os); - bc_.show_stat(os); - } - - // Writes the dictionary into an ostream. - void write(std::ostream& os) const { - bc_.write(os); - terminal_flags_.write(os); - tail_.write(os); - boundary_flags_.write(os); - alphabet_.write(os); - os.write(reinterpret_cast(table_), 512); - write_value(num_keys_, os); - write_value(max_length_, os); - write_value(bin_mode_, os); - } - - // Swap - void swap(Trie& rhs) { - std::swap(*this, rhs); - } - - Trie(const Trie&) = delete; - Trie& operator=(const Trie&) = delete; - - Trie(Trie&&) noexcept = default; - Trie& operator=(Trie&&) noexcept = default; - -private: - bc_type bc_{}; - BitVector terminal_flags_{}; - Vector tail_{}; - BitVector boundary_flags_{}; // used if binary_mode_ == true - Vector alphabet_{}; - uint8_t table_[512]{}; // table[table[c] + 256] = c - - size_t num_keys_{}; - size_t max_length_{}; - bool bin_mode_{}; - - id_type to_key_id_(id_type node_id) const { - return terminal_flags_.rank(node_id); - }; - id_type to_node_id_(id_type string_id) const { - return terminal_flags_.select(string_id); - }; - id_type code_(char c) const { - return table_[static_cast(c)]; - } - char edge_(id_type node_id, id_type child_id) const { - return static_cast(table_[(bc_.base(node_id) ^ child_id) + 256]); - } - - bool match_suffix_(std::string_view key, size_t pos, size_t tail_pos) const { - assert(pos <= key.length()); - - if (pos == key.length()) { - return tail_pos == 0; - } - - if (bin_mode_) { - do { - if (key[pos] != tail_[tail_pos]) { - return false; - } - ++pos; - if (boundary_flags_[tail_pos]) { - return pos == key.length(); - } - ++tail_pos; - } while (pos < key.length()); - return false; - } else { - do { - if (!tail_[tail_pos] || key[pos] != tail_[tail_pos]) { - return false; - } - ++pos; - ++tail_pos; - } while (pos < key.length()); - return !tail_[tail_pos]; - } - } - - void extract_suffix_(size_t tail_pos, std::string& dec) const { - if (bin_mode_) { - if (tail_pos != 0) { - do { - dec += tail_[tail_pos]; - } while (!boundary_flags_[tail_pos++]); - } - } else { - while (tail_[tail_pos] != '\0') { - dec += tail_[tail_pos]; - ++tail_pos; - } - } - } - - bool next_prefix_(PrefixIterator* it) const { - if (it->end_flag_) { - return false; - } - - if (it->begin_flag_) { - it->begin_flag_ = false; - if (terminal_flags_[it->node_id_]) { - it->id_ = to_key_id_(it->node_id_); - return true; - } - } - - while (!bc_.is_leaf(it->node_id_)) { - id_type child_id = bc_.base(it->node_id_) ^code_(it->key_[it->pos_++]); - if (bc_.check(child_id) != it->node_id_) { - it->end_flag_ = true; - it->id_ = NOT_FOUND; - return false; - } - it->node_id_ = child_id; - if (!bc_.is_leaf(it->node_id_) && terminal_flags_[it->node_id_]) { - it->id_ = to_key_id_(it->node_id_); - return true; - } - } - - it->end_flag_ = true; - size_t tail_pos = bc_.link(it->node_id_); - - if (!match_suffix_(it->key_, it->pos_, tail_pos)) { - it->id_ = NOT_FOUND; - return false; - } - - it->pos_ = it->key_.length(); - it->id_ = to_key_id_(it->node_id_); - return true; - } - - bool next_predictive_(PredictiveIterator* it) const { - if (it->end_flag_) { - return false; - } - - if (it->begin_flag_) { - it->begin_flag_ = false; - - id_type node_id = 0; - size_t pos = 0; - - for (; pos < it->key_.length(); ++pos) { - if (bc_.is_leaf(node_id)) { - it->end_flag_ = true; - - size_t tail_pos = bc_.link(node_id); - if (tail_pos == 0) { - return false; - } - - if (bin_mode_) { - do { - if (it->key_[pos] != tail_[tail_pos]) { - return false; - } - it->buf_ += it->key_[pos++]; - if (boundary_flags_[tail_pos]) { - if (pos == it->key_.length()) { - it->id_ = to_key_id_(node_id); - return true; - } - return false; - } - ++tail_pos; - } while (pos < it->key_.length()); - } else { - do { - if (it->key_[pos] != tail_[tail_pos] || !tail_[tail_pos]) { - return false; - } - it->buf_ += it->key_[pos++]; - ++tail_pos; - } while (pos < it->key_.length()); - } - - it->id_ = to_key_id_(node_id); - extract_suffix_(tail_pos, it->buf_); - return true; - } - - id_type child_id = bc_.base(node_id) ^code_(it->key_[pos]); - - if (bc_.check(child_id) != node_id) { - it->end_flag_ = true; - return false; - } - - node_id = child_id; - it->buf_ += it->key_[pos]; - } - - if (!it->buf_.empty()) { - it->stack_.push_back({pos, it->buf_.back(), node_id}); - } else { - it->stack_.push_back({pos, '\0', node_id}); - } - } - - while (!it->stack_.empty()) { - id_type node_id = it->stack_.back().node_id; - size_t depth = it->stack_.back().depth; - uint8_t c = it->stack_.back().c; - it->stack_.pop_back(); - - if (0 < depth) { - it->buf_.resize(depth); - it->buf_.back() = c; - } - - if (bc_.is_leaf(node_id)) { - it->id_ = to_key_id_(node_id); - extract_suffix_(bc_.link(node_id), it->buf_); - return true; - } - - const id_type base = bc_.base(node_id); - - // For lex sort - for (auto rit = std::rbegin(alphabet_); - rit != std::rend(alphabet_); ++rit) { - const id_type child_id = base ^code_(*rit); - if (bc_.check(child_id) == node_id) { - it->stack_.push_back( - {depth + 1, static_cast(*rit), child_id} - ); - } - } - - if (terminal_flags_[node_id]) { - it->id_ = to_key_id_(node_id); - return true; - } - } - - it->end_flag_ = true; - return false; - } - - friend class TrieBuilder; -}; - -} //namespace - xcdat - -#endif //XCDAT_TRIE_HPP_ diff --git a/include/xcdat/TrieBuilder.hpp b/include/xcdat/TrieBuilder.hpp deleted file mode 100644 index 3b213b7..0000000 --- a/include/xcdat/TrieBuilder.hpp +++ /dev/null @@ -1,117 +0,0 @@ -#ifndef XCDAT_TRIE_BUILDER_HPP_ -#define XCDAT_TRIE_BUILDER_HPP_ - -#include "Trie.hpp" - -namespace xcdat { - -// Double-array trie builder. -class TrieBuilder { -public: - // Builds the dictionary from given string keys. The keys must be sorted in - // lexicographical order without duplication. Any error in construction is - // reported by TrieBuilder::Exception. If the keys include the ASCII zero - // code, pass bin_mode = true. - template - static Trie - build(const std::vector& keys, bool bin_mode = false) { - TrieBuilder builder(keys, Trie::bc_type::WIDTH_L1, bin_mode); - - Trie trie; - - trie.bc_ = typename Trie::bc_type(builder.bc_, builder.leaf_flags_); - trie.terminal_flags_ = BitVector(builder.term_flags_, true, true); - trie.tail_ = Vector(builder.tail_); - trie.boundary_flags_ = BitVector(builder.boundary_flags_, false, false); - trie.alphabet_ = Vector(builder.alphabet_); - std::swap(trie.table_, builder.table_); - - trie.num_keys_ = keys.size(); - trie.max_length_ = builder.max_length_; - trie.bin_mode_ = builder.bin_mode_; - - return trie; - } - - // Exception class for xcdat::TrieBuilder - class Exception : public std::exception { - public: - explicit Exception(std::string message) : message_(std::move(message)) {} - ~Exception() throw() override {}; - - // overrides what() of std::exception. - const char* what() const throw() override { - return message_.c_str(); - } - - private: - std::string message_; - }; - - TrieBuilder(const TrieBuilder&) = delete; - TrieBuilder& operator=(const TrieBuilder&) = delete; - -private: - struct Suffix { - std::string_view str; - id_type node_id; - - size_t length() const { - return str.length(); - } - char operator[](size_t i) const { - return str[length() - i - 1]; - } - - std::reverse_iterator rbegin() const { - return std::make_reverse_iterator(str.data() + str.length()); - } - std::reverse_iterator rend() const { - return std::make_reverse_iterator(str.data()); - } - }; - - // To avoid undefined traversal - static constexpr id_type TABOO_ID = 1; - // From darts-clone setting - static constexpr id_type FREE_BLOCKS = 16; - - const std::vector& keys_; - const id_type block_size_; - const id_type width_L1_; - - bool bin_mode_{}; - - std::vector bc_{}; - BitVectorBuilder leaf_flags_{}; - BitVectorBuilder term_flags_{}; - std::vector tail_{}; - BitVectorBuilder boundary_flags_{}; - std::vector alphabet_{}; - uint8_t table_[512]{}; - - std::vector used_flags_{}; - std::vector edges_{}; - std::vector heads_{}; - std::vector suffixes_{}; - - size_t max_length_{}; - - TrieBuilder(const std::vector& keys, - id_type width_L1, bool bin_mode); - ~TrieBuilder() = default; - - void build_table_(); - void build_bc_(size_t begin, size_t end, size_t depth, id_type node_id); - void build_tail_(); - - void expand_(); - void use_(id_type node_id); - void close_block_(id_type block_id); - id_type find_base_(id_type block_id) const; - bool is_target_(id_type base) const; -}; - -} //namespace - xcdat - -#endif //XCDAT_TRIE_BUILDER_HPP_ diff --git a/include/xcdat/Vector.hpp b/include/xcdat/Vector.hpp deleted file mode 100644 index df8aded..0000000 --- a/include/xcdat/Vector.hpp +++ /dev/null @@ -1,91 +0,0 @@ -#ifndef XCDAT_VECTOR_HPP -#define XCDAT_VECTOR_HPP - -#include "xcdat_basics.hpp" - -namespace xcdat { - -// Simple vector of a POD type -template -class Vector { -public: - static_assert(!std::is_same::value, "Type bool is not supported."); - static_assert(std::is_pod::value, "T is not POD."); - - Vector() = default; - - explicit Vector(std::istream& is) { - size_ = read_value(is); - vec_.resize(size_); - is.read(reinterpret_cast(&vec_[0]), sizeof(T) * size_); - data_ = vec_.data(); - } - - explicit Vector(std::vector& vec) { - if (vec.size() != vec.capacity()) { - vec.shrink_to_fit(); - } - vec_ = std::move(vec); - data_ = vec_.data(); - size_ = vec_.size(); - } - - ~Vector() = default; - - const T& operator[](size_t i) const { - return data_[i]; - } - const T* data() const { - return data_; - } - - const T* begin() const { - return data_; - } - const T* end() const { - return data_ + size_; - } - - std::reverse_iterator rbegin() const { - return std::make_reverse_iterator(end()); - } - std::reverse_iterator rend() const { - return std::make_reverse_iterator(begin()); - } - - bool is_empty() const { - return size_ == 0; - } - - size_t size() const { - return size_; - } - - size_t size_in_bytes() const { - return size_ * sizeof(T) + sizeof(size_); - } - - void write(std::ostream& os) const { - write_value(size_, os); - os.write(reinterpret_cast(data_), sizeof(T) * size_); - } - - void swap(Vector& rhs) { - std::swap(*this, rhs); - } - - Vector(const Vector&) = delete; - Vector& operator=(const Vector&) = delete; - - Vector(Vector&&) noexcept = default; - Vector& operator=(Vector&&) noexcept = default; - -private: - const T* data_ {}; - size_t size_ {}; - std::vector vec_ {}; -}; - -} - -#endif //XCDAT_VECTOR_HPP diff --git a/include/xcdat/xcdat_basics.hpp b/include/xcdat/xcdat_basics.hpp deleted file mode 100644 index eec0a99..0000000 --- a/include/xcdat/xcdat_basics.hpp +++ /dev/null @@ -1,58 +0,0 @@ -#ifndef XCDAT_BASICS_HPP_ -#define XCDAT_BASICS_HPP_ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "xcdat_config.hpp" - -namespace xcdat { - -#ifdef XCDAT_X64 -using id_type = uint64_t; -#else -using id_type = uint32_t; -#endif - -constexpr id_type ID_MAX = std::numeric_limits::max(); - -struct BcPair { - id_type base; - id_type check; -}; - -inline void show_size(const char* str, double size, std::ostream& os) { - os << str << "\t" << size << std::endl; -} - -inline void show_size(const char* str, size_t size, std::ostream& os) { - os << str << "\t" << size << std::endl; -} - -inline void show_size_ratio(const char* str, size_t size, size_t denom, std::ostream& os) { - os << str << "\t" << size << "\t" << static_cast(size) / denom << std::endl; -} - -template -inline void write_value(const T val, std::ostream& os) { - os.write(reinterpret_cast(&val), sizeof(val)); -} - -template -inline T read_value(std::istream& is) { - T val; - is.read(reinterpret_cast(&val), sizeof(val)); - return val; -} - -} //namespace - xcdat - -#endif //XCDAT_BASICS_HPP_ diff --git a/sample/CMakeLists.txt b/sample/CMakeLists.txt deleted file mode 100644 index 1f6f911..0000000 --- a/sample/CMakeLists.txt +++ /dev/null @@ -1,3 +0,0 @@ - -add_executable(sample sample.cpp) -target_link_libraries(sample xcdat) diff --git a/sample/sample.cpp b/sample/sample.cpp deleted file mode 100644 index c62edd4..0000000 --- a/sample/sample.cpp +++ /dev/null @@ -1,92 +0,0 @@ -#include -#include - -int main() { - std::vector keys_buf = { - "Aoba", "Yun", "Hajime", "Hihumi", "Kou", "Rin", - "Hazuki", "Umiko", "Nene", "Nenecchi" - }; - - // Convert to the input format - std::vector keys(keys_buf.size()); - for (size_t i = 0; i < keys.size(); ++i) { - keys[i] = std::string_view{keys_buf[i]}; - } - - // Input data must be sorted. - std::sort(std::begin(keys), std::end(keys)); - - // Dictionary class - using Trie = xcdat::Trie; - - try { - // Builds a dictionary from the keys - Trie trie = xcdat::TrieBuilder::build(keys); // move - - // Writes the dictionary to a file. - std::ofstream ofs{"sample.bin"}; - trie.write(ofs); - } catch (const xcdat::TrieBuilder::Exception& ex) { - // Abort if something went wrong... - std::cerr << ex.what() << std::endl; - return 1; - } - - // Creates an empty dictionary - Trie trie; - { - // Reads the dictionary to the file. - std::ifstream ifs{"sample.bin"}; - trie = Trie{ifs}; // move - } - - std::cout << "Performing basic operations..." << std::endl; - { - // lookup() obtains the unique ID for a given key - xcdat::id_type key_id = trie.lookup("Rin"); - // access() decodes the key from a given ID - std::cout << key_id << " : " << trie.access(key_id) << std::endl; - - // Given an unregistered key, lookup() returns NOT_FOUND. - if (trie.lookup("Hotaru") == Trie::NOT_FOUND) { - std::cout << "? : " << "Hotaru" << std::endl; - } - } - - std::cout << "Performing a common prefix operation..." << std::endl; - { - // Common prefix operation is implemented using PrefixIterator, created by - // make_prefix_iterator(). - Trie::PrefixIterator it = trie.make_prefix_iterator("Nenecchi"); - - // next() continues to obtain the next key until false is returned. - while (it.next()) { - std::cout << it.id() << " : " << it.key() << std::endl; - } - } - - std::cout << "Performing a predictive operation..." << std::endl; - { - // Predictive operation is implemented using PredictiveIterator, created by - // make_predictive_iterator(). - Trie::PredictiveIterator it = trie.make_predictive_iterator("Ha"); - - // next() continues to obtain the next key until false is returned in - // lexicographical order. - while (it.next()) { - std::cout << it.id() << " : " << it.key() << std::endl; - } - } - - std::cout << "Enumerating all registered keys..." << std::endl; - { - // PredictiveIterator for an empty string provides enumeration of all - // registered keys in lexicographical order. - Trie::PredictiveIterator it = trie.make_predictive_iterator(""); - while (it.next()) { - std::cout << it.id() << " : " << it.key() << std::endl; - } - } - - return 0; -} diff --git a/src/BitVector.cpp b/src/BitVector.cpp deleted file mode 100644 index f33bcce..0000000 --- a/src/BitVector.cpp +++ /dev/null @@ -1,314 +0,0 @@ -#include - -#include "xcdat/BitVector.hpp" - -namespace xcdat { - -// inspired by marisa-trie -constexpr uint8_t SELECT_TABLE[9][256] = { - { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - }, - { - 8, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, - 5, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, - 6, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, - 5, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, - 7, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, - 5, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, - 6, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, - 5, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, - 8, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, - 5, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, - 6, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, - 5, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, - 7, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, - 5, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, - 6, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, - 5, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1 - }, - { - 8, 8, 8, 2, 8, 3, 3, 2, 8, 4, 4, 2, 4, 3, 3, 2, - 8, 5, 5, 2, 5, 3, 3, 2, 5, 4, 4, 2, 4, 3, 3, 2, - 8, 6, 6, 2, 6, 3, 3, 2, 6, 4, 4, 2, 4, 3, 3, 2, - 6, 5, 5, 2, 5, 3, 3, 2, 5, 4, 4, 2, 4, 3, 3, 2, - 8, 7, 7, 2, 7, 3, 3, 2, 7, 4, 4, 2, 4, 3, 3, 2, - 7, 5, 5, 2, 5, 3, 3, 2, 5, 4, 4, 2, 4, 3, 3, 2, - 7, 6, 6, 2, 6, 3, 3, 2, 6, 4, 4, 2, 4, 3, 3, 2, - 6, 5, 5, 2, 5, 3, 3, 2, 5, 4, 4, 2, 4, 3, 3, 2, - 8, 8, 8, 2, 8, 3, 3, 2, 8, 4, 4, 2, 4, 3, 3, 2, - 8, 5, 5, 2, 5, 3, 3, 2, 5, 4, 4, 2, 4, 3, 3, 2, - 8, 6, 6, 2, 6, 3, 3, 2, 6, 4, 4, 2, 4, 3, 3, 2, - 6, 5, 5, 2, 5, 3, 3, 2, 5, 4, 4, 2, 4, 3, 3, 2, - 8, 7, 7, 2, 7, 3, 3, 2, 7, 4, 4, 2, 4, 3, 3, 2, - 7, 5, 5, 2, 5, 3, 3, 2, 5, 4, 4, 2, 4, 3, 3, 2, - 7, 6, 6, 2, 6, 3, 3, 2, 6, 4, 4, 2, 4, 3, 3, 2, - 6, 5, 5, 2, 5, 3, 3, 2, 5, 4, 4, 2, 4, 3, 3, 2 - }, - { - 8, 8, 8, 8, 8, 8, 8, 3, 8, 8, 8, 4, 8, 4, 4, 3, - 8, 8, 8, 5, 8, 5, 5, 3, 8, 5, 5, 4, 5, 4, 4, 3, - 8, 8, 8, 6, 8, 6, 6, 3, 8, 6, 6, 4, 6, 4, 4, 3, - 8, 6, 6, 5, 6, 5, 5, 3, 6, 5, 5, 4, 5, 4, 4, 3, - 8, 8, 8, 7, 8, 7, 7, 3, 8, 7, 7, 4, 7, 4, 4, 3, - 8, 7, 7, 5, 7, 5, 5, 3, 7, 5, 5, 4, 5, 4, 4, 3, - 8, 7, 7, 6, 7, 6, 6, 3, 7, 6, 6, 4, 6, 4, 4, 3, - 7, 6, 6, 5, 6, 5, 5, 3, 6, 5, 5, 4, 5, 4, 4, 3, - 8, 8, 8, 8, 8, 8, 8, 3, 8, 8, 8, 4, 8, 4, 4, 3, - 8, 8, 8, 5, 8, 5, 5, 3, 8, 5, 5, 4, 5, 4, 4, 3, - 8, 8, 8, 6, 8, 6, 6, 3, 8, 6, 6, 4, 6, 4, 4, 3, - 8, 6, 6, 5, 6, 5, 5, 3, 6, 5, 5, 4, 5, 4, 4, 3, - 8, 8, 8, 7, 8, 7, 7, 3, 8, 7, 7, 4, 7, 4, 4, 3, - 8, 7, 7, 5, 7, 5, 5, 3, 7, 5, 5, 4, 5, 4, 4, 3, - 8, 7, 7, 6, 7, 6, 6, 3, 7, 6, 6, 4, 6, 4, 4, 3, - 7, 6, 6, 5, 6, 5, 5, 3, 6, 5, 5, 4, 5, 4, 4, 3 - }, - { - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, - 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, 5, 8, 5, 5, 4, - 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 4, - 8, 8, 8, 6, 8, 6, 6, 5, 8, 6, 6, 5, 6, 5, 5, 4, - 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 4, - 8, 8, 8, 7, 8, 7, 7, 5, 8, 7, 7, 5, 7, 5, 5, 4, - 8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 4, - 8, 7, 7, 6, 7, 6, 6, 5, 7, 6, 6, 5, 6, 5, 5, 4, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, - 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, 5, 8, 5, 5, 4, - 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 4, - 8, 8, 8, 6, 8, 6, 6, 5, 8, 6, 6, 5, 6, 5, 5, 4, - 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 4, - 8, 8, 8, 7, 8, 7, 7, 5, 8, 7, 7, 5, 7, 5, 5, 4, - 8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 4, - 8, 7, 7, 6, 7, 6, 6, 5, 7, 6, 6, 5, 6, 5, 5, 4 - }, - { - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, - 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 5, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, - 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 5, - 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, - 8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 5, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, - 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 5, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, - 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 5, - 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, - 8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 5 - }, - { - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, - 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, - 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6 - }, - { - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7 - }, - { - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 - } -}; - -uint32_t pop_count(uint32_t x) { -#ifdef XCDAT_USE_POPCNT - return static_cast(_mm_popcnt_u32(x)); -#else - x = ((x & 0xAAAAAAAA) >> 1) + (x & 0x55555555); - x = ((x & 0xCCCCCCCC) >> 2) + (x & 0x33333333); - x = ((x >> 4) + x) & 0x0F0F0F0F; - x += x >> 8; - x += x >> 16; - return x & 0x3F; -#endif -} - -BitVector::BitVector(std::istream& is) { - bits_ = Vector(is); - rank_tips_ = Vector(is); - select_tips_ = Vector(is); - size_ = read_value(is); - num_1s_ = read_value(is); -} - -BitVector::BitVector(BitVectorBuilder& builder, bool rank_flag, - bool select_flag) { - if (!builder.size()) { - return; - } - - bits_ = Vector(builder.bits_); - size_ = builder.size_; - num_1s_ = builder.num_1s_; - - // builds rank_tips_ - if (rank_flag) { - std::vector rank_tips(size_ / BITS_IN_R1 + 1); - id_type count = 0; - for (id_type i = 0; i < rank_tips.size(); ++i) { - auto& tip = rank_tips[i]; - tip.L1 = count; - for (id_type offset = 0; offset < R1_PER_R2; ++offset) { - tip.L2[offset] = static_cast(count - tip.L1); - auto pos_in_bits = i * R1_PER_R2 + offset; - if (pos_in_bits < bits_.size()) { - count += pop_count(bits_[pos_in_bits]); - } - } - } - rank_tips_ = Vector(rank_tips); - } - - // builds select_tips_ - if (rank_flag && select_flag) { - std::vector select_tips{0}; - auto count = ONES_PER_TIP; - for (id_type i = 0; i < rank_tips_.size(); ++i) { - if (count < rank_tips_[i].L1) { - select_tips.push_back(i - 1); - count += ONES_PER_TIP; - } - } - select_tips.push_back(static_cast(rank_tips_.size() - 1)); - select_tips_ = Vector(select_tips); - } -} - -id_type BitVector::rank(id_type i) const { - auto& hint = rank_tips_[i / BITS_IN_R1]; - return hint.L1 + hint.L2[i / BITS_IN_R2 % R1_PER_R2] - + pop_count(bits_[i / 32] & ((1U << (i % 32)) - 1)); -} - -id_type BitVector::select(id_type i) const { - id_type left = 0, right = static_cast(rank_tips_.size()); - - if (!select_tips_.is_empty()) { - auto select_tip_id = static_cast(i / ONES_PER_TIP); - left = select_tips_[select_tip_id]; - right = select_tips_[select_tip_id + 1] + 1; - } - - while (left + 1 < right) { - const auto center = (left + right) / 2; - if (i < rank_tips_[center].L1) { - right = center; - } else { - left = center; - } - } - - i += 1; // for i+1 th - i -= rank_tips_[left].L1; - - uint32_t offset = 1; - for (; offset < R1_PER_R2; ++offset) { - if (i <= rank_tips_[left].L2[offset]) { - break; - } - } - i -= rank_tips_[left].L2[--offset]; - - auto ret = (left * BITS_IN_R1) + (offset * BITS_IN_R2); - auto bits = bits_[ret / 32]; - - { - auto _count = pop_count(bits % 65536); - if (_count < i) { - bits >>= 16; - ret += 16; - i -= _count; - } - } - { - auto _count = pop_count(bits % 256); - if (_count < i) { - bits >>= 8; - ret += 8; - i -= _count; - } - } - - ret += SELECT_TABLE[i][bits % 256]; - return ret - 1; -} - -size_t BitVector::size_in_bytes() const { - size_t ret = 0; - ret += bits_.size_in_bytes(); - ret += rank_tips_.size_in_bytes(); - ret += select_tips_.size_in_bytes(); - ret += sizeof(size_); - ret += sizeof(num_1s_); - return ret; -} - -void BitVector::write(std::ostream& os) const { - bits_.write(os); - rank_tips_.write(os); - select_tips_.write(os); - write_value(size_, os); - write_value(num_1s_, os); -} - -} //namespace - xcdat diff --git a/src/DacBc.cpp b/src/DacBc.cpp deleted file mode 100644 index f6a31f3..0000000 --- a/src/DacBc.cpp +++ /dev/null @@ -1,141 +0,0 @@ -#include - -#include "xcdat/DacBc.hpp" - -namespace xcdat { - -DacBc::DacBc(std::istream& is) { - for (size_t i = 0; i < sizeof(id_type); ++i) { - values_[i] = Vector(is); - } - for (size_t i = 0; i < sizeof(id_type) - 1; ++i) { - flags_[i] = BitVector(is); - } - leaf_flags_ = BitVector(is); - links_ = FitVector(is); - max_level_ = read_value(is); - num_free_nodes_ = read_value(is); -} - -DacBc::DacBc(const std::vector& bc, BitVectorBuilder& leaf_flags) { - if (bc.empty()) { - return; - } - - std::vector values[sizeof(id_type)]; - BitVectorBuilder flags[sizeof(id_type)]; - std::vector links; - - leaf_flags_ = BitVector(leaf_flags, true, false); - - values[0].reserve(bc.size() * 2); - flags[0].reserve(bc.size() * 2); - links.reserve(bc.size()); - - max_level_ = 0; - - auto append = [&](id_type value) { - uint8_t level = 0; - values[level].push_back(static_cast(value & 0xFF)); - flags[level].push_back(true); - value >>= 8; - while (value) { - ++level; - values[level].push_back(static_cast(value & 0xFF)); - flags[level].push_back(true); - value >>= 8; - } - flags[level].set_bit(flags[level].size() - 1, false); - max_level_ = std::max(max_level_, level); - }; - - auto append_leaf = [&](id_type value) { - links.push_back(value >> 8); - values[0].push_back(static_cast(value & 0xFF)); - flags[0].push_back(false); - }; - - for (id_type i = 0; i < bc.size(); ++i) { - if (leaf_flags_[i]) { - append_leaf(bc[i].base); - } else { - append(bc[i].base ^ i); - } - append(bc[i].check ^ i); - if (bc[i].check == i) { - ++num_free_nodes_; - } - } - - // release - for (uint8_t i = 0; i < max_level_; ++i) { - values_[i] = Vector(values[i]); - flags_[i] = BitVector(flags[i], true, false); - } - values_[max_level_] = Vector(values[max_level_]); - links_ = FitVector(links); -} - -size_t DacBc::size_in_bytes() const { - size_t ret = 0; - for (auto& values : values_) { - ret += values.size_in_bytes(); - } - for (auto& flags : flags_) { - ret += flags.size_in_bytes(); - } - ret += leaf_flags_.size_in_bytes(); - ret += links_.size_in_bytes(); - ret += sizeof(max_level_); - ret += sizeof(num_free_nodes_); - return ret; -} - -void DacBc::show_stat(std::ostream& os) const { - const auto total_size = size_in_bytes(); - os << "basic statistics of xcdat::DacBc" << std::endl; - show_size("\tnum links: ", links_.size(), os); - show_size("\tbytes per node:", double(total_size) / num_nodes(), os); - os << "member size statistics of xcdat::DacBc" << std::endl; - for (int i = 0; i <= max_level_; ++i) { - std::ostringstream oss; - oss << "\tvalues_L" << i << ":"; - show_size_ratio(oss.str().c_str(), values_[i].size_in_bytes(), total_size, os); - } - for (int i = 0; i < max_level_; ++i) { - std::ostringstream oss; - oss << "\tflags_L" << i << ": "; - show_size_ratio(oss.str().c_str(), flags_[i].size_in_bytes(), total_size, os); - } - show_size_ratio("\tleaves: ", leaf_flags_.size_in_bytes(), total_size, os); - show_size_ratio("\tlinks: ", links_.size_in_bytes(), total_size, os); -} - -void DacBc::write(std::ostream& os) const { - for (auto& values : values_) { - values.write(os); - } - for (auto& flags : flags_) { - flags.write(os); - } - leaf_flags_.write(os); - links_.write(os); - write_value(max_level_, os); - write_value(num_free_nodes_, os); -} - -id_type DacBc::access_(id_type i) const { - uint8_t level = 0; - id_type value = values_[level][i]; - while (level < max_level_) { - if (!flags_[level][i]) { - break; - } - i = flags_[level].rank(i); - ++level; - value |= static_cast(values_[level][i]) << (level * 8); - } - return value; -} - -} //namespace - xcdat diff --git a/src/FastDacBc.cpp b/src/FastDacBc.cpp deleted file mode 100644 index 4077570..0000000 --- a/src/FastDacBc.cpp +++ /dev/null @@ -1,188 +0,0 @@ -#include "xcdat/FastDacBc.hpp" - -namespace xcdat { - -FastDacBc::FastDacBc(std::istream& is) { - values_L1_ = Vector(is); - values_L2_ = Vector(is); - values_L3_ = Vector(is); -#ifdef XCDAT_X64 - values_L4_ = Vector(is); -#endif - for (size_t i = 0; i < LAYERS - 1; ++i) { - ranks_[i] = Vector(is); - } - leaf_flags_ = BitVector(is); - links_ = FitVector(is); - num_free_nodes_ = read_value(is); -} - -FastDacBc::FastDacBc(const std::vector& bc, - BitVectorBuilder& leaf_flags) { - if (bc.empty()) { - return; - } - - std::vector values_L1; - std::vector values_L2; - std::vector values_L3; -#ifdef XCDAT_X64 - std::vector values_L4; -#endif - std::vector ranks[LAYERS - 1]; - std::vector links; - leaf_flags_ = BitVector(leaf_flags, true, false); - - ranks[0].reserve((bc.size() * 2) / 128); - - auto append = [&](id_type value) { - if ((values_L1.size() % BLOCK_SIZE_L1) == 0) { - ranks[0].push_back(static_cast(values_L2.size())); - } - if ((value / BLOCK_SIZE_L1) == 0) { - values_L1.push_back(static_cast(0 | (value << 1))); - return; - } else { - auto pos = values_L2.size() - ranks[0].back(); - values_L1.push_back(static_cast(1 | (pos << 1))); - } - - if ((values_L2.size() % BLOCK_SIZE_L2) == 0) { - ranks[1].push_back(static_cast(values_L3.size())); - } - if ((value / BLOCK_SIZE_L2) == 0) { - values_L2.push_back(static_cast(0 | (value << 1))); - return; - } else { - auto pos = values_L3.size() - ranks[1].back(); - values_L2.push_back(static_cast(1 | (pos << 1))); - } - -#ifdef XCDAT_X64 - if ((values_L3.size() % BLOCK_SIZE_L3) == 0) { - ranks[1].push_back(static_cast(values_L4.size())); - } - if ((value / BLOCK_SIZE_L3) == 0) { - values_L3.push_back(static_cast(0 | (value << 1))); - return; - } else { - auto pos = values_L4.size() - ranks[1].back(); - values_L3.push_back(static_cast(1 | (pos << 1))); - } - values_L4.push_back(value); -#else - values_L3.push_back(value); -#endif - }; - - auto append_leaf = [&](id_type value) { - if ((values_L1.size() % BLOCK_SIZE_L1) == 0) { - ranks[0].push_back(static_cast(values_L2.size())); - } - values_L1.push_back(static_cast(value & 0xFF)); - links.push_back(value >> 8); - }; - - - for (id_type i = 0; i < bc.size(); ++i) { - if (leaf_flags_[i]) { - append_leaf(bc[i].base); - } else { - append(bc[i].base ^ i); - } - append(bc[i].check ^ i); - if (bc[i].check == i) { - ++num_free_nodes_; - } - } - - // release - values_L1_ = Vector(values_L1); - values_L2_ = Vector(values_L2); - values_L3_ = Vector(values_L3); -#ifdef XCDAT_X64 - values_L4_ = Vector(values_L4); -#endif - for (uint8_t j = 0; j < LAYERS - 1; ++j) { - ranks_[j] = Vector(ranks[j]); - } - links_ = FitVector(links); -} - -size_t FastDacBc::size_in_bytes() const { - size_t ret = 0; - ret += values_L1_.size_in_bytes(); - ret += values_L2_.size_in_bytes(); - ret += values_L3_.size_in_bytes(); -#ifdef XCDAT_X64 - ret += values_L4_.size_in_bytes(); -#endif - for (auto& ranks : ranks_) { - ret += ranks.size_in_bytes(); - } - ret += leaf_flags_.size_in_bytes(); - ret += links_.size_in_bytes(); - ret += sizeof(num_free_nodes_); - return ret; -} - -void FastDacBc::show_stat(std::ostream& os) const { - const auto total_size = size_in_bytes(); - os << "basic statistics of xcdat::FastDacBc" << std::endl; - show_size("\tnum links: ", links_.size(), os); - show_size("\tbytes per node:", double(total_size) / num_nodes(), os); - os << "member size statistics of xcdat::FastDacBc" << std::endl; - show_size_ratio("\tvalues_L1:", values_L1_.size_in_bytes(), total_size, os); - show_size_ratio("\tvalues_L2:", values_L2_.size_in_bytes(), total_size, os); - show_size_ratio("\tvalues_L3:", values_L3_.size_in_bytes(), total_size, os); -#ifdef XCDAT_X64 - show_size_ratio("\tvalues_L4:", values_L4_.size_in_bytes(), total_size, os); -#endif - show_size_ratio("\tranks_L1: ", ranks_[0].size_in_bytes(), total_size, os); - show_size_ratio("\tranks_L2: ", ranks_[1].size_in_bytes(), total_size, os); -#ifdef XCDAT_X64 - show_size_ratio("\tranks_L3: ", ranks_[2].size_in_bytes(), total_size, os); -#endif - show_size_ratio("\tleaves: ", leaf_flags_.size_in_bytes(), total_size, os); - show_size_ratio("\tlinks: ", links_.size_in_bytes(), total_size, os); -} - -void FastDacBc::write(std::ostream& os) const { - values_L1_.write(os); - values_L2_.write(os); - values_L3_.write(os); -#ifdef XCDAT_X64 - values_L4_.write(os); -#endif - for (auto& ranks : ranks_) { - ranks.write(os); - } - leaf_flags_.write(os); - links_.write(os); - write_value(num_free_nodes_, os); -} - -id_type FastDacBc::access_(id_type i) const { - uint32_t value = values_L1_[i] >> 1; - if ((values_L1_[i] & 1U) == 0) { - return value; - } - i = ranks_[0][i / BLOCK_SIZE_L1] + value; - value = values_L2_[i] >> 1; - if ((values_L2_[i] & 1U) == 0) { - return value; - } - i = ranks_[1][i / BLOCK_SIZE_L2] + value; -#ifdef XCDAT_X64 - value = values_L3_[i] >> 1; - if ((values_L3_[i] & 1U) == 0) { - return value; - } - i = ranks_[2][i / BLOCK_SIZE_L3] + value; - return values_L4_[i]; -#else - return values_L3_[i]; -#endif -} - -} //namespace - xcdat diff --git a/src/FitVector.cpp b/src/FitVector.cpp deleted file mode 100644 index 8ea1d42..0000000 --- a/src/FitVector.cpp +++ /dev/null @@ -1,60 +0,0 @@ -#include "xcdat/FitVector.hpp" - -namespace xcdat { - -FitVector::FitVector(std::istream& is) { - chunks_ = Vector(is); - size_= read_value(is); - width_ = read_value(is); - mask_ = read_value(is); -} - -FitVector::FitVector(const std::vector& values) { - if (values.empty()) { - return; - } - - width_ = 0; - auto max_value = *std::max_element(std::begin(values), std::end(values)); - do { - ++width_; - max_value >>= 1; - } while (max_value); - - size_ = values.size(); - mask_ = (1U << width_) - 1; - std::vector chunks(size_ * width_ / CHUNK_WIDTH + 1, 0); - - for (id_type i = 0; i < size_; ++i) { - const auto chunk_pos = static_cast(i * width_ / CHUNK_WIDTH); - const auto offset = static_cast(i * width_ % CHUNK_WIDTH); - - chunks[chunk_pos] &= ~(mask_ << offset); - chunks[chunk_pos] |= (values[i] & mask_) << offset; - - if (CHUNK_WIDTH < offset + width_) { - chunks[chunk_pos + 1] &= ~(mask_ >> (CHUNK_WIDTH - offset)); - chunks[chunk_pos + 1] |= (values[i] & mask_) >> (CHUNK_WIDTH - offset); - } - } - - chunks_ = Vector(chunks); -} - -size_t FitVector::size_in_bytes() const { - size_t ret = 0; - ret += chunks_.size_in_bytes(); - ret += sizeof(size_); - ret += sizeof(width_); - ret += sizeof(mask_); - return ret; -} - -void FitVector::write(std::ostream& os) const { - chunks_.write(os); - write_value(size_, os); - write_value(width_, os); - write_value(mask_, os); -} - -} //namespace - xcdat diff --git a/src/TrieBuilder.cpp b/src/TrieBuilder.cpp deleted file mode 100644 index c3999d2..0000000 --- a/src/TrieBuilder.cpp +++ /dev/null @@ -1,317 +0,0 @@ -#include - -#include "xcdat/TrieBuilder.hpp" - -namespace xcdat { - -TrieBuilder::TrieBuilder(const std::vector& keys, - id_type width_L1, bool bin_mode) - : keys_(keys), block_size_(1U << width_L1), width_L1_(width_L1), - bin_mode_(bin_mode) { - if (keys_.empty()) { - throw TrieBuilder::Exception("The input data is empty."); - } - if (ID_MAX < keys_.size()) { - throw TrieBuilder::Exception("Key ID range error."); - } - - { - size_t init_capa = 1; - while (init_capa < keys_.size()) { - init_capa <<= 1; - } - - bc_.reserve(init_capa); - leaf_flags_.reserve(init_capa); - term_flags_.reserve(init_capa); - used_flags_.reserve(init_capa); - heads_.reserve(init_capa >> width_L1_); - } - - alphabet_.reserve(256); - edges_.reserve(256); - suffixes_.reserve(keys_.size()); - - // initialize an empty list. - for (id_type i = 0; i < 256; ++i) { - bc_.push_back({i + 1, i - 1}); - leaf_flags_.push_back(false); - term_flags_.push_back(false); - used_flags_.push_back(false); - } - bc_[255].base = 0; - bc_[0].check = 255; - - for (id_type i = 0; i < 256; i += block_size_) { - heads_.push_back(i); - } - - use_(0); - bc_[0].check = TABOO_ID; - used_flags_[TABOO_ID] = true; - heads_[TABOO_ID >> width_L1_] = bc_[TABOO_ID].base; - - build_table_(); - build_bc_(0, keys_.size(), 0, 0); - build_tail_(); -} - -void TrieBuilder::build_table_() { - using tb_type = std::pair; - tb_type table_builder[256]; - - for (uint32_t i = 0; i < 256; ++i) { - table_builder[i] = {static_cast(i), 0}; - } - - max_length_ = 0; - for (auto& key : keys_) { - for (char c : key) { - ++table_builder[static_cast(c)].second; - } - max_length_ = std::max(max_length_, key.length()); - } - - if (table_builder[0].second != 0) { // including '\0' - bin_mode_ = true; - } - - for (const auto& item : table_builder) { - if (item.second != 0) { - alphabet_.push_back(item.first); - } - } - alphabet_.shrink_to_fit(); - - std::sort(std::begin(table_builder), std::end(table_builder), - [](const tb_type& lhs, const tb_type& rhs) { - return lhs.second > rhs.second; - }); - - for (uint32_t i = 0; i < 256; ++i) { - table_[table_builder[i].first] = static_cast(i); - } - - for (uint32_t i = 0; i < 256; ++i) { - table_[table_[i] + 256] = static_cast(i); - } -} - -void TrieBuilder::build_bc_(size_t begin, size_t end, size_t depth, - id_type node_id) { - if (keys_[begin].length() == depth) { - term_flags_.set_bit(node_id, true); - if (++begin == end) { // without link? - bc_[node_id].base = 0; // with an empty suffix - leaf_flags_.set_bit(node_id, true); - return; - } - } else if (begin + 1 == end) { // leaf? - term_flags_.set_bit(node_id, true); - leaf_flags_.set_bit(node_id, true); - auto& key = keys_[begin]; - suffixes_.push_back( - {{key.data() + depth, key.length() - depth}, node_id} - ); - return; - } - - { // fetching edges - edges_.clear(); - auto label = static_cast(keys_[begin][depth]); - for (auto str_id = begin + 1; str_id < end; ++str_id) { - const auto _label = static_cast(keys_[str_id][depth]); - if (label != _label) { - if (_label < label) { - throw TrieBuilder::Exception( - "The input data is not in lexicographical order." - ); - } - edges_.push_back(label); - label = _label; - } - } - edges_.push_back(label); - } - - const auto base = find_base_(node_id >> width_L1_); - if (bc_.size() <= base) { - expand_(); - } - - // defining new edges - bc_[node_id].base = base; - for (const auto label : edges_) { - const auto child_id = base ^ table_[label]; - use_(child_id); - bc_[child_id].check = node_id; - } - - // following the children - auto _begin = begin; - auto label = static_cast(keys_[begin][depth]); - for (auto _end = begin + 1; _end < end; ++_end) { - const auto _label = static_cast(keys_[_end][depth]); - if (label != _label) { - build_bc_(_begin, _end, depth + 1, base ^ table_[label]); - label = _label; - _begin = _end; - } - } - build_bc_(_begin, end, depth + 1, base ^ table_[label]); -} - -// The algorithm is inspired by marisa-trie -void TrieBuilder::build_tail_() { - std::sort(std::begin(suffixes_), std::end(suffixes_), - [](const Suffix& lhs, const Suffix& rhs) { - return std::lexicographical_compare( - std::rbegin(lhs), std::rend(lhs), - std::rbegin(rhs), std::rend(rhs)); - }); - - // For empty suffixes - tail_.emplace_back('\0'); - if (bin_mode_) { - boundary_flags_.push_back(false); - } - - const Suffix dummy = {{nullptr, 0}, 0}; - const Suffix* prev_suf = &dummy; - - for (size_t i = suffixes_.size(); i > 0; --i) { - const auto& cur_suf = suffixes_[i - 1]; - if (cur_suf.length() == 0) { - throw TrieBuilder::Exception("A suffix is empty."); - } - - size_t match = 0; - while ((match < cur_suf.length()) && (match < prev_suf->length()) - && ((*prev_suf)[match] == cur_suf[match])) { - ++match; - } - - if ((match == cur_suf.length()) && (prev_suf->length() != 0)) { // sharing - bc_[cur_suf.node_id].base = static_cast( - bc_[prev_suf->node_id].base + (prev_suf->length() - match) - ); - } else { // append - bc_[cur_suf.node_id].base = static_cast(tail_.size()); - std::copy(std::begin(cur_suf.str), std::end(cur_suf.str), - std::back_inserter(tail_)); - if (bin_mode_) { - for (size_t j = 1; j < cur_suf.length(); ++j) { - boundary_flags_.push_back(false); - } - boundary_flags_.push_back(true); - } else { - tail_.emplace_back('\0'); - } - if (ID_MAX < tail_.size()) { - throw TrieBuilder::Exception("TAIL address range error."); - } - } - prev_suf = &cur_suf; - } -} - -void TrieBuilder::expand_() { - if (ID_MAX < bc_.size() + 256) { - throw TrieBuilder::Exception("Node ID range error."); - } - - const auto old_size = static_cast(bc_.size()); - const auto new_size = old_size + 256; - - for (auto i = old_size; i < new_size; ++i) { - bc_.push_back({i + 1, i - 1}); - leaf_flags_.push_back(false); - term_flags_.push_back(false); - used_flags_.push_back(false); - } - - { - const auto last = bc_[TABOO_ID].check; - bc_[old_size].check = last; - bc_[last].base = old_size; - bc_[new_size - 1].base = TABOO_ID; - bc_[TABOO_ID].check = new_size - 1; - } - - for (auto i = old_size; i < new_size; i += block_size_) { - heads_.push_back(i); - } - - const auto block_id = old_size / 256; - if (FREE_BLOCKS <= block_id) { - close_block_(block_id - FREE_BLOCKS); - } -} - -void TrieBuilder::use_(id_type node_id) { - used_flags_[node_id] = true; - - const auto next = bc_[node_id].base; - const auto prev = bc_[node_id].check; - bc_[prev].base = next; - bc_[next].check = prev; - - const auto block_id = node_id >> width_L1_; - if (heads_[block_id] == node_id) { - heads_[block_id] = (block_id != next >> width_L1_) ? TABOO_ID : next; - } -} - -void TrieBuilder::close_block_(id_type block_id) { - const auto begin = block_id * 256; - const auto end = begin + 256; - - for (auto i = begin; i < end; ++i) { - if (!used_flags_[i]) { - use_(i); - bc_[i].base = i; - bc_[i].check = i; - used_flags_[i] = false; - } - } - - for (auto i = begin; i < end; i += block_size_) { - heads_[i >> width_L1_] = TABOO_ID; - } -} - -id_type TrieBuilder::find_base_(id_type block_id) const { - if (bc_[TABOO_ID].base == TABOO_ID) { // Full? - return static_cast(bc_.size()) ^ table_[edges_[0]]; - } - - // search in the same block - for (auto i = heads_[block_id]; - i != TABOO_ID && i >> width_L1_ == block_id; - i = bc_[i].base) { - const auto base = i ^ table_[edges_[0]]; - if (is_target_(base)) { - return base; // base / block_size_ == block_id - } - } - - for (auto i = bc_[TABOO_ID].base; i != TABOO_ID; i = bc_[i].base) { - const auto base = i ^ table_[edges_[0]]; - if (is_target_(base)) { - return base; // base / block_size_ != block_id - } - } - - return static_cast(bc_.size()) ^ table_[edges_[0]]; -} - -bool TrieBuilder::is_target_(id_type base) const { - for (const auto label : edges_) { - if (used_flags_[base ^ table_[label]]) { - return false; - } - } - return true; -} - -} //namespace - xcdat diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt deleted file mode 100644 index 76644fc..0000000 --- a/test/CMakeLists.txt +++ /dev/null @@ -1,8 +0,0 @@ - -file(GLOB TEST_SOURCES *_test.cpp) -foreach(TEST_SOURCE ${TEST_SOURCES}) - get_filename_component(TEST_SOURCE_NAME ${TEST_SOURCE} NAME_WE) - add_executable(${TEST_SOURCE_NAME} ${TEST_SOURCE}) - target_link_libraries(${TEST_SOURCE_NAME} xcdat) - add_test(${TEST_SOURCE_NAME} ${TEST_SOURCE_NAME}) -endforeach() diff --git a/test/tries_test.cpp b/test/tries_test.cpp deleted file mode 100644 index 4de4cb5..0000000 --- a/test/tries_test.cpp +++ /dev/null @@ -1,265 +0,0 @@ -#undef NDEBUG - -#include -#include -#include -#include -#include - -#include "xcdat.hpp" - -using namespace xcdat; - -namespace { - -constexpr size_t NUM_KEYS = 1U << 10; -constexpr size_t MAX_LENGTH = 20; - -void to_set(std::vector& keys) { - std::sort(std::begin(keys), std::end(keys)); - keys.erase(std::unique(std::begin(keys), std::end(keys)), std::end(keys)); -} - -std::string make_key() { - std::random_device rnd; - - std::string key; - size_t length = (rnd() % MAX_LENGTH) + 1; - for (size_t j = 0; j < length; ++j) { - key += 'A' + (rnd() % 26); - } - - return key; -} - -std::vector make_keys() { - std::vector keys; - keys.reserve(NUM_KEYS); - - for (size_t i = 0; i < NUM_KEYS; ++i) { - keys.push_back(make_key()); - } - - to_set(keys); - return keys; -} - -std::vector make_other_keys(const std::vector& keys) { - std::vector others; - - for (size_t i = 0; i < NUM_KEYS; ++i) { - auto string = make_key(); - if (std::find(std::begin(keys), std::end(keys), string) == std::end(keys)) { - others.push_back(string); - } - } - - to_set(others); - return others; -} - -template -Trie test_build(const std::vector& keys, - bool bin_mode) { - std::cerr << "Construction -> build()\n"; - - auto trie = TrieBuilder::build(keys, bin_mode); - assert(trie.num_keys() == keys.size()); - - return trie; -} - -template -void test_basic_operations(const Trie& trie, - const std::vector& keys, - const std::vector& others) { - std::cerr << "Basic operations -> lookup() and access()\n"; - - for (auto& key : keys) { - auto id = trie.lookup(key); - assert(id != Trie::NOT_FOUND); - - auto dec = trie.access(id); - assert(dec == key); - } - - for (auto& other : others) { - const auto id = trie.lookup(other); - assert(id == Trie::NOT_FOUND); - } -} - -template -void test_prefix_operations(const Trie& trie, - const std::vector& keys, - const std::vector& others) { - std::cerr << "Prefix operations -> PrefixIterator\n"; - - for (auto& key : keys) { - size_t num_results = 0; - - auto it = trie.make_prefix_iterator(key); - while (it.next()) { - auto id = it.id(); - auto dec = it.key(); - - assert(dec.length() <= key.length()); - - auto dec2 = trie.access(id); - assert(dec == dec2); - - ++num_results; - } - - assert(1 <= num_results); - assert(num_results <= key.length()); - } - - for (auto& other : others) { - size_t num_results = 0; - - auto it = trie.make_prefix_iterator(other); - while (it.next()) { - auto id = it.id(); - auto dec = it.key(); - - assert(dec.length() < other.length()); - - auto dec2 = trie.access(id); - assert(dec == dec2); - - ++num_results; - } - - assert(num_results < other.length()); - } -} - -template -void test_predictive_operations(const Trie& trie, - const std::vector& keys, - const std::vector& others) { - std::cerr << "Predictive operations -> PredictiveIterator\n"; - - for (auto& key : keys) { - size_t num_results = 0; - - auto it = trie.make_predictive_iterator(key); - while (it.next()) { - auto id = it.id(); - auto dec = it.key(); - - assert(key.length() <= dec.length()); - - auto dec2 = trie.access(id); - assert(dec == dec2); - - ++num_results; - } - - assert(1 <= num_results); - } - - for (auto& other : others) { - auto it = trie.make_predictive_iterator(other); - while (it.next()) { - auto id = it.id(); - auto dec = it.key(); - - assert(other.length() < dec.length()); - - auto dec2 = trie.access(id); - - assert(dec == dec2); - } - } - - { // all enumeration - size_t num_results = 0; - - auto it = trie.make_predictive_iterator(std::string_view{}); - while (it.next()) { - auto id = it.id(); - auto dec = it.key(); - - assert(0 <= dec.length()); - - auto dec2 = trie.access(id); - assert(dec == dec2); - - ++num_results; - } - - assert(num_results == trie.num_keys()); - } -} - -template -void test_io(const Trie& trie) { - std::cerr << "File I/O -> write() and read()\n"; - - const char* file_name = "index"; - { - std::ofstream ofs{file_name}; - trie.write(ofs); - } - { - std::ifstream ifs{file_name}; - auto size = static_cast(ifs.seekg(0, std::ios::end).tellg()); - assert(size == trie.size_in_bytes()); - } - - Trie _trie; - { - std::ifstream ifs{file_name}; - _trie = Trie(ifs); - } - - assert(trie.num_keys() == _trie.num_keys()); - assert(trie.bin_mode() == _trie.bin_mode()); - assert(trie.alphabet_size() == _trie.alphabet_size()); - assert(trie.num_nodes() == _trie.num_nodes()); - assert(trie.num_used_nodes() == _trie.num_used_nodes()); - assert(trie.num_free_nodes() == _trie.num_free_nodes()); - assert(trie.size_in_bytes() == _trie.size_in_bytes()); -} - -template -void test_trie(const std::vector& strings, - const std::vector& others) { - for (int i = 0; i < 2; ++i) { - std::cerr << "** " << (i % 2 ? "Binary" : "Text") << " Mode **\n"; - std::cerr << "Testing xcdat::Trie<" << (Fast ? "true" : "false") << ">\n"; - - auto trie = test_build(strings, i % 2 != 0); - - test_basic_operations(trie, strings, others); - test_prefix_operations(trie, strings, others); - test_predictive_operations(trie, strings, others); - test_io(trie); - - std::cerr << "--> No problem (☝ ՞ਊ ՞)☝" << std::endl << std::endl; - } -} - -} // namespace - -int main() { - auto keys_buffer = make_keys(); - auto others_buffer = make_other_keys(keys_buffer); - - std::vector keys(keys_buffer.size()); - for (size_t i = 0; i < keys.size(); ++i) { - keys[i] = std::string_view{keys_buffer[i]}; - } - - std::vector others(others_buffer.size()); - for (size_t i = 0; i < others.size(); ++i) { - others[i] = std::string_view{others_buffer[i]}; - } - - test_trie(keys, others); - test_trie(keys, others); - - return 0; -} diff --git a/test/vectors_test.cpp b/test/vectors_test.cpp deleted file mode 100644 index d6ad68a..0000000 --- a/test/vectors_test.cpp +++ /dev/null @@ -1,102 +0,0 @@ -#undef NDEBUG - -#include -#include -#include - -#include "xcdat/BitVector.hpp" -#include "xcdat/FitVector.hpp" - -using namespace xcdat; - -namespace { - -constexpr size_t SIZE = 1U << 10; - -void test_vector() { - std::vector orig_vec(SIZE); - { - std::random_device rnd; - for (size_t i = 0; i < SIZE; ++i) { - orig_vec[i] = rnd(); - } - } - - auto copied_vec = orig_vec; // copy - Vector vec(copied_vec); - - assert(copied_vec.empty()); - - for (size_t i = 0; i < SIZE; ++i) { - assert(orig_vec[i] == vec[i]); - } - - Vector swapped_vec; - vec.swap(swapped_vec); - - assert(vec.is_empty()); - - for (size_t i = 0; i < SIZE; ++i) { - assert(orig_vec[i] == swapped_vec[i]); - } -} - -void test_bit_vector() { - std::vector orig_bit_vector; - { - std::random_device rnd; - for (size_t i = 0; i < SIZE; ++i) { - orig_bit_vector.push_back(rnd() % 2 == 0); - } - } - - BitVector bit_vector; - { - BitVectorBuilder builder; - for (size_t i = 0; i < SIZE; ++i) { - builder.push_back(orig_bit_vector[i]); - } - bit_vector = BitVector(builder, true, true); - } - - assert(bit_vector.size() == SIZE); - - id_type sum = 0; - for (id_type i = 0; i < SIZE; ++i) { - assert(bit_vector[i] == orig_bit_vector[i]); - if (bit_vector[i]) { - assert(sum == bit_vector.rank(i)); - assert(i == bit_vector.select(sum)); - ++sum; - } - } - - assert(bit_vector.num_1s() == sum); - assert(bit_vector.num_0s() == SIZE - sum); -} - -void test_small_vector() { - std::vector orig_vector; - { - std::random_device rnd; - for (size_t i = 0; i < SIZE; ++i) { - orig_vector.push_back(rnd() & UINT16_MAX); - } - } - - FitVector small_vector(orig_vector); - assert(orig_vector.size() == small_vector.size()); - - for (size_t i = 0; i < SIZE; ++i) { - assert(orig_vector[i] == small_vector[i]); - } -} - -} // namespace - -int main() { - test_vector(); - test_bit_vector(); - test_small_vector(); - return 0; -} diff --git a/tool/CMakeLists.txt b/tool/CMakeLists.txt deleted file mode 100644 index 53e6b89..0000000 --- a/tool/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ - -add_executable(xcdat-exe xcdat.cpp) -set_target_properties(xcdat-exe PROPERTIES OUTPUT_NAME xcdat) -target_link_libraries(xcdat-exe xcdat) - -install(TARGETS xcdat-exe RUNTIME DESTINATION bin) \ No newline at end of file diff --git a/tool/xcdat.cpp b/tool/xcdat.cpp deleted file mode 100644 index 95dbcff..0000000 --- a/tool/xcdat.cpp +++ /dev/null @@ -1,322 +0,0 @@ -#include -#include -#include - -#include "xcdat.hpp" - -using namespace xcdat; - -namespace { - -constexpr int RUNS = 10; - -class StopWatch { -public: - using hrc = std::chrono::high_resolution_clock; - - StopWatch() : tp_{hrc::now()} {} - - double sec() const { - const auto tp = hrc::now() - tp_; - return std::chrono::duration(tp).count(); - } - double milli_sec() const { - const auto tp = hrc::now() - tp_; - return std::chrono::duration(tp).count(); - } - double micro_sec() const { - const auto tp = hrc::now() - tp_; - return std::chrono::duration(tp).count(); - } - -private: - hrc::time_point tp_; -}; - -size_t read_keys(const char* file_name, std::vector& keys) { - std::ifstream ifs{file_name}; - if (!ifs) { - return 0; - } - - size_t size = 0; - for (std::string line; std::getline(ifs, line);) { - keys.push_back(line); - size += line.length() + 1; // with terminator - } - - return size; -} - -std::vector -extract_views(const std::vector& keys) { - std::vector views(keys.size()); - for (size_t i = 0; i < keys.size(); ++i) { - views[i] = keys[i]; - } - return views; -}; - -void show_usage(std::ostream& os) { - os << "xcdat build \n"; - os << "\t\t1: DACs, 2: FDACs\n"; - os << "\t \tInput file name of a set of keys (must be sorted)\n"; - os << "\t\tOutput file name of the dictionary (optional)\n"; - os << "\t \tIf omitted, .dacs or .fdacs is output\n"; - os << "xcdat query \n"; - os << "\t \t1: DACs, 2: FDACs\n"; - os << "\t \tInput file name of the dictionary\n"; - os << "\t\tLimit of #results (optional, default=10)\n"; - os << "xcdat bench \n"; - os << "\t\t1: DACs, 2: FDACs\n"; - os << "\t\tInput file name of the dictionary\n"; - os << "\t \tInput file name of keys for benchmark\n"; - os.flush(); -} - -template -int build(std::vector& args) { - if (args.size() != 3 && args.size() != 4) { - show_usage(std::cerr); - return 1; - } - - std::vector keys_buffer; - auto raw_size = read_keys(args[2].c_str(), keys_buffer); - - if (raw_size == 0) { - std::cerr << "open error : " << args[2] << std::endl; - return 1; - } - - auto keys = extract_views(keys_buffer); - - Trie trie; - try { - StopWatch sw; - trie = TrieBuilder::build(keys); - std::cout << "constr. time:\t" << sw.sec() << " sec" << std::endl; - } catch (const xcdat::TrieBuilder::Exception& ex) { - std::cerr << ex.what() << std::endl; - return 1; - } - - std::cout << "cmpr. ratio:\t" - << static_cast(trie.size_in_bytes()) / raw_size - << " over the raw size" << std::endl; - - std::cout << std::endl; - trie.show_stat(std::cout); - std::cout << std::endl; - - std::string out_name; - if (args.size() == 4) { - out_name = args[3]; - } else { - out_name = args[2] + (Fast ? ".fdac" : ".dac"); - } - - std::ofstream ofs{out_name}; - if (!ofs) { - std::cerr << "open error : " << out_name << std::endl; - return 1; - } - trie.write(ofs); - - std::cout << "output -> " << out_name << std::endl; - - return 0; -} - -template -int query(std::vector& args) { - if (args.size() != 3 && args.size() != 4) { - show_usage(std::cerr); - return 1; - } - - Trie trie; - { - std::ifstream ifs(args[2]); - if (!ifs) { - std::cerr << "open error : " << args[2] << std::endl; - return 1; - } - trie = Trie(ifs); - } - - size_t limit = 10; - if (args.size() == 4) { - limit = std::stoull(args.back()); - } - - std::string query; - - while (true){ - std::cout << "> " << std::flush; - std::getline(std::cin, query); - if (query.empty()){ - break; - } - - std::cout << "Lookup" << std::endl; - auto id = trie.lookup(query); - if (id == Trie::NOT_FOUND) { - std::cout << "not found" << std::endl; - } else { - std::cout << id << '\t' << query << std::endl; - } - - std::cout << "Common Prefix Lookup" << std::endl; - { - size_t N = 0; - auto it = trie.make_prefix_iterator(query); - while (N < limit && it.next()) { - std::cout << it.id() << '\t' << it.key() << std::endl; - ++N; - } - - size_t M = 0; - while (it.next()) { - ++M; - } - - if (M != 0) { - std::cout << "and more..." << std::endl; - } - std::cout << N + M << " found" << std::endl; - } - - std::cout << "Predictive Lookup" << std::endl; - { - size_t N = 0; - auto it = trie.make_predictive_iterator(query); - while (N < limit && it.next()) { - std::cout << it.id() << '\t' << it.key() << std::endl; - ++N; - } - - size_t M = 0; - while (it.next()) { - ++M; - } - - if (M != 0) { - std::cout << "and more..." << std::endl; - } - std::cout << N + M << " found" << std::endl; - } - } - - return 0; -} - -template -int bench(std::vector& args) { - if (args.size() != 4) { - show_usage(std::cerr); - return 1; - } - - Trie trie; - { - std::ifstream ifs(args[2]); - if (!ifs) { - std::cerr << "open error : " << args[2] << std::endl; - return 1; - } - trie = Trie(ifs); - } - - std::vector keys_buffer; - if (read_keys(args[3].c_str(), keys_buffer) == 0) { - std::cerr << "open error : " << args[3] << std::endl; - return 1; - } - - auto keys = extract_views(keys_buffer); - std::vector ids(keys.size()); - - std::cout << "Warm up" << std::endl; - - for (size_t i = 0; i < keys.size(); ++i) { - ids[i] = trie.lookup(keys[i]); - if (ids[i] == Trie::NOT_FOUND) { - std::cerr << "A non-registered key is included, " - << keys_buffer[i] << std::endl; - return 1; - } - } - - { - std::cout << "Lookup benchmark on " << RUNS << " runs" << std::endl; - - StopWatch sw; - for (uint32_t r = 0; r < RUNS; ++r) { - for (size_t i = 0; i < keys.size(); ++i) { - if (trie.lookup(keys[i]) != ids[i]) { - std::cerr << "Critical lookup error ʅ( ՞ਊ՞)ʃ" << std::endl; - return 1; - } - } - } - - std::cout << sw.micro_sec() / RUNS / keys.size() - << " us per str" << std::endl; - } - - { - std::cout << "Access benchmark on " << RUNS << " runs" << std::endl; - - StopWatch sw; - for (uint32_t r = 0; r < RUNS; ++r) { - for (auto id : ids) { - auto dec = trie.access(id); - if (dec.empty()) { - std::cerr << "Critical access error ʅ( ՞ਊ՞)ʃ" << std::endl; - return 1; - } - } - } - - std::cout << sw.micro_sec() / RUNS / ids.size() - << " us per ID" << std::endl; - } - - return 0; -} - -} // namespace - -int main(int argc, const char* argv[]) { - if (argc < 3) { - show_usage(std::cerr); - return 1; - } - - std::vector args; - for (int i = 1; i < argc; ++i) { - args.emplace_back(std::string{argv[i]}); - } - - bool is_fast; - if (args[1][0] == '1') { - is_fast = false; - } else if (args[1][0] == '2') { - is_fast = true; - } else { - show_usage(std::cerr); - return 1; - } - - if (args[0] == "build") { - return is_fast ? build(args) : build(args); - } else if (args[0] == "query") { - return is_fast ? query(args) : query(args); - } else if (args[0] == "bench") { - return is_fast ? bench(args) : bench(args); - } - - show_usage(std::cerr); - return 1; -} diff --git a/xcdat_config.hpp.in b/xcdat_config.hpp.in deleted file mode 100644 index f084264..0000000 --- a/xcdat_config.hpp.in +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef XCDAT_CONFIG_HPP -#define XCDAT_CONFIG_HPP - -#cmakedefine XCDAT_X64 -#cmakedefine XCDAT_USE_POPCNT - -#endif // XCDAT_CONFIG_HPP \ No newline at end of file