rm old files and add bit_vector
This commit is contained in:
parent
9de3d4348a
commit
96e039bda7
113
.clang-format
Normal file
113
.clang-format
Normal file
|
@ -0,0 +1,113 @@
|
|||
---
|
||||
Language: Cpp
|
||||
# BasedOnStyle: Google
|
||||
AccessModifierOffset: -2
|
||||
AlignAfterOpenBracket: Align
|
||||
AlignConsecutiveAssignments: false
|
||||
AlignConsecutiveDeclarations: false
|
||||
AlignEscapedNewlines: Left
|
||||
AlignOperands: true
|
||||
AlignTrailingComments: false
|
||||
AllowAllParametersOfDeclarationOnNextLine: true
|
||||
AllowShortBlocksOnASingleLine: false
|
||||
AllowShortCaseLabelsOnASingleLine: false
|
||||
AllowShortFunctionsOnASingleLine: Empty
|
||||
AllowShortIfStatementsOnASingleLine: true
|
||||
AllowShortLoopsOnASingleLine: true
|
||||
AlwaysBreakAfterDefinitionReturnType: None
|
||||
AlwaysBreakAfterReturnType: None
|
||||
AlwaysBreakBeforeMultilineStrings: false
|
||||
AlwaysBreakTemplateDeclarations: true
|
||||
BinPackArguments: true
|
||||
BinPackParameters: true
|
||||
BraceWrapping:
|
||||
AfterClass: false
|
||||
AfterControlStatement: false
|
||||
AfterEnum: false
|
||||
AfterFunction: false
|
||||
AfterNamespace: false
|
||||
AfterObjCDeclaration: false
|
||||
AfterStruct: false
|
||||
AfterUnion: false
|
||||
AfterExternBlock: false
|
||||
BeforeCatch: false
|
||||
BeforeElse: false
|
||||
IndentBraces: false
|
||||
SplitEmptyFunction: true
|
||||
SplitEmptyRecord: true
|
||||
SplitEmptyNamespace: true
|
||||
BreakBeforeBinaryOperators: None
|
||||
BreakBeforeBraces: Attach
|
||||
BreakBeforeInheritanceComma: false
|
||||
BreakBeforeTernaryOperators: true
|
||||
BreakConstructorInitializersBeforeComma: false
|
||||
BreakConstructorInitializers: BeforeColon
|
||||
BreakAfterJavaFieldAnnotations: false
|
||||
BreakStringLiterals: true
|
||||
ColumnLimit: 120
|
||||
CommentPragmas: '^ IWYU pragma:'
|
||||
CompactNamespaces: false
|
||||
ConstructorInitializerAllOnOneLineOrOnePerLine: false
|
||||
ConstructorInitializerIndentWidth: 4
|
||||
ContinuationIndentWidth: 4
|
||||
Cpp11BracedListStyle: true
|
||||
DerivePointerAlignment: false
|
||||
DisableFormat: false
|
||||
ExperimentalAutoDetectBinPacking: false
|
||||
FixNamespaceComments: true
|
||||
ForEachMacros:
|
||||
- foreach
|
||||
- Q_FOREACH
|
||||
- BOOST_FOREACH
|
||||
IncludeBlocks: Preserve
|
||||
IncludeCategories:
|
||||
- Regex: '^<ext/.*\.h>'
|
||||
Priority: 2
|
||||
- Regex: '^<.*\.h>'
|
||||
Priority: 1
|
||||
- Regex: '^<.*'
|
||||
Priority: 2
|
||||
- Regex: '.*'
|
||||
Priority: 3
|
||||
IncludeIsMainRegex: '([-_](test|unittest))?$'
|
||||
IndentCaseLabels: true
|
||||
IndentPPDirectives: None
|
||||
IndentWidth: 4
|
||||
IndentWrappedFunctionNames: false
|
||||
JavaScriptQuotes: Leave
|
||||
JavaScriptWrapImports: true
|
||||
KeepEmptyLinesAtTheStartOfBlocks: false
|
||||
MacroBlockBegin: ''
|
||||
MacroBlockEnd: ''
|
||||
MaxEmptyLinesToKeep: 1
|
||||
NamespaceIndentation: None
|
||||
ObjCBlockIndentWidth: 2
|
||||
ObjCSpaceAfterProperty: false
|
||||
ObjCSpaceBeforeProtocolList: false
|
||||
PenaltyBreakAssignment: 2
|
||||
PenaltyBreakBeforeFirstCallParameter: 1
|
||||
PenaltyBreakComment: 300
|
||||
PenaltyBreakFirstLessLess: 120
|
||||
PenaltyBreakString: 1000
|
||||
PenaltyExcessCharacter: 1000000
|
||||
PenaltyReturnTypeOnItsOwnLine: 200
|
||||
PointerAlignment: Left
|
||||
ReflowComments: true
|
||||
SortIncludes: true
|
||||
SortUsingDeclarations: true
|
||||
SpaceAfterCStyleCast: false
|
||||
SpaceAfterTemplateKeyword: true
|
||||
SpaceBeforeAssignmentOperators: true
|
||||
SpaceBeforeParens: ControlStatements
|
||||
SpaceInEmptyParentheses: false
|
||||
SpacesBeforeTrailingComments: 2
|
||||
SpacesInAngles: false
|
||||
SpacesInContainerLiterals: true
|
||||
SpacesInCStyleCastParentheses: false
|
||||
SpacesInParentheses: false
|
||||
SpacesInSquareBrackets: false
|
||||
Standard: Latest
|
||||
TabWidth: 8
|
||||
UseTab: Never
|
||||
...
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
cmake_minimum_required(VERSION 3.1)
|
||||
project(XCDAT)
|
||||
cmake_minimum_required(VERSION 3.0)
|
||||
project(xcdat VERSION 1.0.0 LANGUAGES CXX)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
|
||||
|
@ -7,55 +7,30 @@ if (NOT CMAKE_BUILD_TYPE)
|
|||
set(CMAKE_BUILD_TYPE Release)
|
||||
endif ()
|
||||
|
||||
configure_file(
|
||||
${XCDAT_SOURCE_DIR}/xcdat_config.hpp.in
|
||||
${XCDAT_SOURCE_DIR}/include/xcdat/xcdat_config.hpp
|
||||
)
|
||||
|
||||
message(STATUS "XCDAT_SOURCE_DIR is ${XCDAT_SOURCE_DIR}")
|
||||
|
||||
option(XCDAT_X64
|
||||
"Use 64-bit integers for node representation."
|
||||
OFF)
|
||||
|
||||
option(XCDAT_USE_POPCNT
|
||||
"Use popcount intrinsic. Available on x86-64 since SSE4.2."
|
||||
OFF)
|
||||
|
||||
if (XCDAT_USE_POPCNT)
|
||||
if (UNIX)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2")
|
||||
endif ()
|
||||
if ((CMAKE_CXX_COMPILER_ID STREQUAL "Clang") OR (CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang"))
|
||||
set(CMAKE_COMPILER_IS_CLANGXX 1)
|
||||
endif ()
|
||||
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
|
||||
set(CMAKE_COMPILER_IS_GNUCXX 1)
|
||||
endif ()
|
||||
|
||||
# C++17 compiler check
|
||||
if ((CMAKE_COMPILER_IS_GNUCXX AND ${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 7.0) OR (CMAKE_COMPILER_IS_CLANGXX AND ${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.0))
|
||||
message(FATAL_ERROR "Your C++ compiler does not support C++17. Please install g++ 7.0 (or greater) or clang 4.0 (or greater)")
|
||||
else ()
|
||||
message(STATUS "Compiler is recent enough to support C++17.")
|
||||
endif ()
|
||||
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++1z -pthread -Wall")
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DNDEBUG -march=native -O3")
|
||||
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fsanitize=address -fno-omit-frame-pointer -O0 -g -DDEBUG")
|
||||
|
||||
message(STATUS "BUILD_TYPE is ${CMAKE_BUILD_TYPE}")
|
||||
message(STATUS "CXX_FLAGS are ${CMAKE_CXX_FLAGS}")
|
||||
message(STATUS "CXX_FLAGS_DEBUG are ${CMAKE_CXX_FLAGS_DEBUG}")
|
||||
message(STATUS "CXX_FLAGS_RELEASE are ${CMAKE_CXX_FLAGS_RELEASE}")
|
||||
message(STATUS "XCDAT_X64 is ${XCDAT_X64}")
|
||||
message(STATUS "XCDAT_USE_POPCNT is ${XCDAT_USE_POPCNT}")
|
||||
|
||||
file(GLOB HEADER_FILES include/xcdat/*.hpp)
|
||||
file(GLOB SOURCE_FILES src/*.cpp)
|
||||
|
||||
include_directories(include)
|
||||
add_library(xcdat STATIC ${HEADER_FILES} ${SOURCE_FILES})
|
||||
|
||||
add_subdirectory(tool)
|
||||
add_subdirectory(sample)
|
||||
|
||||
enable_testing()
|
||||
add_subdirectory(test)
|
||||
|
||||
install(FILES include/xcdat.hpp DESTINATION include)
|
||||
install(FILES ${HEADER_FILES} DESTINATION include/xcdat)
|
||||
|
||||
install(TARGETS xcdat
|
||||
EXPORT xcdat-targets
|
||||
LIBRARY DESTINATION lib
|
||||
ARCHIVE DESTINATION lib
|
||||
RUNTIME DESTINATION bin)
|
||||
|
||||
install(EXPORT xcdat-targets
|
||||
FILE xcdat-config.cmake
|
||||
DESTINATION lib/cmake/xcdat)
|
||||
|
|
308
docs/document.md
308
docs/document.md
|
@ -1,308 +0,0 @@
|
|||
% Xcdat: XOR-compressed double-array trie
|
||||
% Shunsuke Kanda
|
||||
% 2017
|
||||
|
||||
## What is Xcdat?
|
||||
|
||||
Xcdat is a C++ library that implements static compressed string dictionaries based on an improved double-array trie.
|
||||
|
||||
The double array (Aoe, 1989) is known as the fastest trie representation and has been used in many trie libraries. On the other hand, it has a space efficiency problem because of a pointer-based data structure. Xcdat solves the problem using the XOR-compressed double-array methods described in the following article.
|
||||
|
||||
> Shunsuke Kanda, Kazuhiro Morita, and Masao Fuketa. Compressed double-array tries for string dictionaries supporting fast lookup. Knowledge and Information Systems, 51(3): 1023–1042, 2017. [[doi](https://doi.org/10.1007/s10115-016-0999-8)] [[pdf](https://sites.google.com/site/shnskknd/KAIS2016.pdf)]
|
||||
|
||||
Xcdat can implement trie dictionaries in smaller space compared to the other double-array libraries. In addition, the lookup speed is relatively fast in compressed data structures from the double-array advantage.
|
||||
|
||||
Xcdat is available at [GitHub repsitory](https://github.com/kampersanda/xcdat).
|
||||
|
||||
## Features
|
||||
|
||||
- **Compressed Data Structure**: Xcdat practically compresses double-array elements for representing node pointers by using the XCDA methods. While the original double array uses 8 bytes (or 16 bytes) per node, it uses about 3–4 bytes (but, depending on datasets). In addition, the dictionary is implemented using a minimal-prefix trie (Yata et al., 2007) that is effective for long strings in time and space.
|
||||
- **Two Compression Approaches**: There are two approaches of compressing elements: using byte-oriented DACs (Brisaboa et al., 2013) and using pointer-based ones (Kanda et al., 2017). For characterless strings such as natural language keywords, the former will be slightly smaller and the latter will be slightly faster. For long strings such as URLs, the latter will outperform the former. Xcdat implements the two versions by using a static polymorphism with C++ template to avoid an overhead of virtual functions.
|
||||
- **64-bit Version**: Although Xcdat represents node addresses using 32-bit integers in default configuration, we can allow for 64-bit integers by defining `XCDAT_X64`; therefore, the dictionary can be constructed from a very large dataset. The construction space becomes large, but the output dictionary size is nearly equal.
|
||||
- **NULL Character**: The dictionary can be constructed from keys including the NULL character by setting the second parameter of `xcdat::TrieBuilder::build()` to `true`.
|
||||
- **Dictionary Encoding**: Xcdat supports mapping N different strings to unique IDs in [0,N-1]. That is to say, it supports two basic dictionary operations: Lookup returns the ID corresponding to a given string and Access (also called ReverseLookup) returns the string corresponding to a given ID. Therefore, Xcdat is very useful in many applications for string precessing and indexing, such as described in (Martínez-Prieto et al., 2016).
|
||||
- **Fast Operations**: Xcdat can provide lookup operations faster than other compressed trie libraries because it is based on the double-array trie. On the other hand, compared to the existing double-array libraries, the speed will be slower due to the compression.
|
||||
- **Prefix-based Lookup Operations**: As with other trie libraries, Xcdat also provides prefix-based lookup operations required for natural language processing and so on.
|
||||
|
||||
## Build Instructions
|
||||
|
||||
You can download and compile Xcdat as the following commands.
|
||||
|
||||
```
|
||||
$ git clone https://github.com/kampersanda/xcdat.git
|
||||
$ cd xcdat
|
||||
$ mkdir build
|
||||
$ cd build
|
||||
$ cmake ..
|
||||
$ make
|
||||
$ make install
|
||||
```
|
||||
|
||||
If you want to use a 64-bit setting, please add `-DXCDAT_X64=ON` to the CMake option. In addition, you can use the SSE4.2 POPCNT instruction by adding `-DXCDAT_USE_POPCNT=ON` for Rank/Select operations. The code has been tested only on Mac OS X and Linux. That is, this library considers only UNIX-compatible OS.
|
||||
|
||||
|
||||
## Command Line Tools
|
||||
|
||||
`xcdat` is a general-purpose command line tool to provide three modes as follows.
|
||||
|
||||
```
|
||||
$ xcdat
|
||||
xcdat build <type> <key> <dict>
|
||||
<type> 1: DACs, 2: FDACs
|
||||
<key> Input file name of a set of keys (must be sorted)
|
||||
<dict> Output file name of the dictionary (optional)
|
||||
If omitted, <key>.dacs or <key>.fdacs is output
|
||||
xcdat query <type> <dict> <limit>
|
||||
<type> 1: DACs, 2: FDACs
|
||||
<dict> Input file name of the dictionary
|
||||
<limit> Limit of #results (optional, default=10)
|
||||
xcdat bench <type> <dict> <key>
|
||||
<type> 1: DACs, 2: FDACs
|
||||
<dict> Input file name of the dictionary
|
||||
<key> Input file name of keys for benchmark
|
||||
```
|
||||
|
||||
### Example 1: Construction
|
||||
|
||||
Command `xcdat build [params...]` builds Xcdat dictionaries from a given dataset and saves it to a file, as follows.
|
||||
|
||||
```
|
||||
$ xcdat build 1 jawiki-all-titles
|
||||
constr. time: 1.58574 sec
|
||||
cmpr. ratio: 0.524287 over the raw size
|
||||
|
||||
basic statistics of xcdat::Trie<false>
|
||||
num keys: 1738995
|
||||
alphabet size: 189
|
||||
num nodes: 4042496
|
||||
num used nodes: 4034357
|
||||
num free nodes: 8139
|
||||
size in bytes: 20546967
|
||||
member size statistics of xcdat::Trie<false>
|
||||
bc: 13879098 0.675482
|
||||
terminal_flags: 708448 0.0344794
|
||||
tail: 5958655 0.290002
|
||||
boundary_flags: 40 1.94676e-06
|
||||
basic statistics of xcdat::DacBc
|
||||
num links: 1499605
|
||||
bytes per node: 3.4333
|
||||
member size statistics of xcdat::DacBc
|
||||
values_L0: 8085000 0.582531
|
||||
values_L1: 746760 0.0538046
|
||||
values_L2: 22581 0.00162698
|
||||
flags_L0: 1389660 0.100126
|
||||
flags_L1: 128400 0.00925132
|
||||
leaves: 694856 0.0500649
|
||||
links: 2811784 0.202591
|
||||
|
||||
output -> jawiki-all-titles.dac
|
||||
```
|
||||
|
||||
### Example 2: Query Processing
|
||||
|
||||
Command `xcdat query [params...]` loads a dictionary file and tests lookup operations, as follows.
|
||||
|
||||
```
|
||||
$ xcdat query 1 jawiki-all-titles.dac
|
||||
> NEW_GAME!
|
||||
Lookup
|
||||
125989 NEW_GAME!
|
||||
Common Prefix Lookup
|
||||
28 N
|
||||
124185 NE
|
||||
125428 NEW
|
||||
125988 NEW_GAME
|
||||
125989 NEW_GAME!
|
||||
5 found
|
||||
Predictive Lookup
|
||||
125989 NEW_GAME!
|
||||
126003 NEW_GAME!!
|
||||
126059 NEW_GAME!_-THE_CHALLENGE_STAGE!-
|
||||
3 found
|
||||
```
|
||||
|
||||
### Example 3: Benchmark Test
|
||||
|
||||
Command `xcdat bench [params...]` tests time performances of a given dictionary, as follows.
|
||||
|
||||
```
|
||||
$ xcdat bench 1 jawiki-all-titles.dac jawiki-all-titles.rnd
|
||||
Warm up
|
||||
Lookup benchmark on 10 runs
|
||||
1.5065 us per str
|
||||
Access benchmark on 10 runs
|
||||
1.81289 us per ID
|
||||
```
|
||||
|
||||
## Sample Usage
|
||||
|
||||
The following code shows an easy routine sample.
|
||||
|
||||
```cpp
|
||||
#include <iostream>
|
||||
#include <xcdat.hpp>
|
||||
|
||||
int main() {
|
||||
std::vector<std::string> keys_buf = {
|
||||
"Aoba", "Yun", "Hajime", "Hihumi", "Kou", "Rin",
|
||||
"Hazuki", "Umiko", "Nene", "Nenecchi"
|
||||
};
|
||||
|
||||
// Convert to the input format
|
||||
std::vector<std::string_view> keys(keys_buf.size());
|
||||
for (size_t i = 0; i < keys.size(); ++i) {
|
||||
keys[i] = std::string_view{keys_buf[i]};
|
||||
}
|
||||
|
||||
// Input data must be sorted.
|
||||
std::sort(std::begin(keys), std::end(keys));
|
||||
|
||||
// Dictionary class
|
||||
using Trie = xcdat::Trie<true>;
|
||||
|
||||
try {
|
||||
// Builds a dictionary from the keys
|
||||
Trie trie = xcdat::TrieBuilder::build<true>(keys); // move
|
||||
|
||||
// Writes the dictionary to a file.
|
||||
std::ofstream ofs{"sample.bin"};
|
||||
trie.write(ofs);
|
||||
} catch (const xcdat::TrieBuilder::Exception& ex) {
|
||||
// Abort if something went wrong...
|
||||
std::cerr << ex.what() << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Creates an empty dictionary
|
||||
Trie trie;
|
||||
{
|
||||
// Reads the dictionary to the file.
|
||||
std::ifstream ifs{"sample.bin"};
|
||||
trie = Trie{ifs}; // move
|
||||
}
|
||||
|
||||
std::cout << "Performing basic operations..." << std::endl;
|
||||
{
|
||||
// lookup() obtains the unique ID for a given key
|
||||
xcdat::id_type key_id = trie.lookup("Rin");
|
||||
// access() decodes the key from a given ID
|
||||
std::cout << key_id << " : " << trie.access(key_id) << std::endl;
|
||||
|
||||
// Given an unregistered key, lookup() returns NOT_FOUND.
|
||||
if (trie.lookup("Hotaru") == Trie::NOT_FOUND) {
|
||||
std::cout << "? : " << "Hotaru" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Performing a common prefix operation..." << std::endl;
|
||||
{
|
||||
// Common prefix operation is implemented using PrefixIterator, created by
|
||||
// make_prefix_iterator().
|
||||
Trie::PrefixIterator it = trie.make_prefix_iterator("Nenecchi");
|
||||
|
||||
// next() continues to obtain the next key until false is returned.
|
||||
while (it.next()) {
|
||||
std::cout << it.id() << " : " << it.key() << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Performing a predictive operation..." << std::endl;
|
||||
{
|
||||
// Predictive operation is implemented using PredictiveIterator, created by
|
||||
// make_predictive_iterator().
|
||||
Trie::PredictiveIterator it = trie.make_predictive_iterator("Ha");
|
||||
|
||||
// next() continues to obtain the next key until false is returned in
|
||||
// lexicographical order.
|
||||
while (it.next()) {
|
||||
std::cout << it.id() << " : " << it.key() << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Enumerating all registered keys..." << std::endl;
|
||||
{
|
||||
// PredictiveIterator for an empty string provides enumeration of all
|
||||
// registered keys in lexicographical order.
|
||||
Trie::PredictiveIterator it = trie.make_predictive_iterator("");
|
||||
while (it.next()) {
|
||||
std::cout << it.id() << " : " << it.key() << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
|
||||
The standard output is as follows.
|
||||
|
||||
```
|
||||
Performing basic operations...
|
||||
7 : Rin
|
||||
? : Hotaru
|
||||
Performing common prefix operations...
|
||||
4 : Nene
|
||||
6 : Nenecchi
|
||||
Performing predictive operations...
|
||||
3 : Hajime
|
||||
5 : Hazuki
|
||||
Enumerating all registered keys...
|
||||
0 : Aoba
|
||||
3 : Hajime
|
||||
5 : Hazuki
|
||||
1 : Hihumi
|
||||
2 : Kou
|
||||
4 : Nene
|
||||
6 : Nenecchi
|
||||
7 : Rin
|
||||
8 : Umiko
|
||||
9 : Yun
|
||||
```
|
||||
|
||||
As shown in the output, `xcdat::Trie` assigns unique integer IDs to each registered key. The ID order is random, depending on node arrangement.
|
||||
|
||||
## API
|
||||
|
||||
You can build a dictionary using static member function `xcdat::TrieBuilder::build()`.
|
||||
This function receives a set of keywords and returns the resulting class object of `xcdat::Trie`.
|
||||
For the usage, refer to the header comments of [`xcdat::TrieBuilder.hpp`](https://github.com/kampersanda/xcdat/blob/master/include/xcdat/TrieBuilder.hpp).
|
||||
Also for the usage of `xcdat::Trie`, refer to the header comments of [`xcdat::Trie`](https://github.com/kampersanda/xcdat/blob/master/include/xcdat/Trie.hpp).
|
||||
|
||||
The detailed descriptions of AIP are under construction...
|
||||
|
||||
## Benchmark
|
||||
|
||||
Work in progress...
|
||||
|
||||
## To Do
|
||||
|
||||
- Show benchmarks
|
||||
- Create AIP descriptions
|
||||
|
||||
## Licensing
|
||||
|
||||
This library is free software provided under the MIT License.
|
||||
|
||||
## Citation
|
||||
|
||||
If you use the library in academic settings, please cite the following paper.
|
||||
|
||||
```bibtex
|
||||
@article{kanda2017compressed,
|
||||
title={Compressed double-array tries for string dictionaries supporting fast lookup},
|
||||
author={Kanda, Shunsuke and Morita, Kazuhiro and Fuketa, Masao},
|
||||
journal={Knowledge and Information Systems},
|
||||
volume={51},
|
||||
number={3},
|
||||
pages={1023--1042},
|
||||
year={2017},
|
||||
publisher={Springer}
|
||||
}
|
||||
```
|
||||
|
||||
## References
|
||||
|
||||
- J. Aoe. An efficient digital search algorithm by using a double-array structure. IEEE Transactions on Software Engineering, 15(9):1066–1077, 1989.
|
||||
- N. R. Brisaboa, S. Ladra, and G. Navarro. DACs: Bringing direct access to variable-length codes. Information Processing & Management, 49(1):392–404, 2013.
|
||||
- S. Kanda, K. Morita, and M. Fuketa. Compressed double-array tries for string dictionaries supporting fast lookup. Knowledge and Information Systems, 51(3): 1023–1042, 2017.
|
||||
- M. A. Martínez-Prieto, N. Brisaboa, R. Cánovas, F. Claude, and G. Navarro. Practical compressed string dictionaries. Information Systems, 56:73–108, 2016
|
||||
- S. Yata, M. Oono, K. Morita, M. Fuketa, T. Sumitomo, and J. Aoe. A compact static double-array keeping character codes. Information Processing & Management, 43(1):237–247, 2007.
|
348
docs/index.html
348
docs/index.html
|
@ -1,348 +0,0 @@
|
|||
<!DOCTYPE html>
|
||||
<html xmlns="http://www.w3.org/1999/xhtml" lang="" xml:lang="">
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<meta name="generator" content="pandoc" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes" />
|
||||
<meta name="author" content="Shunsuke Kanda" />
|
||||
<meta name="dcterms.date" content="2017-01-01" />
|
||||
<title>Xcdat: XOR-compressed double-array trie</title>
|
||||
<style type="text/css">
|
||||
code{white-space: pre-wrap;}
|
||||
span.smallcaps{font-variant: small-caps;}
|
||||
div.line-block{white-space: pre-line;}
|
||||
div.column{display: inline-block; vertical-align: top; width: 50%;}
|
||||
</style>
|
||||
<style type="text/css">
|
||||
div.sourceLine, a.sourceLine { display: inline-block; min-height: 1.25em; }
|
||||
a.sourceLine { pointer-events: none; color: inherit; text-decoration: inherit; }
|
||||
.sourceCode { overflow: visible; }
|
||||
code.sourceCode { white-space: pre; }
|
||||
@media print {
|
||||
code.sourceCode { white-space: pre-wrap; }
|
||||
div.sourceLine, a.sourceLine { text-indent: -1em; padding-left: 1em; }
|
||||
}
|
||||
pre.numberSource div.sourceLine, .numberSource a.sourceLine
|
||||
{ position: relative; }
|
||||
pre.numberSource div.sourceLine::before, .numberSource a.sourceLine::before
|
||||
{ content: attr(data-line-number);
|
||||
position: absolute; left: -5em; text-align: right; vertical-align: baseline;
|
||||
border: none; pointer-events: all;
|
||||
-webkit-touch-callout: none; -webkit-user-select: none;
|
||||
-khtml-user-select: none; -moz-user-select: none;
|
||||
-ms-user-select: none; user-select: none;
|
||||
padding: 0 4px; width: 4em; }
|
||||
pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa; color: #aaaaaa; padding-left: 4px; }
|
||||
@media screen {
|
||||
a.sourceLine::before { text-decoration: underline; color: initial; }
|
||||
}
|
||||
code span.kw { color: #007020; font-weight: bold; } /* Keyword */
|
||||
code span.dt { color: #902000; } /* DataType */
|
||||
code span.dv { color: #40a070; } /* DecVal */
|
||||
code span.bn { color: #40a070; } /* BaseN */
|
||||
code span.fl { color: #40a070; } /* Float */
|
||||
code span.ch { color: #4070a0; } /* Char */
|
||||
code span.st { color: #4070a0; } /* String */
|
||||
code span.co { color: #60a0b0; font-style: italic; } /* Comment */
|
||||
code span.ot { color: #007020; } /* Other */
|
||||
code span.al { color: #ff0000; font-weight: bold; } /* Alert */
|
||||
code span.fu { color: #06287e; } /* Function */
|
||||
code span.er { color: #ff0000; font-weight: bold; } /* Error */
|
||||
code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
|
||||
code span.cn { color: #880000; } /* Constant */
|
||||
code span.sc { color: #4070a0; } /* SpecialChar */
|
||||
code span.vs { color: #4070a0; } /* VerbatimString */
|
||||
code span.ss { color: #bb6688; } /* SpecialString */
|
||||
code span.im { } /* Import */
|
||||
code span.va { color: #19177c; } /* Variable */
|
||||
code span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
|
||||
code span.op { color: #666666; } /* Operator */
|
||||
code span.bu { } /* BuiltIn */
|
||||
code span.ex { } /* Extension */
|
||||
code span.pp { color: #bc7a00; } /* Preprocessor */
|
||||
code span.at { color: #7d9029; } /* Attribute */
|
||||
code span.do { color: #ba2121; font-style: italic; } /* Documentation */
|
||||
code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
|
||||
code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
|
||||
code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
|
||||
</style>
|
||||
<link rel="stylesheet" href="style.css">
|
||||
<!--[if lt IE 9]>
|
||||
<script src="//cdnjs.cloudflare.com/ajax/libs/html5shiv/3.7.3/html5shiv-printshiv.min.js"></script>
|
||||
<![endif]-->
|
||||
</head>
|
||||
<body>
|
||||
<header>
|
||||
<h1 class="title">Xcdat: XOR-compressed double-array trie</h1>
|
||||
<p align="center">Created by <a href="https://github.com/kampersanda">Shunsuke Kanda</a></p>
|
||||
</header>
|
||||
<h2>Contents</h2>
|
||||
<nav id="TOC">
|
||||
<ul>
|
||||
<li><a href="#what-is-xcdat">What is Xcdat?</a></li>
|
||||
<li><a href="#features">Features</a></li>
|
||||
<li><a href="#build-instructions">Build Instructions</a></li>
|
||||
<li><a href="#command-line-tools">Command Line Tools</a></li>
|
||||
<li><a href="#sample-usage">Sample Usage</a></li>
|
||||
<li><a href="#api">API</a></li>
|
||||
<li><a href="#benchmark">Benchmark</a></li>
|
||||
<li><a href="#to-do">To Do</a></li>
|
||||
<li><a href="#licensing">Licensing</a></li>
|
||||
<li><a href="#citation">Citation</a></li>
|
||||
<li><a href="#references">References</a></li>
|
||||
</ul>
|
||||
</nav>
|
||||
<h2 id="what-is-xcdat">What is Xcdat?</h2>
|
||||
<p>Xcdat is a C++ library that implements static compressed string dictionaries based on an improved double-array trie.</p>
|
||||
<p>The double array (Aoe, 1989) is known as the fastest trie representation and has been used in many trie libraries. On the other hand, it has a space efficiency problem because of a pointer-based data structure. Xcdat solves the problem using the XOR-compressed double-array methods described in the following article.</p>
|
||||
<blockquote>
|
||||
<p>Shunsuke Kanda, Kazuhiro Morita, and Masao Fuketa. Compressed double-array tries for string dictionaries supporting fast lookup. Knowledge and Information Systems, 51(3): 1023–1042, 2017. [<a href="https://doi.org/10.1007/s10115-016-0999-8">doi</a>] [<a href="https://sites.google.com/site/shnskknd/KAIS2016.pdf">pdf</a>]</p>
|
||||
</blockquote>
|
||||
<p>Xcdat can implement trie dictionaries in smaller space compared to the other double-array libraries. In addition, the lookup speed is relatively fast in compressed data structures from the double-array advantage.</p>
|
||||
<p>Xcdat is available at <a href="https://github.com/kampersanda/xcdat">GitHub repsitory</a>.</p>
|
||||
<h2 id="features">Features</h2>
|
||||
<ul>
|
||||
<li><strong>Compressed Data Structure</strong>: Xcdat practically compresses double-array elements for representing node pointers by using the XCDA methods. While the original double array uses 8 bytes (or 16 bytes) per node, it uses about 3–4 bytes (but, depending on datasets). In addition, the dictionary is implemented using a minimal-prefix trie (Yata et al., 2007) that is effective for long strings in time and space.</li>
|
||||
<li><strong>Two Compression Approaches</strong>: There are two approaches of compressing elements: using byte-oriented DACs (Brisaboa et al., 2013) and using pointer-based ones (Kanda et al., 2017). For characterless strings such as natural language keywords, the former will be slightly smaller and the latter will be slightly faster. For long strings such as URLs, the latter will outperform the former. Xcdat implements the two versions by using a static polymorphism with C++ template to avoid an overhead of virtual functions.</li>
|
||||
<li><strong>64-bit Version</strong>: Although Xcdat represents node addresses using 32-bit integers in default configuration, we can allow for 64-bit integers by defining <code>XCDAT_X64</code>; therefore, the dictionary can be constructed from a very large dataset. The construction space becomes large, but the output dictionary size is nearly equal.</li>
|
||||
<li><strong>NULL Character</strong>: The dictionary can be constructed from keys including the NULL character by setting the second parameter of <code>xcdat::TrieBuilder::build()</code> to <code>true</code>.</li>
|
||||
<li><strong>Dictionary Encoding</strong>: Xcdat supports mapping N different strings to unique IDs in [0,N-1]. That is to say, it supports two basic dictionary operations: Lookup returns the ID corresponding to a given string and Access (also called ReverseLookup) returns the string corresponding to a given ID. Therefore, Xcdat is very useful in many applications for string precessing and indexing, such as described in (Martínez-Prieto et al., 2016).</li>
|
||||
<li><strong>Fast Operations</strong>: Xcdat can provide lookup operations faster than other compressed trie libraries because it is based on the double-array trie. On the other hand, compared to the existing double-array libraries, the speed will be slower due to the compression.</li>
|
||||
<li><strong>Prefix-based Lookup Operations</strong>: As with other trie libraries, Xcdat also provides prefix-based lookup operations required for natural language processing and so on.</li>
|
||||
</ul>
|
||||
<h2 id="build-instructions">Build Instructions</h2>
|
||||
<p>You can download and compile Xcdat as the following commands.</p>
|
||||
<pre><code>$ git clone https://github.com/kampersanda/xcdat.git
|
||||
$ cd xcdat
|
||||
$ mkdir build
|
||||
$ cd build
|
||||
$ cmake ..
|
||||
$ make
|
||||
$ make install</code></pre>
|
||||
<p>If you want to use a 64-bit setting, please add <code>-DXCDAT_X64=ON</code> to the CMake option. In addition, you can use the SSE4.2 POPCNT instruction by adding <code>-DXCDAT_USE_POPCNT=ON</code> for Rank/Select operations. The code has been tested only on Mac OS X and Linux. That is, this library considers only UNIX-compatible OS.</p>
|
||||
<h2 id="command-line-tools">Command Line Tools</h2>
|
||||
<p><code>xcdat</code> is a general-purpose command line tool to provide three modes as follows.</p>
|
||||
<pre><code>$ xcdat
|
||||
xcdat build <type> <key> <dict>
|
||||
<type> 1: DACs, 2: FDACs
|
||||
<key> Input file name of a set of keys (must be sorted)
|
||||
<dict> Output file name of the dictionary (optional)
|
||||
If omitted, <key>.dacs or <key>.fdacs is output
|
||||
xcdat query <type> <dict> <limit>
|
||||
<type> 1: DACs, 2: FDACs
|
||||
<dict> Input file name of the dictionary
|
||||
<limit> Limit of #results (optional, default=10)
|
||||
xcdat bench <type> <dict> <key>
|
||||
<type> 1: DACs, 2: FDACs
|
||||
<dict> Input file name of the dictionary
|
||||
<key> Input file name of keys for benchmark</code></pre>
|
||||
<h3 id="example-1-construction">Example 1: Construction</h3>
|
||||
<p>Command <code>xcdat build [params...]</code> builds Xcdat dictionaries from a given dataset and saves it to a file, as follows.</p>
|
||||
<pre><code>$ xcdat build 1 jawiki-all-titles
|
||||
constr. time: 1.58574 sec
|
||||
cmpr. ratio: 0.524287 over the raw size
|
||||
|
||||
basic statistics of xcdat::Trie<false>
|
||||
num keys: 1738995
|
||||
alphabet size: 189
|
||||
num nodes: 4042496
|
||||
num used nodes: 4034357
|
||||
num free nodes: 8139
|
||||
size in bytes: 20546967
|
||||
member size statistics of xcdat::Trie<false>
|
||||
bc: 13879098 0.675482
|
||||
terminal_flags: 708448 0.0344794
|
||||
tail: 5958655 0.290002
|
||||
boundary_flags: 40 1.94676e-06
|
||||
basic statistics of xcdat::DacBc
|
||||
num links: 1499605
|
||||
bytes per node: 3.4333
|
||||
member size statistics of xcdat::DacBc
|
||||
values_L0: 8085000 0.582531
|
||||
values_L1: 746760 0.0538046
|
||||
values_L2: 22581 0.00162698
|
||||
flags_L0: 1389660 0.100126
|
||||
flags_L1: 128400 0.00925132
|
||||
leaves: 694856 0.0500649
|
||||
links: 2811784 0.202591
|
||||
|
||||
output -> jawiki-all-titles.dac</code></pre>
|
||||
<h3 id="example-2-query-processing">Example 2: Query Processing</h3>
|
||||
<p>Command <code>xcdat query [params...]</code> loads a dictionary file and tests lookup operations, as follows.</p>
|
||||
<pre><code>$ xcdat query 1 jawiki-all-titles.dac
|
||||
> NEW_GAME!
|
||||
Lookup
|
||||
125989 NEW_GAME!
|
||||
Common Prefix Lookup
|
||||
28 N
|
||||
124185 NE
|
||||
125428 NEW
|
||||
125988 NEW_GAME
|
||||
125989 NEW_GAME!
|
||||
5 found
|
||||
Predictive Lookup
|
||||
125989 NEW_GAME!
|
||||
126003 NEW_GAME!!
|
||||
126059 NEW_GAME!_-THE_CHALLENGE_STAGE!-
|
||||
3 found</code></pre>
|
||||
<h3 id="example-3-benchmark-test">Example 3: Benchmark Test</h3>
|
||||
<p>Command <code>xcdat bench [params...]</code> tests time performances of a given dictionary, as follows.</p>
|
||||
<pre><code>$ xcdat bench 1 jawiki-all-titles.dac jawiki-all-titles.rnd
|
||||
Warm up
|
||||
Lookup benchmark on 10 runs
|
||||
1.5065 us per str
|
||||
Access benchmark on 10 runs
|
||||
1.81289 us per ID</code></pre>
|
||||
<h2 id="sample-usage">Sample Usage</h2>
|
||||
<p>The following code shows an easy routine sample.</p>
|
||||
<pre class="sourceCode cpp" id="cb6"><code class="sourceCode cpp"><div class="sourceLine" id="cb6-1" data-line-number="1"><span class="pp">#include </span><span class="im"><iostream></span></div>
|
||||
<div class="sourceLine" id="cb6-2" data-line-number="2"><span class="pp">#include </span><span class="im"><xcdat.hpp></span></div>
|
||||
<div class="sourceLine" id="cb6-3" data-line-number="3"></div>
|
||||
<div class="sourceLine" id="cb6-4" data-line-number="4"><span class="dt">int</span> main() {</div>
|
||||
<div class="sourceLine" id="cb6-5" data-line-number="5"> <span class="bu">std::</span>vector<<span class="bu">std::</span>string> keys_buf = {</div>
|
||||
<div class="sourceLine" id="cb6-6" data-line-number="6"> <span class="st">"Aoba"</span>, <span class="st">"Yun"</span>, <span class="st">"Hajime"</span>, <span class="st">"Hihumi"</span>, <span class="st">"Kou"</span>, <span class="st">"Rin"</span>,</div>
|
||||
<div class="sourceLine" id="cb6-7" data-line-number="7"> <span class="st">"Hazuki"</span>, <span class="st">"Umiko"</span>, <span class="st">"Nene"</span>, <span class="st">"Nenecchi"</span></div>
|
||||
<div class="sourceLine" id="cb6-8" data-line-number="8"> };</div>
|
||||
<div class="sourceLine" id="cb6-9" data-line-number="9"></div>
|
||||
<div class="sourceLine" id="cb6-10" data-line-number="10"> <span class="co">// Convert to the input format</span></div>
|
||||
<div class="sourceLine" id="cb6-11" data-line-number="11"> <span class="bu">std::</span>vector<<span class="bu">std::</span>string_view> keys(keys_buf.size());</div>
|
||||
<div class="sourceLine" id="cb6-12" data-line-number="12"> <span class="cf">for</span> (<span class="dt">size_t</span> i = <span class="dv">0</span>; i < keys.size(); ++i) {</div>
|
||||
<div class="sourceLine" id="cb6-13" data-line-number="13"> keys[i] = <span class="bu">std::</span>string_view{keys_buf[i]};</div>
|
||||
<div class="sourceLine" id="cb6-14" data-line-number="14"> }</div>
|
||||
<div class="sourceLine" id="cb6-15" data-line-number="15"></div>
|
||||
<div class="sourceLine" id="cb6-16" data-line-number="16"> <span class="co">// Input data must be sorted.</span></div>
|
||||
<div class="sourceLine" id="cb6-17" data-line-number="17"> <span class="bu">std::</span>sort(<span class="bu">std::</span>begin(keys), <span class="bu">std::</span>end(keys));</div>
|
||||
<div class="sourceLine" id="cb6-18" data-line-number="18"></div>
|
||||
<div class="sourceLine" id="cb6-19" data-line-number="19"> <span class="co">// Dictionary class</span></div>
|
||||
<div class="sourceLine" id="cb6-20" data-line-number="20"> <span class="kw">using</span> Trie = xcdat::Trie<<span class="kw">true</span>>;</div>
|
||||
<div class="sourceLine" id="cb6-21" data-line-number="21"></div>
|
||||
<div class="sourceLine" id="cb6-22" data-line-number="22"> <span class="cf">try</span> {</div>
|
||||
<div class="sourceLine" id="cb6-23" data-line-number="23"> <span class="co">// Builds a dictionary from the keys</span></div>
|
||||
<div class="sourceLine" id="cb6-24" data-line-number="24"> Trie trie = xcdat::TrieBuilder::build<<span class="kw">true</span>>(keys); <span class="co">// move</span></div>
|
||||
<div class="sourceLine" id="cb6-25" data-line-number="25"></div>
|
||||
<div class="sourceLine" id="cb6-26" data-line-number="26"> <span class="co">// Writes the dictionary to a file.</span></div>
|
||||
<div class="sourceLine" id="cb6-27" data-line-number="27"> <span class="bu">std::</span>ofstream ofs{<span class="st">"sample.bin"</span>};</div>
|
||||
<div class="sourceLine" id="cb6-28" data-line-number="28"> trie.write(ofs);</div>
|
||||
<div class="sourceLine" id="cb6-29" data-line-number="29"> } <span class="cf">catch</span> (<span class="at">const</span> xcdat::TrieBuilder::Exception& ex) {</div>
|
||||
<div class="sourceLine" id="cb6-30" data-line-number="30"> <span class="co">// Abort if something went wrong...</span></div>
|
||||
<div class="sourceLine" id="cb6-31" data-line-number="31"> <span class="bu">std::</span>cerr << ex.what() << <span class="bu">std::</span>endl;</div>
|
||||
<div class="sourceLine" id="cb6-32" data-line-number="32"> <span class="cf">return</span> <span class="dv">1</span>;</div>
|
||||
<div class="sourceLine" id="cb6-33" data-line-number="33"> }</div>
|
||||
<div class="sourceLine" id="cb6-34" data-line-number="34"></div>
|
||||
<div class="sourceLine" id="cb6-35" data-line-number="35"> <span class="co">// Creates an empty dictionary</span></div>
|
||||
<div class="sourceLine" id="cb6-36" data-line-number="36"> Trie trie;</div>
|
||||
<div class="sourceLine" id="cb6-37" data-line-number="37"> {</div>
|
||||
<div class="sourceLine" id="cb6-38" data-line-number="38"> <span class="co">// Reads the dictionary to the file.</span></div>
|
||||
<div class="sourceLine" id="cb6-39" data-line-number="39"> <span class="bu">std::</span>ifstream ifs{<span class="st">"sample.bin"</span>};</div>
|
||||
<div class="sourceLine" id="cb6-40" data-line-number="40"> trie = Trie{ifs}; <span class="co">// move</span></div>
|
||||
<div class="sourceLine" id="cb6-41" data-line-number="41"> }</div>
|
||||
<div class="sourceLine" id="cb6-42" data-line-number="42"></div>
|
||||
<div class="sourceLine" id="cb6-43" data-line-number="43"> <span class="bu">std::</span>cout << <span class="st">"Performing basic operations..."</span> << <span class="bu">std::</span>endl;</div>
|
||||
<div class="sourceLine" id="cb6-44" data-line-number="44"> {</div>
|
||||
<div class="sourceLine" id="cb6-45" data-line-number="45"> <span class="co">// lookup() obtains the unique ID for a given key</span></div>
|
||||
<div class="sourceLine" id="cb6-46" data-line-number="46"> xcdat::<span class="dt">id_type</span> key_id = trie.lookup(<span class="st">"Rin"</span>);</div>
|
||||
<div class="sourceLine" id="cb6-47" data-line-number="47"> <span class="co">// access() decodes the key from a given ID</span></div>
|
||||
<div class="sourceLine" id="cb6-48" data-line-number="48"> <span class="bu">std::</span>cout << key_id << <span class="st">" : "</span> << trie.access(key_id) << <span class="bu">std::</span>endl;</div>
|
||||
<div class="sourceLine" id="cb6-49" data-line-number="49"></div>
|
||||
<div class="sourceLine" id="cb6-50" data-line-number="50"> <span class="co">// Given an unregistered key, lookup() returns NOT_FOUND.</span></div>
|
||||
<div class="sourceLine" id="cb6-51" data-line-number="51"> <span class="cf">if</span> (trie.lookup(<span class="st">"Hotaru"</span>) == Trie::NOT_FOUND) {</div>
|
||||
<div class="sourceLine" id="cb6-52" data-line-number="52"> <span class="bu">std::</span>cout << <span class="st">"? : "</span> << <span class="st">"Hotaru"</span> << <span class="bu">std::</span>endl;</div>
|
||||
<div class="sourceLine" id="cb6-53" data-line-number="53"> }</div>
|
||||
<div class="sourceLine" id="cb6-54" data-line-number="54"> }</div>
|
||||
<div class="sourceLine" id="cb6-55" data-line-number="55"></div>
|
||||
<div class="sourceLine" id="cb6-56" data-line-number="56"> <span class="bu">std::</span>cout << <span class="st">"Performing a common prefix operation..."</span> << <span class="bu">std::</span>endl;</div>
|
||||
<div class="sourceLine" id="cb6-57" data-line-number="57"> {</div>
|
||||
<div class="sourceLine" id="cb6-58" data-line-number="58"> <span class="co">// Common prefix operation is implemented using PrefixIterator, created by</span></div>
|
||||
<div class="sourceLine" id="cb6-59" data-line-number="59"> <span class="co">// make_prefix_iterator().</span></div>
|
||||
<div class="sourceLine" id="cb6-60" data-line-number="60"> Trie::PrefixIterator it = trie.make_prefix_iterator(<span class="st">"Nenecchi"</span>);</div>
|
||||
<div class="sourceLine" id="cb6-61" data-line-number="61"></div>
|
||||
<div class="sourceLine" id="cb6-62" data-line-number="62"> <span class="co">// next() continues to obtain the next key until false is returned.</span></div>
|
||||
<div class="sourceLine" id="cb6-63" data-line-number="63"> <span class="cf">while</span> (it.next()) {</div>
|
||||
<div class="sourceLine" id="cb6-64" data-line-number="64"> <span class="bu">std::</span>cout << it.id() << <span class="st">" : "</span> << it.key() << <span class="bu">std::</span>endl;</div>
|
||||
<div class="sourceLine" id="cb6-65" data-line-number="65"> }</div>
|
||||
<div class="sourceLine" id="cb6-66" data-line-number="66"> }</div>
|
||||
<div class="sourceLine" id="cb6-67" data-line-number="67"></div>
|
||||
<div class="sourceLine" id="cb6-68" data-line-number="68"> <span class="bu">std::</span>cout << <span class="st">"Performing a predictive operation..."</span> << <span class="bu">std::</span>endl;</div>
|
||||
<div class="sourceLine" id="cb6-69" data-line-number="69"> {</div>
|
||||
<div class="sourceLine" id="cb6-70" data-line-number="70"> <span class="co">// Predictive operation is implemented using PredictiveIterator, created by</span></div>
|
||||
<div class="sourceLine" id="cb6-71" data-line-number="71"> <span class="co">// make_predictive_iterator().</span></div>
|
||||
<div class="sourceLine" id="cb6-72" data-line-number="72"> Trie::PredictiveIterator it = trie.make_predictive_iterator(<span class="st">"Ha"</span>);</div>
|
||||
<div class="sourceLine" id="cb6-73" data-line-number="73"></div>
|
||||
<div class="sourceLine" id="cb6-74" data-line-number="74"> <span class="co">// next() continues to obtain the next key until false is returned in</span></div>
|
||||
<div class="sourceLine" id="cb6-75" data-line-number="75"> <span class="co">// lexicographical order.</span></div>
|
||||
<div class="sourceLine" id="cb6-76" data-line-number="76"> <span class="cf">while</span> (it.next()) {</div>
|
||||
<div class="sourceLine" id="cb6-77" data-line-number="77"> <span class="bu">std::</span>cout << it.id() << <span class="st">" : "</span> << it.key() << <span class="bu">std::</span>endl;</div>
|
||||
<div class="sourceLine" id="cb6-78" data-line-number="78"> }</div>
|
||||
<div class="sourceLine" id="cb6-79" data-line-number="79"> }</div>
|
||||
<div class="sourceLine" id="cb6-80" data-line-number="80"></div>
|
||||
<div class="sourceLine" id="cb6-81" data-line-number="81"> <span class="bu">std::</span>cout << <span class="st">"Enumerating all registered keys..."</span> << <span class="bu">std::</span>endl;</div>
|
||||
<div class="sourceLine" id="cb6-82" data-line-number="82"> {</div>
|
||||
<div class="sourceLine" id="cb6-83" data-line-number="83"> <span class="co">// PredictiveIterator for an empty string provides enumeration of all</span></div>
|
||||
<div class="sourceLine" id="cb6-84" data-line-number="84"> <span class="co">// registered keys in lexicographical order.</span></div>
|
||||
<div class="sourceLine" id="cb6-85" data-line-number="85"> Trie::PredictiveIterator it = trie.make_predictive_iterator(<span class="st">""</span>);</div>
|
||||
<div class="sourceLine" id="cb6-86" data-line-number="86"> <span class="cf">while</span> (it.next()) {</div>
|
||||
<div class="sourceLine" id="cb6-87" data-line-number="87"> <span class="bu">std::</span>cout << it.id() << <span class="st">" : "</span> << it.key() << <span class="bu">std::</span>endl;</div>
|
||||
<div class="sourceLine" id="cb6-88" data-line-number="88"> }</div>
|
||||
<div class="sourceLine" id="cb6-89" data-line-number="89"> }</div>
|
||||
<div class="sourceLine" id="cb6-90" data-line-number="90"></div>
|
||||
<div class="sourceLine" id="cb6-91" data-line-number="91"> <span class="cf">return</span> <span class="dv">0</span>;</div>
|
||||
<div class="sourceLine" id="cb6-92" data-line-number="92">}</div></code></pre>
|
||||
<p>The standard output is as follows.</p>
|
||||
<pre><code>Performing basic operations...
|
||||
7 : Rin
|
||||
? : Hotaru
|
||||
Performing common prefix operations...
|
||||
4 : Nene
|
||||
6 : Nenecchi
|
||||
Performing predictive operations...
|
||||
3 : Hajime
|
||||
5 : Hazuki
|
||||
Enumerating all registered keys...
|
||||
0 : Aoba
|
||||
3 : Hajime
|
||||
5 : Hazuki
|
||||
1 : Hihumi
|
||||
2 : Kou
|
||||
4 : Nene
|
||||
6 : Nenecchi
|
||||
7 : Rin
|
||||
8 : Umiko
|
||||
9 : Yun</code></pre>
|
||||
<p>As shown in the output, <code>xcdat::Trie</code> assigns unique integer IDs to each registered key. The ID order is random, depending on node arrangement.</p>
|
||||
<h2 id="api">API</h2>
|
||||
<p>You can build a dictionary using static member function <code>xcdat::TrieBuilder::build()</code>. This function receives a set of keywords and returns the resulting class object of <code>xcdat::Trie</code>. For the usage, refer to the header comments of <a href="https://github.com/kampersanda/xcdat/blob/master/include/xcdat/TrieBuilder.hpp"><code>xcdat::TrieBuilder.hpp</code></a>. Also for the usage of <code>xcdat::Trie</code>, refer to the header comments of <a href="https://github.com/kampersanda/xcdat/blob/master/include/xcdat/Trie.hpp"><code>xcdat::Trie</code></a>.</p>
|
||||
<p>The detailed descriptions of AIP are under construction…</p>
|
||||
<h2 id="benchmark">Benchmark</h2>
|
||||
<p>Work in progress…</p>
|
||||
<h2 id="to-do">To Do</h2>
|
||||
<ul>
|
||||
<li>Show benchmarks</li>
|
||||
<li>Create AIP descriptions</li>
|
||||
</ul>
|
||||
<h2 id="licensing">Licensing</h2>
|
||||
<p>This library is free software provided under the MIT License.</p>
|
||||
<h2 id="citation">Citation</h2>
|
||||
<p>If you use the library in academic settings, please cite the following paper.</p>
|
||||
<pre class="sourceCode bibtex" id="cb8"><code class="sourceCode bibtex"><div class="sourceLine" id="cb8-1" data-line-number="1"><span class="va">@article</span>{<span class="ot">kanda2017compressed</span>,</div>
|
||||
<div class="sourceLine" id="cb8-2" data-line-number="2"> <span class="dt">title</span>={Compressed double-array tries for string dictionaries supporting fast lookup},</div>
|
||||
<div class="sourceLine" id="cb8-3" data-line-number="3"> <span class="dt">author</span>={Kanda, Shunsuke and Morita, Kazuhiro and Fuketa, Masao},</div>
|
||||
<div class="sourceLine" id="cb8-4" data-line-number="4"> <span class="dt">journal</span>={Knowledge and Information Systems},</div>
|
||||
<div class="sourceLine" id="cb8-5" data-line-number="5"> <span class="dt">volume</span>={51},</div>
|
||||
<div class="sourceLine" id="cb8-6" data-line-number="6"> <span class="dt">number</span>={3},</div>
|
||||
<div class="sourceLine" id="cb8-7" data-line-number="7"> <span class="dt">pages</span>={1023--1042},</div>
|
||||
<div class="sourceLine" id="cb8-8" data-line-number="8"> <span class="dt">year</span>={2017},</div>
|
||||
<div class="sourceLine" id="cb8-9" data-line-number="9"> <span class="dt">publisher</span>={Springer}</div>
|
||||
<div class="sourceLine" id="cb8-10" data-line-number="10">}</div></code></pre>
|
||||
<h2 id="references">References</h2>
|
||||
<ul>
|
||||
<li>J. Aoe. An efficient digital search algorithm by using a double-array structure. IEEE Transactions on Software Engineering, 15(9):1066–1077, 1989.</li>
|
||||
<li>N. R. Brisaboa, S. Ladra, and G. Navarro. DACs: Bringing direct access to variable-length codes. Information Processing & Management, 49(1):392–404, 2013.</li>
|
||||
<li>S. Kanda, K. Morita, and M. Fuketa. Compressed double-array tries for string dictionaries supporting fast lookup. Knowledge and Information Systems, 51(3): 1023–1042, 2017.</li>
|
||||
<li>M. A. Martínez-Prieto, N. Brisaboa, R. Cánovas, F. Claude, and G. Navarro. Practical compressed string dictionaries. Information Systems, 56:73–108, 2016</li>
|
||||
<li>S. Yata, M. Oono, K. Morita, M. Fuketa, T. Sumitomo, and J. Aoe. A compact static double-array keeping character codes. Information Processing & Management, 43(1):237–247, 2007.</li>
|
||||
</ul>
|
||||
<footer>
|
||||
<p>Copyright © 2017 Shunsuke Kanda, All Rights Reserved.</p>
|
||||
</footer>
|
||||
</body>
|
||||
</html>
|
|
@ -1,3 +0,0 @@
|
|||
#!/bin/sh
|
||||
|
||||
pandoc --template=template.html -o index.html document.md -c style.css --toc --toc-depth=2
|
163
docs/style.css
163
docs/style.css
|
@ -1,163 +0,0 @@
|
|||
@import url('https://fonts.googleapis.com/css?family=Comfortaa');
|
||||
@import url('https://fonts.googleapis.com/css?family=Source+Code+Pro');
|
||||
@import url('https://maxcdn.bootstrapcdn.com/font-awesome/4.7.0/css/font-awesome.min.css');
|
||||
|
||||
body {
|
||||
background: #fff;
|
||||
color: #545454;
|
||||
font-family: 'Comfortaa';
|
||||
font-size: 16px;
|
||||
line-height: 1.5;
|
||||
margin: 0 auto;
|
||||
max-width: 800px;
|
||||
padding: 2em 2em 2em;
|
||||
}
|
||||
|
||||
h1,
|
||||
h2,
|
||||
h3,
|
||||
h4,
|
||||
h5,
|
||||
h6 {
|
||||
color: #494949;
|
||||
font-weight: 600;
|
||||
line-height: 1.3;
|
||||
}
|
||||
|
||||
h1 {
|
||||
line-height: 1.7;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
h2 {
|
||||
margin-top: 1.3em;
|
||||
padding: 0.25em 0.5em;
|
||||
color: #494949;
|
||||
background: transparent;
|
||||
border-left: solid 5px #7db4e6;
|
||||
}
|
||||
|
||||
h3 {
|
||||
margin-top: 1.3em;
|
||||
padding: 0.25em 0.0em;
|
||||
}
|
||||
|
||||
h4 {
|
||||
margin-top: 1.3em;
|
||||
padding: 0.25em 0.0em;
|
||||
}
|
||||
|
||||
a {
|
||||
color: #0083e8;
|
||||
text-decoration: none;
|
||||
}
|
||||
|
||||
a:hover {
|
||||
text-decoration: underline;
|
||||
}
|
||||
|
||||
b,
|
||||
strong {
|
||||
font-weight: 600;
|
||||
background: linear-gradient(transparent 75%, #a7d6ff 70%);
|
||||
}
|
||||
|
||||
img {
|
||||
animation: colorize 2s cubic-bezier(0, 0, .78, .36) 1;
|
||||
background: transparent;
|
||||
border: 10px solid rgba(0, 0, 0, 0.12);
|
||||
border-radius: 4px;
|
||||
display: block;
|
||||
margin: 1.3em auto;
|
||||
max-width: 95%;
|
||||
}
|
||||
|
||||
blockquote {
|
||||
position: relative;
|
||||
padding: 10px 15px 10px 60px;
|
||||
box-sizing: border-box;
|
||||
background: #f5f5f5;
|
||||
color: #777777;
|
||||
border-left: 4px solid #9dd4ff;
|
||||
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.14);
|
||||
}
|
||||
|
||||
blockquote:before {
|
||||
display: inline-block;
|
||||
position: absolute;
|
||||
top: 15px;
|
||||
left: 15px;
|
||||
vertical-align: middle;
|
||||
content: "\f10d";
|
||||
font-family: FontAwesome;
|
||||
color: #9dd4ff;
|
||||
font-size: 30px;
|
||||
line-height: 1;
|
||||
}
|
||||
|
||||
blockquote p {
|
||||
padding: 0;
|
||||
margin: 7px 0;
|
||||
}
|
||||
|
||||
blockquote cite {
|
||||
display: block;
|
||||
text-align: right;
|
||||
color: #888888;
|
||||
font-size: 0.9em;
|
||||
}
|
||||
|
||||
ul {
|
||||
padding: 0 0.5em;
|
||||
position: relative;
|
||||
}
|
||||
|
||||
ul li {
|
||||
line-height: 1.5;
|
||||
padding: 0.2em 0 0.5em 1.5em;
|
||||
border-bottom: 2px solid white;
|
||||
list-style-type: none!important;
|
||||
}
|
||||
|
||||
ul li:before {
|
||||
font-family: FontAwesome;
|
||||
content: "\f00c";
|
||||
position: absolute;
|
||||
left: 0.5em;
|
||||
color: #9dd4ff;
|
||||
}
|
||||
|
||||
ul li:last-of-type {
|
||||
border-bottom: none;
|
||||
}
|
||||
|
||||
pre,
|
||||
code {
|
||||
background: #f5f5f5;
|
||||
font-family: 'Source Code Pro', monospace;
|
||||
}
|
||||
|
||||
p code {
|
||||
padding: 0.1em 0.5em;
|
||||
}
|
||||
|
||||
pre {
|
||||
font-size: 0.95rem;
|
||||
padding: 1em;
|
||||
overflow: auto;
|
||||
white-space: pre;
|
||||
}
|
||||
|
||||
pre.sourceCode {
|
||||
font-size: 0.95rem;
|
||||
padding: 1em;
|
||||
overflow: auto;
|
||||
white-space: pre;
|
||||
}
|
||||
|
||||
footer {
|
||||
font-size: 14px;
|
||||
color: #8f9296;
|
||||
text-align: center;
|
||||
margin-top: 40px;
|
||||
}
|
|
@ -1,68 +0,0 @@
|
|||
<!DOCTYPE html>
|
||||
<html xmlns="http://www.w3.org/1999/xhtml" lang="$lang$" xml:lang="$lang$"$if(dir)$ dir="$dir$"$endif$>
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<meta name="generator" content="pandoc" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes" />
|
||||
$for(author-meta)$
|
||||
<meta name="author" content="$author-meta$" />
|
||||
$endfor$
|
||||
$if(date-meta)$
|
||||
<meta name="dcterms.date" content="$date-meta$" />
|
||||
$endif$
|
||||
$if(keywords)$
|
||||
<meta name="keywords" content="$for(keywords)$$keywords$$sep$, $endfor$" />
|
||||
$endif$
|
||||
<title>$if(title-prefix)$$title-prefix$ – $endif$$pagetitle$</title>
|
||||
<style type="text/css">
|
||||
code{white-space: pre-wrap;}
|
||||
span.smallcaps{font-variant: small-caps;}
|
||||
div.line-block{white-space: pre-line;}
|
||||
div.column{display: inline-block; vertical-align: top; width: 50%;}
|
||||
$if(quotes)$
|
||||
q { quotes: "“" "”" "‘" "’"; }
|
||||
$endif$
|
||||
</style>
|
||||
$if(highlighting-css)$
|
||||
<style type="text/css">
|
||||
$highlighting-css$
|
||||
</style>
|
||||
$endif$
|
||||
$for(css)$
|
||||
<link rel="stylesheet" href="$css$">
|
||||
$endfor$
|
||||
$if(math)$
|
||||
$math$
|
||||
$endif$
|
||||
<!--[if lt IE 9]>
|
||||
<script src="//cdnjs.cloudflare.com/ajax/libs/html5shiv/3.7.3/html5shiv-printshiv.min.js"></script>
|
||||
<![endif]-->
|
||||
$for(header-includes)$
|
||||
$header-includes$
|
||||
$endfor$
|
||||
</head>
|
||||
<body>
|
||||
$for(include-before)$
|
||||
$include-before$
|
||||
$endfor$
|
||||
$if(title)$
|
||||
<header>
|
||||
<h1 class="title">$title$</h1>
|
||||
<p align="center">Created by <a href="https://github.com/kampersanda">$author$</a></p>
|
||||
</header>
|
||||
$endif$
|
||||
$if(toc)$
|
||||
<h2>Contents</h2>
|
||||
<nav id="$idprefix$TOC">
|
||||
$table-of-contents$
|
||||
</nav>
|
||||
$endif$
|
||||
$body$
|
||||
$for(include-after)$
|
||||
$include-after$
|
||||
$endfor$
|
||||
<footer>
|
||||
<p>Copyright © $date$ $author$, All Rights Reserved.</p>
|
||||
</footer>
|
||||
</body>
|
||||
</html>
|
|
@ -1,6 +1 @@
|
|||
#ifndef XCDAT_XCDAT_HPP_
|
||||
#define XCDAT_XCDAT_HPP_
|
||||
|
||||
#include "xcdat/TrieBuilder.hpp"
|
||||
|
||||
#endif //XCDAT_XCDAT_HPP_
|
||||
#pragma once
|
||||
|
|
|
@ -1,73 +0,0 @@
|
|||
#ifndef XCDAT_BIT_VECTOR_HPP_
|
||||
#define XCDAT_BIT_VECTOR_HPP_
|
||||
|
||||
#include "BitVectorBuilder.hpp"
|
||||
#include "Vector.hpp"
|
||||
|
||||
namespace xcdat {
|
||||
|
||||
// Bit vector supporting Rank/Select operations.
|
||||
class BitVector {
|
||||
public:
|
||||
BitVector() = default;
|
||||
explicit BitVector(std::istream &is);
|
||||
BitVector(BitVectorBuilder& builder, bool rank_flag, bool select_flag);
|
||||
|
||||
~BitVector() = default;
|
||||
|
||||
bool operator[](size_t i) const {
|
||||
return (bits_[i / 32] & (1U << (i % 32))) != 0;
|
||||
}
|
||||
|
||||
// the number of 1s in B[0,i).
|
||||
id_type rank(id_type i) const;
|
||||
// the position of the i+1 th occurrence.
|
||||
id_type select(id_type i) const;
|
||||
|
||||
size_t num_1s() const {
|
||||
return num_1s_;
|
||||
}
|
||||
size_t num_0s() const {
|
||||
return size_ - num_1s_;
|
||||
}
|
||||
|
||||
// the number of bits
|
||||
size_t size() const {
|
||||
return size_;
|
||||
}
|
||||
|
||||
size_t size_in_bytes() const;
|
||||
|
||||
void write(std::ostream &os) const;
|
||||
|
||||
void swap(BitVector& rhs) {
|
||||
std::swap(*this, rhs);
|
||||
}
|
||||
|
||||
BitVector(const BitVector&) = delete;
|
||||
BitVector& operator=(const BitVector&) = delete;
|
||||
|
||||
BitVector(BitVector&&) noexcept = default;
|
||||
BitVector& operator=(BitVector&&) noexcept = default;
|
||||
|
||||
private:
|
||||
static constexpr id_type BITS_IN_R1 {256};
|
||||
static constexpr id_type BITS_IN_R2 {32};
|
||||
static constexpr id_type R1_PER_R2 {BITS_IN_R1 / BITS_IN_R2}; // 8
|
||||
static constexpr id_type ONES_PER_TIP {512};
|
||||
|
||||
struct RankTip {
|
||||
id_type L1;
|
||||
uint8_t L2[R1_PER_R2];
|
||||
};
|
||||
|
||||
Vector<uint32_t> bits_ {};
|
||||
Vector<RankTip> rank_tips_ {};
|
||||
Vector<id_type> select_tips_ {};
|
||||
size_t size_ {};
|
||||
size_t num_1s_ {};
|
||||
};
|
||||
|
||||
} //namespace - xcdat
|
||||
|
||||
#endif //XCDAT_BIT_VECTOR_HPP_
|
|
@ -1,64 +0,0 @@
|
|||
#ifndef XCDAT_BIT_VECTOR_BUILDER_HPP_
|
||||
#define XCDAT_BIT_VECTOR_BUILDER_HPP_
|
||||
|
||||
#include "xcdat_basics.hpp"
|
||||
|
||||
namespace xcdat {
|
||||
|
||||
// Bit pool for building BitVector.
|
||||
class BitVectorBuilder {
|
||||
public:
|
||||
friend class BitVector;
|
||||
|
||||
BitVectorBuilder() = default;
|
||||
~BitVectorBuilder() = default;
|
||||
|
||||
explicit BitVectorBuilder(size_t size) {
|
||||
resize(size);
|
||||
}
|
||||
|
||||
void push_back(bool bit) {
|
||||
if (size_ % 32 == 0) {
|
||||
bits_.push_back(0);
|
||||
}
|
||||
if (bit) {
|
||||
set_bit(size_, true);
|
||||
}
|
||||
++size_;
|
||||
}
|
||||
|
||||
void set_bit(size_t i, bool bit) {
|
||||
if (bit) {
|
||||
bits_[i / 32] |= (1U << (i % 32));
|
||||
++num_1s_;
|
||||
} else {
|
||||
bits_[i / 32] &= (~(1U << (i % 32)));
|
||||
--num_1s_;
|
||||
}
|
||||
}
|
||||
|
||||
void resize(size_t size) {
|
||||
bits_.resize(size / 32 + 1, 0);
|
||||
size_ = size;
|
||||
}
|
||||
|
||||
void reserve(size_t capacity) {
|
||||
bits_.reserve(capacity / 32 + 1);
|
||||
}
|
||||
|
||||
size_t size() const {
|
||||
return size_;
|
||||
}
|
||||
|
||||
BitVectorBuilder(const BitVectorBuilder&) = delete;
|
||||
BitVectorBuilder& operator=(const BitVectorBuilder&) = delete;
|
||||
|
||||
private:
|
||||
std::vector<uint32_t> bits_ {};
|
||||
size_t size_ {};
|
||||
size_t num_1s_ {};
|
||||
};
|
||||
|
||||
} //namespace - xcdat
|
||||
|
||||
#endif //XCDAT_BIT_VECTOR_BUILDER_HPP_
|
|
@ -1,75 +0,0 @@
|
|||
#ifndef XCDAT_DAC_BC_HPP_
|
||||
#define XCDAT_DAC_BC_HPP_
|
||||
|
||||
#include "BitVector.hpp"
|
||||
#include "FitVector.hpp"
|
||||
|
||||
namespace xcdat {
|
||||
|
||||
// BASE/CHECK representation using byte-oriented DACs.
|
||||
class DacBc {
|
||||
public:
|
||||
static constexpr id_type WIDTH_L1 {8};
|
||||
|
||||
DacBc() = default;
|
||||
~DacBc() = default;
|
||||
|
||||
explicit DacBc(std::istream &is);
|
||||
explicit DacBc(const std::vector<BcPair>& bc, BitVectorBuilder& leaf_flags);
|
||||
|
||||
id_type base(id_type i) const {
|
||||
return access_(i * 2) ^ i;
|
||||
}
|
||||
id_type link(id_type i) const {
|
||||
return values_[0][i * 2] | (links_[leaf_flags_.rank(i)] << 8);
|
||||
}
|
||||
id_type check(id_type i) const {
|
||||
return access_(i * 2 + 1) ^ i;
|
||||
}
|
||||
|
||||
bool is_leaf(id_type i) const {
|
||||
return leaf_flags_[i];
|
||||
}
|
||||
bool is_used(id_type i) const {
|
||||
return check(i) != i;
|
||||
}
|
||||
|
||||
size_t num_nodes() const {
|
||||
return values_[0].size() / 2;
|
||||
}
|
||||
size_t num_used_nodes() const {
|
||||
return num_nodes() - num_free_nodes_;
|
||||
}
|
||||
size_t num_free_nodes() const {
|
||||
return num_free_nodes_;
|
||||
}
|
||||
|
||||
size_t size_in_bytes() const;
|
||||
void show_stat(std::ostream &os) const;
|
||||
|
||||
void write(std::ostream &os) const;
|
||||
|
||||
void swap(DacBc& rhs) {
|
||||
std::swap(*this, rhs);
|
||||
}
|
||||
|
||||
DacBc(const DacBc&) = delete;
|
||||
DacBc& operator=(const DacBc&) = delete;
|
||||
|
||||
DacBc(DacBc&&) noexcept = default;
|
||||
DacBc& operator=(DacBc&&) noexcept = default;
|
||||
|
||||
private:
|
||||
Vector<uint8_t> values_[sizeof(id_type)] {};
|
||||
BitVector flags_[sizeof(id_type) - 1] {};
|
||||
BitVector leaf_flags_ {};
|
||||
FitVector links_ {};
|
||||
uint8_t max_level_ {};
|
||||
size_t num_free_nodes_ {};
|
||||
|
||||
id_type access_(id_type i) const;
|
||||
};
|
||||
|
||||
} //namespace - xcdat
|
||||
|
||||
#endif //XCDAT_DAC_BC_HPP_
|
|
@ -1,93 +0,0 @@
|
|||
#ifndef XCDAT_FAST_DAC_BC_HPP_
|
||||
#define XCDAT_FAST_DAC_BC_HPP_
|
||||
|
||||
#include <tuple>
|
||||
|
||||
#include "BitVector.hpp"
|
||||
#include "FitVector.hpp"
|
||||
#include "Vector.hpp"
|
||||
|
||||
namespace xcdat {
|
||||
|
||||
// BASE/CHECK representation using pointer-based byte-oriented DACs.
|
||||
class FastDacBc {
|
||||
public:
|
||||
static constexpr id_type WIDTH_L1 = 7;
|
||||
#ifdef XCDAT_X64
|
||||
static constexpr uint8_t LAYERS = 4;
|
||||
#else
|
||||
static constexpr uint8_t LAYERS = 3;
|
||||
#endif
|
||||
|
||||
static constexpr id_type BLOCK_SIZE_L1 = 1U << 7;
|
||||
static constexpr id_type BLOCK_SIZE_L2 = 1U << 15;
|
||||
#ifdef XCDAT_X64
|
||||
static constexpr id_type BLOCK_SIZE_L3 = 1U << 31;
|
||||
#endif
|
||||
|
||||
FastDacBc() = default;
|
||||
explicit FastDacBc(std::istream& is);
|
||||
explicit FastDacBc(const std::vector<BcPair>& bc,
|
||||
BitVectorBuilder& leaf_flags);
|
||||
|
||||
~FastDacBc() = default;
|
||||
|
||||
id_type base(id_type i) const {
|
||||
return access_(i * 2) ^ i;
|
||||
}
|
||||
id_type link(id_type i) const {
|
||||
return values_L1_[i * 2] | (links_[leaf_flags_.rank(i)] << 8);
|
||||
}
|
||||
id_type check(id_type i) const {
|
||||
return access_(i * 2 + 1) ^ i;
|
||||
}
|
||||
|
||||
bool is_leaf(id_type i) const {
|
||||
return leaf_flags_[i];
|
||||
}
|
||||
bool is_used(id_type i) const {
|
||||
return check(i) != i;
|
||||
}
|
||||
|
||||
size_t num_nodes() const {
|
||||
return values_L1_.size() / 2;
|
||||
}
|
||||
size_t num_used_nodes() const {
|
||||
return num_nodes() - num_free_nodes_;
|
||||
}
|
||||
size_t num_free_nodes() const {
|
||||
return num_free_nodes_;
|
||||
}
|
||||
|
||||
size_t size_in_bytes() const;
|
||||
void show_stat(std::ostream& os) const;
|
||||
void write(std::ostream& os) const;
|
||||
|
||||
void swap(FastDacBc& rhs) {
|
||||
std::swap(*this, rhs);
|
||||
}
|
||||
|
||||
FastDacBc(const FastDacBc&) = delete;
|
||||
FastDacBc& operator=(const FastDacBc&) = delete;
|
||||
|
||||
FastDacBc(FastDacBc&&) noexcept = default;
|
||||
FastDacBc& operator=(FastDacBc&&) noexcept = default;
|
||||
|
||||
private:
|
||||
Vector <uint8_t> values_L1_{};
|
||||
Vector <uint16_t> values_L2_{};
|
||||
Vector <uint32_t> values_L3_{};
|
||||
#ifdef XCDAT_X64
|
||||
Vector<uint64_t> values_L4_ {};
|
||||
#endif
|
||||
Vector <id_type> ranks_[LAYERS - 1]{};
|
||||
BitVector leaf_flags_{};
|
||||
FitVector links_{};
|
||||
size_t num_free_nodes_{};
|
||||
|
||||
id_type access_(id_type i) const;
|
||||
};
|
||||
|
||||
} //namespace - xcdat
|
||||
|
||||
#endif //XCDAT_FAST_DAC_BC_HPP_
|
|
@ -1,56 +0,0 @@
|
|||
#ifndef XCDAT_SMALL_VECTOR_HPP_
|
||||
#define XCDAT_SMALL_VECTOR_HPP_
|
||||
|
||||
#include "Vector.hpp"
|
||||
|
||||
namespace xcdat {
|
||||
|
||||
// Compacted integer vector.
|
||||
class FitVector {
|
||||
public:
|
||||
static constexpr id_type CHUNK_WIDTH = sizeof(id_type) * 8;
|
||||
|
||||
FitVector() = default;
|
||||
explicit FitVector(std::istream &is);
|
||||
explicit FitVector(const std::vector<id_type>& values);
|
||||
|
||||
~FitVector() = default;
|
||||
|
||||
id_type operator[](size_t i) const {
|
||||
auto chunk_pos = static_cast<id_type>(i * width_ / CHUNK_WIDTH);
|
||||
auto offset = static_cast<id_type>(i * width_ % CHUNK_WIDTH);
|
||||
if (offset + width_ <= CHUNK_WIDTH) {
|
||||
return (chunks_[chunk_pos] >> offset) & mask_;
|
||||
} else {
|
||||
return ((chunks_[chunk_pos] >> offset)
|
||||
| (chunks_[chunk_pos + 1] << (CHUNK_WIDTH - offset))) & mask_;
|
||||
}
|
||||
}
|
||||
|
||||
size_t size() const {
|
||||
return size_;
|
||||
}
|
||||
size_t size_in_bytes() const;
|
||||
|
||||
void write(std::ostream &os) const;
|
||||
|
||||
void swap(FitVector& rhs) {
|
||||
std::swap(*this, rhs);
|
||||
}
|
||||
|
||||
FitVector(const FitVector&) = delete;
|
||||
FitVector& operator=(const FitVector&) = delete;
|
||||
|
||||
FitVector(FitVector&&) noexcept = default;
|
||||
FitVector& operator=(FitVector&&) noexcept = default;
|
||||
|
||||
private:
|
||||
Vector<id_type> chunks_ {};
|
||||
size_t size_ {};
|
||||
id_type width_ {};
|
||||
id_type mask_ {};
|
||||
};
|
||||
|
||||
} //namespace - xcdat
|
||||
|
||||
#endif //XCDAT_SMALL_VECTOR_HPP_
|
|
@ -1,514 +0,0 @@
|
|||
#ifndef XCDAT_TRIE_HPP_
|
||||
#define XCDAT_TRIE_HPP_
|
||||
|
||||
#include <string_view>
|
||||
#include <xcdat/Trie.hpp>
|
||||
|
||||
#include "Trie.hpp"
|
||||
#include "DacBc.hpp"
|
||||
#include "FastDacBc.hpp"
|
||||
|
||||
namespace xcdat {
|
||||
|
||||
// Compressed string dictionary using an improved double-array trie. There are
|
||||
// two versions of DACs to represent BASE/CHECK arrays in small space. The
|
||||
// versions can be chosen using the Fast parameter.
|
||||
template<bool Fast>
|
||||
class Trie {
|
||||
public:
|
||||
using trie_type = Trie<Fast>;
|
||||
using bc_type = typename std::conditional<Fast, FastDacBc, DacBc>::type;
|
||||
|
||||
static constexpr auto NOT_FOUND = ID_MAX;
|
||||
|
||||
// Generic constructor.
|
||||
Trie() = default;
|
||||
|
||||
// Reads the dictionary from an std::istream.
|
||||
explicit Trie(std::istream& is) {
|
||||
bc_ = bc_type(is);
|
||||
terminal_flags_ = BitVector(is);
|
||||
tail_ = Vector<char>(is);
|
||||
boundary_flags_ = BitVector(is);
|
||||
alphabet_ = Vector<uint8_t>(is);
|
||||
is.read(reinterpret_cast<char*>(table_), 512);
|
||||
num_keys_ = read_value<size_t>(is);
|
||||
max_length_ = read_value<size_t>(is);
|
||||
bin_mode_ = read_value<bool>(is);
|
||||
}
|
||||
|
||||
// Generic destructor.
|
||||
~Trie() = default;
|
||||
|
||||
// Lookups the ID of a given key. If the key is not registered, otherwise
|
||||
// returns NOT_FOUND.
|
||||
id_type lookup(std::string_view key) const {
|
||||
size_t pos = 0;
|
||||
id_type node_id = 0;
|
||||
|
||||
while (!bc_.is_leaf(node_id)) {
|
||||
if (pos == key.length()) {
|
||||
return terminal_flags_[node_id] ? to_key_id_(node_id) : NOT_FOUND;
|
||||
}
|
||||
|
||||
const auto child_id = bc_.base(node_id) ^code_(key[pos++]);
|
||||
if (bc_.check(child_id) != node_id) {
|
||||
return NOT_FOUND;
|
||||
}
|
||||
|
||||
node_id = child_id;
|
||||
}
|
||||
|
||||
size_t tail_pos = bc_.link(node_id);
|
||||
if (!match_suffix_(key, pos, tail_pos)) {
|
||||
return NOT_FOUND;
|
||||
}
|
||||
|
||||
return to_key_id_(node_id);
|
||||
}
|
||||
|
||||
// Decodes the key associated with a given ID.
|
||||
std::string access(id_type id) const {
|
||||
if (num_keys_ <= id) {
|
||||
return {};
|
||||
}
|
||||
|
||||
std::string dec;
|
||||
dec.reserve(max_length_);
|
||||
|
||||
auto node_id = to_node_id_(id);
|
||||
auto tail_pos = bc_.is_leaf(node_id) ? bc_.link(node_id) : NOT_FOUND;
|
||||
|
||||
while (node_id) {
|
||||
const auto parent_id = bc_.check(node_id);
|
||||
dec += edge_(parent_id, node_id);
|
||||
node_id = parent_id;
|
||||
}
|
||||
|
||||
std::reverse(std::begin(dec), std::end(dec));
|
||||
|
||||
if (tail_pos != 0 && tail_pos != NOT_FOUND) {
|
||||
if (bin_mode_) {
|
||||
do {
|
||||
dec += tail_[tail_pos];
|
||||
} while (!boundary_flags_[tail_pos++]);
|
||||
} else {
|
||||
do {
|
||||
dec += tail_[tail_pos++];
|
||||
} while (tail_[tail_pos]);
|
||||
}
|
||||
}
|
||||
|
||||
return dec;
|
||||
}
|
||||
|
||||
// Iterator for enumerating the keys and IDs included as prefixes of a given
|
||||
// key, that is, supporting so-called common prefix lookup. It is created by
|
||||
// using make_prefix_iterator().
|
||||
class PrefixIterator {
|
||||
public:
|
||||
PrefixIterator() = default;
|
||||
|
||||
// Scans the next key. If it does not exist, returns false.
|
||||
bool next() {
|
||||
return trie_ != nullptr && trie_->next_prefix_(this);
|
||||
}
|
||||
|
||||
// Gets the key.
|
||||
std::string_view key() const {
|
||||
return {key_.data(), pos_};
|
||||
};
|
||||
// Gets the ID.
|
||||
id_type id() const {
|
||||
return id_;
|
||||
}
|
||||
|
||||
private:
|
||||
const trie_type* trie_{};
|
||||
const std::string_view key_{};
|
||||
|
||||
size_t pos_{0};
|
||||
id_type node_id_{0};
|
||||
id_type id_{};
|
||||
|
||||
bool begin_flag_{true};
|
||||
bool end_flag_{false};
|
||||
|
||||
PrefixIterator(const trie_type* trie, std::string_view key)
|
||||
: trie_{trie}, key_{key} {}
|
||||
|
||||
friend class Trie;
|
||||
};
|
||||
|
||||
// Makes PrefixIterator from a given key.
|
||||
PrefixIterator make_prefix_iterator(std::string_view key) const {
|
||||
return PrefixIterator{this, key};
|
||||
}
|
||||
|
||||
// Iterator class for enumerating the keys and IDs starting with prefixes of
|
||||
// a given key, that is, supporting so-called predictive lookup. It is in
|
||||
// lexicographical order. It is created by using make_predictive_iterator().
|
||||
class PredictiveIterator {
|
||||
public:
|
||||
PredictiveIterator() = default;
|
||||
|
||||
// Scans the next key. If it does not exist, returns false.
|
||||
bool next() {
|
||||
return trie_ != nullptr && trie_->next_predictive_(this);
|
||||
}
|
||||
|
||||
// Gets the key.
|
||||
std::string_view key() const {
|
||||
return {buf_.data(), buf_.size()};
|
||||
};
|
||||
// Gets the ID.
|
||||
id_type id() const {
|
||||
return id_;
|
||||
}
|
||||
|
||||
private:
|
||||
const trie_type* trie_{};
|
||||
const std::string_view key_{};
|
||||
|
||||
bool begin_flag_{true};
|
||||
bool end_flag_{false};
|
||||
|
||||
struct stack_t {
|
||||
size_t depth;
|
||||
char c;
|
||||
id_type node_id;
|
||||
};
|
||||
|
||||
std::vector<stack_t> stack_{};
|
||||
std::string buf_{};
|
||||
id_type id_{};
|
||||
|
||||
PredictiveIterator(const trie_type* trie, std::string_view key)
|
||||
: trie_{trie}, key_{key} {
|
||||
buf_.reserve(trie->max_length_);
|
||||
}
|
||||
|
||||
friend class Trie;
|
||||
};
|
||||
|
||||
// Makes PredictiveIterator from a given key.
|
||||
PredictiveIterator make_predictive_iterator(std::string_view key) const {
|
||||
return {this, key};
|
||||
}
|
||||
|
||||
// Gets the number of registered keys in the dictionary
|
||||
size_t num_keys() const {
|
||||
return num_keys_;
|
||||
}
|
||||
|
||||
// Gets whether a binary mode or not.
|
||||
bool bin_mode() const {
|
||||
return bin_mode_;
|
||||
}
|
||||
|
||||
// Gets the size of alphabet drawing keys in the dictionary.
|
||||
size_t alphabet_size() const {
|
||||
return alphabet_.size();
|
||||
}
|
||||
|
||||
// Gets the number of nodes including free nodes.
|
||||
size_t num_nodes() const {
|
||||
return bc_.num_nodes();
|
||||
}
|
||||
|
||||
// Gets the number of nodes in the original trie.
|
||||
size_t num_used_nodes() const {
|
||||
return bc_.num_used_nodes();
|
||||
}
|
||||
|
||||
// Gets the number of free nodes corresponding to empty elements.
|
||||
size_t num_free_nodes() const {
|
||||
return bc_.num_free_nodes();
|
||||
}
|
||||
|
||||
// Computes the output dictionary size in bytes.
|
||||
size_t size_in_bytes() const {
|
||||
size_t ret = 0;
|
||||
ret += bc_.size_in_bytes();
|
||||
ret += terminal_flags_.size_in_bytes();
|
||||
ret += tail_.size_in_bytes();
|
||||
ret += boundary_flags_.size_in_bytes();
|
||||
ret += alphabet_.size_in_bytes();
|
||||
ret += sizeof(table_);
|
||||
ret += sizeof(num_keys_);
|
||||
ret += sizeof(max_length_);
|
||||
ret += sizeof(bin_mode_);
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Reports the dictionary statistics into an ostream.
|
||||
void show_stat(std::ostream& os) const {
|
||||
const auto total_size = size_in_bytes();
|
||||
os << "basic statistics of xcdat::Trie<"
|
||||
<< (Fast ? "true" : "false") << ">" << std::endl;
|
||||
show_size("\tnum keys: ", num_keys(), os);
|
||||
show_size("\talphabet size: ", alphabet_size(), os);
|
||||
show_size("\tnum nodes: ", num_nodes(), os);
|
||||
show_size("\tnum used nodes:", num_used_nodes(), os);
|
||||
show_size("\tnum free nodes:", num_free_nodes(), os);
|
||||
show_size("\tsize in bytes: ", size_in_bytes(), os);
|
||||
os << "member size statistics of xcdat::Trie<"
|
||||
<< (Fast ? "true" : "false") << ">" << std::endl;
|
||||
show_size_ratio("\tbc: ", bc_.size_in_bytes(), total_size, os);
|
||||
show_size_ratio("\tterminal_flags:", terminal_flags_.size_in_bytes(),
|
||||
total_size, os);
|
||||
show_size_ratio("\ttail: ", tail_.size_in_bytes(), total_size, os);
|
||||
show_size_ratio("\tboundary_flags:", boundary_flags_.size_in_bytes(),
|
||||
total_size, os);
|
||||
bc_.show_stat(os);
|
||||
}
|
||||
|
||||
// Writes the dictionary into an ostream.
|
||||
void write(std::ostream& os) const {
|
||||
bc_.write(os);
|
||||
terminal_flags_.write(os);
|
||||
tail_.write(os);
|
||||
boundary_flags_.write(os);
|
||||
alphabet_.write(os);
|
||||
os.write(reinterpret_cast<const char*>(table_), 512);
|
||||
write_value(num_keys_, os);
|
||||
write_value(max_length_, os);
|
||||
write_value(bin_mode_, os);
|
||||
}
|
||||
|
||||
// Swap
|
||||
void swap(Trie& rhs) {
|
||||
std::swap(*this, rhs);
|
||||
}
|
||||
|
||||
Trie(const Trie&) = delete;
|
||||
Trie& operator=(const Trie&) = delete;
|
||||
|
||||
Trie(Trie&&) noexcept = default;
|
||||
Trie& operator=(Trie&&) noexcept = default;
|
||||
|
||||
private:
|
||||
bc_type bc_{};
|
||||
BitVector terminal_flags_{};
|
||||
Vector<char> tail_{};
|
||||
BitVector boundary_flags_{}; // used if binary_mode_ == true
|
||||
Vector<uint8_t> alphabet_{};
|
||||
uint8_t table_[512]{}; // table[table[c] + 256] = c
|
||||
|
||||
size_t num_keys_{};
|
||||
size_t max_length_{};
|
||||
bool bin_mode_{};
|
||||
|
||||
id_type to_key_id_(id_type node_id) const {
|
||||
return terminal_flags_.rank(node_id);
|
||||
};
|
||||
id_type to_node_id_(id_type string_id) const {
|
||||
return terminal_flags_.select(string_id);
|
||||
};
|
||||
id_type code_(char c) const {
|
||||
return table_[static_cast<uint8_t>(c)];
|
||||
}
|
||||
char edge_(id_type node_id, id_type child_id) const {
|
||||
return static_cast<char>(table_[(bc_.base(node_id) ^ child_id) + 256]);
|
||||
}
|
||||
|
||||
bool match_suffix_(std::string_view key, size_t pos, size_t tail_pos) const {
|
||||
assert(pos <= key.length());
|
||||
|
||||
if (pos == key.length()) {
|
||||
return tail_pos == 0;
|
||||
}
|
||||
|
||||
if (bin_mode_) {
|
||||
do {
|
||||
if (key[pos] != tail_[tail_pos]) {
|
||||
return false;
|
||||
}
|
||||
++pos;
|
||||
if (boundary_flags_[tail_pos]) {
|
||||
return pos == key.length();
|
||||
}
|
||||
++tail_pos;
|
||||
} while (pos < key.length());
|
||||
return false;
|
||||
} else {
|
||||
do {
|
||||
if (!tail_[tail_pos] || key[pos] != tail_[tail_pos]) {
|
||||
return false;
|
||||
}
|
||||
++pos;
|
||||
++tail_pos;
|
||||
} while (pos < key.length());
|
||||
return !tail_[tail_pos];
|
||||
}
|
||||
}
|
||||
|
||||
void extract_suffix_(size_t tail_pos, std::string& dec) const {
|
||||
if (bin_mode_) {
|
||||
if (tail_pos != 0) {
|
||||
do {
|
||||
dec += tail_[tail_pos];
|
||||
} while (!boundary_flags_[tail_pos++]);
|
||||
}
|
||||
} else {
|
||||
while (tail_[tail_pos] != '\0') {
|
||||
dec += tail_[tail_pos];
|
||||
++tail_pos;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool next_prefix_(PrefixIterator* it) const {
|
||||
if (it->end_flag_) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (it->begin_flag_) {
|
||||
it->begin_flag_ = false;
|
||||
if (terminal_flags_[it->node_id_]) {
|
||||
it->id_ = to_key_id_(it->node_id_);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
while (!bc_.is_leaf(it->node_id_)) {
|
||||
id_type child_id = bc_.base(it->node_id_) ^code_(it->key_[it->pos_++]);
|
||||
if (bc_.check(child_id) != it->node_id_) {
|
||||
it->end_flag_ = true;
|
||||
it->id_ = NOT_FOUND;
|
||||
return false;
|
||||
}
|
||||
it->node_id_ = child_id;
|
||||
if (!bc_.is_leaf(it->node_id_) && terminal_flags_[it->node_id_]) {
|
||||
it->id_ = to_key_id_(it->node_id_);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
it->end_flag_ = true;
|
||||
size_t tail_pos = bc_.link(it->node_id_);
|
||||
|
||||
if (!match_suffix_(it->key_, it->pos_, tail_pos)) {
|
||||
it->id_ = NOT_FOUND;
|
||||
return false;
|
||||
}
|
||||
|
||||
it->pos_ = it->key_.length();
|
||||
it->id_ = to_key_id_(it->node_id_);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool next_predictive_(PredictiveIterator* it) const {
|
||||
if (it->end_flag_) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (it->begin_flag_) {
|
||||
it->begin_flag_ = false;
|
||||
|
||||
id_type node_id = 0;
|
||||
size_t pos = 0;
|
||||
|
||||
for (; pos < it->key_.length(); ++pos) {
|
||||
if (bc_.is_leaf(node_id)) {
|
||||
it->end_flag_ = true;
|
||||
|
||||
size_t tail_pos = bc_.link(node_id);
|
||||
if (tail_pos == 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (bin_mode_) {
|
||||
do {
|
||||
if (it->key_[pos] != tail_[tail_pos]) {
|
||||
return false;
|
||||
}
|
||||
it->buf_ += it->key_[pos++];
|
||||
if (boundary_flags_[tail_pos]) {
|
||||
if (pos == it->key_.length()) {
|
||||
it->id_ = to_key_id_(node_id);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
++tail_pos;
|
||||
} while (pos < it->key_.length());
|
||||
} else {
|
||||
do {
|
||||
if (it->key_[pos] != tail_[tail_pos] || !tail_[tail_pos]) {
|
||||
return false;
|
||||
}
|
||||
it->buf_ += it->key_[pos++];
|
||||
++tail_pos;
|
||||
} while (pos < it->key_.length());
|
||||
}
|
||||
|
||||
it->id_ = to_key_id_(node_id);
|
||||
extract_suffix_(tail_pos, it->buf_);
|
||||
return true;
|
||||
}
|
||||
|
||||
id_type child_id = bc_.base(node_id) ^code_(it->key_[pos]);
|
||||
|
||||
if (bc_.check(child_id) != node_id) {
|
||||
it->end_flag_ = true;
|
||||
return false;
|
||||
}
|
||||
|
||||
node_id = child_id;
|
||||
it->buf_ += it->key_[pos];
|
||||
}
|
||||
|
||||
if (!it->buf_.empty()) {
|
||||
it->stack_.push_back({pos, it->buf_.back(), node_id});
|
||||
} else {
|
||||
it->stack_.push_back({pos, '\0', node_id});
|
||||
}
|
||||
}
|
||||
|
||||
while (!it->stack_.empty()) {
|
||||
id_type node_id = it->stack_.back().node_id;
|
||||
size_t depth = it->stack_.back().depth;
|
||||
uint8_t c = it->stack_.back().c;
|
||||
it->stack_.pop_back();
|
||||
|
||||
if (0 < depth) {
|
||||
it->buf_.resize(depth);
|
||||
it->buf_.back() = c;
|
||||
}
|
||||
|
||||
if (bc_.is_leaf(node_id)) {
|
||||
it->id_ = to_key_id_(node_id);
|
||||
extract_suffix_(bc_.link(node_id), it->buf_);
|
||||
return true;
|
||||
}
|
||||
|
||||
const id_type base = bc_.base(node_id);
|
||||
|
||||
// For lex sort
|
||||
for (auto rit = std::rbegin(alphabet_);
|
||||
rit != std::rend(alphabet_); ++rit) {
|
||||
const id_type child_id = base ^code_(*rit);
|
||||
if (bc_.check(child_id) == node_id) {
|
||||
it->stack_.push_back(
|
||||
{depth + 1, static_cast<char>(*rit), child_id}
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if (terminal_flags_[node_id]) {
|
||||
it->id_ = to_key_id_(node_id);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
it->end_flag_ = true;
|
||||
return false;
|
||||
}
|
||||
|
||||
friend class TrieBuilder;
|
||||
};
|
||||
|
||||
} //namespace - xcdat
|
||||
|
||||
#endif //XCDAT_TRIE_HPP_
|
|
@ -1,117 +0,0 @@
|
|||
#ifndef XCDAT_TRIE_BUILDER_HPP_
|
||||
#define XCDAT_TRIE_BUILDER_HPP_
|
||||
|
||||
#include "Trie.hpp"
|
||||
|
||||
namespace xcdat {
|
||||
|
||||
// Double-array trie builder.
|
||||
class TrieBuilder {
|
||||
public:
|
||||
// Builds the dictionary from given string keys. The keys must be sorted in
|
||||
// lexicographical order without duplication. Any error in construction is
|
||||
// reported by TrieBuilder::Exception. If the keys include the ASCII zero
|
||||
// code, pass bin_mode = true.
|
||||
template<bool Fast>
|
||||
static Trie<Fast>
|
||||
build(const std::vector<std::string_view>& keys, bool bin_mode = false) {
|
||||
TrieBuilder builder(keys, Trie<Fast>::bc_type::WIDTH_L1, bin_mode);
|
||||
|
||||
Trie<Fast> trie;
|
||||
|
||||
trie.bc_ = typename Trie<Fast>::bc_type(builder.bc_, builder.leaf_flags_);
|
||||
trie.terminal_flags_ = BitVector(builder.term_flags_, true, true);
|
||||
trie.tail_ = Vector<char>(builder.tail_);
|
||||
trie.boundary_flags_ = BitVector(builder.boundary_flags_, false, false);
|
||||
trie.alphabet_ = Vector<uint8_t>(builder.alphabet_);
|
||||
std::swap(trie.table_, builder.table_);
|
||||
|
||||
trie.num_keys_ = keys.size();
|
||||
trie.max_length_ = builder.max_length_;
|
||||
trie.bin_mode_ = builder.bin_mode_;
|
||||
|
||||
return trie;
|
||||
}
|
||||
|
||||
// Exception class for xcdat::TrieBuilder
|
||||
class Exception : public std::exception {
|
||||
public:
|
||||
explicit Exception(std::string message) : message_(std::move(message)) {}
|
||||
~Exception() throw() override {};
|
||||
|
||||
// overrides what() of std::exception.
|
||||
const char* what() const throw() override {
|
||||
return message_.c_str();
|
||||
}
|
||||
|
||||
private:
|
||||
std::string message_;
|
||||
};
|
||||
|
||||
TrieBuilder(const TrieBuilder&) = delete;
|
||||
TrieBuilder& operator=(const TrieBuilder&) = delete;
|
||||
|
||||
private:
|
||||
struct Suffix {
|
||||
std::string_view str;
|
||||
id_type node_id;
|
||||
|
||||
size_t length() const {
|
||||
return str.length();
|
||||
}
|
||||
char operator[](size_t i) const {
|
||||
return str[length() - i - 1];
|
||||
}
|
||||
|
||||
std::reverse_iterator<const char*> rbegin() const {
|
||||
return std::make_reverse_iterator(str.data() + str.length());
|
||||
}
|
||||
std::reverse_iterator<const char*> rend() const {
|
||||
return std::make_reverse_iterator(str.data());
|
||||
}
|
||||
};
|
||||
|
||||
// To avoid undefined traversal
|
||||
static constexpr id_type TABOO_ID = 1;
|
||||
// From darts-clone setting
|
||||
static constexpr id_type FREE_BLOCKS = 16;
|
||||
|
||||
const std::vector<std::string_view>& keys_;
|
||||
const id_type block_size_;
|
||||
const id_type width_L1_;
|
||||
|
||||
bool bin_mode_{};
|
||||
|
||||
std::vector<BcPair> bc_{};
|
||||
BitVectorBuilder leaf_flags_{};
|
||||
BitVectorBuilder term_flags_{};
|
||||
std::vector<char> tail_{};
|
||||
BitVectorBuilder boundary_flags_{};
|
||||
std::vector<uint8_t> alphabet_{};
|
||||
uint8_t table_[512]{};
|
||||
|
||||
std::vector<bool> used_flags_{};
|
||||
std::vector<uint8_t> edges_{};
|
||||
std::vector<id_type> heads_{};
|
||||
std::vector<Suffix> suffixes_{};
|
||||
|
||||
size_t max_length_{};
|
||||
|
||||
TrieBuilder(const std::vector<std::string_view>& keys,
|
||||
id_type width_L1, bool bin_mode);
|
||||
~TrieBuilder() = default;
|
||||
|
||||
void build_table_();
|
||||
void build_bc_(size_t begin, size_t end, size_t depth, id_type node_id);
|
||||
void build_tail_();
|
||||
|
||||
void expand_();
|
||||
void use_(id_type node_id);
|
||||
void close_block_(id_type block_id);
|
||||
id_type find_base_(id_type block_id) const;
|
||||
bool is_target_(id_type base) const;
|
||||
};
|
||||
|
||||
} //namespace - xcdat
|
||||
|
||||
#endif //XCDAT_TRIE_BUILDER_HPP_
|
|
@ -1,91 +0,0 @@
|
|||
#ifndef XCDAT_VECTOR_HPP
|
||||
#define XCDAT_VECTOR_HPP
|
||||
|
||||
#include "xcdat_basics.hpp"
|
||||
|
||||
namespace xcdat {
|
||||
|
||||
// Simple vector of a POD type
|
||||
template<typename T>
|
||||
class Vector {
|
||||
public:
|
||||
static_assert(!std::is_same<T, bool>::value, "Type bool is not supported.");
|
||||
static_assert(std::is_pod<T>::value, "T is not POD.");
|
||||
|
||||
Vector() = default;
|
||||
|
||||
explicit Vector(std::istream& is) {
|
||||
size_ = read_value<size_t>(is);
|
||||
vec_.resize(size_);
|
||||
is.read(reinterpret_cast<char*>(&vec_[0]), sizeof(T) * size_);
|
||||
data_ = vec_.data();
|
||||
}
|
||||
|
||||
explicit Vector(std::vector<T>& vec) {
|
||||
if (vec.size() != vec.capacity()) {
|
||||
vec.shrink_to_fit();
|
||||
}
|
||||
vec_ = std::move(vec);
|
||||
data_ = vec_.data();
|
||||
size_ = vec_.size();
|
||||
}
|
||||
|
||||
~Vector() = default;
|
||||
|
||||
const T& operator[](size_t i) const {
|
||||
return data_[i];
|
||||
}
|
||||
const T* data() const {
|
||||
return data_;
|
||||
}
|
||||
|
||||
const T* begin() const {
|
||||
return data_;
|
||||
}
|
||||
const T* end() const {
|
||||
return data_ + size_;
|
||||
}
|
||||
|
||||
std::reverse_iterator<const T*> rbegin() const {
|
||||
return std::make_reverse_iterator(end());
|
||||
}
|
||||
std::reverse_iterator<const T*> rend() const {
|
||||
return std::make_reverse_iterator(begin());
|
||||
}
|
||||
|
||||
bool is_empty() const {
|
||||
return size_ == 0;
|
||||
}
|
||||
|
||||
size_t size() const {
|
||||
return size_;
|
||||
}
|
||||
|
||||
size_t size_in_bytes() const {
|
||||
return size_ * sizeof(T) + sizeof(size_);
|
||||
}
|
||||
|
||||
void write(std::ostream& os) const {
|
||||
write_value(size_, os);
|
||||
os.write(reinterpret_cast<const char*>(data_), sizeof(T) * size_);
|
||||
}
|
||||
|
||||
void swap(Vector& rhs) {
|
||||
std::swap(*this, rhs);
|
||||
}
|
||||
|
||||
Vector(const Vector&) = delete;
|
||||
Vector& operator=(const Vector&) = delete;
|
||||
|
||||
Vector(Vector&&) noexcept = default;
|
||||
Vector& operator=(Vector&&) noexcept = default;
|
||||
|
||||
private:
|
||||
const T* data_ {};
|
||||
size_t size_ {};
|
||||
std::vector<T> vec_ {};
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif //XCDAT_VECTOR_HPP
|
114
include/xcdat/bit_tools.hpp
Normal file
114
include/xcdat/bit_tools.hpp
Normal file
|
@ -0,0 +1,114 @@
|
|||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <nmmintrin.h>
|
||||
|
||||
// From https://github.com/ot/succinct
|
||||
namespace xcdat::bit_tools {
|
||||
|
||||
static constexpr std::uint64_t ones_step_4 = 0x1111111111111111ULL;
|
||||
static constexpr std::uint64_t ones_step_8 = 0x0101010101010101ULL;
|
||||
static constexpr std::uint64_t ones_step_9 = 1ULL << 0 | 1ULL << 9 | 1ULL << 18 | 1ULL << 27 | //
|
||||
1ULL << 36 | 1ULL << 45 | 1ULL << 54;
|
||||
static constexpr std::uint64_t msbs_step_8 = 0x80ULL * ones_step_8;
|
||||
static constexpr std::uint64_t msbs_step_9 = 0x100ULL * ones_step_9;
|
||||
|
||||
inline std::uint64_t popcount(std::uint64_t x) {
|
||||
#ifdef __SSE4_2__
|
||||
return static_cast<std::uint64_t>(__builtin_popcountll(x));
|
||||
#else
|
||||
x = x - ((x >> 1) & 0x5555555555555555ULL);
|
||||
x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL);
|
||||
x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
|
||||
x = (0x0101010101010101ULL * x >> 56);
|
||||
return x;
|
||||
#endif
|
||||
}
|
||||
|
||||
inline std::uint64_t msb(std::uint64_t x) {
|
||||
return x == 0 ? 0 : 63 - __builtin_clzll(x);
|
||||
}
|
||||
|
||||
inline std::uint64_t uleq_step_9(std::uint64_t x, std::uint64_t y) {
|
||||
return (((((y | msbs_step_9) - (x & ~msbs_step_9)) | (x ^ y)) ^ (x & ~y)) & msbs_step_9) >> 8;
|
||||
}
|
||||
|
||||
inline std::uint64_t byte_counts(std::uint64_t x) {
|
||||
x = x - ((x & 0xa * ones_step_4) >> 1);
|
||||
x = (x & 3 * ones_step_4) + ((x >> 2) & 3 * ones_step_4);
|
||||
x = (x + (x >> 4)) & 0x0f * ones_step_8;
|
||||
return x;
|
||||
}
|
||||
|
||||
static constexpr std::uint8_t select_in_byte[2048] = {
|
||||
8, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1,
|
||||
0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0,
|
||||
1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2,
|
||||
0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0,
|
||||
2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1,
|
||||
0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0,
|
||||
1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 8, 8, 8, 1, 8, 2, 2, 1, 8, 3, 3, 1, 3, 2, 2, 1, 8,
|
||||
4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 8, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, 3, 2, 2, 1, 5, 4, 4, 1, 4, 2, 2, 1,
|
||||
4, 3, 3, 1, 3, 2, 2, 1, 8, 6, 6, 1, 6, 2, 2, 1, 6, 3, 3, 1, 3, 2, 2, 1, 6, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2,
|
||||
1, 6, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, 3, 2, 2, 1, 5, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 8, 7, 7, 1, 7, 2,
|
||||
2, 1, 7, 3, 3, 1, 3, 2, 2, 1, 7, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 7, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, 3,
|
||||
2, 2, 1, 5, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 7, 6, 6, 1, 6, 2, 2, 1, 6, 3, 3, 1, 3, 2, 2, 1, 6, 4, 4, 1,
|
||||
4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 6, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, 3, 2, 2, 1, 5, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3,
|
||||
1, 3, 2, 2, 1, 8, 8, 8, 8, 8, 8, 8, 2, 8, 8, 8, 3, 8, 3, 3, 2, 8, 8, 8, 4, 8, 4, 4, 2, 8, 4, 4, 3, 4, 3, 3, 2, 8, 8,
|
||||
8, 5, 8, 5, 5, 2, 8, 5, 5, 3, 5, 3, 3, 2, 8, 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, 4, 3, 3, 2, 8, 8, 8, 6, 8, 6, 6, 2, 8,
|
||||
6, 6, 3, 6, 3, 3, 2, 8, 6, 6, 4, 6, 4, 4, 2, 6, 4, 4, 3, 4, 3, 3, 2, 8, 6, 6, 5, 6, 5, 5, 2, 6, 5, 5, 3, 5, 3, 3, 2,
|
||||
6, 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, 4, 3, 3, 2, 8, 8, 8, 7, 8, 7, 7, 2, 8, 7, 7, 3, 7, 3, 3, 2, 8, 7, 7, 4, 7, 4, 4,
|
||||
2, 7, 4, 4, 3, 4, 3, 3, 2, 8, 7, 7, 5, 7, 5, 5, 2, 7, 5, 5, 3, 5, 3, 3, 2, 7, 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, 4, 3,
|
||||
3, 2, 8, 7, 7, 6, 7, 6, 6, 2, 7, 6, 6, 3, 6, 3, 3, 2, 7, 6, 6, 4, 6, 4, 4, 2, 6, 4, 4, 3, 4, 3, 3, 2, 7, 6, 6, 5, 6,
|
||||
5, 5, 2, 6, 5, 5, 3, 5, 3, 3, 2, 6, 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, 4, 3, 3, 2, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 3, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 4, 8, 4, 4, 3, 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, 5, 8, 5, 5, 3, 8, 8, 8,
|
||||
5, 8, 5, 5, 4, 8, 5, 5, 4, 5, 4, 4, 3, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 3, 8, 8, 8, 6, 8, 6, 6, 4, 8, 6,
|
||||
6, 4, 6, 4, 4, 3, 8, 8, 8, 6, 8, 6, 6, 5, 8, 6, 6, 5, 6, 5, 5, 3, 8, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3, 8,
|
||||
8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 3, 8, 8, 8, 7, 8, 7, 7, 4, 8, 7, 7, 4, 7, 4, 4, 3, 8, 8, 8, 7, 8, 7, 7, 5,
|
||||
8, 7, 7, 5, 7, 5, 5, 3, 8, 7, 7, 5, 7, 5, 5, 4, 7, 5, 5, 4, 5, 4, 4, 3, 8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6,
|
||||
3, 8, 7, 7, 6, 7, 6, 6, 4, 7, 6, 6, 4, 6, 4, 4, 3, 8, 7, 7, 6, 7, 6, 6, 5, 7, 6, 6, 5, 6, 5, 5, 3, 7, 6, 6, 5, 6, 5,
|
||||
5, 4, 6, 5, 5, 4, 5, 4, 4, 3, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, 5, 8, 5, 5, 4, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 4, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8,
|
||||
6, 8, 6, 6, 5, 8, 8, 8, 6, 8, 6, 6, 5, 8, 6, 6, 5, 6, 5, 5, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8,
|
||||
8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 4, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 5, 8, 8, 8, 7, 8, 7, 7, 5, 8,
|
||||
7, 7, 5, 7, 5, 5, 4, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, 8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 4,
|
||||
8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 5, 8, 7, 7, 6, 7, 6, 6, 5, 7, 6, 6, 5, 6, 5, 5, 4, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6,
|
||||
8, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, 8,
|
||||
8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7};
|
||||
|
||||
inline std::uint64_t select_in_word(const std::uint64_t x, const std::uint64_t k) {
|
||||
#ifdef __BMI2__
|
||||
return _tzcnt_u64(_pdep_u64(1ULL << k, x));
|
||||
#else
|
||||
const std::uint64_t byte_sums = byte_counts(x) * ones_step_8;
|
||||
const std::uint64_t k_step_8 = k * ones_step_8;
|
||||
const std::uint64_t geq_k_step_8 = (((k_step_8 | msbs_step_8) - byte_sums) & msbs_step_8);
|
||||
const std::uint64_t place = popcount(geq_k_step_8) * 8;
|
||||
const std::uint64_t byte_rank = k - (((byte_sums << 8) >> place) & 0xFFULL);
|
||||
return place + select_in_byte[((x >> place) & 0xFF) | (byte_rank << 8)];
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace xcdat::bit_tools
|
250
include/xcdat/bit_vector.hpp
Normal file
250
include/xcdat/bit_vector.hpp
Normal file
|
@ -0,0 +1,250 @@
|
|||
#pragma once
|
||||
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
#include <numeric>
|
||||
|
||||
#include "mm_vector.hpp"
|
||||
#include "utils.hpp"
|
||||
|
||||
namespace xcdat {
|
||||
|
||||
//! Rank9 implementatoin
|
||||
class bit_vector {
|
||||
public:
|
||||
class builder {
|
||||
private:
|
||||
std::vector<std::uint64_t> m_bits;
|
||||
std::uint64_t m_size = 0;
|
||||
|
||||
public:
|
||||
builder() = default;
|
||||
|
||||
builder(std::uint64_t size) {
|
||||
resize(size);
|
||||
}
|
||||
|
||||
inline void push_back(bool x) {
|
||||
if (m_size % 64 == 0) {
|
||||
m_bits.push_back(0);
|
||||
}
|
||||
if (x) {
|
||||
set_bit(m_size, true);
|
||||
}
|
||||
m_size += 1;
|
||||
}
|
||||
|
||||
inline bool operator[](std::uint64_t i) const {
|
||||
return m_bits[i / 64] & (1ULL << (i % 64));
|
||||
}
|
||||
|
||||
inline void set_bit(std::uint64_t i, bool x = true) {
|
||||
if (x) {
|
||||
m_bits[i / 64] |= (1ULL << (i % 64));
|
||||
} else {
|
||||
m_bits[i / 64] &= (~(1ULL << (i % 64)));
|
||||
}
|
||||
}
|
||||
|
||||
inline void resize(std::uint64_t size) {
|
||||
m_bits.resize(utils::words_for_bits(size), 0ULL);
|
||||
m_size = size;
|
||||
}
|
||||
|
||||
inline void reserve(std::uint64_t capacity) {
|
||||
m_bits.reserve(utils::words_for_bits(capacity));
|
||||
}
|
||||
|
||||
inline std::uint64_t size() const {
|
||||
return m_size;
|
||||
}
|
||||
|
||||
friend class bit_vector;
|
||||
};
|
||||
|
||||
static constexpr std::uint64_t block_size = 8; // i.e., 64 * 8 bits
|
||||
static constexpr std::uint64_t selects_per_hint = 64 * block_size * 2;
|
||||
|
||||
private:
|
||||
mm_vector<std::uint64_t> m_bits;
|
||||
mm_vector<std::uint64_t> m_rank_hints;
|
||||
mm_vector<std::uint64_t> m_select_hints;
|
||||
std::uint64_t m_size = 0;
|
||||
std::uint64_t m_num_ones = 0;
|
||||
|
||||
public:
|
||||
bit_vector() = default;
|
||||
|
||||
bit_vector(builder& b, bool enable_rank = false, bool enable_select = false) {
|
||||
build(b, enable_rank, enable_select);
|
||||
}
|
||||
|
||||
virtual ~bit_vector() = default;
|
||||
|
||||
void reset() {
|
||||
m_bits.reset();
|
||||
m_rank_hints.reset();
|
||||
m_select_hints.reset();
|
||||
m_size = 0;
|
||||
m_num_ones = 0;
|
||||
}
|
||||
|
||||
void build(builder& b, bool enable_rank = false, bool enable_select = false) {
|
||||
reset();
|
||||
m_bits.steal(b.m_bits);
|
||||
m_size = b.m_size;
|
||||
m_num_ones = std::accumulate(m_bits.begin(), m_bits.end(), 0ULL,
|
||||
[](std::uint64_t acc, std::uint64_t x) { return acc + bit_tools::popcount(x); });
|
||||
if (enable_rank) {
|
||||
build_rank_hints();
|
||||
}
|
||||
if (enable_rank and enable_select) {
|
||||
build_select_hints();
|
||||
}
|
||||
}
|
||||
|
||||
inline std::uint64_t size() const {
|
||||
return m_size;
|
||||
}
|
||||
|
||||
inline std::uint64_t num_ones() const {
|
||||
return m_num_ones;
|
||||
}
|
||||
|
||||
inline bool operator[](std::uint64_t i) const {
|
||||
return m_bits[i / 64] & (1ULL << (i % 64));
|
||||
}
|
||||
|
||||
// The number of 1s in B[0..i)
|
||||
inline std::uint64_t rank(std::uint64_t i) const {
|
||||
if (i == size()) {
|
||||
return num_ones();
|
||||
}
|
||||
const auto [wi, wj] = utils::decompose<64>(i);
|
||||
return rank_for_word(wi) + (wj != 0 ? bit_tools::popcount(m_bits[wi] << (64 - wj)) : 0);
|
||||
}
|
||||
|
||||
// The largest position
|
||||
inline std::uint64_t select(std::uint64_t n) const {
|
||||
const std::uint64_t bi = select_for_block(n);
|
||||
assert(bi < num_blocks());
|
||||
|
||||
std::uint64_t curr_rank = rank_for_block(bi);
|
||||
assert(curr_rank <= n);
|
||||
|
||||
std::uint64_t rank_in_block_parallel = (n - curr_rank) * bit_tools::ones_step_9;
|
||||
std::uint64_t sub_ranks = ranks_in_block(bi);
|
||||
std::uint64_t sub_block_offset =
|
||||
bit_tools::uleq_step_9(sub_ranks, rank_in_block_parallel) * bit_tools::ones_step_9 >> 54 & 0x7;
|
||||
curr_rank += sub_ranks >> (7 - sub_block_offset) * 9 & 0x1FF;
|
||||
assert(curr_rank <= n);
|
||||
|
||||
std::uint64_t word_offset = (bi * block_size) + sub_block_offset;
|
||||
return word_offset * 64 + bit_tools::select_in_word(m_bits[word_offset], n - curr_rank);
|
||||
}
|
||||
|
||||
private:
|
||||
inline std::uint64_t num_blocks() const {
|
||||
return m_rank_hints.size() / 2 - 1;
|
||||
}
|
||||
|
||||
// Absolute rank until the bi-th block
|
||||
inline std::uint64_t rank_for_block(std::uint64_t bi) const {
|
||||
return m_rank_hints[bi * 2];
|
||||
}
|
||||
|
||||
// Packed ranks in the bi-th block
|
||||
inline std::uint64_t ranks_in_block(std::uint64_t bi) const {
|
||||
return m_rank_hints[bi * 2 + 1];
|
||||
}
|
||||
|
||||
// Absolute rank until the wi-th word
|
||||
inline std::uint64_t rank_for_word(std::uint64_t wi) const {
|
||||
const auto [bi, bj] = utils::decompose<block_size>(wi);
|
||||
return rank_for_block(bi) + rank_in_block(bi, bj);
|
||||
}
|
||||
|
||||
// Relative rank in the bi-th block
|
||||
inline std::uint64_t rank_in_block(std::uint64_t bi, std::uint64_t bj) const {
|
||||
return ranks_in_block(bi) >> ((7 - bj) * 9) & 0x1FF;
|
||||
}
|
||||
|
||||
inline std::uint64_t select_for_block(std::uint64_t n) const {
|
||||
auto [a, b] = select_with_hint(n);
|
||||
while (b - a > 1) {
|
||||
const std::uint64_t lb = a + (b - a) / 2;
|
||||
if (rank_for_block(lb) <= n) {
|
||||
a = lb;
|
||||
} else {
|
||||
b = lb;
|
||||
}
|
||||
}
|
||||
return a;
|
||||
}
|
||||
|
||||
inline std::tuple<std::uint64_t, std::uint64_t> select_with_hint(std::uint64_t n) const {
|
||||
const std::uint64_t i = n / selects_per_hint;
|
||||
return {i != 0 ? m_select_hints[i - 1] : 0, m_select_hints[i] + 1};
|
||||
}
|
||||
|
||||
void build_rank_hints() {
|
||||
std::uint64_t curr_num_ones = 0;
|
||||
std::uint64_t curr_num_ones_in_block = 0;
|
||||
std::uint64_t curr_ranks_in_block = 0;
|
||||
|
||||
const std::uint64_t num_words = m_bits.size();
|
||||
std::vector<std::uint64_t> rank_hints = {curr_num_ones};
|
||||
|
||||
for (std::uint64_t wi = 0; wi < num_words; wi++) {
|
||||
const std::uint64_t bi = wi % block_size; // Relative position in the block
|
||||
const std::uint64_t num_ones_in_word = bit_tools::popcount(m_bits[wi]);
|
||||
|
||||
if (bi != 0) {
|
||||
curr_ranks_in_block <<= 9;
|
||||
curr_ranks_in_block |= curr_num_ones_in_block;
|
||||
}
|
||||
|
||||
curr_num_ones += num_ones_in_word;
|
||||
curr_num_ones_in_block += num_ones_in_word;
|
||||
|
||||
if (bi == block_size - 1) {
|
||||
rank_hints.push_back(curr_ranks_in_block);
|
||||
rank_hints.push_back(curr_num_ones);
|
||||
curr_num_ones_in_block = 0;
|
||||
curr_ranks_in_block = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// Padding the remaining hints
|
||||
const std::uint64_t remain = block_size - (num_words % block_size);
|
||||
for (std::uint64_t wi = 0; wi < remain; wi++) {
|
||||
curr_ranks_in_block <<= 9;
|
||||
curr_ranks_in_block |= curr_num_ones_in_block;
|
||||
}
|
||||
rank_hints.push_back(curr_ranks_in_block);
|
||||
|
||||
// Sentinel
|
||||
if (num_words % block_size != 0) {
|
||||
rank_hints.push_back(curr_ranks_in_block);
|
||||
rank_hints.push_back(0);
|
||||
}
|
||||
|
||||
// Release
|
||||
m_rank_hints.steal(rank_hints);
|
||||
}
|
||||
|
||||
void build_select_hints() {
|
||||
std::vector<std::uint64_t> select_hints;
|
||||
std::uint64_t threshold = selects_per_hint;
|
||||
for (std::uint64_t bi = 0; bi < num_blocks(); ++bi) {
|
||||
if (rank_for_block(bi + 1) > threshold) {
|
||||
select_hints.push_back(bi);
|
||||
threshold += selects_per_hint;
|
||||
}
|
||||
}
|
||||
select_hints.push_back(num_blocks());
|
||||
m_select_hints.steal(select_hints);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace xcdat
|
58
include/xcdat/mm_vector.hpp
Normal file
58
include/xcdat/mm_vector.hpp
Normal file
|
@ -0,0 +1,58 @@
|
|||
#pragma once
|
||||
|
||||
#include <cstdlib>
|
||||
#include <vector>
|
||||
|
||||
namespace xcdat {
|
||||
|
||||
template <class T>
|
||||
class mm_vector {
|
||||
private:
|
||||
std::vector<T> m_vec;
|
||||
|
||||
public:
|
||||
mm_vector() = default;
|
||||
|
||||
virtual ~mm_vector() = default;
|
||||
|
||||
// NOTE: The input vector is stolen.
|
||||
mm_vector(std::vector<T>& vec) {
|
||||
steal(vec);
|
||||
}
|
||||
|
||||
void steal(std::vector<T>& vec) {
|
||||
m_vec.swap(vec);
|
||||
m_vec.shrink_to_fit();
|
||||
}
|
||||
|
||||
void reset() {
|
||||
m_vec = std::vector<T>();
|
||||
}
|
||||
|
||||
inline std::uint64_t size() const {
|
||||
return m_vec.size();
|
||||
}
|
||||
|
||||
inline auto begin() const {
|
||||
return m_vec.begin();
|
||||
}
|
||||
|
||||
inline auto end() const {
|
||||
return m_vec.end();
|
||||
}
|
||||
|
||||
inline const T& operator[](std::uint64_t i) const {
|
||||
return m_vec[i];
|
||||
}
|
||||
|
||||
inline const T* data() const {
|
||||
return m_vec.data();
|
||||
}
|
||||
|
||||
template <typename Visitor>
|
||||
void visit(Visitor& visitor) {
|
||||
visitor.visit(m_vec);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace xcdat
|
24
include/xcdat/utils.hpp
Normal file
24
include/xcdat/utils.hpp
Normal file
|
@ -0,0 +1,24 @@
|
|||
#pragma once
|
||||
|
||||
#include <tuple>
|
||||
|
||||
#include "bit_tools.hpp"
|
||||
|
||||
namespace xcdat::utils {
|
||||
|
||||
template <std::uint64_t N>
|
||||
constexpr std::tuple<std::uint64_t, std::uint64_t> decompose(std::uint64_t x) {
|
||||
return std::make_tuple(x / N, x % N);
|
||||
}
|
||||
|
||||
template <class T = std::uint64_t>
|
||||
constexpr std::uint64_t words_for_bits(std::uint64_t nbits) {
|
||||
constexpr std::uint64_t wbits = sizeof(T) * 8;
|
||||
return (nbits + wbits - 1) / wbits;
|
||||
}
|
||||
|
||||
inline std::uint64_t bits_for_int(std::uint64_t x) {
|
||||
return (x > 1) ? bit_tools::msb(x - 1) + 1 : 0;
|
||||
}
|
||||
|
||||
} // namespace xcdat::utils
|
|
@ -1,58 +0,0 @@
|
|||
#ifndef XCDAT_BASICS_HPP_
|
||||
#define XCDAT_BASICS_HPP_
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <cassert>
|
||||
#include <fstream>
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include <limits>
|
||||
|
||||
#include "xcdat_config.hpp"
|
||||
|
||||
namespace xcdat {
|
||||
|
||||
#ifdef XCDAT_X64
|
||||
using id_type = uint64_t;
|
||||
#else
|
||||
using id_type = uint32_t;
|
||||
#endif
|
||||
|
||||
constexpr id_type ID_MAX = std::numeric_limits<id_type>::max();
|
||||
|
||||
struct BcPair {
|
||||
id_type base;
|
||||
id_type check;
|
||||
};
|
||||
|
||||
inline void show_size(const char* str, double size, std::ostream& os) {
|
||||
os << str << "\t" << size << std::endl;
|
||||
}
|
||||
|
||||
inline void show_size(const char* str, size_t size, std::ostream& os) {
|
||||
os << str << "\t" << size << std::endl;
|
||||
}
|
||||
|
||||
inline void show_size_ratio(const char* str, size_t size, size_t denom, std::ostream& os) {
|
||||
os << str << "\t" << size << "\t" << static_cast<double>(size) / denom << std::endl;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
inline void write_value(const T val, std::ostream& os) {
|
||||
os.write(reinterpret_cast<const char*>(&val), sizeof(val));
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
inline T read_value(std::istream& is) {
|
||||
T val;
|
||||
is.read(reinterpret_cast<char*>(&val), sizeof(val));
|
||||
return val;
|
||||
}
|
||||
|
||||
} //namespace - xcdat
|
||||
|
||||
#endif //XCDAT_BASICS_HPP_
|
|
@ -1,3 +0,0 @@
|
|||
|
||||
add_executable(sample sample.cpp)
|
||||
target_link_libraries(sample xcdat)
|
|
@ -1,92 +0,0 @@
|
|||
#include <iostream>
|
||||
#include <xcdat.hpp>
|
||||
|
||||
int main() {
|
||||
std::vector<std::string> keys_buf = {
|
||||
"Aoba", "Yun", "Hajime", "Hihumi", "Kou", "Rin",
|
||||
"Hazuki", "Umiko", "Nene", "Nenecchi"
|
||||
};
|
||||
|
||||
// Convert to the input format
|
||||
std::vector<std::string_view> keys(keys_buf.size());
|
||||
for (size_t i = 0; i < keys.size(); ++i) {
|
||||
keys[i] = std::string_view{keys_buf[i]};
|
||||
}
|
||||
|
||||
// Input data must be sorted.
|
||||
std::sort(std::begin(keys), std::end(keys));
|
||||
|
||||
// Dictionary class
|
||||
using Trie = xcdat::Trie<true>;
|
||||
|
||||
try {
|
||||
// Builds a dictionary from the keys
|
||||
Trie trie = xcdat::TrieBuilder::build<true>(keys); // move
|
||||
|
||||
// Writes the dictionary to a file.
|
||||
std::ofstream ofs{"sample.bin"};
|
||||
trie.write(ofs);
|
||||
} catch (const xcdat::TrieBuilder::Exception& ex) {
|
||||
// Abort if something went wrong...
|
||||
std::cerr << ex.what() << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Creates an empty dictionary
|
||||
Trie trie;
|
||||
{
|
||||
// Reads the dictionary to the file.
|
||||
std::ifstream ifs{"sample.bin"};
|
||||
trie = Trie{ifs}; // move
|
||||
}
|
||||
|
||||
std::cout << "Performing basic operations..." << std::endl;
|
||||
{
|
||||
// lookup() obtains the unique ID for a given key
|
||||
xcdat::id_type key_id = trie.lookup("Rin");
|
||||
// access() decodes the key from a given ID
|
||||
std::cout << key_id << " : " << trie.access(key_id) << std::endl;
|
||||
|
||||
// Given an unregistered key, lookup() returns NOT_FOUND.
|
||||
if (trie.lookup("Hotaru") == Trie::NOT_FOUND) {
|
||||
std::cout << "? : " << "Hotaru" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Performing a common prefix operation..." << std::endl;
|
||||
{
|
||||
// Common prefix operation is implemented using PrefixIterator, created by
|
||||
// make_prefix_iterator().
|
||||
Trie::PrefixIterator it = trie.make_prefix_iterator("Nenecchi");
|
||||
|
||||
// next() continues to obtain the next key until false is returned.
|
||||
while (it.next()) {
|
||||
std::cout << it.id() << " : " << it.key() << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Performing a predictive operation..." << std::endl;
|
||||
{
|
||||
// Predictive operation is implemented using PredictiveIterator, created by
|
||||
// make_predictive_iterator().
|
||||
Trie::PredictiveIterator it = trie.make_predictive_iterator("Ha");
|
||||
|
||||
// next() continues to obtain the next key until false is returned in
|
||||
// lexicographical order.
|
||||
while (it.next()) {
|
||||
std::cout << it.id() << " : " << it.key() << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Enumerating all registered keys..." << std::endl;
|
||||
{
|
||||
// PredictiveIterator for an empty string provides enumeration of all
|
||||
// registered keys in lexicographical order.
|
||||
Trie::PredictiveIterator it = trie.make_predictive_iterator("");
|
||||
while (it.next()) {
|
||||
std::cout << it.id() << " : " << it.key() << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -1,314 +0,0 @@
|
|||
#include <popcntintrin.h>
|
||||
|
||||
#include "xcdat/BitVector.hpp"
|
||||
|
||||
namespace xcdat {
|
||||
|
||||
// inspired by marisa-trie
|
||||
constexpr uint8_t SELECT_TABLE[9][256] = {
|
||||
{
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
},
|
||||
{
|
||||
8, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1,
|
||||
5, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1,
|
||||
6, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1,
|
||||
5, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1,
|
||||
7, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1,
|
||||
5, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1,
|
||||
6, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1,
|
||||
5, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1,
|
||||
8, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1,
|
||||
5, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1,
|
||||
6, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1,
|
||||
5, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1,
|
||||
7, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1,
|
||||
5, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1,
|
||||
6, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1,
|
||||
5, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1
|
||||
},
|
||||
{
|
||||
8, 8, 8, 2, 8, 3, 3, 2, 8, 4, 4, 2, 4, 3, 3, 2,
|
||||
8, 5, 5, 2, 5, 3, 3, 2, 5, 4, 4, 2, 4, 3, 3, 2,
|
||||
8, 6, 6, 2, 6, 3, 3, 2, 6, 4, 4, 2, 4, 3, 3, 2,
|
||||
6, 5, 5, 2, 5, 3, 3, 2, 5, 4, 4, 2, 4, 3, 3, 2,
|
||||
8, 7, 7, 2, 7, 3, 3, 2, 7, 4, 4, 2, 4, 3, 3, 2,
|
||||
7, 5, 5, 2, 5, 3, 3, 2, 5, 4, 4, 2, 4, 3, 3, 2,
|
||||
7, 6, 6, 2, 6, 3, 3, 2, 6, 4, 4, 2, 4, 3, 3, 2,
|
||||
6, 5, 5, 2, 5, 3, 3, 2, 5, 4, 4, 2, 4, 3, 3, 2,
|
||||
8, 8, 8, 2, 8, 3, 3, 2, 8, 4, 4, 2, 4, 3, 3, 2,
|
||||
8, 5, 5, 2, 5, 3, 3, 2, 5, 4, 4, 2, 4, 3, 3, 2,
|
||||
8, 6, 6, 2, 6, 3, 3, 2, 6, 4, 4, 2, 4, 3, 3, 2,
|
||||
6, 5, 5, 2, 5, 3, 3, 2, 5, 4, 4, 2, 4, 3, 3, 2,
|
||||
8, 7, 7, 2, 7, 3, 3, 2, 7, 4, 4, 2, 4, 3, 3, 2,
|
||||
7, 5, 5, 2, 5, 3, 3, 2, 5, 4, 4, 2, 4, 3, 3, 2,
|
||||
7, 6, 6, 2, 6, 3, 3, 2, 6, 4, 4, 2, 4, 3, 3, 2,
|
||||
6, 5, 5, 2, 5, 3, 3, 2, 5, 4, 4, 2, 4, 3, 3, 2
|
||||
},
|
||||
{
|
||||
8, 8, 8, 8, 8, 8, 8, 3, 8, 8, 8, 4, 8, 4, 4, 3,
|
||||
8, 8, 8, 5, 8, 5, 5, 3, 8, 5, 5, 4, 5, 4, 4, 3,
|
||||
8, 8, 8, 6, 8, 6, 6, 3, 8, 6, 6, 4, 6, 4, 4, 3,
|
||||
8, 6, 6, 5, 6, 5, 5, 3, 6, 5, 5, 4, 5, 4, 4, 3,
|
||||
8, 8, 8, 7, 8, 7, 7, 3, 8, 7, 7, 4, 7, 4, 4, 3,
|
||||
8, 7, 7, 5, 7, 5, 5, 3, 7, 5, 5, 4, 5, 4, 4, 3,
|
||||
8, 7, 7, 6, 7, 6, 6, 3, 7, 6, 6, 4, 6, 4, 4, 3,
|
||||
7, 6, 6, 5, 6, 5, 5, 3, 6, 5, 5, 4, 5, 4, 4, 3,
|
||||
8, 8, 8, 8, 8, 8, 8, 3, 8, 8, 8, 4, 8, 4, 4, 3,
|
||||
8, 8, 8, 5, 8, 5, 5, 3, 8, 5, 5, 4, 5, 4, 4, 3,
|
||||
8, 8, 8, 6, 8, 6, 6, 3, 8, 6, 6, 4, 6, 4, 4, 3,
|
||||
8, 6, 6, 5, 6, 5, 5, 3, 6, 5, 5, 4, 5, 4, 4, 3,
|
||||
8, 8, 8, 7, 8, 7, 7, 3, 8, 7, 7, 4, 7, 4, 4, 3,
|
||||
8, 7, 7, 5, 7, 5, 5, 3, 7, 5, 5, 4, 5, 4, 4, 3,
|
||||
8, 7, 7, 6, 7, 6, 6, 3, 7, 6, 6, 4, 6, 4, 4, 3,
|
||||
7, 6, 6, 5, 6, 5, 5, 3, 6, 5, 5, 4, 5, 4, 4, 3
|
||||
},
|
||||
{
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4,
|
||||
8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, 5, 8, 5, 5, 4,
|
||||
8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 4,
|
||||
8, 8, 8, 6, 8, 6, 6, 5, 8, 6, 6, 5, 6, 5, 5, 4,
|
||||
8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 4,
|
||||
8, 8, 8, 7, 8, 7, 7, 5, 8, 7, 7, 5, 7, 5, 5, 4,
|
||||
8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 4,
|
||||
8, 7, 7, 6, 7, 6, 6, 5, 7, 6, 6, 5, 6, 5, 5, 4,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4,
|
||||
8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, 5, 8, 5, 5, 4,
|
||||
8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 4,
|
||||
8, 8, 8, 6, 8, 6, 6, 5, 8, 6, 6, 5, 6, 5, 5, 4,
|
||||
8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 4,
|
||||
8, 8, 8, 7, 8, 7, 7, 5, 8, 7, 7, 5, 7, 5, 5, 4,
|
||||
8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 4,
|
||||
8, 7, 7, 6, 7, 6, 6, 5, 7, 6, 6, 5, 6, 5, 5, 4
|
||||
},
|
||||
{
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6,
|
||||
8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 5,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7,
|
||||
8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 5,
|
||||
8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6,
|
||||
8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 5,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6,
|
||||
8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 5,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7,
|
||||
8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 5,
|
||||
8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6,
|
||||
8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 5
|
||||
},
|
||||
{
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7,
|
||||
8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7,
|
||||
8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6
|
||||
},
|
||||
{
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7
|
||||
},
|
||||
{
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8
|
||||
}
|
||||
};
|
||||
|
||||
uint32_t pop_count(uint32_t x) {
|
||||
#ifdef XCDAT_USE_POPCNT
|
||||
return static_cast<uint32_t>(_mm_popcnt_u32(x));
|
||||
#else
|
||||
x = ((x & 0xAAAAAAAA) >> 1) + (x & 0x55555555);
|
||||
x = ((x & 0xCCCCCCCC) >> 2) + (x & 0x33333333);
|
||||
x = ((x >> 4) + x) & 0x0F0F0F0F;
|
||||
x += x >> 8;
|
||||
x += x >> 16;
|
||||
return x & 0x3F;
|
||||
#endif
|
||||
}
|
||||
|
||||
BitVector::BitVector(std::istream& is) {
|
||||
bits_ = Vector<uint32_t>(is);
|
||||
rank_tips_ = Vector<RankTip>(is);
|
||||
select_tips_ = Vector<id_type>(is);
|
||||
size_ = read_value<size_t>(is);
|
||||
num_1s_ = read_value<size_t>(is);
|
||||
}
|
||||
|
||||
BitVector::BitVector(BitVectorBuilder& builder, bool rank_flag,
|
||||
bool select_flag) {
|
||||
if (!builder.size()) {
|
||||
return;
|
||||
}
|
||||
|
||||
bits_ = Vector<uint32_t>(builder.bits_);
|
||||
size_ = builder.size_;
|
||||
num_1s_ = builder.num_1s_;
|
||||
|
||||
// builds rank_tips_
|
||||
if (rank_flag) {
|
||||
std::vector<RankTip> rank_tips(size_ / BITS_IN_R1 + 1);
|
||||
id_type count = 0;
|
||||
for (id_type i = 0; i < rank_tips.size(); ++i) {
|
||||
auto& tip = rank_tips[i];
|
||||
tip.L1 = count;
|
||||
for (id_type offset = 0; offset < R1_PER_R2; ++offset) {
|
||||
tip.L2[offset] = static_cast<uint8_t>(count - tip.L1);
|
||||
auto pos_in_bits = i * R1_PER_R2 + offset;
|
||||
if (pos_in_bits < bits_.size()) {
|
||||
count += pop_count(bits_[pos_in_bits]);
|
||||
}
|
||||
}
|
||||
}
|
||||
rank_tips_ = Vector<RankTip>(rank_tips);
|
||||
}
|
||||
|
||||
// builds select_tips_
|
||||
if (rank_flag && select_flag) {
|
||||
std::vector<id_type> select_tips{0};
|
||||
auto count = ONES_PER_TIP;
|
||||
for (id_type i = 0; i < rank_tips_.size(); ++i) {
|
||||
if (count < rank_tips_[i].L1) {
|
||||
select_tips.push_back(i - 1);
|
||||
count += ONES_PER_TIP;
|
||||
}
|
||||
}
|
||||
select_tips.push_back(static_cast<id_type>(rank_tips_.size() - 1));
|
||||
select_tips_ = Vector<id_type>(select_tips);
|
||||
}
|
||||
}
|
||||
|
||||
id_type BitVector::rank(id_type i) const {
|
||||
auto& hint = rank_tips_[i / BITS_IN_R1];
|
||||
return hint.L1 + hint.L2[i / BITS_IN_R2 % R1_PER_R2]
|
||||
+ pop_count(bits_[i / 32] & ((1U << (i % 32)) - 1));
|
||||
}
|
||||
|
||||
id_type BitVector::select(id_type i) const {
|
||||
id_type left = 0, right = static_cast<id_type>(rank_tips_.size());
|
||||
|
||||
if (!select_tips_.is_empty()) {
|
||||
auto select_tip_id = static_cast<id_type>(i / ONES_PER_TIP);
|
||||
left = select_tips_[select_tip_id];
|
||||
right = select_tips_[select_tip_id + 1] + 1;
|
||||
}
|
||||
|
||||
while (left + 1 < right) {
|
||||
const auto center = (left + right) / 2;
|
||||
if (i < rank_tips_[center].L1) {
|
||||
right = center;
|
||||
} else {
|
||||
left = center;
|
||||
}
|
||||
}
|
||||
|
||||
i += 1; // for i+1 th
|
||||
i -= rank_tips_[left].L1;
|
||||
|
||||
uint32_t offset = 1;
|
||||
for (; offset < R1_PER_R2; ++offset) {
|
||||
if (i <= rank_tips_[left].L2[offset]) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
i -= rank_tips_[left].L2[--offset];
|
||||
|
||||
auto ret = (left * BITS_IN_R1) + (offset * BITS_IN_R2);
|
||||
auto bits = bits_[ret / 32];
|
||||
|
||||
{
|
||||
auto _count = pop_count(bits % 65536);
|
||||
if (_count < i) {
|
||||
bits >>= 16;
|
||||
ret += 16;
|
||||
i -= _count;
|
||||
}
|
||||
}
|
||||
{
|
||||
auto _count = pop_count(bits % 256);
|
||||
if (_count < i) {
|
||||
bits >>= 8;
|
||||
ret += 8;
|
||||
i -= _count;
|
||||
}
|
||||
}
|
||||
|
||||
ret += SELECT_TABLE[i][bits % 256];
|
||||
return ret - 1;
|
||||
}
|
||||
|
||||
size_t BitVector::size_in_bytes() const {
|
||||
size_t ret = 0;
|
||||
ret += bits_.size_in_bytes();
|
||||
ret += rank_tips_.size_in_bytes();
|
||||
ret += select_tips_.size_in_bytes();
|
||||
ret += sizeof(size_);
|
||||
ret += sizeof(num_1s_);
|
||||
return ret;
|
||||
}
|
||||
|
||||
void BitVector::write(std::ostream& os) const {
|
||||
bits_.write(os);
|
||||
rank_tips_.write(os);
|
||||
select_tips_.write(os);
|
||||
write_value(size_, os);
|
||||
write_value(num_1s_, os);
|
||||
}
|
||||
|
||||
} //namespace - xcdat
|
141
src/DacBc.cpp
141
src/DacBc.cpp
|
@ -1,141 +0,0 @@
|
|||
#include <sstream>
|
||||
|
||||
#include "xcdat/DacBc.hpp"
|
||||
|
||||
namespace xcdat {
|
||||
|
||||
DacBc::DacBc(std::istream& is) {
|
||||
for (size_t i = 0; i < sizeof(id_type); ++i) {
|
||||
values_[i] = Vector<uint8_t>(is);
|
||||
}
|
||||
for (size_t i = 0; i < sizeof(id_type) - 1; ++i) {
|
||||
flags_[i] = BitVector(is);
|
||||
}
|
||||
leaf_flags_ = BitVector(is);
|
||||
links_ = FitVector(is);
|
||||
max_level_ = read_value<uint8_t>(is);
|
||||
num_free_nodes_ = read_value<size_t>(is);
|
||||
}
|
||||
|
||||
DacBc::DacBc(const std::vector<BcPair>& bc, BitVectorBuilder& leaf_flags) {
|
||||
if (bc.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<uint8_t> values[sizeof(id_type)];
|
||||
BitVectorBuilder flags[sizeof(id_type)];
|
||||
std::vector<id_type> links;
|
||||
|
||||
leaf_flags_ = BitVector(leaf_flags, true, false);
|
||||
|
||||
values[0].reserve(bc.size() * 2);
|
||||
flags[0].reserve(bc.size() * 2);
|
||||
links.reserve(bc.size());
|
||||
|
||||
max_level_ = 0;
|
||||
|
||||
auto append = [&](id_type value) {
|
||||
uint8_t level = 0;
|
||||
values[level].push_back(static_cast<uint8_t>(value & 0xFF));
|
||||
flags[level].push_back(true);
|
||||
value >>= 8;
|
||||
while (value) {
|
||||
++level;
|
||||
values[level].push_back(static_cast<uint8_t>(value & 0xFF));
|
||||
flags[level].push_back(true);
|
||||
value >>= 8;
|
||||
}
|
||||
flags[level].set_bit(flags[level].size() - 1, false);
|
||||
max_level_ = std::max(max_level_, level);
|
||||
};
|
||||
|
||||
auto append_leaf = [&](id_type value) {
|
||||
links.push_back(value >> 8);
|
||||
values[0].push_back(static_cast<uint8_t>(value & 0xFF));
|
||||
flags[0].push_back(false);
|
||||
};
|
||||
|
||||
for (id_type i = 0; i < bc.size(); ++i) {
|
||||
if (leaf_flags_[i]) {
|
||||
append_leaf(bc[i].base);
|
||||
} else {
|
||||
append(bc[i].base ^ i);
|
||||
}
|
||||
append(bc[i].check ^ i);
|
||||
if (bc[i].check == i) {
|
||||
++num_free_nodes_;
|
||||
}
|
||||
}
|
||||
|
||||
// release
|
||||
for (uint8_t i = 0; i < max_level_; ++i) {
|
||||
values_[i] = Vector<uint8_t>(values[i]);
|
||||
flags_[i] = BitVector(flags[i], true, false);
|
||||
}
|
||||
values_[max_level_] = Vector<uint8_t>(values[max_level_]);
|
||||
links_ = FitVector(links);
|
||||
}
|
||||
|
||||
size_t DacBc::size_in_bytes() const {
|
||||
size_t ret = 0;
|
||||
for (auto& values : values_) {
|
||||
ret += values.size_in_bytes();
|
||||
}
|
||||
for (auto& flags : flags_) {
|
||||
ret += flags.size_in_bytes();
|
||||
}
|
||||
ret += leaf_flags_.size_in_bytes();
|
||||
ret += links_.size_in_bytes();
|
||||
ret += sizeof(max_level_);
|
||||
ret += sizeof(num_free_nodes_);
|
||||
return ret;
|
||||
}
|
||||
|
||||
void DacBc::show_stat(std::ostream& os) const {
|
||||
const auto total_size = size_in_bytes();
|
||||
os << "basic statistics of xcdat::DacBc" << std::endl;
|
||||
show_size("\tnum links: ", links_.size(), os);
|
||||
show_size("\tbytes per node:", double(total_size) / num_nodes(), os);
|
||||
os << "member size statistics of xcdat::DacBc" << std::endl;
|
||||
for (int i = 0; i <= max_level_; ++i) {
|
||||
std::ostringstream oss;
|
||||
oss << "\tvalues_L" << i << ":";
|
||||
show_size_ratio(oss.str().c_str(), values_[i].size_in_bytes(), total_size, os);
|
||||
}
|
||||
for (int i = 0; i < max_level_; ++i) {
|
||||
std::ostringstream oss;
|
||||
oss << "\tflags_L" << i << ": ";
|
||||
show_size_ratio(oss.str().c_str(), flags_[i].size_in_bytes(), total_size, os);
|
||||
}
|
||||
show_size_ratio("\tleaves: ", leaf_flags_.size_in_bytes(), total_size, os);
|
||||
show_size_ratio("\tlinks: ", links_.size_in_bytes(), total_size, os);
|
||||
}
|
||||
|
||||
void DacBc::write(std::ostream& os) const {
|
||||
for (auto& values : values_) {
|
||||
values.write(os);
|
||||
}
|
||||
for (auto& flags : flags_) {
|
||||
flags.write(os);
|
||||
}
|
||||
leaf_flags_.write(os);
|
||||
links_.write(os);
|
||||
write_value(max_level_, os);
|
||||
write_value(num_free_nodes_, os);
|
||||
}
|
||||
|
||||
id_type DacBc::access_(id_type i) const {
|
||||
uint8_t level = 0;
|
||||
id_type value = values_[level][i];
|
||||
while (level < max_level_) {
|
||||
if (!flags_[level][i]) {
|
||||
break;
|
||||
}
|
||||
i = flags_[level].rank(i);
|
||||
++level;
|
||||
value |= static_cast<id_type>(values_[level][i]) << (level * 8);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
} //namespace - xcdat
|
|
@ -1,188 +0,0 @@
|
|||
#include "xcdat/FastDacBc.hpp"
|
||||
|
||||
namespace xcdat {
|
||||
|
||||
FastDacBc::FastDacBc(std::istream& is) {
|
||||
values_L1_ = Vector<uint8_t>(is);
|
||||
values_L2_ = Vector<uint16_t>(is);
|
||||
values_L3_ = Vector<uint32_t>(is);
|
||||
#ifdef XCDAT_X64
|
||||
values_L4_ = Vector<uint64_t>(is);
|
||||
#endif
|
||||
for (size_t i = 0; i < LAYERS - 1; ++i) {
|
||||
ranks_[i] = Vector<id_type>(is);
|
||||
}
|
||||
leaf_flags_ = BitVector(is);
|
||||
links_ = FitVector(is);
|
||||
num_free_nodes_ = read_value<size_t>(is);
|
||||
}
|
||||
|
||||
FastDacBc::FastDacBc(const std::vector<BcPair>& bc,
|
||||
BitVectorBuilder& leaf_flags) {
|
||||
if (bc.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<uint8_t> values_L1;
|
||||
std::vector<uint16_t> values_L2;
|
||||
std::vector<uint32_t> values_L3;
|
||||
#ifdef XCDAT_X64
|
||||
std::vector<uint64_t> values_L4;
|
||||
#endif
|
||||
std::vector<id_type> ranks[LAYERS - 1];
|
||||
std::vector<id_type> links;
|
||||
leaf_flags_ = BitVector(leaf_flags, true, false);
|
||||
|
||||
ranks[0].reserve((bc.size() * 2) / 128);
|
||||
|
||||
auto append = [&](id_type value) {
|
||||
if ((values_L1.size() % BLOCK_SIZE_L1) == 0) {
|
||||
ranks[0].push_back(static_cast<id_type>(values_L2.size()));
|
||||
}
|
||||
if ((value / BLOCK_SIZE_L1) == 0) {
|
||||
values_L1.push_back(static_cast<uint8_t>(0 | (value << 1)));
|
||||
return;
|
||||
} else {
|
||||
auto pos = values_L2.size() - ranks[0].back();
|
||||
values_L1.push_back(static_cast<uint8_t>(1 | (pos << 1)));
|
||||
}
|
||||
|
||||
if ((values_L2.size() % BLOCK_SIZE_L2) == 0) {
|
||||
ranks[1].push_back(static_cast<id_type>(values_L3.size()));
|
||||
}
|
||||
if ((value / BLOCK_SIZE_L2) == 0) {
|
||||
values_L2.push_back(static_cast<uint16_t>(0 | (value << 1)));
|
||||
return;
|
||||
} else {
|
||||
auto pos = values_L3.size() - ranks[1].back();
|
||||
values_L2.push_back(static_cast<uint16_t>(1 | (pos << 1)));
|
||||
}
|
||||
|
||||
#ifdef XCDAT_X64
|
||||
if ((values_L3.size() % BLOCK_SIZE_L3) == 0) {
|
||||
ranks[1].push_back(static_cast<id_type>(values_L4.size()));
|
||||
}
|
||||
if ((value / BLOCK_SIZE_L3) == 0) {
|
||||
values_L3.push_back(static_cast<uint32_t>(0 | (value << 1)));
|
||||
return;
|
||||
} else {
|
||||
auto pos = values_L4.size() - ranks[1].back();
|
||||
values_L3.push_back(static_cast<uint32_t>(1 | (pos << 1)));
|
||||
}
|
||||
values_L4.push_back(value);
|
||||
#else
|
||||
values_L3.push_back(value);
|
||||
#endif
|
||||
};
|
||||
|
||||
auto append_leaf = [&](id_type value) {
|
||||
if ((values_L1.size() % BLOCK_SIZE_L1) == 0) {
|
||||
ranks[0].push_back(static_cast<id_type>(values_L2.size()));
|
||||
}
|
||||
values_L1.push_back(static_cast<uint8_t>(value & 0xFF));
|
||||
links.push_back(value >> 8);
|
||||
};
|
||||
|
||||
|
||||
for (id_type i = 0; i < bc.size(); ++i) {
|
||||
if (leaf_flags_[i]) {
|
||||
append_leaf(bc[i].base);
|
||||
} else {
|
||||
append(bc[i].base ^ i);
|
||||
}
|
||||
append(bc[i].check ^ i);
|
||||
if (bc[i].check == i) {
|
||||
++num_free_nodes_;
|
||||
}
|
||||
}
|
||||
|
||||
// release
|
||||
values_L1_ = Vector<uint8_t>(values_L1);
|
||||
values_L2_ = Vector<uint16_t>(values_L2);
|
||||
values_L3_ = Vector<uint32_t>(values_L3);
|
||||
#ifdef XCDAT_X64
|
||||
values_L4_ = Vector<uint64_t>(values_L4);
|
||||
#endif
|
||||
for (uint8_t j = 0; j < LAYERS - 1; ++j) {
|
||||
ranks_[j] = Vector<id_type>(ranks[j]);
|
||||
}
|
||||
links_ = FitVector(links);
|
||||
}
|
||||
|
||||
size_t FastDacBc::size_in_bytes() const {
|
||||
size_t ret = 0;
|
||||
ret += values_L1_.size_in_bytes();
|
||||
ret += values_L2_.size_in_bytes();
|
||||
ret += values_L3_.size_in_bytes();
|
||||
#ifdef XCDAT_X64
|
||||
ret += values_L4_.size_in_bytes();
|
||||
#endif
|
||||
for (auto& ranks : ranks_) {
|
||||
ret += ranks.size_in_bytes();
|
||||
}
|
||||
ret += leaf_flags_.size_in_bytes();
|
||||
ret += links_.size_in_bytes();
|
||||
ret += sizeof(num_free_nodes_);
|
||||
return ret;
|
||||
}
|
||||
|
||||
void FastDacBc::show_stat(std::ostream& os) const {
|
||||
const auto total_size = size_in_bytes();
|
||||
os << "basic statistics of xcdat::FastDacBc" << std::endl;
|
||||
show_size("\tnum links: ", links_.size(), os);
|
||||
show_size("\tbytes per node:", double(total_size) / num_nodes(), os);
|
||||
os << "member size statistics of xcdat::FastDacBc" << std::endl;
|
||||
show_size_ratio("\tvalues_L1:", values_L1_.size_in_bytes(), total_size, os);
|
||||
show_size_ratio("\tvalues_L2:", values_L2_.size_in_bytes(), total_size, os);
|
||||
show_size_ratio("\tvalues_L3:", values_L3_.size_in_bytes(), total_size, os);
|
||||
#ifdef XCDAT_X64
|
||||
show_size_ratio("\tvalues_L4:", values_L4_.size_in_bytes(), total_size, os);
|
||||
#endif
|
||||
show_size_ratio("\tranks_L1: ", ranks_[0].size_in_bytes(), total_size, os);
|
||||
show_size_ratio("\tranks_L2: ", ranks_[1].size_in_bytes(), total_size, os);
|
||||
#ifdef XCDAT_X64
|
||||
show_size_ratio("\tranks_L3: ", ranks_[2].size_in_bytes(), total_size, os);
|
||||
#endif
|
||||
show_size_ratio("\tleaves: ", leaf_flags_.size_in_bytes(), total_size, os);
|
||||
show_size_ratio("\tlinks: ", links_.size_in_bytes(), total_size, os);
|
||||
}
|
||||
|
||||
void FastDacBc::write(std::ostream& os) const {
|
||||
values_L1_.write(os);
|
||||
values_L2_.write(os);
|
||||
values_L3_.write(os);
|
||||
#ifdef XCDAT_X64
|
||||
values_L4_.write(os);
|
||||
#endif
|
||||
for (auto& ranks : ranks_) {
|
||||
ranks.write(os);
|
||||
}
|
||||
leaf_flags_.write(os);
|
||||
links_.write(os);
|
||||
write_value(num_free_nodes_, os);
|
||||
}
|
||||
|
||||
id_type FastDacBc::access_(id_type i) const {
|
||||
uint32_t value = values_L1_[i] >> 1;
|
||||
if ((values_L1_[i] & 1U) == 0) {
|
||||
return value;
|
||||
}
|
||||
i = ranks_[0][i / BLOCK_SIZE_L1] + value;
|
||||
value = values_L2_[i] >> 1;
|
||||
if ((values_L2_[i] & 1U) == 0) {
|
||||
return value;
|
||||
}
|
||||
i = ranks_[1][i / BLOCK_SIZE_L2] + value;
|
||||
#ifdef XCDAT_X64
|
||||
value = values_L3_[i] >> 1;
|
||||
if ((values_L3_[i] & 1U) == 0) {
|
||||
return value;
|
||||
}
|
||||
i = ranks_[2][i / BLOCK_SIZE_L3] + value;
|
||||
return values_L4_[i];
|
||||
#else
|
||||
return values_L3_[i];
|
||||
#endif
|
||||
}
|
||||
|
||||
} //namespace - xcdat
|
|
@ -1,60 +0,0 @@
|
|||
#include "xcdat/FitVector.hpp"
|
||||
|
||||
namespace xcdat {
|
||||
|
||||
FitVector::FitVector(std::istream& is) {
|
||||
chunks_ = Vector<id_type>(is);
|
||||
size_= read_value<size_t>(is);
|
||||
width_ = read_value<id_type>(is);
|
||||
mask_ = read_value<id_type>(is);
|
||||
}
|
||||
|
||||
FitVector::FitVector(const std::vector<id_type>& values) {
|
||||
if (values.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
width_ = 0;
|
||||
auto max_value = *std::max_element(std::begin(values), std::end(values));
|
||||
do {
|
||||
++width_;
|
||||
max_value >>= 1;
|
||||
} while (max_value);
|
||||
|
||||
size_ = values.size();
|
||||
mask_ = (1U << width_) - 1;
|
||||
std::vector<id_type> chunks(size_ * width_ / CHUNK_WIDTH + 1, 0);
|
||||
|
||||
for (id_type i = 0; i < size_; ++i) {
|
||||
const auto chunk_pos = static_cast<id_type>(i * width_ / CHUNK_WIDTH);
|
||||
const auto offset = static_cast<id_type>(i * width_ % CHUNK_WIDTH);
|
||||
|
||||
chunks[chunk_pos] &= ~(mask_ << offset);
|
||||
chunks[chunk_pos] |= (values[i] & mask_) << offset;
|
||||
|
||||
if (CHUNK_WIDTH < offset + width_) {
|
||||
chunks[chunk_pos + 1] &= ~(mask_ >> (CHUNK_WIDTH - offset));
|
||||
chunks[chunk_pos + 1] |= (values[i] & mask_) >> (CHUNK_WIDTH - offset);
|
||||
}
|
||||
}
|
||||
|
||||
chunks_ = Vector<id_type>(chunks);
|
||||
}
|
||||
|
||||
size_t FitVector::size_in_bytes() const {
|
||||
size_t ret = 0;
|
||||
ret += chunks_.size_in_bytes();
|
||||
ret += sizeof(size_);
|
||||
ret += sizeof(width_);
|
||||
ret += sizeof(mask_);
|
||||
return ret;
|
||||
}
|
||||
|
||||
void FitVector::write(std::ostream& os) const {
|
||||
chunks_.write(os);
|
||||
write_value(size_, os);
|
||||
write_value(width_, os);
|
||||
write_value(mask_, os);
|
||||
}
|
||||
|
||||
} //namespace - xcdat
|
|
@ -1,317 +0,0 @@
|
|||
#include <iostream>
|
||||
|
||||
#include "xcdat/TrieBuilder.hpp"
|
||||
|
||||
namespace xcdat {
|
||||
|
||||
TrieBuilder::TrieBuilder(const std::vector<std::string_view>& keys,
|
||||
id_type width_L1, bool bin_mode)
|
||||
: keys_(keys), block_size_(1U << width_L1), width_L1_(width_L1),
|
||||
bin_mode_(bin_mode) {
|
||||
if (keys_.empty()) {
|
||||
throw TrieBuilder::Exception("The input data is empty.");
|
||||
}
|
||||
if (ID_MAX < keys_.size()) {
|
||||
throw TrieBuilder::Exception("Key ID range error.");
|
||||
}
|
||||
|
||||
{
|
||||
size_t init_capa = 1;
|
||||
while (init_capa < keys_.size()) {
|
||||
init_capa <<= 1;
|
||||
}
|
||||
|
||||
bc_.reserve(init_capa);
|
||||
leaf_flags_.reserve(init_capa);
|
||||
term_flags_.reserve(init_capa);
|
||||
used_flags_.reserve(init_capa);
|
||||
heads_.reserve(init_capa >> width_L1_);
|
||||
}
|
||||
|
||||
alphabet_.reserve(256);
|
||||
edges_.reserve(256);
|
||||
suffixes_.reserve(keys_.size());
|
||||
|
||||
// initialize an empty list.
|
||||
for (id_type i = 0; i < 256; ++i) {
|
||||
bc_.push_back({i + 1, i - 1});
|
||||
leaf_flags_.push_back(false);
|
||||
term_flags_.push_back(false);
|
||||
used_flags_.push_back(false);
|
||||
}
|
||||
bc_[255].base = 0;
|
||||
bc_[0].check = 255;
|
||||
|
||||
for (id_type i = 0; i < 256; i += block_size_) {
|
||||
heads_.push_back(i);
|
||||
}
|
||||
|
||||
use_(0);
|
||||
bc_[0].check = TABOO_ID;
|
||||
used_flags_[TABOO_ID] = true;
|
||||
heads_[TABOO_ID >> width_L1_] = bc_[TABOO_ID].base;
|
||||
|
||||
build_table_();
|
||||
build_bc_(0, keys_.size(), 0, 0);
|
||||
build_tail_();
|
||||
}
|
||||
|
||||
void TrieBuilder::build_table_() {
|
||||
using tb_type = std::pair<uint8_t, size_t>;
|
||||
tb_type table_builder[256];
|
||||
|
||||
for (uint32_t i = 0; i < 256; ++i) {
|
||||
table_builder[i] = {static_cast<uint8_t>(i), 0};
|
||||
}
|
||||
|
||||
max_length_ = 0;
|
||||
for (auto& key : keys_) {
|
||||
for (char c : key) {
|
||||
++table_builder[static_cast<uint8_t>(c)].second;
|
||||
}
|
||||
max_length_ = std::max(max_length_, key.length());
|
||||
}
|
||||
|
||||
if (table_builder[0].second != 0) { // including '\0'
|
||||
bin_mode_ = true;
|
||||
}
|
||||
|
||||
for (const auto& item : table_builder) {
|
||||
if (item.second != 0) {
|
||||
alphabet_.push_back(item.first);
|
||||
}
|
||||
}
|
||||
alphabet_.shrink_to_fit();
|
||||
|
||||
std::sort(std::begin(table_builder), std::end(table_builder),
|
||||
[](const tb_type& lhs, const tb_type& rhs) {
|
||||
return lhs.second > rhs.second;
|
||||
});
|
||||
|
||||
for (uint32_t i = 0; i < 256; ++i) {
|
||||
table_[table_builder[i].first] = static_cast<uint8_t>(i);
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < 256; ++i) {
|
||||
table_[table_[i] + 256] = static_cast<uint8_t>(i);
|
||||
}
|
||||
}
|
||||
|
||||
void TrieBuilder::build_bc_(size_t begin, size_t end, size_t depth,
|
||||
id_type node_id) {
|
||||
if (keys_[begin].length() == depth) {
|
||||
term_flags_.set_bit(node_id, true);
|
||||
if (++begin == end) { // without link?
|
||||
bc_[node_id].base = 0; // with an empty suffix
|
||||
leaf_flags_.set_bit(node_id, true);
|
||||
return;
|
||||
}
|
||||
} else if (begin + 1 == end) { // leaf?
|
||||
term_flags_.set_bit(node_id, true);
|
||||
leaf_flags_.set_bit(node_id, true);
|
||||
auto& key = keys_[begin];
|
||||
suffixes_.push_back(
|
||||
{{key.data() + depth, key.length() - depth}, node_id}
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
{ // fetching edges
|
||||
edges_.clear();
|
||||
auto label = static_cast<uint8_t>(keys_[begin][depth]);
|
||||
for (auto str_id = begin + 1; str_id < end; ++str_id) {
|
||||
const auto _label = static_cast<uint8_t>(keys_[str_id][depth]);
|
||||
if (label != _label) {
|
||||
if (_label < label) {
|
||||
throw TrieBuilder::Exception(
|
||||
"The input data is not in lexicographical order."
|
||||
);
|
||||
}
|
||||
edges_.push_back(label);
|
||||
label = _label;
|
||||
}
|
||||
}
|
||||
edges_.push_back(label);
|
||||
}
|
||||
|
||||
const auto base = find_base_(node_id >> width_L1_);
|
||||
if (bc_.size() <= base) {
|
||||
expand_();
|
||||
}
|
||||
|
||||
// defining new edges
|
||||
bc_[node_id].base = base;
|
||||
for (const auto label : edges_) {
|
||||
const auto child_id = base ^ table_[label];
|
||||
use_(child_id);
|
||||
bc_[child_id].check = node_id;
|
||||
}
|
||||
|
||||
// following the children
|
||||
auto _begin = begin;
|
||||
auto label = static_cast<uint8_t>(keys_[begin][depth]);
|
||||
for (auto _end = begin + 1; _end < end; ++_end) {
|
||||
const auto _label = static_cast<uint8_t>(keys_[_end][depth]);
|
||||
if (label != _label) {
|
||||
build_bc_(_begin, _end, depth + 1, base ^ table_[label]);
|
||||
label = _label;
|
||||
_begin = _end;
|
||||
}
|
||||
}
|
||||
build_bc_(_begin, end, depth + 1, base ^ table_[label]);
|
||||
}
|
||||
|
||||
// The algorithm is inspired by marisa-trie
|
||||
void TrieBuilder::build_tail_() {
|
||||
std::sort(std::begin(suffixes_), std::end(suffixes_),
|
||||
[](const Suffix& lhs, const Suffix& rhs) {
|
||||
return std::lexicographical_compare(
|
||||
std::rbegin(lhs), std::rend(lhs),
|
||||
std::rbegin(rhs), std::rend(rhs));
|
||||
});
|
||||
|
||||
// For empty suffixes
|
||||
tail_.emplace_back('\0');
|
||||
if (bin_mode_) {
|
||||
boundary_flags_.push_back(false);
|
||||
}
|
||||
|
||||
const Suffix dummy = {{nullptr, 0}, 0};
|
||||
const Suffix* prev_suf = &dummy;
|
||||
|
||||
for (size_t i = suffixes_.size(); i > 0; --i) {
|
||||
const auto& cur_suf = suffixes_[i - 1];
|
||||
if (cur_suf.length() == 0) {
|
||||
throw TrieBuilder::Exception("A suffix is empty.");
|
||||
}
|
||||
|
||||
size_t match = 0;
|
||||
while ((match < cur_suf.length()) && (match < prev_suf->length())
|
||||
&& ((*prev_suf)[match] == cur_suf[match])) {
|
||||
++match;
|
||||
}
|
||||
|
||||
if ((match == cur_suf.length()) && (prev_suf->length() != 0)) { // sharing
|
||||
bc_[cur_suf.node_id].base = static_cast<id_type>(
|
||||
bc_[prev_suf->node_id].base + (prev_suf->length() - match)
|
||||
);
|
||||
} else { // append
|
||||
bc_[cur_suf.node_id].base = static_cast<id_type>(tail_.size());
|
||||
std::copy(std::begin(cur_suf.str), std::end(cur_suf.str),
|
||||
std::back_inserter(tail_));
|
||||
if (bin_mode_) {
|
||||
for (size_t j = 1; j < cur_suf.length(); ++j) {
|
||||
boundary_flags_.push_back(false);
|
||||
}
|
||||
boundary_flags_.push_back(true);
|
||||
} else {
|
||||
tail_.emplace_back('\0');
|
||||
}
|
||||
if (ID_MAX < tail_.size()) {
|
||||
throw TrieBuilder::Exception("TAIL address range error.");
|
||||
}
|
||||
}
|
||||
prev_suf = &cur_suf;
|
||||
}
|
||||
}
|
||||
|
||||
void TrieBuilder::expand_() {
|
||||
if (ID_MAX < bc_.size() + 256) {
|
||||
throw TrieBuilder::Exception("Node ID range error.");
|
||||
}
|
||||
|
||||
const auto old_size = static_cast<id_type>(bc_.size());
|
||||
const auto new_size = old_size + 256;
|
||||
|
||||
for (auto i = old_size; i < new_size; ++i) {
|
||||
bc_.push_back({i + 1, i - 1});
|
||||
leaf_flags_.push_back(false);
|
||||
term_flags_.push_back(false);
|
||||
used_flags_.push_back(false);
|
||||
}
|
||||
|
||||
{
|
||||
const auto last = bc_[TABOO_ID].check;
|
||||
bc_[old_size].check = last;
|
||||
bc_[last].base = old_size;
|
||||
bc_[new_size - 1].base = TABOO_ID;
|
||||
bc_[TABOO_ID].check = new_size - 1;
|
||||
}
|
||||
|
||||
for (auto i = old_size; i < new_size; i += block_size_) {
|
||||
heads_.push_back(i);
|
||||
}
|
||||
|
||||
const auto block_id = old_size / 256;
|
||||
if (FREE_BLOCKS <= block_id) {
|
||||
close_block_(block_id - FREE_BLOCKS);
|
||||
}
|
||||
}
|
||||
|
||||
void TrieBuilder::use_(id_type node_id) {
|
||||
used_flags_[node_id] = true;
|
||||
|
||||
const auto next = bc_[node_id].base;
|
||||
const auto prev = bc_[node_id].check;
|
||||
bc_[prev].base = next;
|
||||
bc_[next].check = prev;
|
||||
|
||||
const auto block_id = node_id >> width_L1_;
|
||||
if (heads_[block_id] == node_id) {
|
||||
heads_[block_id] = (block_id != next >> width_L1_) ? TABOO_ID : next;
|
||||
}
|
||||
}
|
||||
|
||||
void TrieBuilder::close_block_(id_type block_id) {
|
||||
const auto begin = block_id * 256;
|
||||
const auto end = begin + 256;
|
||||
|
||||
for (auto i = begin; i < end; ++i) {
|
||||
if (!used_flags_[i]) {
|
||||
use_(i);
|
||||
bc_[i].base = i;
|
||||
bc_[i].check = i;
|
||||
used_flags_[i] = false;
|
||||
}
|
||||
}
|
||||
|
||||
for (auto i = begin; i < end; i += block_size_) {
|
||||
heads_[i >> width_L1_] = TABOO_ID;
|
||||
}
|
||||
}
|
||||
|
||||
id_type TrieBuilder::find_base_(id_type block_id) const {
|
||||
if (bc_[TABOO_ID].base == TABOO_ID) { // Full?
|
||||
return static_cast<id_type>(bc_.size()) ^ table_[edges_[0]];
|
||||
}
|
||||
|
||||
// search in the same block
|
||||
for (auto i = heads_[block_id];
|
||||
i != TABOO_ID && i >> width_L1_ == block_id;
|
||||
i = bc_[i].base) {
|
||||
const auto base = i ^ table_[edges_[0]];
|
||||
if (is_target_(base)) {
|
||||
return base; // base / block_size_ == block_id
|
||||
}
|
||||
}
|
||||
|
||||
for (auto i = bc_[TABOO_ID].base; i != TABOO_ID; i = bc_[i].base) {
|
||||
const auto base = i ^ table_[edges_[0]];
|
||||
if (is_target_(base)) {
|
||||
return base; // base / block_size_ != block_id
|
||||
}
|
||||
}
|
||||
|
||||
return static_cast<id_type>(bc_.size()) ^ table_[edges_[0]];
|
||||
}
|
||||
|
||||
bool TrieBuilder::is_target_(id_type base) const {
|
||||
for (const auto label : edges_) {
|
||||
if (used_flags_[base ^ table_[label]]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
} //namespace - xcdat
|
|
@ -1,8 +1,6 @@
|
|||
|
||||
file(GLOB TEST_SOURCES *_test.cpp)
|
||||
file(GLOB TEST_SOURCES test_*.cpp)
|
||||
foreach(TEST_SOURCE ${TEST_SOURCES})
|
||||
get_filename_component(TEST_SOURCE_NAME ${TEST_SOURCE} NAME_WE)
|
||||
add_executable(${TEST_SOURCE_NAME} ${TEST_SOURCE})
|
||||
target_link_libraries(${TEST_SOURCE_NAME} xcdat)
|
||||
add_test(${TEST_SOURCE_NAME} ${TEST_SOURCE_NAME})
|
||||
endforeach()
|
||||
|
|
6260
test/doctest/doctest.h
Normal file
6260
test/doctest/doctest.h
Normal file
File diff suppressed because it is too large
Load diff
110
test/test_bit_vector.cpp
Normal file
110
test/test_bit_vector.cpp
Normal file
|
@ -0,0 +1,110 @@
|
|||
#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
|
||||
|
||||
#include <algorithm>
|
||||
#include <random>
|
||||
|
||||
#include <xcdat/bit_vector.hpp>
|
||||
|
||||
#include "doctest/doctest.h"
|
||||
|
||||
std::vector<bool> generate_random_bits(std::uint64_t n) {
|
||||
static constexpr std::uint64_t seed = 13;
|
||||
std::vector<bool> bits;
|
||||
std::mt19937 engine(seed);
|
||||
for (std::uint64_t i = 0; i < n; i++) {
|
||||
bits.push_back(engine() & 1);
|
||||
}
|
||||
return bits;
|
||||
}
|
||||
|
||||
std::uint64_t get_num_ones(const std::vector<bool>& bits) {
|
||||
return std::accumulate(bits.begin(), bits.end(), 0ULL);
|
||||
}
|
||||
|
||||
std::uint64_t rank_naive(const std::vector<bool>& bits, std::uint64_t i) {
|
||||
return std::accumulate(bits.begin(), bits.begin() + i, 0ULL);
|
||||
}
|
||||
|
||||
std::uint64_t select_naive(const std::vector<bool>& bits, std::uint64_t n) {
|
||||
std::uint64_t i = 0;
|
||||
for (; i < bits.size(); i++) {
|
||||
if (bits[i]) {
|
||||
if (n == 0) {
|
||||
break;
|
||||
} else {
|
||||
n -= 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
TEST_CASE("Test bit_vector::builder with resize") {
|
||||
const auto bits = generate_random_bits(10000);
|
||||
|
||||
xcdat::bit_vector::builder b;
|
||||
b.resize(bits.size());
|
||||
|
||||
REQUIRE_EQ(b.size(), bits.size());
|
||||
|
||||
for (std::uint64_t i = 0; i < bits.size(); i++) {
|
||||
b.set_bit(i, bits[i]);
|
||||
}
|
||||
for (std::uint64_t i = 0; i < bits.size(); i++) {
|
||||
REQUIRE_EQ(b[i], bits[i]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("Test bit_vector::builder with push_back") {
|
||||
const auto bits = generate_random_bits(10000);
|
||||
|
||||
xcdat::bit_vector::builder b;
|
||||
b.reserve(bits.size());
|
||||
|
||||
for (std::uint64_t i = 0; i < bits.size(); i++) {
|
||||
b.push_back(bits[i]);
|
||||
}
|
||||
|
||||
REQUIRE_EQ(b.size(), bits.size());
|
||||
|
||||
for (std::uint64_t i = 0; i < bits.size(); i++) {
|
||||
REQUIRE_EQ(b[i], bits[i]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("Test bit_vector") {
|
||||
const auto bits = generate_random_bits(10000);
|
||||
|
||||
xcdat::bit_vector bv;
|
||||
{
|
||||
xcdat::bit_vector::builder b(bits.size());
|
||||
for (std::uint64_t i = 0; i < bits.size(); i++) {
|
||||
b.set_bit(i, bits[i]);
|
||||
}
|
||||
bv.build(b, true, true);
|
||||
}
|
||||
|
||||
REQUIRE_EQ(bv.size(), bits.size());
|
||||
REQUIRE_EQ(bv.num_ones(), get_num_ones(bits));
|
||||
|
||||
for (std::uint64_t i = 0; i < bits.size(); i++) {
|
||||
REQUIRE_EQ(bv[i], bits[i]);
|
||||
}
|
||||
|
||||
static constexpr std::uint64_t seed = 17;
|
||||
std::mt19937_64 engine(seed);
|
||||
{
|
||||
std::uniform_int_distribution<std::uint64_t> dist(0, bv.size());
|
||||
for (std::uint64_t r = 0; r < 100; r++) {
|
||||
const std::uint64_t i = dist(engine);
|
||||
REQUIRE_EQ(bv.rank(i), rank_naive(bits, i));
|
||||
}
|
||||
}
|
||||
{
|
||||
std::uniform_int_distribution<std::uint64_t> dist(0, bv.num_ones() - 1);
|
||||
for (std::uint64_t r = 0; r < 100; r++) {
|
||||
const std::uint64_t n = dist(engine);
|
||||
REQUIRE_EQ(bv.select(n), select_naive(bits, n));
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,265 +0,0 @@
|
|||
#undef NDEBUG
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <iostream>
|
||||
#include <random>
|
||||
#include <cstring>
|
||||
|
||||
#include "xcdat.hpp"
|
||||
|
||||
using namespace xcdat;
|
||||
|
||||
namespace {
|
||||
|
||||
constexpr size_t NUM_KEYS = 1U << 10;
|
||||
constexpr size_t MAX_LENGTH = 20;
|
||||
|
||||
void to_set(std::vector<std::string>& keys) {
|
||||
std::sort(std::begin(keys), std::end(keys));
|
||||
keys.erase(std::unique(std::begin(keys), std::end(keys)), std::end(keys));
|
||||
}
|
||||
|
||||
std::string make_key() {
|
||||
std::random_device rnd;
|
||||
|
||||
std::string key;
|
||||
size_t length = (rnd() % MAX_LENGTH) + 1;
|
||||
for (size_t j = 0; j < length; ++j) {
|
||||
key += 'A' + (rnd() % 26);
|
||||
}
|
||||
|
||||
return key;
|
||||
}
|
||||
|
||||
std::vector<std::string> make_keys() {
|
||||
std::vector<std::string> keys;
|
||||
keys.reserve(NUM_KEYS);
|
||||
|
||||
for (size_t i = 0; i < NUM_KEYS; ++i) {
|
||||
keys.push_back(make_key());
|
||||
}
|
||||
|
||||
to_set(keys);
|
||||
return keys;
|
||||
}
|
||||
|
||||
std::vector<std::string> make_other_keys(const std::vector<std::string>& keys) {
|
||||
std::vector<std::string> others;
|
||||
|
||||
for (size_t i = 0; i < NUM_KEYS; ++i) {
|
||||
auto string = make_key();
|
||||
if (std::find(std::begin(keys), std::end(keys), string) == std::end(keys)) {
|
||||
others.push_back(string);
|
||||
}
|
||||
}
|
||||
|
||||
to_set(others);
|
||||
return others;
|
||||
}
|
||||
|
||||
template<bool Fast>
|
||||
Trie<Fast> test_build(const std::vector<std::string_view>& keys,
|
||||
bool bin_mode) {
|
||||
std::cerr << "Construction -> build()\n";
|
||||
|
||||
auto trie = TrieBuilder::build<Fast>(keys, bin_mode);
|
||||
assert(trie.num_keys() == keys.size());
|
||||
|
||||
return trie;
|
||||
}
|
||||
|
||||
template<bool Fast>
|
||||
void test_basic_operations(const Trie<Fast>& trie,
|
||||
const std::vector<std::string_view>& keys,
|
||||
const std::vector<std::string_view>& others) {
|
||||
std::cerr << "Basic operations -> lookup() and access()\n";
|
||||
|
||||
for (auto& key : keys) {
|
||||
auto id = trie.lookup(key);
|
||||
assert(id != Trie<Fast>::NOT_FOUND);
|
||||
|
||||
auto dec = trie.access(id);
|
||||
assert(dec == key);
|
||||
}
|
||||
|
||||
for (auto& other : others) {
|
||||
const auto id = trie.lookup(other);
|
||||
assert(id == Trie<Fast>::NOT_FOUND);
|
||||
}
|
||||
}
|
||||
|
||||
template<bool Fast>
|
||||
void test_prefix_operations(const Trie<Fast>& trie,
|
||||
const std::vector<std::string_view>& keys,
|
||||
const std::vector<std::string_view>& others) {
|
||||
std::cerr << "Prefix operations -> PrefixIterator\n";
|
||||
|
||||
for (auto& key : keys) {
|
||||
size_t num_results = 0;
|
||||
|
||||
auto it = trie.make_prefix_iterator(key);
|
||||
while (it.next()) {
|
||||
auto id = it.id();
|
||||
auto dec = it.key();
|
||||
|
||||
assert(dec.length() <= key.length());
|
||||
|
||||
auto dec2 = trie.access(id);
|
||||
assert(dec == dec2);
|
||||
|
||||
++num_results;
|
||||
}
|
||||
|
||||
assert(1 <= num_results);
|
||||
assert(num_results <= key.length());
|
||||
}
|
||||
|
||||
for (auto& other : others) {
|
||||
size_t num_results = 0;
|
||||
|
||||
auto it = trie.make_prefix_iterator(other);
|
||||
while (it.next()) {
|
||||
auto id = it.id();
|
||||
auto dec = it.key();
|
||||
|
||||
assert(dec.length() < other.length());
|
||||
|
||||
auto dec2 = trie.access(id);
|
||||
assert(dec == dec2);
|
||||
|
||||
++num_results;
|
||||
}
|
||||
|
||||
assert(num_results < other.length());
|
||||
}
|
||||
}
|
||||
|
||||
template<bool Fast>
|
||||
void test_predictive_operations(const Trie<Fast>& trie,
|
||||
const std::vector<std::string_view>& keys,
|
||||
const std::vector<std::string_view>& others) {
|
||||
std::cerr << "Predictive operations -> PredictiveIterator\n";
|
||||
|
||||
for (auto& key : keys) {
|
||||
size_t num_results = 0;
|
||||
|
||||
auto it = trie.make_predictive_iterator(key);
|
||||
while (it.next()) {
|
||||
auto id = it.id();
|
||||
auto dec = it.key();
|
||||
|
||||
assert(key.length() <= dec.length());
|
||||
|
||||
auto dec2 = trie.access(id);
|
||||
assert(dec == dec2);
|
||||
|
||||
++num_results;
|
||||
}
|
||||
|
||||
assert(1 <= num_results);
|
||||
}
|
||||
|
||||
for (auto& other : others) {
|
||||
auto it = trie.make_predictive_iterator(other);
|
||||
while (it.next()) {
|
||||
auto id = it.id();
|
||||
auto dec = it.key();
|
||||
|
||||
assert(other.length() < dec.length());
|
||||
|
||||
auto dec2 = trie.access(id);
|
||||
|
||||
assert(dec == dec2);
|
||||
}
|
||||
}
|
||||
|
||||
{ // all enumeration
|
||||
size_t num_results = 0;
|
||||
|
||||
auto it = trie.make_predictive_iterator(std::string_view{});
|
||||
while (it.next()) {
|
||||
auto id = it.id();
|
||||
auto dec = it.key();
|
||||
|
||||
assert(0 <= dec.length());
|
||||
|
||||
auto dec2 = trie.access(id);
|
||||
assert(dec == dec2);
|
||||
|
||||
++num_results;
|
||||
}
|
||||
|
||||
assert(num_results == trie.num_keys());
|
||||
}
|
||||
}
|
||||
|
||||
template<bool Fast>
|
||||
void test_io(const Trie<Fast>& trie) {
|
||||
std::cerr << "File I/O -> write() and read()\n";
|
||||
|
||||
const char* file_name = "index";
|
||||
{
|
||||
std::ofstream ofs{file_name};
|
||||
trie.write(ofs);
|
||||
}
|
||||
{
|
||||
std::ifstream ifs{file_name};
|
||||
auto size = static_cast<size_t>(ifs.seekg(0, std::ios::end).tellg());
|
||||
assert(size == trie.size_in_bytes());
|
||||
}
|
||||
|
||||
Trie<Fast> _trie;
|
||||
{
|
||||
std::ifstream ifs{file_name};
|
||||
_trie = Trie<Fast>(ifs);
|
||||
}
|
||||
|
||||
assert(trie.num_keys() == _trie.num_keys());
|
||||
assert(trie.bin_mode() == _trie.bin_mode());
|
||||
assert(trie.alphabet_size() == _trie.alphabet_size());
|
||||
assert(trie.num_nodes() == _trie.num_nodes());
|
||||
assert(trie.num_used_nodes() == _trie.num_used_nodes());
|
||||
assert(trie.num_free_nodes() == _trie.num_free_nodes());
|
||||
assert(trie.size_in_bytes() == _trie.size_in_bytes());
|
||||
}
|
||||
|
||||
template<bool Fast>
|
||||
void test_trie(const std::vector<std::string_view>& strings,
|
||||
const std::vector<std::string_view>& others) {
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
std::cerr << "** " << (i % 2 ? "Binary" : "Text") << " Mode **\n";
|
||||
std::cerr << "Testing xcdat::Trie<" << (Fast ? "true" : "false") << ">\n";
|
||||
|
||||
auto trie = test_build<Fast>(strings, i % 2 != 0);
|
||||
|
||||
test_basic_operations(trie, strings, others);
|
||||
test_prefix_operations(trie, strings, others);
|
||||
test_predictive_operations(trie, strings, others);
|
||||
test_io(trie);
|
||||
|
||||
std::cerr << "--> No problem (☝ ՞ਊ ՞)☝" << std::endl << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
int main() {
|
||||
auto keys_buffer = make_keys();
|
||||
auto others_buffer = make_other_keys(keys_buffer);
|
||||
|
||||
std::vector<std::string_view> keys(keys_buffer.size());
|
||||
for (size_t i = 0; i < keys.size(); ++i) {
|
||||
keys[i] = std::string_view{keys_buffer[i]};
|
||||
}
|
||||
|
||||
std::vector<std::string_view> others(others_buffer.size());
|
||||
for (size_t i = 0; i < others.size(); ++i) {
|
||||
others[i] = std::string_view{others_buffer[i]};
|
||||
}
|
||||
|
||||
test_trie<false>(keys, others);
|
||||
test_trie<true>(keys, others);
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -1,102 +0,0 @@
|
|||
#undef NDEBUG
|
||||
|
||||
#include <cassert>
|
||||
#include <iostream>
|
||||
#include <random>
|
||||
|
||||
#include "xcdat/BitVector.hpp"
|
||||
#include "xcdat/FitVector.hpp"
|
||||
|
||||
using namespace xcdat;
|
||||
|
||||
namespace {
|
||||
|
||||
constexpr size_t SIZE = 1U << 10;
|
||||
|
||||
void test_vector() {
|
||||
std::vector<int> orig_vec(SIZE);
|
||||
{
|
||||
std::random_device rnd;
|
||||
for (size_t i = 0; i < SIZE; ++i) {
|
||||
orig_vec[i] = rnd();
|
||||
}
|
||||
}
|
||||
|
||||
auto copied_vec = orig_vec; // copy
|
||||
Vector<int> vec(copied_vec);
|
||||
|
||||
assert(copied_vec.empty());
|
||||
|
||||
for (size_t i = 0; i < SIZE; ++i) {
|
||||
assert(orig_vec[i] == vec[i]);
|
||||
}
|
||||
|
||||
Vector<int> swapped_vec;
|
||||
vec.swap(swapped_vec);
|
||||
|
||||
assert(vec.is_empty());
|
||||
|
||||
for (size_t i = 0; i < SIZE; ++i) {
|
||||
assert(orig_vec[i] == swapped_vec[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void test_bit_vector() {
|
||||
std::vector<bool> orig_bit_vector;
|
||||
{
|
||||
std::random_device rnd;
|
||||
for (size_t i = 0; i < SIZE; ++i) {
|
||||
orig_bit_vector.push_back(rnd() % 2 == 0);
|
||||
}
|
||||
}
|
||||
|
||||
BitVector bit_vector;
|
||||
{
|
||||
BitVectorBuilder builder;
|
||||
for (size_t i = 0; i < SIZE; ++i) {
|
||||
builder.push_back(orig_bit_vector[i]);
|
||||
}
|
||||
bit_vector = BitVector(builder, true, true);
|
||||
}
|
||||
|
||||
assert(bit_vector.size() == SIZE);
|
||||
|
||||
id_type sum = 0;
|
||||
for (id_type i = 0; i < SIZE; ++i) {
|
||||
assert(bit_vector[i] == orig_bit_vector[i]);
|
||||
if (bit_vector[i]) {
|
||||
assert(sum == bit_vector.rank(i));
|
||||
assert(i == bit_vector.select(sum));
|
||||
++sum;
|
||||
}
|
||||
}
|
||||
|
||||
assert(bit_vector.num_1s() == sum);
|
||||
assert(bit_vector.num_0s() == SIZE - sum);
|
||||
}
|
||||
|
||||
void test_small_vector() {
|
||||
std::vector<id_type> orig_vector;
|
||||
{
|
||||
std::random_device rnd;
|
||||
for (size_t i = 0; i < SIZE; ++i) {
|
||||
orig_vector.push_back(rnd() & UINT16_MAX);
|
||||
}
|
||||
}
|
||||
|
||||
FitVector small_vector(orig_vector);
|
||||
assert(orig_vector.size() == small_vector.size());
|
||||
|
||||
for (size_t i = 0; i < SIZE; ++i) {
|
||||
assert(orig_vector[i] == small_vector[i]);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
int main() {
|
||||
test_vector();
|
||||
test_bit_vector();
|
||||
test_small_vector();
|
||||
return 0;
|
||||
}
|
|
@ -1,6 +0,0 @@
|
|||
|
||||
add_executable(xcdat-exe xcdat.cpp)
|
||||
set_target_properties(xcdat-exe PROPERTIES OUTPUT_NAME xcdat)
|
||||
target_link_libraries(xcdat-exe xcdat)
|
||||
|
||||
install(TARGETS xcdat-exe RUNTIME DESTINATION bin)
|
322
tool/xcdat.cpp
322
tool/xcdat.cpp
|
@ -1,322 +0,0 @@
|
|||
#include <chrono>
|
||||
#include <iostream>
|
||||
#include <random>
|
||||
|
||||
#include "xcdat.hpp"
|
||||
|
||||
using namespace xcdat;
|
||||
|
||||
namespace {
|
||||
|
||||
constexpr int RUNS = 10;
|
||||
|
||||
class StopWatch {
|
||||
public:
|
||||
using hrc = std::chrono::high_resolution_clock;
|
||||
|
||||
StopWatch() : tp_{hrc::now()} {}
|
||||
|
||||
double sec() const {
|
||||
const auto tp = hrc::now() - tp_;
|
||||
return std::chrono::duration<double>(tp).count();
|
||||
}
|
||||
double milli_sec() const {
|
||||
const auto tp = hrc::now() - tp_;
|
||||
return std::chrono::duration<double, std::milli>(tp).count();
|
||||
}
|
||||
double micro_sec() const {
|
||||
const auto tp = hrc::now() - tp_;
|
||||
return std::chrono::duration<double, std::micro>(tp).count();
|
||||
}
|
||||
|
||||
private:
|
||||
hrc::time_point tp_;
|
||||
};
|
||||
|
||||
size_t read_keys(const char* file_name, std::vector<std::string>& keys) {
|
||||
std::ifstream ifs{file_name};
|
||||
if (!ifs) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
size_t size = 0;
|
||||
for (std::string line; std::getline(ifs, line);) {
|
||||
keys.push_back(line);
|
||||
size += line.length() + 1; // with terminator
|
||||
}
|
||||
|
||||
return size;
|
||||
}
|
||||
|
||||
std::vector<std::string_view>
|
||||
extract_views(const std::vector<std::string>& keys) {
|
||||
std::vector<std::string_view> views(keys.size());
|
||||
for (size_t i = 0; i < keys.size(); ++i) {
|
||||
views[i] = keys[i];
|
||||
}
|
||||
return views;
|
||||
};
|
||||
|
||||
void show_usage(std::ostream& os) {
|
||||
os << "xcdat build <type> <key> <dict>\n";
|
||||
os << "\t<type>\t1: DACs, 2: FDACs\n";
|
||||
os << "\t<key> \tInput file name of a set of keys (must be sorted)\n";
|
||||
os << "\t<dict>\tOutput file name of the dictionary (optional)\n";
|
||||
os << "\t \tIf omitted, <key>.dacs or <key>.fdacs is output\n";
|
||||
os << "xcdat query <type> <dict> <limit>\n";
|
||||
os << "\t<type> \t1: DACs, 2: FDACs\n";
|
||||
os << "\t<dict> \tInput file name of the dictionary\n";
|
||||
os << "\t<limit>\tLimit of #results (optional, default=10)\n";
|
||||
os << "xcdat bench <type> <dict> <key>\n";
|
||||
os << "\t<type>\t1: DACs, 2: FDACs\n";
|
||||
os << "\t<dict>\tInput file name of the dictionary\n";
|
||||
os << "\t<key> \tInput file name of keys for benchmark\n";
|
||||
os.flush();
|
||||
}
|
||||
|
||||
template<bool Fast>
|
||||
int build(std::vector<std::string>& args) {
|
||||
if (args.size() != 3 && args.size() != 4) {
|
||||
show_usage(std::cerr);
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::vector<std::string> keys_buffer;
|
||||
auto raw_size = read_keys(args[2].c_str(), keys_buffer);
|
||||
|
||||
if (raw_size == 0) {
|
||||
std::cerr << "open error : " << args[2] << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
auto keys = extract_views(keys_buffer);
|
||||
|
||||
Trie<Fast> trie;
|
||||
try {
|
||||
StopWatch sw;
|
||||
trie = TrieBuilder::build<Fast>(keys);
|
||||
std::cout << "constr. time:\t" << sw.sec() << " sec" << std::endl;
|
||||
} catch (const xcdat::TrieBuilder::Exception& ex) {
|
||||
std::cerr << ex.what() << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::cout << "cmpr. ratio:\t"
|
||||
<< static_cast<double>(trie.size_in_bytes()) / raw_size
|
||||
<< " over the raw size" << std::endl;
|
||||
|
||||
std::cout << std::endl;
|
||||
trie.show_stat(std::cout);
|
||||
std::cout << std::endl;
|
||||
|
||||
std::string out_name;
|
||||
if (args.size() == 4) {
|
||||
out_name = args[3];
|
||||
} else {
|
||||
out_name = args[2] + (Fast ? ".fdac" : ".dac");
|
||||
}
|
||||
|
||||
std::ofstream ofs{out_name};
|
||||
if (!ofs) {
|
||||
std::cerr << "open error : " << out_name << std::endl;
|
||||
return 1;
|
||||
}
|
||||
trie.write(ofs);
|
||||
|
||||
std::cout << "output -> " << out_name << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
template<bool Fast>
|
||||
int query(std::vector<std::string>& args) {
|
||||
if (args.size() != 3 && args.size() != 4) {
|
||||
show_usage(std::cerr);
|
||||
return 1;
|
||||
}
|
||||
|
||||
Trie<Fast> trie;
|
||||
{
|
||||
std::ifstream ifs(args[2]);
|
||||
if (!ifs) {
|
||||
std::cerr << "open error : " << args[2] << std::endl;
|
||||
return 1;
|
||||
}
|
||||
trie = Trie<Fast>(ifs);
|
||||
}
|
||||
|
||||
size_t limit = 10;
|
||||
if (args.size() == 4) {
|
||||
limit = std::stoull(args.back());
|
||||
}
|
||||
|
||||
std::string query;
|
||||
|
||||
while (true){
|
||||
std::cout << "> " << std::flush;
|
||||
std::getline(std::cin, query);
|
||||
if (query.empty()){
|
||||
break;
|
||||
}
|
||||
|
||||
std::cout << "Lookup" << std::endl;
|
||||
auto id = trie.lookup(query);
|
||||
if (id == Trie<Fast>::NOT_FOUND) {
|
||||
std::cout << "not found" << std::endl;
|
||||
} else {
|
||||
std::cout << id << '\t' << query << std::endl;
|
||||
}
|
||||
|
||||
std::cout << "Common Prefix Lookup" << std::endl;
|
||||
{
|
||||
size_t N = 0;
|
||||
auto it = trie.make_prefix_iterator(query);
|
||||
while (N < limit && it.next()) {
|
||||
std::cout << it.id() << '\t' << it.key() << std::endl;
|
||||
++N;
|
||||
}
|
||||
|
||||
size_t M = 0;
|
||||
while (it.next()) {
|
||||
++M;
|
||||
}
|
||||
|
||||
if (M != 0) {
|
||||
std::cout << "and more..." << std::endl;
|
||||
}
|
||||
std::cout << N + M << " found" << std::endl;
|
||||
}
|
||||
|
||||
std::cout << "Predictive Lookup" << std::endl;
|
||||
{
|
||||
size_t N = 0;
|
||||
auto it = trie.make_predictive_iterator(query);
|
||||
while (N < limit && it.next()) {
|
||||
std::cout << it.id() << '\t' << it.key() << std::endl;
|
||||
++N;
|
||||
}
|
||||
|
||||
size_t M = 0;
|
||||
while (it.next()) {
|
||||
++M;
|
||||
}
|
||||
|
||||
if (M != 0) {
|
||||
std::cout << "and more..." << std::endl;
|
||||
}
|
||||
std::cout << N + M << " found" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
template<bool Fast>
|
||||
int bench(std::vector<std::string>& args) {
|
||||
if (args.size() != 4) {
|
||||
show_usage(std::cerr);
|
||||
return 1;
|
||||
}
|
||||
|
||||
Trie<Fast> trie;
|
||||
{
|
||||
std::ifstream ifs(args[2]);
|
||||
if (!ifs) {
|
||||
std::cerr << "open error : " << args[2] << std::endl;
|
||||
return 1;
|
||||
}
|
||||
trie = Trie<Fast>(ifs);
|
||||
}
|
||||
|
||||
std::vector<std::string> keys_buffer;
|
||||
if (read_keys(args[3].c_str(), keys_buffer) == 0) {
|
||||
std::cerr << "open error : " << args[3] << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
auto keys = extract_views(keys_buffer);
|
||||
std::vector<id_type> ids(keys.size());
|
||||
|
||||
std::cout << "Warm up" << std::endl;
|
||||
|
||||
for (size_t i = 0; i < keys.size(); ++i) {
|
||||
ids[i] = trie.lookup(keys[i]);
|
||||
if (ids[i] == Trie<Fast>::NOT_FOUND) {
|
||||
std::cerr << "A non-registered key is included, "
|
||||
<< keys_buffer[i] << std::endl;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
std::cout << "Lookup benchmark on " << RUNS << " runs" << std::endl;
|
||||
|
||||
StopWatch sw;
|
||||
for (uint32_t r = 0; r < RUNS; ++r) {
|
||||
for (size_t i = 0; i < keys.size(); ++i) {
|
||||
if (trie.lookup(keys[i]) != ids[i]) {
|
||||
std::cerr << "Critical lookup error ʅ( ՞ਊ՞)ʃ" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << sw.micro_sec() / RUNS / keys.size()
|
||||
<< " us per str" << std::endl;
|
||||
}
|
||||
|
||||
{
|
||||
std::cout << "Access benchmark on " << RUNS << " runs" << std::endl;
|
||||
|
||||
StopWatch sw;
|
||||
for (uint32_t r = 0; r < RUNS; ++r) {
|
||||
for (auto id : ids) {
|
||||
auto dec = trie.access(id);
|
||||
if (dec.empty()) {
|
||||
std::cerr << "Critical access error ʅ( ՞ਊ՞)ʃ" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << sw.micro_sec() / RUNS / ids.size()
|
||||
<< " us per ID" << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
int main(int argc, const char* argv[]) {
|
||||
if (argc < 3) {
|
||||
show_usage(std::cerr);
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::vector<std::string> args;
|
||||
for (int i = 1; i < argc; ++i) {
|
||||
args.emplace_back(std::string{argv[i]});
|
||||
}
|
||||
|
||||
bool is_fast;
|
||||
if (args[1][0] == '1') {
|
||||
is_fast = false;
|
||||
} else if (args[1][0] == '2') {
|
||||
is_fast = true;
|
||||
} else {
|
||||
show_usage(std::cerr);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (args[0] == "build") {
|
||||
return is_fast ? build<true>(args) : build<false>(args);
|
||||
} else if (args[0] == "query") {
|
||||
return is_fast ? query<true>(args) : query<false>(args);
|
||||
} else if (args[0] == "bench") {
|
||||
return is_fast ? bench<true>(args) : bench<false>(args);
|
||||
}
|
||||
|
||||
show_usage(std::cerr);
|
||||
return 1;
|
||||
}
|
|
@ -1,7 +0,0 @@
|
|||
#ifndef XCDAT_CONFIG_HPP
|
||||
#define XCDAT_CONFIG_HPP
|
||||
|
||||
#cmakedefine XCDAT_X64
|
||||
#cmakedefine XCDAT_USE_POPCNT
|
||||
|
||||
#endif // XCDAT_CONFIG_HPP
|
Loading…
Reference in a new issue