rm old files and add bit_vector

This commit is contained in:
Shunsuke Kanda 2021-06-26 02:18:57 +09:00
parent 9de3d4348a
commit 96e039bda7
36 changed files with 6949 additions and 3900 deletions

113
.clang-format Normal file
View file

@ -0,0 +1,113 @@
---
Language: Cpp
# BasedOnStyle: Google
AccessModifierOffset: -2
AlignAfterOpenBracket: Align
AlignConsecutiveAssignments: false
AlignConsecutiveDeclarations: false
AlignEscapedNewlines: Left
AlignOperands: true
AlignTrailingComments: false
AllowAllParametersOfDeclarationOnNextLine: true
AllowShortBlocksOnASingleLine: false
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: Empty
AllowShortIfStatementsOnASingleLine: true
AllowShortLoopsOnASingleLine: true
AlwaysBreakAfterDefinitionReturnType: None
AlwaysBreakAfterReturnType: None
AlwaysBreakBeforeMultilineStrings: false
AlwaysBreakTemplateDeclarations: true
BinPackArguments: true
BinPackParameters: true
BraceWrapping:
AfterClass: false
AfterControlStatement: false
AfterEnum: false
AfterFunction: false
AfterNamespace: false
AfterObjCDeclaration: false
AfterStruct: false
AfterUnion: false
AfterExternBlock: false
BeforeCatch: false
BeforeElse: false
IndentBraces: false
SplitEmptyFunction: true
SplitEmptyRecord: true
SplitEmptyNamespace: true
BreakBeforeBinaryOperators: None
BreakBeforeBraces: Attach
BreakBeforeInheritanceComma: false
BreakBeforeTernaryOperators: true
BreakConstructorInitializersBeforeComma: false
BreakConstructorInitializers: BeforeColon
BreakAfterJavaFieldAnnotations: false
BreakStringLiterals: true
ColumnLimit: 120
CommentPragmas: '^ IWYU pragma:'
CompactNamespaces: false
ConstructorInitializerAllOnOneLineOrOnePerLine: false
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
Cpp11BracedListStyle: true
DerivePointerAlignment: false
DisableFormat: false
ExperimentalAutoDetectBinPacking: false
FixNamespaceComments: true
ForEachMacros:
- foreach
- Q_FOREACH
- BOOST_FOREACH
IncludeBlocks: Preserve
IncludeCategories:
- Regex: '^<ext/.*\.h>'
Priority: 2
- Regex: '^<.*\.h>'
Priority: 1
- Regex: '^<.*'
Priority: 2
- Regex: '.*'
Priority: 3
IncludeIsMainRegex: '([-_](test|unittest))?$'
IndentCaseLabels: true
IndentPPDirectives: None
IndentWidth: 4
IndentWrappedFunctionNames: false
JavaScriptQuotes: Leave
JavaScriptWrapImports: true
KeepEmptyLinesAtTheStartOfBlocks: false
MacroBlockBegin: ''
MacroBlockEnd: ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: None
ObjCBlockIndentWidth: 2
ObjCSpaceAfterProperty: false
ObjCSpaceBeforeProtocolList: false
PenaltyBreakAssignment: 2
PenaltyBreakBeforeFirstCallParameter: 1
PenaltyBreakComment: 300
PenaltyBreakFirstLessLess: 120
PenaltyBreakString: 1000
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 200
PointerAlignment: Left
ReflowComments: true
SortIncludes: true
SortUsingDeclarations: true
SpaceAfterCStyleCast: false
SpaceAfterTemplateKeyword: true
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: ControlStatements
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 2
SpacesInAngles: false
SpacesInContainerLiterals: true
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
Standard: Latest
TabWidth: 8
UseTab: Never
...

View file

@ -1,5 +1,5 @@
cmake_minimum_required(VERSION 3.1) cmake_minimum_required(VERSION 3.0)
project(XCDAT) project(xcdat VERSION 1.0.0 LANGUAGES CXX)
set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD 17)
@ -7,55 +7,30 @@ if (NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release) set(CMAKE_BUILD_TYPE Release)
endif () endif ()
configure_file( if ((CMAKE_CXX_COMPILER_ID STREQUAL "Clang") OR (CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang"))
${XCDAT_SOURCE_DIR}/xcdat_config.hpp.in set(CMAKE_COMPILER_IS_CLANGXX 1)
${XCDAT_SOURCE_DIR}/include/xcdat/xcdat_config.hpp
)
message(STATUS "XCDAT_SOURCE_DIR is ${XCDAT_SOURCE_DIR}")
option(XCDAT_X64
"Use 64-bit integers for node representation."
OFF)
option(XCDAT_USE_POPCNT
"Use popcount intrinsic. Available on x86-64 since SSE4.2."
OFF)
if (XCDAT_USE_POPCNT)
if (UNIX)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2")
endif ()
endif () endif ()
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
set(CMAKE_COMPILER_IS_GNUCXX 1)
endif ()
# C++17 compiler check
if ((CMAKE_COMPILER_IS_GNUCXX AND ${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 7.0) OR (CMAKE_COMPILER_IS_CLANGXX AND ${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.0))
message(FATAL_ERROR "Your C++ compiler does not support C++17. Please install g++ 7.0 (or greater) or clang 4.0 (or greater)")
else ()
message(STATUS "Compiler is recent enough to support C++17.")
endif ()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++1z -pthread -Wall")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DNDEBUG -march=native -O3")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fsanitize=address -fno-omit-frame-pointer -O0 -g -DDEBUG")
message(STATUS "BUILD_TYPE is ${CMAKE_BUILD_TYPE}") message(STATUS "BUILD_TYPE is ${CMAKE_BUILD_TYPE}")
message(STATUS "CXX_FLAGS are ${CMAKE_CXX_FLAGS}") message(STATUS "CXX_FLAGS are ${CMAKE_CXX_FLAGS}")
message(STATUS "CXX_FLAGS_DEBUG are ${CMAKE_CXX_FLAGS_DEBUG}") message(STATUS "CXX_FLAGS_DEBUG are ${CMAKE_CXX_FLAGS_DEBUG}")
message(STATUS "CXX_FLAGS_RELEASE are ${CMAKE_CXX_FLAGS_RELEASE}") message(STATUS "CXX_FLAGS_RELEASE are ${CMAKE_CXX_FLAGS_RELEASE}")
message(STATUS "XCDAT_X64 is ${XCDAT_X64}")
message(STATUS "XCDAT_USE_POPCNT is ${XCDAT_USE_POPCNT}")
file(GLOB HEADER_FILES include/xcdat/*.hpp)
file(GLOB SOURCE_FILES src/*.cpp)
include_directories(include) include_directories(include)
add_library(xcdat STATIC ${HEADER_FILES} ${SOURCE_FILES})
add_subdirectory(tool)
add_subdirectory(sample)
enable_testing() enable_testing()
add_subdirectory(test) add_subdirectory(test)
install(FILES include/xcdat.hpp DESTINATION include)
install(FILES ${HEADER_FILES} DESTINATION include/xcdat)
install(TARGETS xcdat
EXPORT xcdat-targets
LIBRARY DESTINATION lib
ARCHIVE DESTINATION lib
RUNTIME DESTINATION bin)
install(EXPORT xcdat-targets
FILE xcdat-config.cmake
DESTINATION lib/cmake/xcdat)

View file

@ -1,308 +0,0 @@
% Xcdat: XOR-compressed double-array trie
% Shunsuke Kanda
% 2017
## What is Xcdat?
Xcdat is a C++ library that implements static compressed string dictionaries based on an improved double-array trie.
The double array (Aoe, 1989) is known as the fastest trie representation and has been used in many trie libraries. On the other hand, it has a space efficiency problem because of a pointer-based data structure. Xcdat solves the problem using the XOR-compressed double-array methods described in the following article.
> Shunsuke Kanda, Kazuhiro Morita, and Masao Fuketa. Compressed double-array tries for string dictionaries supporting fast lookup. Knowledge and Information Systems, 51(3): 10231042, 2017. [[doi](https://doi.org/10.1007/s10115-016-0999-8)] [[pdf](https://sites.google.com/site/shnskknd/KAIS2016.pdf)]
Xcdat can implement trie dictionaries in smaller space compared to the other double-array libraries. In addition, the lookup speed is relatively fast in compressed data structures from the double-array advantage.
Xcdat is available at [GitHub repsitory](https://github.com/kampersanda/xcdat).
## Features
- **Compressed Data Structure**: Xcdat practically compresses double-array elements for representing node pointers by using the XCDA methods. While the original double array uses 8 bytes (or 16 bytes) per node, it uses about 34 bytes (but, depending on datasets). In addition, the dictionary is implemented using a minimal-prefix trie (Yata et al., 2007) that is effective for long strings in time and space.
- **Two Compression Approaches**: There are two approaches of compressing elements: using byte-oriented DACs (Brisaboa et al., 2013) and using pointer-based ones (Kanda et al., 2017). For characterless strings such as natural language keywords, the former will be slightly smaller and the latter will be slightly faster. For long strings such as URLs, the latter will outperform the former. Xcdat implements the two versions by using a static polymorphism with C++ template to avoid an overhead of virtual functions.
- **64-bit Version**: Although Xcdat represents node addresses using 32-bit integers in default configuration, we can allow for 64-bit integers by defining `XCDAT_X64`; therefore, the dictionary can be constructed from a very large dataset. The construction space becomes large, but the output dictionary size is nearly equal.
- **NULL Character**: The dictionary can be constructed from keys including the NULL character by setting the second parameter of `xcdat::TrieBuilder::build()` to `true`.
- **Dictionary Encoding**: Xcdat supports mapping N different strings to unique IDs in [0,N-1]. That is to say, it supports two basic dictionary operations: Lookup returns the ID corresponding to a given string and Access (also called ReverseLookup) returns the string corresponding to a given ID. Therefore, Xcdat is very useful in many applications for string precessing and indexing, such as described in (Martínez-Prieto et al., 2016).
- **Fast Operations**: Xcdat can provide lookup operations faster than other compressed trie libraries because it is based on the double-array trie. On the other hand, compared to the existing double-array libraries, the speed will be slower due to the compression.
- **Prefix-based Lookup Operations**: As with other trie libraries, Xcdat also provides prefix-based lookup operations required for natural language processing and so on.
## Build Instructions
You can download and compile Xcdat as the following commands.
```
$ git clone https://github.com/kampersanda/xcdat.git
$ cd xcdat
$ mkdir build
$ cd build
$ cmake ..
$ make
$ make install
```
If you want to use a 64-bit setting, please add `-DXCDAT_X64=ON` to the CMake option. In addition, you can use the SSE4.2 POPCNT instruction by adding `-DXCDAT_USE_POPCNT=ON` for Rank/Select operations. The code has been tested only on Mac OS X and Linux. That is, this library considers only UNIX-compatible OS.
## Command Line Tools
`xcdat` is a general-purpose command line tool to provide three modes as follows.
```
$ xcdat
xcdat build <type> <key> <dict>
<type> 1: DACs, 2: FDACs
<key> Input file name of a set of keys (must be sorted)
<dict> Output file name of the dictionary (optional)
If omitted, <key>.dacs or <key>.fdacs is output
xcdat query <type> <dict> <limit>
<type> 1: DACs, 2: FDACs
<dict> Input file name of the dictionary
<limit> Limit of #results (optional, default=10)
xcdat bench <type> <dict> <key>
<type> 1: DACs, 2: FDACs
<dict> Input file name of the dictionary
<key> Input file name of keys for benchmark
```
### Example 1: Construction
Command `xcdat build [params...]` builds Xcdat dictionaries from a given dataset and saves it to a file, as follows.
```
$ xcdat build 1 jawiki-all-titles
constr. time: 1.58574 sec
cmpr. ratio: 0.524287 over the raw size
basic statistics of xcdat::Trie<false>
num keys: 1738995
alphabet size: 189
num nodes: 4042496
num used nodes: 4034357
num free nodes: 8139
size in bytes: 20546967
member size statistics of xcdat::Trie<false>
bc: 13879098 0.675482
terminal_flags: 708448 0.0344794
tail: 5958655 0.290002
boundary_flags: 40 1.94676e-06
basic statistics of xcdat::DacBc
num links: 1499605
bytes per node: 3.4333
member size statistics of xcdat::DacBc
values_L0: 8085000 0.582531
values_L1: 746760 0.0538046
values_L2: 22581 0.00162698
flags_L0: 1389660 0.100126
flags_L1: 128400 0.00925132
leaves: 694856 0.0500649
links: 2811784 0.202591
output -> jawiki-all-titles.dac
```
### Example 2: Query Processing
Command `xcdat query [params...]` loads a dictionary file and tests lookup operations, as follows.
```
$ xcdat query 1 jawiki-all-titles.dac
> NEW_GAME!
Lookup
125989 NEW_GAME!
Common Prefix Lookup
28 N
124185 NE
125428 NEW
125988 NEW_GAME
125989 NEW_GAME!
5 found
Predictive Lookup
125989 NEW_GAME!
126003 NEW_GAME!!
126059 NEW_GAME!_-THE_CHALLENGE_STAGE!-
3 found
```
### Example 3: Benchmark Test
Command `xcdat bench [params...]` tests time performances of a given dictionary, as follows.
```
$ xcdat bench 1 jawiki-all-titles.dac jawiki-all-titles.rnd
Warm up
Lookup benchmark on 10 runs
1.5065 us per str
Access benchmark on 10 runs
1.81289 us per ID
```
## Sample Usage
The following code shows an easy routine sample.
```cpp
#include <iostream>
#include <xcdat.hpp>
int main() {
std::vector<std::string> keys_buf = {
"Aoba", "Yun", "Hajime", "Hihumi", "Kou", "Rin",
"Hazuki", "Umiko", "Nene", "Nenecchi"
};
// Convert to the input format
std::vector<std::string_view> keys(keys_buf.size());
for (size_t i = 0; i < keys.size(); ++i) {
keys[i] = std::string_view{keys_buf[i]};
}
// Input data must be sorted.
std::sort(std::begin(keys), std::end(keys));
// Dictionary class
using Trie = xcdat::Trie<true>;
try {
// Builds a dictionary from the keys
Trie trie = xcdat::TrieBuilder::build<true>(keys); // move
// Writes the dictionary to a file.
std::ofstream ofs{"sample.bin"};
trie.write(ofs);
} catch (const xcdat::TrieBuilder::Exception& ex) {
// Abort if something went wrong...
std::cerr << ex.what() << std::endl;
return 1;
}
// Creates an empty dictionary
Trie trie;
{
// Reads the dictionary to the file.
std::ifstream ifs{"sample.bin"};
trie = Trie{ifs}; // move
}
std::cout << "Performing basic operations..." << std::endl;
{
// lookup() obtains the unique ID for a given key
xcdat::id_type key_id = trie.lookup("Rin");
// access() decodes the key from a given ID
std::cout << key_id << " : " << trie.access(key_id) << std::endl;
// Given an unregistered key, lookup() returns NOT_FOUND.
if (trie.lookup("Hotaru") == Trie::NOT_FOUND) {
std::cout << "? : " << "Hotaru" << std::endl;
}
}
std::cout << "Performing a common prefix operation..." << std::endl;
{
// Common prefix operation is implemented using PrefixIterator, created by
// make_prefix_iterator().
Trie::PrefixIterator it = trie.make_prefix_iterator("Nenecchi");
// next() continues to obtain the next key until false is returned.
while (it.next()) {
std::cout << it.id() << " : " << it.key() << std::endl;
}
}
std::cout << "Performing a predictive operation..." << std::endl;
{
// Predictive operation is implemented using PredictiveIterator, created by
// make_predictive_iterator().
Trie::PredictiveIterator it = trie.make_predictive_iterator("Ha");
// next() continues to obtain the next key until false is returned in
// lexicographical order.
while (it.next()) {
std::cout << it.id() << " : " << it.key() << std::endl;
}
}
std::cout << "Enumerating all registered keys..." << std::endl;
{
// PredictiveIterator for an empty string provides enumeration of all
// registered keys in lexicographical order.
Trie::PredictiveIterator it = trie.make_predictive_iterator("");
while (it.next()) {
std::cout << it.id() << " : " << it.key() << std::endl;
}
}
return 0;
}
```
The standard output is as follows.
```
Performing basic operations...
7 : Rin
? : Hotaru
Performing common prefix operations...
4 : Nene
6 : Nenecchi
Performing predictive operations...
3 : Hajime
5 : Hazuki
Enumerating all registered keys...
0 : Aoba
3 : Hajime
5 : Hazuki
1 : Hihumi
2 : Kou
4 : Nene
6 : Nenecchi
7 : Rin
8 : Umiko
9 : Yun
```
As shown in the output, `xcdat::Trie` assigns unique integer IDs to each registered key. The ID order is random, depending on node arrangement.
## API
You can build a dictionary using static member function `xcdat::TrieBuilder::build()`.
This function receives a set of keywords and returns the resulting class object of `xcdat::Trie`.
For the usage, refer to the header comments of [`xcdat::TrieBuilder.hpp`](https://github.com/kampersanda/xcdat/blob/master/include/xcdat/TrieBuilder.hpp).
Also for the usage of `xcdat::Trie`, refer to the header comments of [`xcdat::Trie`](https://github.com/kampersanda/xcdat/blob/master/include/xcdat/Trie.hpp).
The detailed descriptions of AIP are under construction...
## Benchmark
Work in progress...
## To Do
- Show benchmarks
- Create AIP descriptions
## Licensing
This library is free software provided under the MIT License.
## Citation
If you use the library in academic settings, please cite the following paper.
```bibtex
@article{kanda2017compressed,
title={Compressed double-array tries for string dictionaries supporting fast lookup},
author={Kanda, Shunsuke and Morita, Kazuhiro and Fuketa, Masao},
journal={Knowledge and Information Systems},
volume={51},
number={3},
pages={1023--1042},
year={2017},
publisher={Springer}
}
```
## References
- J. Aoe. An efficient digital search algorithm by using a double-array structure. IEEE Transactions on Software Engineering, 15(9):10661077, 1989.
- N. R. Brisaboa, S. Ladra, and G. Navarro. DACs: Bringing direct access to variable-length codes. Information Processing & Management, 49(1):392404, 2013.
- S. Kanda, K. Morita, and M. Fuketa. Compressed double-array tries for string dictionaries supporting fast lookup. Knowledge and Information Systems, 51(3): 10231042, 2017.
- M. A. Martínez-Prieto, N. Brisaboa, R. Cánovas, F. Claude, and G. Navarro. Practical compressed string dictionaries. Information Systems, 56:73108, 2016
- S. Yata, M. Oono, K. Morita, M. Fuketa, T. Sumitomo, and J. Aoe. A compact static double-array keeping character codes. Information Processing & Management, 43(1):237247, 2007.

View file

@ -1,348 +0,0 @@
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" lang="" xml:lang="">
<head>
<meta charset="utf-8" />
<meta name="generator" content="pandoc" />
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes" />
<meta name="author" content="Shunsuke Kanda" />
<meta name="dcterms.date" content="2017-01-01" />
<title>Xcdat: XOR-compressed double-array trie</title>
<style type="text/css">
code{white-space: pre-wrap;}
span.smallcaps{font-variant: small-caps;}
div.line-block{white-space: pre-line;}
div.column{display: inline-block; vertical-align: top; width: 50%;}
</style>
<style type="text/css">
div.sourceLine, a.sourceLine { display: inline-block; min-height: 1.25em; }
a.sourceLine { pointer-events: none; color: inherit; text-decoration: inherit; }
.sourceCode { overflow: visible; }
code.sourceCode { white-space: pre; }
@media print {
code.sourceCode { white-space: pre-wrap; }
div.sourceLine, a.sourceLine { text-indent: -1em; padding-left: 1em; }
}
pre.numberSource div.sourceLine, .numberSource a.sourceLine
{ position: relative; }
pre.numberSource div.sourceLine::before, .numberSource a.sourceLine::before
{ content: attr(data-line-number);
position: absolute; left: -5em; text-align: right; vertical-align: baseline;
border: none; pointer-events: all;
-webkit-touch-callout: none; -webkit-user-select: none;
-khtml-user-select: none; -moz-user-select: none;
-ms-user-select: none; user-select: none;
padding: 0 4px; width: 4em; }
pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa; color: #aaaaaa; padding-left: 4px; }
@media screen {
a.sourceLine::before { text-decoration: underline; color: initial; }
}
code span.kw { color: #007020; font-weight: bold; } /* Keyword */
code span.dt { color: #902000; } /* DataType */
code span.dv { color: #40a070; } /* DecVal */
code span.bn { color: #40a070; } /* BaseN */
code span.fl { color: #40a070; } /* Float */
code span.ch { color: #4070a0; } /* Char */
code span.st { color: #4070a0; } /* String */
code span.co { color: #60a0b0; font-style: italic; } /* Comment */
code span.ot { color: #007020; } /* Other */
code span.al { color: #ff0000; font-weight: bold; } /* Alert */
code span.fu { color: #06287e; } /* Function */
code span.er { color: #ff0000; font-weight: bold; } /* Error */
code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
code span.cn { color: #880000; } /* Constant */
code span.sc { color: #4070a0; } /* SpecialChar */
code span.vs { color: #4070a0; } /* VerbatimString */
code span.ss { color: #bb6688; } /* SpecialString */
code span.im { } /* Import */
code span.va { color: #19177c; } /* Variable */
code span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
code span.op { color: #666666; } /* Operator */
code span.bu { } /* BuiltIn */
code span.ex { } /* Extension */
code span.pp { color: #bc7a00; } /* Preprocessor */
code span.at { color: #7d9029; } /* Attribute */
code span.do { color: #ba2121; font-style: italic; } /* Documentation */
code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
</style>
<link rel="stylesheet" href="style.css">
<!--[if lt IE 9]>
<script src="//cdnjs.cloudflare.com/ajax/libs/html5shiv/3.7.3/html5shiv-printshiv.min.js"></script>
<![endif]-->
</head>
<body>
<header>
<h1 class="title">Xcdat: XOR-compressed double-array trie</h1>
<p align="center">Created by <a href="https://github.com/kampersanda">Shunsuke Kanda</a></p>
</header>
<h2>Contents</h2>
<nav id="TOC">
<ul>
<li><a href="#what-is-xcdat">What is Xcdat?</a></li>
<li><a href="#features">Features</a></li>
<li><a href="#build-instructions">Build Instructions</a></li>
<li><a href="#command-line-tools">Command Line Tools</a></li>
<li><a href="#sample-usage">Sample Usage</a></li>
<li><a href="#api">API</a></li>
<li><a href="#benchmark">Benchmark</a></li>
<li><a href="#to-do">To Do</a></li>
<li><a href="#licensing">Licensing</a></li>
<li><a href="#citation">Citation</a></li>
<li><a href="#references">References</a></li>
</ul>
</nav>
<h2 id="what-is-xcdat">What is Xcdat?</h2>
<p>Xcdat is a C++ library that implements static compressed string dictionaries based on an improved double-array trie.</p>
<p>The double array (Aoe, 1989) is known as the fastest trie representation and has been used in many trie libraries. On the other hand, it has a space efficiency problem because of a pointer-based data structure. Xcdat solves the problem using the XOR-compressed double-array methods described in the following article.</p>
<blockquote>
<p>Shunsuke Kanda, Kazuhiro Morita, and Masao Fuketa. Compressed double-array tries for string dictionaries supporting fast lookup. Knowledge and Information Systems, 51(3): 10231042, 2017. [<a href="https://doi.org/10.1007/s10115-016-0999-8">doi</a>] [<a href="https://sites.google.com/site/shnskknd/KAIS2016.pdf">pdf</a>]</p>
</blockquote>
<p>Xcdat can implement trie dictionaries in smaller space compared to the other double-array libraries. In addition, the lookup speed is relatively fast in compressed data structures from the double-array advantage.</p>
<p>Xcdat is available at <a href="https://github.com/kampersanda/xcdat">GitHub repsitory</a>.</p>
<h2 id="features">Features</h2>
<ul>
<li><strong>Compressed Data Structure</strong>: Xcdat practically compresses double-array elements for representing node pointers by using the XCDA methods. While the original double array uses 8 bytes (or 16 bytes) per node, it uses about 34 bytes (but, depending on datasets). In addition, the dictionary is implemented using a minimal-prefix trie (Yata et al., 2007) that is effective for long strings in time and space.</li>
<li><strong>Two Compression Approaches</strong>: There are two approaches of compressing elements: using byte-oriented DACs (Brisaboa et al., 2013) and using pointer-based ones (Kanda et al., 2017). For characterless strings such as natural language keywords, the former will be slightly smaller and the latter will be slightly faster. For long strings such as URLs, the latter will outperform the former. Xcdat implements the two versions by using a static polymorphism with C++ template to avoid an overhead of virtual functions.</li>
<li><strong>64-bit Version</strong>: Although Xcdat represents node addresses using 32-bit integers in default configuration, we can allow for 64-bit integers by defining <code>XCDAT_X64</code>; therefore, the dictionary can be constructed from a very large dataset. The construction space becomes large, but the output dictionary size is nearly equal.</li>
<li><strong>NULL Character</strong>: The dictionary can be constructed from keys including the NULL character by setting the second parameter of <code>xcdat::TrieBuilder::build()</code> to <code>true</code>.</li>
<li><strong>Dictionary Encoding</strong>: Xcdat supports mapping N different strings to unique IDs in [0,N-1]. That is to say, it supports two basic dictionary operations: Lookup returns the ID corresponding to a given string and Access (also called ReverseLookup) returns the string corresponding to a given ID. Therefore, Xcdat is very useful in many applications for string precessing and indexing, such as described in (Martínez-Prieto et al., 2016).</li>
<li><strong>Fast Operations</strong>: Xcdat can provide lookup operations faster than other compressed trie libraries because it is based on the double-array trie. On the other hand, compared to the existing double-array libraries, the speed will be slower due to the compression.</li>
<li><strong>Prefix-based Lookup Operations</strong>: As with other trie libraries, Xcdat also provides prefix-based lookup operations required for natural language processing and so on.</li>
</ul>
<h2 id="build-instructions">Build Instructions</h2>
<p>You can download and compile Xcdat as the following commands.</p>
<pre><code>$ git clone https://github.com/kampersanda/xcdat.git
$ cd xcdat
$ mkdir build
$ cd build
$ cmake ..
$ make
$ make install</code></pre>
<p>If you want to use a 64-bit setting, please add <code>-DXCDAT_X64=ON</code> to the CMake option. In addition, you can use the SSE4.2 POPCNT instruction by adding <code>-DXCDAT_USE_POPCNT=ON</code> for Rank/Select operations. The code has been tested only on Mac OS X and Linux. That is, this library considers only UNIX-compatible OS.</p>
<h2 id="command-line-tools">Command Line Tools</h2>
<p><code>xcdat</code> is a general-purpose command line tool to provide three modes as follows.</p>
<pre><code>$ xcdat
xcdat build &lt;type&gt; &lt;key&gt; &lt;dict&gt;
&lt;type&gt; 1: DACs, 2: FDACs
&lt;key&gt; Input file name of a set of keys (must be sorted)
&lt;dict&gt; Output file name of the dictionary (optional)
If omitted, &lt;key&gt;.dacs or &lt;key&gt;.fdacs is output
xcdat query &lt;type&gt; &lt;dict&gt; &lt;limit&gt;
&lt;type&gt; 1: DACs, 2: FDACs
&lt;dict&gt; Input file name of the dictionary
&lt;limit&gt; Limit of #results (optional, default=10)
xcdat bench &lt;type&gt; &lt;dict&gt; &lt;key&gt;
&lt;type&gt; 1: DACs, 2: FDACs
&lt;dict&gt; Input file name of the dictionary
&lt;key&gt; Input file name of keys for benchmark</code></pre>
<h3 id="example-1-construction">Example 1: Construction</h3>
<p>Command <code>xcdat build [params...]</code> builds Xcdat dictionaries from a given dataset and saves it to a file, as follows.</p>
<pre><code>$ xcdat build 1 jawiki-all-titles
constr. time: 1.58574 sec
cmpr. ratio: 0.524287 over the raw size
basic statistics of xcdat::Trie&lt;false&gt;
num keys: 1738995
alphabet size: 189
num nodes: 4042496
num used nodes: 4034357
num free nodes: 8139
size in bytes: 20546967
member size statistics of xcdat::Trie&lt;false&gt;
bc: 13879098 0.675482
terminal_flags: 708448 0.0344794
tail: 5958655 0.290002
boundary_flags: 40 1.94676e-06
basic statistics of xcdat::DacBc
num links: 1499605
bytes per node: 3.4333
member size statistics of xcdat::DacBc
values_L0: 8085000 0.582531
values_L1: 746760 0.0538046
values_L2: 22581 0.00162698
flags_L0: 1389660 0.100126
flags_L1: 128400 0.00925132
leaves: 694856 0.0500649
links: 2811784 0.202591
output -&gt; jawiki-all-titles.dac</code></pre>
<h3 id="example-2-query-processing">Example 2: Query Processing</h3>
<p>Command <code>xcdat query [params...]</code> loads a dictionary file and tests lookup operations, as follows.</p>
<pre><code>$ xcdat query 1 jawiki-all-titles.dac
&gt; NEW_GAME!
Lookup
125989 NEW_GAME!
Common Prefix Lookup
28 N
124185 NE
125428 NEW
125988 NEW_GAME
125989 NEW_GAME!
5 found
Predictive Lookup
125989 NEW_GAME!
126003 NEW_GAME!!
126059 NEW_GAME!_-THE_CHALLENGE_STAGE!-
3 found</code></pre>
<h3 id="example-3-benchmark-test">Example 3: Benchmark Test</h3>
<p>Command <code>xcdat bench [params...]</code> tests time performances of a given dictionary, as follows.</p>
<pre><code>$ xcdat bench 1 jawiki-all-titles.dac jawiki-all-titles.rnd
Warm up
Lookup benchmark on 10 runs
1.5065 us per str
Access benchmark on 10 runs
1.81289 us per ID</code></pre>
<h2 id="sample-usage">Sample Usage</h2>
<p>The following code shows an easy routine sample.</p>
<pre class="sourceCode cpp" id="cb6"><code class="sourceCode cpp"><div class="sourceLine" id="cb6-1" data-line-number="1"><span class="pp">#include </span><span class="im">&lt;iostream&gt;</span></div>
<div class="sourceLine" id="cb6-2" data-line-number="2"><span class="pp">#include </span><span class="im">&lt;xcdat.hpp&gt;</span></div>
<div class="sourceLine" id="cb6-3" data-line-number="3"></div>
<div class="sourceLine" id="cb6-4" data-line-number="4"><span class="dt">int</span> main() {</div>
<div class="sourceLine" id="cb6-5" data-line-number="5"> <span class="bu">std::</span>vector&lt;<span class="bu">std::</span>string&gt; keys_buf = {</div>
<div class="sourceLine" id="cb6-6" data-line-number="6"> <span class="st">&quot;Aoba&quot;</span>, <span class="st">&quot;Yun&quot;</span>, <span class="st">&quot;Hajime&quot;</span>, <span class="st">&quot;Hihumi&quot;</span>, <span class="st">&quot;Kou&quot;</span>, <span class="st">&quot;Rin&quot;</span>,</div>
<div class="sourceLine" id="cb6-7" data-line-number="7"> <span class="st">&quot;Hazuki&quot;</span>, <span class="st">&quot;Umiko&quot;</span>, <span class="st">&quot;Nene&quot;</span>, <span class="st">&quot;Nenecchi&quot;</span></div>
<div class="sourceLine" id="cb6-8" data-line-number="8"> };</div>
<div class="sourceLine" id="cb6-9" data-line-number="9"></div>
<div class="sourceLine" id="cb6-10" data-line-number="10"> <span class="co">// Convert to the input format</span></div>
<div class="sourceLine" id="cb6-11" data-line-number="11"> <span class="bu">std::</span>vector&lt;<span class="bu">std::</span>string_view&gt; keys(keys_buf.size());</div>
<div class="sourceLine" id="cb6-12" data-line-number="12"> <span class="cf">for</span> (<span class="dt">size_t</span> i = <span class="dv">0</span>; i &lt; keys.size(); ++i) {</div>
<div class="sourceLine" id="cb6-13" data-line-number="13"> keys[i] = <span class="bu">std::</span>string_view{keys_buf[i]};</div>
<div class="sourceLine" id="cb6-14" data-line-number="14"> }</div>
<div class="sourceLine" id="cb6-15" data-line-number="15"></div>
<div class="sourceLine" id="cb6-16" data-line-number="16"> <span class="co">// Input data must be sorted.</span></div>
<div class="sourceLine" id="cb6-17" data-line-number="17"> <span class="bu">std::</span>sort(<span class="bu">std::</span>begin(keys), <span class="bu">std::</span>end(keys));</div>
<div class="sourceLine" id="cb6-18" data-line-number="18"></div>
<div class="sourceLine" id="cb6-19" data-line-number="19"> <span class="co">// Dictionary class</span></div>
<div class="sourceLine" id="cb6-20" data-line-number="20"> <span class="kw">using</span> Trie = xcdat::Trie&lt;<span class="kw">true</span>&gt;;</div>
<div class="sourceLine" id="cb6-21" data-line-number="21"></div>
<div class="sourceLine" id="cb6-22" data-line-number="22"> <span class="cf">try</span> {</div>
<div class="sourceLine" id="cb6-23" data-line-number="23"> <span class="co">// Builds a dictionary from the keys</span></div>
<div class="sourceLine" id="cb6-24" data-line-number="24"> Trie trie = xcdat::TrieBuilder::build&lt;<span class="kw">true</span>&gt;(keys); <span class="co">// move</span></div>
<div class="sourceLine" id="cb6-25" data-line-number="25"></div>
<div class="sourceLine" id="cb6-26" data-line-number="26"> <span class="co">// Writes the dictionary to a file.</span></div>
<div class="sourceLine" id="cb6-27" data-line-number="27"> <span class="bu">std::</span>ofstream ofs{<span class="st">&quot;sample.bin&quot;</span>};</div>
<div class="sourceLine" id="cb6-28" data-line-number="28"> trie.write(ofs);</div>
<div class="sourceLine" id="cb6-29" data-line-number="29"> } <span class="cf">catch</span> (<span class="at">const</span> xcdat::TrieBuilder::Exception&amp; ex) {</div>
<div class="sourceLine" id="cb6-30" data-line-number="30"> <span class="co">// Abort if something went wrong...</span></div>
<div class="sourceLine" id="cb6-31" data-line-number="31"> <span class="bu">std::</span>cerr &lt;&lt; ex.what() &lt;&lt; <span class="bu">std::</span>endl;</div>
<div class="sourceLine" id="cb6-32" data-line-number="32"> <span class="cf">return</span> <span class="dv">1</span>;</div>
<div class="sourceLine" id="cb6-33" data-line-number="33"> }</div>
<div class="sourceLine" id="cb6-34" data-line-number="34"></div>
<div class="sourceLine" id="cb6-35" data-line-number="35"> <span class="co">// Creates an empty dictionary</span></div>
<div class="sourceLine" id="cb6-36" data-line-number="36"> Trie trie;</div>
<div class="sourceLine" id="cb6-37" data-line-number="37"> {</div>
<div class="sourceLine" id="cb6-38" data-line-number="38"> <span class="co">// Reads the dictionary to the file.</span></div>
<div class="sourceLine" id="cb6-39" data-line-number="39"> <span class="bu">std::</span>ifstream ifs{<span class="st">&quot;sample.bin&quot;</span>};</div>
<div class="sourceLine" id="cb6-40" data-line-number="40"> trie = Trie{ifs}; <span class="co">// move</span></div>
<div class="sourceLine" id="cb6-41" data-line-number="41"> }</div>
<div class="sourceLine" id="cb6-42" data-line-number="42"></div>
<div class="sourceLine" id="cb6-43" data-line-number="43"> <span class="bu">std::</span>cout &lt;&lt; <span class="st">&quot;Performing basic operations...&quot;</span> &lt;&lt; <span class="bu">std::</span>endl;</div>
<div class="sourceLine" id="cb6-44" data-line-number="44"> {</div>
<div class="sourceLine" id="cb6-45" data-line-number="45"> <span class="co">// lookup() obtains the unique ID for a given key</span></div>
<div class="sourceLine" id="cb6-46" data-line-number="46"> xcdat::<span class="dt">id_type</span> key_id = trie.lookup(<span class="st">&quot;Rin&quot;</span>);</div>
<div class="sourceLine" id="cb6-47" data-line-number="47"> <span class="co">// access() decodes the key from a given ID</span></div>
<div class="sourceLine" id="cb6-48" data-line-number="48"> <span class="bu">std::</span>cout &lt;&lt; key_id &lt;&lt; <span class="st">&quot; : &quot;</span> &lt;&lt; trie.access(key_id) &lt;&lt; <span class="bu">std::</span>endl;</div>
<div class="sourceLine" id="cb6-49" data-line-number="49"></div>
<div class="sourceLine" id="cb6-50" data-line-number="50"> <span class="co">// Given an unregistered key, lookup() returns NOT_FOUND.</span></div>
<div class="sourceLine" id="cb6-51" data-line-number="51"> <span class="cf">if</span> (trie.lookup(<span class="st">&quot;Hotaru&quot;</span>) == Trie::NOT_FOUND) {</div>
<div class="sourceLine" id="cb6-52" data-line-number="52"> <span class="bu">std::</span>cout &lt;&lt; <span class="st">&quot;? : &quot;</span> &lt;&lt; <span class="st">&quot;Hotaru&quot;</span> &lt;&lt; <span class="bu">std::</span>endl;</div>
<div class="sourceLine" id="cb6-53" data-line-number="53"> }</div>
<div class="sourceLine" id="cb6-54" data-line-number="54"> }</div>
<div class="sourceLine" id="cb6-55" data-line-number="55"></div>
<div class="sourceLine" id="cb6-56" data-line-number="56"> <span class="bu">std::</span>cout &lt;&lt; <span class="st">&quot;Performing a common prefix operation...&quot;</span> &lt;&lt; <span class="bu">std::</span>endl;</div>
<div class="sourceLine" id="cb6-57" data-line-number="57"> {</div>
<div class="sourceLine" id="cb6-58" data-line-number="58"> <span class="co">// Common prefix operation is implemented using PrefixIterator, created by</span></div>
<div class="sourceLine" id="cb6-59" data-line-number="59"> <span class="co">// make_prefix_iterator().</span></div>
<div class="sourceLine" id="cb6-60" data-line-number="60"> Trie::PrefixIterator it = trie.make_prefix_iterator(<span class="st">&quot;Nenecchi&quot;</span>);</div>
<div class="sourceLine" id="cb6-61" data-line-number="61"></div>
<div class="sourceLine" id="cb6-62" data-line-number="62"> <span class="co">// next() continues to obtain the next key until false is returned.</span></div>
<div class="sourceLine" id="cb6-63" data-line-number="63"> <span class="cf">while</span> (it.next()) {</div>
<div class="sourceLine" id="cb6-64" data-line-number="64"> <span class="bu">std::</span>cout &lt;&lt; it.id() &lt;&lt; <span class="st">&quot; : &quot;</span> &lt;&lt; it.key() &lt;&lt; <span class="bu">std::</span>endl;</div>
<div class="sourceLine" id="cb6-65" data-line-number="65"> }</div>
<div class="sourceLine" id="cb6-66" data-line-number="66"> }</div>
<div class="sourceLine" id="cb6-67" data-line-number="67"></div>
<div class="sourceLine" id="cb6-68" data-line-number="68"> <span class="bu">std::</span>cout &lt;&lt; <span class="st">&quot;Performing a predictive operation...&quot;</span> &lt;&lt; <span class="bu">std::</span>endl;</div>
<div class="sourceLine" id="cb6-69" data-line-number="69"> {</div>
<div class="sourceLine" id="cb6-70" data-line-number="70"> <span class="co">// Predictive operation is implemented using PredictiveIterator, created by</span></div>
<div class="sourceLine" id="cb6-71" data-line-number="71"> <span class="co">// make_predictive_iterator().</span></div>
<div class="sourceLine" id="cb6-72" data-line-number="72"> Trie::PredictiveIterator it = trie.make_predictive_iterator(<span class="st">&quot;Ha&quot;</span>);</div>
<div class="sourceLine" id="cb6-73" data-line-number="73"></div>
<div class="sourceLine" id="cb6-74" data-line-number="74"> <span class="co">// next() continues to obtain the next key until false is returned in</span></div>
<div class="sourceLine" id="cb6-75" data-line-number="75"> <span class="co">// lexicographical order.</span></div>
<div class="sourceLine" id="cb6-76" data-line-number="76"> <span class="cf">while</span> (it.next()) {</div>
<div class="sourceLine" id="cb6-77" data-line-number="77"> <span class="bu">std::</span>cout &lt;&lt; it.id() &lt;&lt; <span class="st">&quot; : &quot;</span> &lt;&lt; it.key() &lt;&lt; <span class="bu">std::</span>endl;</div>
<div class="sourceLine" id="cb6-78" data-line-number="78"> }</div>
<div class="sourceLine" id="cb6-79" data-line-number="79"> }</div>
<div class="sourceLine" id="cb6-80" data-line-number="80"></div>
<div class="sourceLine" id="cb6-81" data-line-number="81"> <span class="bu">std::</span>cout &lt;&lt; <span class="st">&quot;Enumerating all registered keys...&quot;</span> &lt;&lt; <span class="bu">std::</span>endl;</div>
<div class="sourceLine" id="cb6-82" data-line-number="82"> {</div>
<div class="sourceLine" id="cb6-83" data-line-number="83"> <span class="co">// PredictiveIterator for an empty string provides enumeration of all</span></div>
<div class="sourceLine" id="cb6-84" data-line-number="84"> <span class="co">// registered keys in lexicographical order.</span></div>
<div class="sourceLine" id="cb6-85" data-line-number="85"> Trie::PredictiveIterator it = trie.make_predictive_iterator(<span class="st">&quot;&quot;</span>);</div>
<div class="sourceLine" id="cb6-86" data-line-number="86"> <span class="cf">while</span> (it.next()) {</div>
<div class="sourceLine" id="cb6-87" data-line-number="87"> <span class="bu">std::</span>cout &lt;&lt; it.id() &lt;&lt; <span class="st">&quot; : &quot;</span> &lt;&lt; it.key() &lt;&lt; <span class="bu">std::</span>endl;</div>
<div class="sourceLine" id="cb6-88" data-line-number="88"> }</div>
<div class="sourceLine" id="cb6-89" data-line-number="89"> }</div>
<div class="sourceLine" id="cb6-90" data-line-number="90"></div>
<div class="sourceLine" id="cb6-91" data-line-number="91"> <span class="cf">return</span> <span class="dv">0</span>;</div>
<div class="sourceLine" id="cb6-92" data-line-number="92">}</div></code></pre>
<p>The standard output is as follows.</p>
<pre><code>Performing basic operations...
7 : Rin
? : Hotaru
Performing common prefix operations...
4 : Nene
6 : Nenecchi
Performing predictive operations...
3 : Hajime
5 : Hazuki
Enumerating all registered keys...
0 : Aoba
3 : Hajime
5 : Hazuki
1 : Hihumi
2 : Kou
4 : Nene
6 : Nenecchi
7 : Rin
8 : Umiko
9 : Yun</code></pre>
<p>As shown in the output, <code>xcdat::Trie</code> assigns unique integer IDs to each registered key. The ID order is random, depending on node arrangement.</p>
<h2 id="api">API</h2>
<p>You can build a dictionary using static member function <code>xcdat::TrieBuilder::build()</code>. This function receives a set of keywords and returns the resulting class object of <code>xcdat::Trie</code>. For the usage, refer to the header comments of <a href="https://github.com/kampersanda/xcdat/blob/master/include/xcdat/TrieBuilder.hpp"><code>xcdat::TrieBuilder.hpp</code></a>. Also for the usage of <code>xcdat::Trie</code>, refer to the header comments of <a href="https://github.com/kampersanda/xcdat/blob/master/include/xcdat/Trie.hpp"><code>xcdat::Trie</code></a>.</p>
<p>The detailed descriptions of AIP are under construction…</p>
<h2 id="benchmark">Benchmark</h2>
<p>Work in progress…</p>
<h2 id="to-do">To Do</h2>
<ul>
<li>Show benchmarks</li>
<li>Create AIP descriptions</li>
</ul>
<h2 id="licensing">Licensing</h2>
<p>This library is free software provided under the MIT License.</p>
<h2 id="citation">Citation</h2>
<p>If you use the library in academic settings, please cite the following paper.</p>
<pre class="sourceCode bibtex" id="cb8"><code class="sourceCode bibtex"><div class="sourceLine" id="cb8-1" data-line-number="1"><span class="va">@article</span>{<span class="ot">kanda2017compressed</span>,</div>
<div class="sourceLine" id="cb8-2" data-line-number="2"> <span class="dt">title</span>={Compressed double-array tries for string dictionaries supporting fast lookup},</div>
<div class="sourceLine" id="cb8-3" data-line-number="3"> <span class="dt">author</span>={Kanda, Shunsuke and Morita, Kazuhiro and Fuketa, Masao},</div>
<div class="sourceLine" id="cb8-4" data-line-number="4"> <span class="dt">journal</span>={Knowledge and Information Systems},</div>
<div class="sourceLine" id="cb8-5" data-line-number="5"> <span class="dt">volume</span>={51},</div>
<div class="sourceLine" id="cb8-6" data-line-number="6"> <span class="dt">number</span>={3},</div>
<div class="sourceLine" id="cb8-7" data-line-number="7"> <span class="dt">pages</span>={1023--1042},</div>
<div class="sourceLine" id="cb8-8" data-line-number="8"> <span class="dt">year</span>={2017},</div>
<div class="sourceLine" id="cb8-9" data-line-number="9"> <span class="dt">publisher</span>={Springer}</div>
<div class="sourceLine" id="cb8-10" data-line-number="10">}</div></code></pre>
<h2 id="references">References</h2>
<ul>
<li>J. Aoe. An efficient digital search algorithm by using a double-array structure. IEEE Transactions on Software Engineering, 15(9):10661077, 1989.</li>
<li>N. R. Brisaboa, S. Ladra, and G. Navarro. DACs: Bringing direct access to variable-length codes. Information Processing &amp; Management, 49(1):392404, 2013.</li>
<li>S. Kanda, K. Morita, and M. Fuketa. Compressed double-array tries for string dictionaries supporting fast lookup. Knowledge and Information Systems, 51(3): 10231042, 2017.</li>
<li>M. A. Martínez-Prieto, N. Brisaboa, R. Cánovas, F. Claude, and G. Navarro. Practical compressed string dictionaries. Information Systems, 56:73108, 2016</li>
<li>S. Yata, M. Oono, K. Morita, M. Fuketa, T. Sumitomo, and J. Aoe. A compact static double-array keeping character codes. Information Processing &amp; Management, 43(1):237247, 2007.</li>
</ul>
<footer>
<p>Copyright &copy; 2017 Shunsuke Kanda, All Rights Reserved.</p>
</footer>
</body>
</html>

View file

@ -1,3 +0,0 @@
#!/bin/sh
pandoc --template=template.html -o index.html document.md -c style.css --toc --toc-depth=2

View file

@ -1,163 +0,0 @@
@import url('https://fonts.googleapis.com/css?family=Comfortaa');
@import url('https://fonts.googleapis.com/css?family=Source+Code+Pro');
@import url('https://maxcdn.bootstrapcdn.com/font-awesome/4.7.0/css/font-awesome.min.css');
body {
background: #fff;
color: #545454;
font-family: 'Comfortaa';
font-size: 16px;
line-height: 1.5;
margin: 0 auto;
max-width: 800px;
padding: 2em 2em 2em;
}
h1,
h2,
h3,
h4,
h5,
h6 {
color: #494949;
font-weight: 600;
line-height: 1.3;
}
h1 {
line-height: 1.7;
text-align: center;
}
h2 {
margin-top: 1.3em;
padding: 0.25em 0.5em;
color: #494949;
background: transparent;
border-left: solid 5px #7db4e6;
}
h3 {
margin-top: 1.3em;
padding: 0.25em 0.0em;
}
h4 {
margin-top: 1.3em;
padding: 0.25em 0.0em;
}
a {
color: #0083e8;
text-decoration: none;
}
a:hover {
text-decoration: underline;
}
b,
strong {
font-weight: 600;
background: linear-gradient(transparent 75%, #a7d6ff 70%);
}
img {
animation: colorize 2s cubic-bezier(0, 0, .78, .36) 1;
background: transparent;
border: 10px solid rgba(0, 0, 0, 0.12);
border-radius: 4px;
display: block;
margin: 1.3em auto;
max-width: 95%;
}
blockquote {
position: relative;
padding: 10px 15px 10px 60px;
box-sizing: border-box;
background: #f5f5f5;
color: #777777;
border-left: 4px solid #9dd4ff;
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.14);
}
blockquote:before {
display: inline-block;
position: absolute;
top: 15px;
left: 15px;
vertical-align: middle;
content: "\f10d";
font-family: FontAwesome;
color: #9dd4ff;
font-size: 30px;
line-height: 1;
}
blockquote p {
padding: 0;
margin: 7px 0;
}
blockquote cite {
display: block;
text-align: right;
color: #888888;
font-size: 0.9em;
}
ul {
padding: 0 0.5em;
position: relative;
}
ul li {
line-height: 1.5;
padding: 0.2em 0 0.5em 1.5em;
border-bottom: 2px solid white;
list-style-type: none!important;
}
ul li:before {
font-family: FontAwesome;
content: "\f00c";
position: absolute;
left: 0.5em;
color: #9dd4ff;
}
ul li:last-of-type {
border-bottom: none;
}
pre,
code {
background: #f5f5f5;
font-family: 'Source Code Pro', monospace;
}
p code {
padding: 0.1em 0.5em;
}
pre {
font-size: 0.95rem;
padding: 1em;
overflow: auto;
white-space: pre;
}
pre.sourceCode {
font-size: 0.95rem;
padding: 1em;
overflow: auto;
white-space: pre;
}
footer {
font-size: 14px;
color: #8f9296;
text-align: center;
margin-top: 40px;
}

View file

@ -1,68 +0,0 @@
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" lang="$lang$" xml:lang="$lang$"$if(dir)$ dir="$dir$"$endif$>
<head>
<meta charset="utf-8" />
<meta name="generator" content="pandoc" />
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes" />
$for(author-meta)$
<meta name="author" content="$author-meta$" />
$endfor$
$if(date-meta)$
<meta name="dcterms.date" content="$date-meta$" />
$endif$
$if(keywords)$
<meta name="keywords" content="$for(keywords)$$keywords$$sep$, $endfor$" />
$endif$
<title>$if(title-prefix)$$title-prefix$ $endif$$pagetitle$</title>
<style type="text/css">
code{white-space: pre-wrap;}
span.smallcaps{font-variant: small-caps;}
div.line-block{white-space: pre-line;}
div.column{display: inline-block; vertical-align: top; width: 50%;}
$if(quotes)$
q { quotes: "“" "”" "" ""; }
$endif$
</style>
$if(highlighting-css)$
<style type="text/css">
$highlighting-css$
</style>
$endif$
$for(css)$
<link rel="stylesheet" href="$css$">
$endfor$
$if(math)$
$math$
$endif$
<!--[if lt IE 9]>
<script src="//cdnjs.cloudflare.com/ajax/libs/html5shiv/3.7.3/html5shiv-printshiv.min.js"></script>
<![endif]-->
$for(header-includes)$
$header-includes$
$endfor$
</head>
<body>
$for(include-before)$
$include-before$
$endfor$
$if(title)$
<header>
<h1 class="title">$title$</h1>
<p align="center">Created by <a href="https://github.com/kampersanda">$author$</a></p>
</header>
$endif$
$if(toc)$
<h2>Contents</h2>
<nav id="$idprefix$TOC">
$table-of-contents$
</nav>
$endif$
$body$
$for(include-after)$
$include-after$
$endfor$
<footer>
<p>Copyright &copy; $date$ $author$, All Rights Reserved.</p>
</footer>
</body>
</html>

View file

@ -1,6 +1 @@
#ifndef XCDAT_XCDAT_HPP_ #pragma once
#define XCDAT_XCDAT_HPP_
#include "xcdat/TrieBuilder.hpp"
#endif //XCDAT_XCDAT_HPP_

View file

@ -1,73 +0,0 @@
#ifndef XCDAT_BIT_VECTOR_HPP_
#define XCDAT_BIT_VECTOR_HPP_
#include "BitVectorBuilder.hpp"
#include "Vector.hpp"
namespace xcdat {
// Bit vector supporting Rank/Select operations.
class BitVector {
public:
BitVector() = default;
explicit BitVector(std::istream &is);
BitVector(BitVectorBuilder& builder, bool rank_flag, bool select_flag);
~BitVector() = default;
bool operator[](size_t i) const {
return (bits_[i / 32] & (1U << (i % 32))) != 0;
}
// the number of 1s in B[0,i).
id_type rank(id_type i) const;
// the position of the i+1 th occurrence.
id_type select(id_type i) const;
size_t num_1s() const {
return num_1s_;
}
size_t num_0s() const {
return size_ - num_1s_;
}
// the number of bits
size_t size() const {
return size_;
}
size_t size_in_bytes() const;
void write(std::ostream &os) const;
void swap(BitVector& rhs) {
std::swap(*this, rhs);
}
BitVector(const BitVector&) = delete;
BitVector& operator=(const BitVector&) = delete;
BitVector(BitVector&&) noexcept = default;
BitVector& operator=(BitVector&&) noexcept = default;
private:
static constexpr id_type BITS_IN_R1 {256};
static constexpr id_type BITS_IN_R2 {32};
static constexpr id_type R1_PER_R2 {BITS_IN_R1 / BITS_IN_R2}; // 8
static constexpr id_type ONES_PER_TIP {512};
struct RankTip {
id_type L1;
uint8_t L2[R1_PER_R2];
};
Vector<uint32_t> bits_ {};
Vector<RankTip> rank_tips_ {};
Vector<id_type> select_tips_ {};
size_t size_ {};
size_t num_1s_ {};
};
} //namespace - xcdat
#endif //XCDAT_BIT_VECTOR_HPP_

View file

@ -1,64 +0,0 @@
#ifndef XCDAT_BIT_VECTOR_BUILDER_HPP_
#define XCDAT_BIT_VECTOR_BUILDER_HPP_
#include "xcdat_basics.hpp"
namespace xcdat {
// Bit pool for building BitVector.
class BitVectorBuilder {
public:
friend class BitVector;
BitVectorBuilder() = default;
~BitVectorBuilder() = default;
explicit BitVectorBuilder(size_t size) {
resize(size);
}
void push_back(bool bit) {
if (size_ % 32 == 0) {
bits_.push_back(0);
}
if (bit) {
set_bit(size_, true);
}
++size_;
}
void set_bit(size_t i, bool bit) {
if (bit) {
bits_[i / 32] |= (1U << (i % 32));
++num_1s_;
} else {
bits_[i / 32] &= (~(1U << (i % 32)));
--num_1s_;
}
}
void resize(size_t size) {
bits_.resize(size / 32 + 1, 0);
size_ = size;
}
void reserve(size_t capacity) {
bits_.reserve(capacity / 32 + 1);
}
size_t size() const {
return size_;
}
BitVectorBuilder(const BitVectorBuilder&) = delete;
BitVectorBuilder& operator=(const BitVectorBuilder&) = delete;
private:
std::vector<uint32_t> bits_ {};
size_t size_ {};
size_t num_1s_ {};
};
} //namespace - xcdat
#endif //XCDAT_BIT_VECTOR_BUILDER_HPP_

View file

@ -1,75 +0,0 @@
#ifndef XCDAT_DAC_BC_HPP_
#define XCDAT_DAC_BC_HPP_
#include "BitVector.hpp"
#include "FitVector.hpp"
namespace xcdat {
// BASE/CHECK representation using byte-oriented DACs.
class DacBc {
public:
static constexpr id_type WIDTH_L1 {8};
DacBc() = default;
~DacBc() = default;
explicit DacBc(std::istream &is);
explicit DacBc(const std::vector<BcPair>& bc, BitVectorBuilder& leaf_flags);
id_type base(id_type i) const {
return access_(i * 2) ^ i;
}
id_type link(id_type i) const {
return values_[0][i * 2] | (links_[leaf_flags_.rank(i)] << 8);
}
id_type check(id_type i) const {
return access_(i * 2 + 1) ^ i;
}
bool is_leaf(id_type i) const {
return leaf_flags_[i];
}
bool is_used(id_type i) const {
return check(i) != i;
}
size_t num_nodes() const {
return values_[0].size() / 2;
}
size_t num_used_nodes() const {
return num_nodes() - num_free_nodes_;
}
size_t num_free_nodes() const {
return num_free_nodes_;
}
size_t size_in_bytes() const;
void show_stat(std::ostream &os) const;
void write(std::ostream &os) const;
void swap(DacBc& rhs) {
std::swap(*this, rhs);
}
DacBc(const DacBc&) = delete;
DacBc& operator=(const DacBc&) = delete;
DacBc(DacBc&&) noexcept = default;
DacBc& operator=(DacBc&&) noexcept = default;
private:
Vector<uint8_t> values_[sizeof(id_type)] {};
BitVector flags_[sizeof(id_type) - 1] {};
BitVector leaf_flags_ {};
FitVector links_ {};
uint8_t max_level_ {};
size_t num_free_nodes_ {};
id_type access_(id_type i) const;
};
} //namespace - xcdat
#endif //XCDAT_DAC_BC_HPP_

View file

@ -1,93 +0,0 @@
#ifndef XCDAT_FAST_DAC_BC_HPP_
#define XCDAT_FAST_DAC_BC_HPP_
#include <tuple>
#include "BitVector.hpp"
#include "FitVector.hpp"
#include "Vector.hpp"
namespace xcdat {
// BASE/CHECK representation using pointer-based byte-oriented DACs.
class FastDacBc {
public:
static constexpr id_type WIDTH_L1 = 7;
#ifdef XCDAT_X64
static constexpr uint8_t LAYERS = 4;
#else
static constexpr uint8_t LAYERS = 3;
#endif
static constexpr id_type BLOCK_SIZE_L1 = 1U << 7;
static constexpr id_type BLOCK_SIZE_L2 = 1U << 15;
#ifdef XCDAT_X64
static constexpr id_type BLOCK_SIZE_L3 = 1U << 31;
#endif
FastDacBc() = default;
explicit FastDacBc(std::istream& is);
explicit FastDacBc(const std::vector<BcPair>& bc,
BitVectorBuilder& leaf_flags);
~FastDacBc() = default;
id_type base(id_type i) const {
return access_(i * 2) ^ i;
}
id_type link(id_type i) const {
return values_L1_[i * 2] | (links_[leaf_flags_.rank(i)] << 8);
}
id_type check(id_type i) const {
return access_(i * 2 + 1) ^ i;
}
bool is_leaf(id_type i) const {
return leaf_flags_[i];
}
bool is_used(id_type i) const {
return check(i) != i;
}
size_t num_nodes() const {
return values_L1_.size() / 2;
}
size_t num_used_nodes() const {
return num_nodes() - num_free_nodes_;
}
size_t num_free_nodes() const {
return num_free_nodes_;
}
size_t size_in_bytes() const;
void show_stat(std::ostream& os) const;
void write(std::ostream& os) const;
void swap(FastDacBc& rhs) {
std::swap(*this, rhs);
}
FastDacBc(const FastDacBc&) = delete;
FastDacBc& operator=(const FastDacBc&) = delete;
FastDacBc(FastDacBc&&) noexcept = default;
FastDacBc& operator=(FastDacBc&&) noexcept = default;
private:
Vector <uint8_t> values_L1_{};
Vector <uint16_t> values_L2_{};
Vector <uint32_t> values_L3_{};
#ifdef XCDAT_X64
Vector<uint64_t> values_L4_ {};
#endif
Vector <id_type> ranks_[LAYERS - 1]{};
BitVector leaf_flags_{};
FitVector links_{};
size_t num_free_nodes_{};
id_type access_(id_type i) const;
};
} //namespace - xcdat
#endif //XCDAT_FAST_DAC_BC_HPP_

View file

@ -1,56 +0,0 @@
#ifndef XCDAT_SMALL_VECTOR_HPP_
#define XCDAT_SMALL_VECTOR_HPP_
#include "Vector.hpp"
namespace xcdat {
// Compacted integer vector.
class FitVector {
public:
static constexpr id_type CHUNK_WIDTH = sizeof(id_type) * 8;
FitVector() = default;
explicit FitVector(std::istream &is);
explicit FitVector(const std::vector<id_type>& values);
~FitVector() = default;
id_type operator[](size_t i) const {
auto chunk_pos = static_cast<id_type>(i * width_ / CHUNK_WIDTH);
auto offset = static_cast<id_type>(i * width_ % CHUNK_WIDTH);
if (offset + width_ <= CHUNK_WIDTH) {
return (chunks_[chunk_pos] >> offset) & mask_;
} else {
return ((chunks_[chunk_pos] >> offset)
| (chunks_[chunk_pos + 1] << (CHUNK_WIDTH - offset))) & mask_;
}
}
size_t size() const {
return size_;
}
size_t size_in_bytes() const;
void write(std::ostream &os) const;
void swap(FitVector& rhs) {
std::swap(*this, rhs);
}
FitVector(const FitVector&) = delete;
FitVector& operator=(const FitVector&) = delete;
FitVector(FitVector&&) noexcept = default;
FitVector& operator=(FitVector&&) noexcept = default;
private:
Vector<id_type> chunks_ {};
size_t size_ {};
id_type width_ {};
id_type mask_ {};
};
} //namespace - xcdat
#endif //XCDAT_SMALL_VECTOR_HPP_

View file

@ -1,514 +0,0 @@
#ifndef XCDAT_TRIE_HPP_
#define XCDAT_TRIE_HPP_
#include <string_view>
#include <xcdat/Trie.hpp>
#include "Trie.hpp"
#include "DacBc.hpp"
#include "FastDacBc.hpp"
namespace xcdat {
// Compressed string dictionary using an improved double-array trie. There are
// two versions of DACs to represent BASE/CHECK arrays in small space. The
// versions can be chosen using the Fast parameter.
template<bool Fast>
class Trie {
public:
using trie_type = Trie<Fast>;
using bc_type = typename std::conditional<Fast, FastDacBc, DacBc>::type;
static constexpr auto NOT_FOUND = ID_MAX;
// Generic constructor.
Trie() = default;
// Reads the dictionary from an std::istream.
explicit Trie(std::istream& is) {
bc_ = bc_type(is);
terminal_flags_ = BitVector(is);
tail_ = Vector<char>(is);
boundary_flags_ = BitVector(is);
alphabet_ = Vector<uint8_t>(is);
is.read(reinterpret_cast<char*>(table_), 512);
num_keys_ = read_value<size_t>(is);
max_length_ = read_value<size_t>(is);
bin_mode_ = read_value<bool>(is);
}
// Generic destructor.
~Trie() = default;
// Lookups the ID of a given key. If the key is not registered, otherwise
// returns NOT_FOUND.
id_type lookup(std::string_view key) const {
size_t pos = 0;
id_type node_id = 0;
while (!bc_.is_leaf(node_id)) {
if (pos == key.length()) {
return terminal_flags_[node_id] ? to_key_id_(node_id) : NOT_FOUND;
}
const auto child_id = bc_.base(node_id) ^code_(key[pos++]);
if (bc_.check(child_id) != node_id) {
return NOT_FOUND;
}
node_id = child_id;
}
size_t tail_pos = bc_.link(node_id);
if (!match_suffix_(key, pos, tail_pos)) {
return NOT_FOUND;
}
return to_key_id_(node_id);
}
// Decodes the key associated with a given ID.
std::string access(id_type id) const {
if (num_keys_ <= id) {
return {};
}
std::string dec;
dec.reserve(max_length_);
auto node_id = to_node_id_(id);
auto tail_pos = bc_.is_leaf(node_id) ? bc_.link(node_id) : NOT_FOUND;
while (node_id) {
const auto parent_id = bc_.check(node_id);
dec += edge_(parent_id, node_id);
node_id = parent_id;
}
std::reverse(std::begin(dec), std::end(dec));
if (tail_pos != 0 && tail_pos != NOT_FOUND) {
if (bin_mode_) {
do {
dec += tail_[tail_pos];
} while (!boundary_flags_[tail_pos++]);
} else {
do {
dec += tail_[tail_pos++];
} while (tail_[tail_pos]);
}
}
return dec;
}
// Iterator for enumerating the keys and IDs included as prefixes of a given
// key, that is, supporting so-called common prefix lookup. It is created by
// using make_prefix_iterator().
class PrefixIterator {
public:
PrefixIterator() = default;
// Scans the next key. If it does not exist, returns false.
bool next() {
return trie_ != nullptr && trie_->next_prefix_(this);
}
// Gets the key.
std::string_view key() const {
return {key_.data(), pos_};
};
// Gets the ID.
id_type id() const {
return id_;
}
private:
const trie_type* trie_{};
const std::string_view key_{};
size_t pos_{0};
id_type node_id_{0};
id_type id_{};
bool begin_flag_{true};
bool end_flag_{false};
PrefixIterator(const trie_type* trie, std::string_view key)
: trie_{trie}, key_{key} {}
friend class Trie;
};
// Makes PrefixIterator from a given key.
PrefixIterator make_prefix_iterator(std::string_view key) const {
return PrefixIterator{this, key};
}
// Iterator class for enumerating the keys and IDs starting with prefixes of
// a given key, that is, supporting so-called predictive lookup. It is in
// lexicographical order. It is created by using make_predictive_iterator().
class PredictiveIterator {
public:
PredictiveIterator() = default;
// Scans the next key. If it does not exist, returns false.
bool next() {
return trie_ != nullptr && trie_->next_predictive_(this);
}
// Gets the key.
std::string_view key() const {
return {buf_.data(), buf_.size()};
};
// Gets the ID.
id_type id() const {
return id_;
}
private:
const trie_type* trie_{};
const std::string_view key_{};
bool begin_flag_{true};
bool end_flag_{false};
struct stack_t {
size_t depth;
char c;
id_type node_id;
};
std::vector<stack_t> stack_{};
std::string buf_{};
id_type id_{};
PredictiveIterator(const trie_type* trie, std::string_view key)
: trie_{trie}, key_{key} {
buf_.reserve(trie->max_length_);
}
friend class Trie;
};
// Makes PredictiveIterator from a given key.
PredictiveIterator make_predictive_iterator(std::string_view key) const {
return {this, key};
}
// Gets the number of registered keys in the dictionary
size_t num_keys() const {
return num_keys_;
}
// Gets whether a binary mode or not.
bool bin_mode() const {
return bin_mode_;
}
// Gets the size of alphabet drawing keys in the dictionary.
size_t alphabet_size() const {
return alphabet_.size();
}
// Gets the number of nodes including free nodes.
size_t num_nodes() const {
return bc_.num_nodes();
}
// Gets the number of nodes in the original trie.
size_t num_used_nodes() const {
return bc_.num_used_nodes();
}
// Gets the number of free nodes corresponding to empty elements.
size_t num_free_nodes() const {
return bc_.num_free_nodes();
}
// Computes the output dictionary size in bytes.
size_t size_in_bytes() const {
size_t ret = 0;
ret += bc_.size_in_bytes();
ret += terminal_flags_.size_in_bytes();
ret += tail_.size_in_bytes();
ret += boundary_flags_.size_in_bytes();
ret += alphabet_.size_in_bytes();
ret += sizeof(table_);
ret += sizeof(num_keys_);
ret += sizeof(max_length_);
ret += sizeof(bin_mode_);
return ret;
}
// Reports the dictionary statistics into an ostream.
void show_stat(std::ostream& os) const {
const auto total_size = size_in_bytes();
os << "basic statistics of xcdat::Trie<"
<< (Fast ? "true" : "false") << ">" << std::endl;
show_size("\tnum keys: ", num_keys(), os);
show_size("\talphabet size: ", alphabet_size(), os);
show_size("\tnum nodes: ", num_nodes(), os);
show_size("\tnum used nodes:", num_used_nodes(), os);
show_size("\tnum free nodes:", num_free_nodes(), os);
show_size("\tsize in bytes: ", size_in_bytes(), os);
os << "member size statistics of xcdat::Trie<"
<< (Fast ? "true" : "false") << ">" << std::endl;
show_size_ratio("\tbc: ", bc_.size_in_bytes(), total_size, os);
show_size_ratio("\tterminal_flags:", terminal_flags_.size_in_bytes(),
total_size, os);
show_size_ratio("\ttail: ", tail_.size_in_bytes(), total_size, os);
show_size_ratio("\tboundary_flags:", boundary_flags_.size_in_bytes(),
total_size, os);
bc_.show_stat(os);
}
// Writes the dictionary into an ostream.
void write(std::ostream& os) const {
bc_.write(os);
terminal_flags_.write(os);
tail_.write(os);
boundary_flags_.write(os);
alphabet_.write(os);
os.write(reinterpret_cast<const char*>(table_), 512);
write_value(num_keys_, os);
write_value(max_length_, os);
write_value(bin_mode_, os);
}
// Swap
void swap(Trie& rhs) {
std::swap(*this, rhs);
}
Trie(const Trie&) = delete;
Trie& operator=(const Trie&) = delete;
Trie(Trie&&) noexcept = default;
Trie& operator=(Trie&&) noexcept = default;
private:
bc_type bc_{};
BitVector terminal_flags_{};
Vector<char> tail_{};
BitVector boundary_flags_{}; // used if binary_mode_ == true
Vector<uint8_t> alphabet_{};
uint8_t table_[512]{}; // table[table[c] + 256] = c
size_t num_keys_{};
size_t max_length_{};
bool bin_mode_{};
id_type to_key_id_(id_type node_id) const {
return terminal_flags_.rank(node_id);
};
id_type to_node_id_(id_type string_id) const {
return terminal_flags_.select(string_id);
};
id_type code_(char c) const {
return table_[static_cast<uint8_t>(c)];
}
char edge_(id_type node_id, id_type child_id) const {
return static_cast<char>(table_[(bc_.base(node_id) ^ child_id) + 256]);
}
bool match_suffix_(std::string_view key, size_t pos, size_t tail_pos) const {
assert(pos <= key.length());
if (pos == key.length()) {
return tail_pos == 0;
}
if (bin_mode_) {
do {
if (key[pos] != tail_[tail_pos]) {
return false;
}
++pos;
if (boundary_flags_[tail_pos]) {
return pos == key.length();
}
++tail_pos;
} while (pos < key.length());
return false;
} else {
do {
if (!tail_[tail_pos] || key[pos] != tail_[tail_pos]) {
return false;
}
++pos;
++tail_pos;
} while (pos < key.length());
return !tail_[tail_pos];
}
}
void extract_suffix_(size_t tail_pos, std::string& dec) const {
if (bin_mode_) {
if (tail_pos != 0) {
do {
dec += tail_[tail_pos];
} while (!boundary_flags_[tail_pos++]);
}
} else {
while (tail_[tail_pos] != '\0') {
dec += tail_[tail_pos];
++tail_pos;
}
}
}
bool next_prefix_(PrefixIterator* it) const {
if (it->end_flag_) {
return false;
}
if (it->begin_flag_) {
it->begin_flag_ = false;
if (terminal_flags_[it->node_id_]) {
it->id_ = to_key_id_(it->node_id_);
return true;
}
}
while (!bc_.is_leaf(it->node_id_)) {
id_type child_id = bc_.base(it->node_id_) ^code_(it->key_[it->pos_++]);
if (bc_.check(child_id) != it->node_id_) {
it->end_flag_ = true;
it->id_ = NOT_FOUND;
return false;
}
it->node_id_ = child_id;
if (!bc_.is_leaf(it->node_id_) && terminal_flags_[it->node_id_]) {
it->id_ = to_key_id_(it->node_id_);
return true;
}
}
it->end_flag_ = true;
size_t tail_pos = bc_.link(it->node_id_);
if (!match_suffix_(it->key_, it->pos_, tail_pos)) {
it->id_ = NOT_FOUND;
return false;
}
it->pos_ = it->key_.length();
it->id_ = to_key_id_(it->node_id_);
return true;
}
bool next_predictive_(PredictiveIterator* it) const {
if (it->end_flag_) {
return false;
}
if (it->begin_flag_) {
it->begin_flag_ = false;
id_type node_id = 0;
size_t pos = 0;
for (; pos < it->key_.length(); ++pos) {
if (bc_.is_leaf(node_id)) {
it->end_flag_ = true;
size_t tail_pos = bc_.link(node_id);
if (tail_pos == 0) {
return false;
}
if (bin_mode_) {
do {
if (it->key_[pos] != tail_[tail_pos]) {
return false;
}
it->buf_ += it->key_[pos++];
if (boundary_flags_[tail_pos]) {
if (pos == it->key_.length()) {
it->id_ = to_key_id_(node_id);
return true;
}
return false;
}
++tail_pos;
} while (pos < it->key_.length());
} else {
do {
if (it->key_[pos] != tail_[tail_pos] || !tail_[tail_pos]) {
return false;
}
it->buf_ += it->key_[pos++];
++tail_pos;
} while (pos < it->key_.length());
}
it->id_ = to_key_id_(node_id);
extract_suffix_(tail_pos, it->buf_);
return true;
}
id_type child_id = bc_.base(node_id) ^code_(it->key_[pos]);
if (bc_.check(child_id) != node_id) {
it->end_flag_ = true;
return false;
}
node_id = child_id;
it->buf_ += it->key_[pos];
}
if (!it->buf_.empty()) {
it->stack_.push_back({pos, it->buf_.back(), node_id});
} else {
it->stack_.push_back({pos, '\0', node_id});
}
}
while (!it->stack_.empty()) {
id_type node_id = it->stack_.back().node_id;
size_t depth = it->stack_.back().depth;
uint8_t c = it->stack_.back().c;
it->stack_.pop_back();
if (0 < depth) {
it->buf_.resize(depth);
it->buf_.back() = c;
}
if (bc_.is_leaf(node_id)) {
it->id_ = to_key_id_(node_id);
extract_suffix_(bc_.link(node_id), it->buf_);
return true;
}
const id_type base = bc_.base(node_id);
// For lex sort
for (auto rit = std::rbegin(alphabet_);
rit != std::rend(alphabet_); ++rit) {
const id_type child_id = base ^code_(*rit);
if (bc_.check(child_id) == node_id) {
it->stack_.push_back(
{depth + 1, static_cast<char>(*rit), child_id}
);
}
}
if (terminal_flags_[node_id]) {
it->id_ = to_key_id_(node_id);
return true;
}
}
it->end_flag_ = true;
return false;
}
friend class TrieBuilder;
};
} //namespace - xcdat
#endif //XCDAT_TRIE_HPP_

View file

@ -1,117 +0,0 @@
#ifndef XCDAT_TRIE_BUILDER_HPP_
#define XCDAT_TRIE_BUILDER_HPP_
#include "Trie.hpp"
namespace xcdat {
// Double-array trie builder.
class TrieBuilder {
public:
// Builds the dictionary from given string keys. The keys must be sorted in
// lexicographical order without duplication. Any error in construction is
// reported by TrieBuilder::Exception. If the keys include the ASCII zero
// code, pass bin_mode = true.
template<bool Fast>
static Trie<Fast>
build(const std::vector<std::string_view>& keys, bool bin_mode = false) {
TrieBuilder builder(keys, Trie<Fast>::bc_type::WIDTH_L1, bin_mode);
Trie<Fast> trie;
trie.bc_ = typename Trie<Fast>::bc_type(builder.bc_, builder.leaf_flags_);
trie.terminal_flags_ = BitVector(builder.term_flags_, true, true);
trie.tail_ = Vector<char>(builder.tail_);
trie.boundary_flags_ = BitVector(builder.boundary_flags_, false, false);
trie.alphabet_ = Vector<uint8_t>(builder.alphabet_);
std::swap(trie.table_, builder.table_);
trie.num_keys_ = keys.size();
trie.max_length_ = builder.max_length_;
trie.bin_mode_ = builder.bin_mode_;
return trie;
}
// Exception class for xcdat::TrieBuilder
class Exception : public std::exception {
public:
explicit Exception(std::string message) : message_(std::move(message)) {}
~Exception() throw() override {};
// overrides what() of std::exception.
const char* what() const throw() override {
return message_.c_str();
}
private:
std::string message_;
};
TrieBuilder(const TrieBuilder&) = delete;
TrieBuilder& operator=(const TrieBuilder&) = delete;
private:
struct Suffix {
std::string_view str;
id_type node_id;
size_t length() const {
return str.length();
}
char operator[](size_t i) const {
return str[length() - i - 1];
}
std::reverse_iterator<const char*> rbegin() const {
return std::make_reverse_iterator(str.data() + str.length());
}
std::reverse_iterator<const char*> rend() const {
return std::make_reverse_iterator(str.data());
}
};
// To avoid undefined traversal
static constexpr id_type TABOO_ID = 1;
// From darts-clone setting
static constexpr id_type FREE_BLOCKS = 16;
const std::vector<std::string_view>& keys_;
const id_type block_size_;
const id_type width_L1_;
bool bin_mode_{};
std::vector<BcPair> bc_{};
BitVectorBuilder leaf_flags_{};
BitVectorBuilder term_flags_{};
std::vector<char> tail_{};
BitVectorBuilder boundary_flags_{};
std::vector<uint8_t> alphabet_{};
uint8_t table_[512]{};
std::vector<bool> used_flags_{};
std::vector<uint8_t> edges_{};
std::vector<id_type> heads_{};
std::vector<Suffix> suffixes_{};
size_t max_length_{};
TrieBuilder(const std::vector<std::string_view>& keys,
id_type width_L1, bool bin_mode);
~TrieBuilder() = default;
void build_table_();
void build_bc_(size_t begin, size_t end, size_t depth, id_type node_id);
void build_tail_();
void expand_();
void use_(id_type node_id);
void close_block_(id_type block_id);
id_type find_base_(id_type block_id) const;
bool is_target_(id_type base) const;
};
} //namespace - xcdat
#endif //XCDAT_TRIE_BUILDER_HPP_

View file

@ -1,91 +0,0 @@
#ifndef XCDAT_VECTOR_HPP
#define XCDAT_VECTOR_HPP
#include "xcdat_basics.hpp"
namespace xcdat {
// Simple vector of a POD type
template<typename T>
class Vector {
public:
static_assert(!std::is_same<T, bool>::value, "Type bool is not supported.");
static_assert(std::is_pod<T>::value, "T is not POD.");
Vector() = default;
explicit Vector(std::istream& is) {
size_ = read_value<size_t>(is);
vec_.resize(size_);
is.read(reinterpret_cast<char*>(&vec_[0]), sizeof(T) * size_);
data_ = vec_.data();
}
explicit Vector(std::vector<T>& vec) {
if (vec.size() != vec.capacity()) {
vec.shrink_to_fit();
}
vec_ = std::move(vec);
data_ = vec_.data();
size_ = vec_.size();
}
~Vector() = default;
const T& operator[](size_t i) const {
return data_[i];
}
const T* data() const {
return data_;
}
const T* begin() const {
return data_;
}
const T* end() const {
return data_ + size_;
}
std::reverse_iterator<const T*> rbegin() const {
return std::make_reverse_iterator(end());
}
std::reverse_iterator<const T*> rend() const {
return std::make_reverse_iterator(begin());
}
bool is_empty() const {
return size_ == 0;
}
size_t size() const {
return size_;
}
size_t size_in_bytes() const {
return size_ * sizeof(T) + sizeof(size_);
}
void write(std::ostream& os) const {
write_value(size_, os);
os.write(reinterpret_cast<const char*>(data_), sizeof(T) * size_);
}
void swap(Vector& rhs) {
std::swap(*this, rhs);
}
Vector(const Vector&) = delete;
Vector& operator=(const Vector&) = delete;
Vector(Vector&&) noexcept = default;
Vector& operator=(Vector&&) noexcept = default;
private:
const T* data_ {};
size_t size_ {};
std::vector<T> vec_ {};
};
}
#endif //XCDAT_VECTOR_HPP

114
include/xcdat/bit_tools.hpp Normal file
View file

@ -0,0 +1,114 @@
#pragma once
#include <cstdint>
#include <cstdlib>
#include <immintrin.h>
#include <nmmintrin.h>
// From https://github.com/ot/succinct
namespace xcdat::bit_tools {
static constexpr std::uint64_t ones_step_4 = 0x1111111111111111ULL;
static constexpr std::uint64_t ones_step_8 = 0x0101010101010101ULL;
static constexpr std::uint64_t ones_step_9 = 1ULL << 0 | 1ULL << 9 | 1ULL << 18 | 1ULL << 27 | //
1ULL << 36 | 1ULL << 45 | 1ULL << 54;
static constexpr std::uint64_t msbs_step_8 = 0x80ULL * ones_step_8;
static constexpr std::uint64_t msbs_step_9 = 0x100ULL * ones_step_9;
inline std::uint64_t popcount(std::uint64_t x) {
#ifdef __SSE4_2__
return static_cast<std::uint64_t>(__builtin_popcountll(x));
#else
x = x - ((x >> 1) & 0x5555555555555555ULL);
x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL);
x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
x = (0x0101010101010101ULL * x >> 56);
return x;
#endif
}
inline std::uint64_t msb(std::uint64_t x) {
return x == 0 ? 0 : 63 - __builtin_clzll(x);
}
inline std::uint64_t uleq_step_9(std::uint64_t x, std::uint64_t y) {
return (((((y | msbs_step_9) - (x & ~msbs_step_9)) | (x ^ y)) ^ (x & ~y)) & msbs_step_9) >> 8;
}
inline std::uint64_t byte_counts(std::uint64_t x) {
x = x - ((x & 0xa * ones_step_4) >> 1);
x = (x & 3 * ones_step_4) + ((x >> 2) & 3 * ones_step_4);
x = (x + (x >> 4)) & 0x0f * ones_step_8;
return x;
}
static constexpr std::uint8_t select_in_byte[2048] = {
8, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1,
0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0,
1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2,
0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0,
2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1,
0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0,
1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 8, 8, 8, 1, 8, 2, 2, 1, 8, 3, 3, 1, 3, 2, 2, 1, 8,
4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 8, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, 3, 2, 2, 1, 5, 4, 4, 1, 4, 2, 2, 1,
4, 3, 3, 1, 3, 2, 2, 1, 8, 6, 6, 1, 6, 2, 2, 1, 6, 3, 3, 1, 3, 2, 2, 1, 6, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2,
1, 6, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, 3, 2, 2, 1, 5, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 8, 7, 7, 1, 7, 2,
2, 1, 7, 3, 3, 1, 3, 2, 2, 1, 7, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 7, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, 3,
2, 2, 1, 5, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 7, 6, 6, 1, 6, 2, 2, 1, 6, 3, 3, 1, 3, 2, 2, 1, 6, 4, 4, 1,
4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 6, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, 3, 2, 2, 1, 5, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3,
1, 3, 2, 2, 1, 8, 8, 8, 8, 8, 8, 8, 2, 8, 8, 8, 3, 8, 3, 3, 2, 8, 8, 8, 4, 8, 4, 4, 2, 8, 4, 4, 3, 4, 3, 3, 2, 8, 8,
8, 5, 8, 5, 5, 2, 8, 5, 5, 3, 5, 3, 3, 2, 8, 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, 4, 3, 3, 2, 8, 8, 8, 6, 8, 6, 6, 2, 8,
6, 6, 3, 6, 3, 3, 2, 8, 6, 6, 4, 6, 4, 4, 2, 6, 4, 4, 3, 4, 3, 3, 2, 8, 6, 6, 5, 6, 5, 5, 2, 6, 5, 5, 3, 5, 3, 3, 2,
6, 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, 4, 3, 3, 2, 8, 8, 8, 7, 8, 7, 7, 2, 8, 7, 7, 3, 7, 3, 3, 2, 8, 7, 7, 4, 7, 4, 4,
2, 7, 4, 4, 3, 4, 3, 3, 2, 8, 7, 7, 5, 7, 5, 5, 2, 7, 5, 5, 3, 5, 3, 3, 2, 7, 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, 4, 3,
3, 2, 8, 7, 7, 6, 7, 6, 6, 2, 7, 6, 6, 3, 6, 3, 3, 2, 7, 6, 6, 4, 6, 4, 4, 2, 6, 4, 4, 3, 4, 3, 3, 2, 7, 6, 6, 5, 6,
5, 5, 2, 6, 5, 5, 3, 5, 3, 3, 2, 6, 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, 4, 3, 3, 2, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 3, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 4, 8, 4, 4, 3, 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, 5, 8, 5, 5, 3, 8, 8, 8,
5, 8, 5, 5, 4, 8, 5, 5, 4, 5, 4, 4, 3, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 3, 8, 8, 8, 6, 8, 6, 6, 4, 8, 6,
6, 4, 6, 4, 4, 3, 8, 8, 8, 6, 8, 6, 6, 5, 8, 6, 6, 5, 6, 5, 5, 3, 8, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3, 8,
8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 3, 8, 8, 8, 7, 8, 7, 7, 4, 8, 7, 7, 4, 7, 4, 4, 3, 8, 8, 8, 7, 8, 7, 7, 5,
8, 7, 7, 5, 7, 5, 5, 3, 8, 7, 7, 5, 7, 5, 5, 4, 7, 5, 5, 4, 5, 4, 4, 3, 8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6,
3, 8, 7, 7, 6, 7, 6, 6, 4, 7, 6, 6, 4, 6, 4, 4, 3, 8, 7, 7, 6, 7, 6, 6, 5, 7, 6, 6, 5, 6, 5, 5, 3, 7, 6, 6, 5, 6, 5,
5, 4, 6, 5, 5, 4, 5, 4, 4, 3, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, 5, 8, 5, 5, 4, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 4, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8,
6, 8, 6, 6, 5, 8, 8, 8, 6, 8, 6, 6, 5, 8, 6, 6, 5, 6, 5, 5, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8,
8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 4, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 5, 8, 8, 8, 7, 8, 7, 7, 5, 8,
7, 7, 5, 7, 5, 5, 4, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, 8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 4,
8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 5, 8, 7, 7, 6, 7, 6, 6, 5, 7, 6, 6, 5, 6, 5, 5, 4, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6,
8, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, 8,
8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7};
inline std::uint64_t select_in_word(const std::uint64_t x, const std::uint64_t k) {
#ifdef __BMI2__
return _tzcnt_u64(_pdep_u64(1ULL << k, x));
#else
const std::uint64_t byte_sums = byte_counts(x) * ones_step_8;
const std::uint64_t k_step_8 = k * ones_step_8;
const std::uint64_t geq_k_step_8 = (((k_step_8 | msbs_step_8) - byte_sums) & msbs_step_8);
const std::uint64_t place = popcount(geq_k_step_8) * 8;
const std::uint64_t byte_rank = k - (((byte_sums << 8) >> place) & 0xFFULL);
return place + select_in_byte[((x >> place) & 0xFF) | (byte_rank << 8)];
#endif
}
} // namespace xcdat::bit_tools

View file

@ -0,0 +1,250 @@
#pragma once
#include <cassert>
#include <cstdint>
#include <numeric>
#include "mm_vector.hpp"
#include "utils.hpp"
namespace xcdat {
//! Rank9 implementatoin
class bit_vector {
public:
class builder {
private:
std::vector<std::uint64_t> m_bits;
std::uint64_t m_size = 0;
public:
builder() = default;
builder(std::uint64_t size) {
resize(size);
}
inline void push_back(bool x) {
if (m_size % 64 == 0) {
m_bits.push_back(0);
}
if (x) {
set_bit(m_size, true);
}
m_size += 1;
}
inline bool operator[](std::uint64_t i) const {
return m_bits[i / 64] & (1ULL << (i % 64));
}
inline void set_bit(std::uint64_t i, bool x = true) {
if (x) {
m_bits[i / 64] |= (1ULL << (i % 64));
} else {
m_bits[i / 64] &= (~(1ULL << (i % 64)));
}
}
inline void resize(std::uint64_t size) {
m_bits.resize(utils::words_for_bits(size), 0ULL);
m_size = size;
}
inline void reserve(std::uint64_t capacity) {
m_bits.reserve(utils::words_for_bits(capacity));
}
inline std::uint64_t size() const {
return m_size;
}
friend class bit_vector;
};
static constexpr std::uint64_t block_size = 8; // i.e., 64 * 8 bits
static constexpr std::uint64_t selects_per_hint = 64 * block_size * 2;
private:
mm_vector<std::uint64_t> m_bits;
mm_vector<std::uint64_t> m_rank_hints;
mm_vector<std::uint64_t> m_select_hints;
std::uint64_t m_size = 0;
std::uint64_t m_num_ones = 0;
public:
bit_vector() = default;
bit_vector(builder& b, bool enable_rank = false, bool enable_select = false) {
build(b, enable_rank, enable_select);
}
virtual ~bit_vector() = default;
void reset() {
m_bits.reset();
m_rank_hints.reset();
m_select_hints.reset();
m_size = 0;
m_num_ones = 0;
}
void build(builder& b, bool enable_rank = false, bool enable_select = false) {
reset();
m_bits.steal(b.m_bits);
m_size = b.m_size;
m_num_ones = std::accumulate(m_bits.begin(), m_bits.end(), 0ULL,
[](std::uint64_t acc, std::uint64_t x) { return acc + bit_tools::popcount(x); });
if (enable_rank) {
build_rank_hints();
}
if (enable_rank and enable_select) {
build_select_hints();
}
}
inline std::uint64_t size() const {
return m_size;
}
inline std::uint64_t num_ones() const {
return m_num_ones;
}
inline bool operator[](std::uint64_t i) const {
return m_bits[i / 64] & (1ULL << (i % 64));
}
// The number of 1s in B[0..i)
inline std::uint64_t rank(std::uint64_t i) const {
if (i == size()) {
return num_ones();
}
const auto [wi, wj] = utils::decompose<64>(i);
return rank_for_word(wi) + (wj != 0 ? bit_tools::popcount(m_bits[wi] << (64 - wj)) : 0);
}
// The largest position
inline std::uint64_t select(std::uint64_t n) const {
const std::uint64_t bi = select_for_block(n);
assert(bi < num_blocks());
std::uint64_t curr_rank = rank_for_block(bi);
assert(curr_rank <= n);
std::uint64_t rank_in_block_parallel = (n - curr_rank) * bit_tools::ones_step_9;
std::uint64_t sub_ranks = ranks_in_block(bi);
std::uint64_t sub_block_offset =
bit_tools::uleq_step_9(sub_ranks, rank_in_block_parallel) * bit_tools::ones_step_9 >> 54 & 0x7;
curr_rank += sub_ranks >> (7 - sub_block_offset) * 9 & 0x1FF;
assert(curr_rank <= n);
std::uint64_t word_offset = (bi * block_size) + sub_block_offset;
return word_offset * 64 + bit_tools::select_in_word(m_bits[word_offset], n - curr_rank);
}
private:
inline std::uint64_t num_blocks() const {
return m_rank_hints.size() / 2 - 1;
}
// Absolute rank until the bi-th block
inline std::uint64_t rank_for_block(std::uint64_t bi) const {
return m_rank_hints[bi * 2];
}
// Packed ranks in the bi-th block
inline std::uint64_t ranks_in_block(std::uint64_t bi) const {
return m_rank_hints[bi * 2 + 1];
}
// Absolute rank until the wi-th word
inline std::uint64_t rank_for_word(std::uint64_t wi) const {
const auto [bi, bj] = utils::decompose<block_size>(wi);
return rank_for_block(bi) + rank_in_block(bi, bj);
}
// Relative rank in the bi-th block
inline std::uint64_t rank_in_block(std::uint64_t bi, std::uint64_t bj) const {
return ranks_in_block(bi) >> ((7 - bj) * 9) & 0x1FF;
}
inline std::uint64_t select_for_block(std::uint64_t n) const {
auto [a, b] = select_with_hint(n);
while (b - a > 1) {
const std::uint64_t lb = a + (b - a) / 2;
if (rank_for_block(lb) <= n) {
a = lb;
} else {
b = lb;
}
}
return a;
}
inline std::tuple<std::uint64_t, std::uint64_t> select_with_hint(std::uint64_t n) const {
const std::uint64_t i = n / selects_per_hint;
return {i != 0 ? m_select_hints[i - 1] : 0, m_select_hints[i] + 1};
}
void build_rank_hints() {
std::uint64_t curr_num_ones = 0;
std::uint64_t curr_num_ones_in_block = 0;
std::uint64_t curr_ranks_in_block = 0;
const std::uint64_t num_words = m_bits.size();
std::vector<std::uint64_t> rank_hints = {curr_num_ones};
for (std::uint64_t wi = 0; wi < num_words; wi++) {
const std::uint64_t bi = wi % block_size; // Relative position in the block
const std::uint64_t num_ones_in_word = bit_tools::popcount(m_bits[wi]);
if (bi != 0) {
curr_ranks_in_block <<= 9;
curr_ranks_in_block |= curr_num_ones_in_block;
}
curr_num_ones += num_ones_in_word;
curr_num_ones_in_block += num_ones_in_word;
if (bi == block_size - 1) {
rank_hints.push_back(curr_ranks_in_block);
rank_hints.push_back(curr_num_ones);
curr_num_ones_in_block = 0;
curr_ranks_in_block = 0;
}
}
// Padding the remaining hints
const std::uint64_t remain = block_size - (num_words % block_size);
for (std::uint64_t wi = 0; wi < remain; wi++) {
curr_ranks_in_block <<= 9;
curr_ranks_in_block |= curr_num_ones_in_block;
}
rank_hints.push_back(curr_ranks_in_block);
// Sentinel
if (num_words % block_size != 0) {
rank_hints.push_back(curr_ranks_in_block);
rank_hints.push_back(0);
}
// Release
m_rank_hints.steal(rank_hints);
}
void build_select_hints() {
std::vector<std::uint64_t> select_hints;
std::uint64_t threshold = selects_per_hint;
for (std::uint64_t bi = 0; bi < num_blocks(); ++bi) {
if (rank_for_block(bi + 1) > threshold) {
select_hints.push_back(bi);
threshold += selects_per_hint;
}
}
select_hints.push_back(num_blocks());
m_select_hints.steal(select_hints);
}
};
} // namespace xcdat

View file

@ -0,0 +1,58 @@
#pragma once
#include <cstdlib>
#include <vector>
namespace xcdat {
template <class T>
class mm_vector {
private:
std::vector<T> m_vec;
public:
mm_vector() = default;
virtual ~mm_vector() = default;
// NOTE: The input vector is stolen.
mm_vector(std::vector<T>& vec) {
steal(vec);
}
void steal(std::vector<T>& vec) {
m_vec.swap(vec);
m_vec.shrink_to_fit();
}
void reset() {
m_vec = std::vector<T>();
}
inline std::uint64_t size() const {
return m_vec.size();
}
inline auto begin() const {
return m_vec.begin();
}
inline auto end() const {
return m_vec.end();
}
inline const T& operator[](std::uint64_t i) const {
return m_vec[i];
}
inline const T* data() const {
return m_vec.data();
}
template <typename Visitor>
void visit(Visitor& visitor) {
visitor.visit(m_vec);
}
};
} // namespace xcdat

24
include/xcdat/utils.hpp Normal file
View file

@ -0,0 +1,24 @@
#pragma once
#include <tuple>
#include "bit_tools.hpp"
namespace xcdat::utils {
template <std::uint64_t N>
constexpr std::tuple<std::uint64_t, std::uint64_t> decompose(std::uint64_t x) {
return std::make_tuple(x / N, x % N);
}
template <class T = std::uint64_t>
constexpr std::uint64_t words_for_bits(std::uint64_t nbits) {
constexpr std::uint64_t wbits = sizeof(T) * 8;
return (nbits + wbits - 1) / wbits;
}
inline std::uint64_t bits_for_int(std::uint64_t x) {
return (x > 1) ? bit_tools::msb(x - 1) + 1 : 0;
}
} // namespace xcdat::utils

View file

@ -1,58 +0,0 @@
#ifndef XCDAT_BASICS_HPP_
#define XCDAT_BASICS_HPP_
#include <algorithm>
#include <array>
#include <cassert>
#include <fstream>
#include <stdint.h>
#include <stddef.h>
#include <string>
#include <utility>
#include <vector>
#include <limits>
#include "xcdat_config.hpp"
namespace xcdat {
#ifdef XCDAT_X64
using id_type = uint64_t;
#else
using id_type = uint32_t;
#endif
constexpr id_type ID_MAX = std::numeric_limits<id_type>::max();
struct BcPair {
id_type base;
id_type check;
};
inline void show_size(const char* str, double size, std::ostream& os) {
os << str << "\t" << size << std::endl;
}
inline void show_size(const char* str, size_t size, std::ostream& os) {
os << str << "\t" << size << std::endl;
}
inline void show_size_ratio(const char* str, size_t size, size_t denom, std::ostream& os) {
os << str << "\t" << size << "\t" << static_cast<double>(size) / denom << std::endl;
}
template<typename T>
inline void write_value(const T val, std::ostream& os) {
os.write(reinterpret_cast<const char*>(&val), sizeof(val));
}
template<typename T>
inline T read_value(std::istream& is) {
T val;
is.read(reinterpret_cast<char*>(&val), sizeof(val));
return val;
}
} //namespace - xcdat
#endif //XCDAT_BASICS_HPP_

View file

@ -1,3 +0,0 @@
add_executable(sample sample.cpp)
target_link_libraries(sample xcdat)

View file

@ -1,92 +0,0 @@
#include <iostream>
#include <xcdat.hpp>
int main() {
std::vector<std::string> keys_buf = {
"Aoba", "Yun", "Hajime", "Hihumi", "Kou", "Rin",
"Hazuki", "Umiko", "Nene", "Nenecchi"
};
// Convert to the input format
std::vector<std::string_view> keys(keys_buf.size());
for (size_t i = 0; i < keys.size(); ++i) {
keys[i] = std::string_view{keys_buf[i]};
}
// Input data must be sorted.
std::sort(std::begin(keys), std::end(keys));
// Dictionary class
using Trie = xcdat::Trie<true>;
try {
// Builds a dictionary from the keys
Trie trie = xcdat::TrieBuilder::build<true>(keys); // move
// Writes the dictionary to a file.
std::ofstream ofs{"sample.bin"};
trie.write(ofs);
} catch (const xcdat::TrieBuilder::Exception& ex) {
// Abort if something went wrong...
std::cerr << ex.what() << std::endl;
return 1;
}
// Creates an empty dictionary
Trie trie;
{
// Reads the dictionary to the file.
std::ifstream ifs{"sample.bin"};
trie = Trie{ifs}; // move
}
std::cout << "Performing basic operations..." << std::endl;
{
// lookup() obtains the unique ID for a given key
xcdat::id_type key_id = trie.lookup("Rin");
// access() decodes the key from a given ID
std::cout << key_id << " : " << trie.access(key_id) << std::endl;
// Given an unregistered key, lookup() returns NOT_FOUND.
if (trie.lookup("Hotaru") == Trie::NOT_FOUND) {
std::cout << "? : " << "Hotaru" << std::endl;
}
}
std::cout << "Performing a common prefix operation..." << std::endl;
{
// Common prefix operation is implemented using PrefixIterator, created by
// make_prefix_iterator().
Trie::PrefixIterator it = trie.make_prefix_iterator("Nenecchi");
// next() continues to obtain the next key until false is returned.
while (it.next()) {
std::cout << it.id() << " : " << it.key() << std::endl;
}
}
std::cout << "Performing a predictive operation..." << std::endl;
{
// Predictive operation is implemented using PredictiveIterator, created by
// make_predictive_iterator().
Trie::PredictiveIterator it = trie.make_predictive_iterator("Ha");
// next() continues to obtain the next key until false is returned in
// lexicographical order.
while (it.next()) {
std::cout << it.id() << " : " << it.key() << std::endl;
}
}
std::cout << "Enumerating all registered keys..." << std::endl;
{
// PredictiveIterator for an empty string provides enumeration of all
// registered keys in lexicographical order.
Trie::PredictiveIterator it = trie.make_predictive_iterator("");
while (it.next()) {
std::cout << it.id() << " : " << it.key() << std::endl;
}
}
return 0;
}

View file

@ -1,314 +0,0 @@
#include <popcntintrin.h>
#include "xcdat/BitVector.hpp"
namespace xcdat {
// inspired by marisa-trie
constexpr uint8_t SELECT_TABLE[9][256] = {
{
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
},
{
8, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1,
5, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1,
6, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1,
5, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1,
7, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1,
5, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1,
6, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1,
5, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1,
8, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1,
5, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1,
6, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1,
5, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1,
7, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1,
5, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1,
6, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1,
5, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1
},
{
8, 8, 8, 2, 8, 3, 3, 2, 8, 4, 4, 2, 4, 3, 3, 2,
8, 5, 5, 2, 5, 3, 3, 2, 5, 4, 4, 2, 4, 3, 3, 2,
8, 6, 6, 2, 6, 3, 3, 2, 6, 4, 4, 2, 4, 3, 3, 2,
6, 5, 5, 2, 5, 3, 3, 2, 5, 4, 4, 2, 4, 3, 3, 2,
8, 7, 7, 2, 7, 3, 3, 2, 7, 4, 4, 2, 4, 3, 3, 2,
7, 5, 5, 2, 5, 3, 3, 2, 5, 4, 4, 2, 4, 3, 3, 2,
7, 6, 6, 2, 6, 3, 3, 2, 6, 4, 4, 2, 4, 3, 3, 2,
6, 5, 5, 2, 5, 3, 3, 2, 5, 4, 4, 2, 4, 3, 3, 2,
8, 8, 8, 2, 8, 3, 3, 2, 8, 4, 4, 2, 4, 3, 3, 2,
8, 5, 5, 2, 5, 3, 3, 2, 5, 4, 4, 2, 4, 3, 3, 2,
8, 6, 6, 2, 6, 3, 3, 2, 6, 4, 4, 2, 4, 3, 3, 2,
6, 5, 5, 2, 5, 3, 3, 2, 5, 4, 4, 2, 4, 3, 3, 2,
8, 7, 7, 2, 7, 3, 3, 2, 7, 4, 4, 2, 4, 3, 3, 2,
7, 5, 5, 2, 5, 3, 3, 2, 5, 4, 4, 2, 4, 3, 3, 2,
7, 6, 6, 2, 6, 3, 3, 2, 6, 4, 4, 2, 4, 3, 3, 2,
6, 5, 5, 2, 5, 3, 3, 2, 5, 4, 4, 2, 4, 3, 3, 2
},
{
8, 8, 8, 8, 8, 8, 8, 3, 8, 8, 8, 4, 8, 4, 4, 3,
8, 8, 8, 5, 8, 5, 5, 3, 8, 5, 5, 4, 5, 4, 4, 3,
8, 8, 8, 6, 8, 6, 6, 3, 8, 6, 6, 4, 6, 4, 4, 3,
8, 6, 6, 5, 6, 5, 5, 3, 6, 5, 5, 4, 5, 4, 4, 3,
8, 8, 8, 7, 8, 7, 7, 3, 8, 7, 7, 4, 7, 4, 4, 3,
8, 7, 7, 5, 7, 5, 5, 3, 7, 5, 5, 4, 5, 4, 4, 3,
8, 7, 7, 6, 7, 6, 6, 3, 7, 6, 6, 4, 6, 4, 4, 3,
7, 6, 6, 5, 6, 5, 5, 3, 6, 5, 5, 4, 5, 4, 4, 3,
8, 8, 8, 8, 8, 8, 8, 3, 8, 8, 8, 4, 8, 4, 4, 3,
8, 8, 8, 5, 8, 5, 5, 3, 8, 5, 5, 4, 5, 4, 4, 3,
8, 8, 8, 6, 8, 6, 6, 3, 8, 6, 6, 4, 6, 4, 4, 3,
8, 6, 6, 5, 6, 5, 5, 3, 6, 5, 5, 4, 5, 4, 4, 3,
8, 8, 8, 7, 8, 7, 7, 3, 8, 7, 7, 4, 7, 4, 4, 3,
8, 7, 7, 5, 7, 5, 5, 3, 7, 5, 5, 4, 5, 4, 4, 3,
8, 7, 7, 6, 7, 6, 6, 3, 7, 6, 6, 4, 6, 4, 4, 3,
7, 6, 6, 5, 6, 5, 5, 3, 6, 5, 5, 4, 5, 4, 4, 3
},
{
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4,
8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, 5, 8, 5, 5, 4,
8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 4,
8, 8, 8, 6, 8, 6, 6, 5, 8, 6, 6, 5, 6, 5, 5, 4,
8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 4,
8, 8, 8, 7, 8, 7, 7, 5, 8, 7, 7, 5, 7, 5, 5, 4,
8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 4,
8, 7, 7, 6, 7, 6, 6, 5, 7, 6, 6, 5, 6, 5, 5, 4,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4,
8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, 5, 8, 5, 5, 4,
8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 4,
8, 8, 8, 6, 8, 6, 6, 5, 8, 6, 6, 5, 6, 5, 5, 4,
8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 4,
8, 8, 8, 7, 8, 7, 7, 5, 8, 7, 7, 5, 7, 5, 5, 4,
8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 4,
8, 7, 7, 6, 7, 6, 6, 5, 7, 6, 6, 5, 6, 5, 5, 4
},
{
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6,
8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 5,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7,
8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 5,
8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6,
8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 5,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6,
8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 5,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7,
8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 5,
8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6,
8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 5
},
{
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7,
8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7,
8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6
},
{
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7
},
{
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8
}
};
uint32_t pop_count(uint32_t x) {
#ifdef XCDAT_USE_POPCNT
return static_cast<uint32_t>(_mm_popcnt_u32(x));
#else
x = ((x & 0xAAAAAAAA) >> 1) + (x & 0x55555555);
x = ((x & 0xCCCCCCCC) >> 2) + (x & 0x33333333);
x = ((x >> 4) + x) & 0x0F0F0F0F;
x += x >> 8;
x += x >> 16;
return x & 0x3F;
#endif
}
BitVector::BitVector(std::istream& is) {
bits_ = Vector<uint32_t>(is);
rank_tips_ = Vector<RankTip>(is);
select_tips_ = Vector<id_type>(is);
size_ = read_value<size_t>(is);
num_1s_ = read_value<size_t>(is);
}
BitVector::BitVector(BitVectorBuilder& builder, bool rank_flag,
bool select_flag) {
if (!builder.size()) {
return;
}
bits_ = Vector<uint32_t>(builder.bits_);
size_ = builder.size_;
num_1s_ = builder.num_1s_;
// builds rank_tips_
if (rank_flag) {
std::vector<RankTip> rank_tips(size_ / BITS_IN_R1 + 1);
id_type count = 0;
for (id_type i = 0; i < rank_tips.size(); ++i) {
auto& tip = rank_tips[i];
tip.L1 = count;
for (id_type offset = 0; offset < R1_PER_R2; ++offset) {
tip.L2[offset] = static_cast<uint8_t>(count - tip.L1);
auto pos_in_bits = i * R1_PER_R2 + offset;
if (pos_in_bits < bits_.size()) {
count += pop_count(bits_[pos_in_bits]);
}
}
}
rank_tips_ = Vector<RankTip>(rank_tips);
}
// builds select_tips_
if (rank_flag && select_flag) {
std::vector<id_type> select_tips{0};
auto count = ONES_PER_TIP;
for (id_type i = 0; i < rank_tips_.size(); ++i) {
if (count < rank_tips_[i].L1) {
select_tips.push_back(i - 1);
count += ONES_PER_TIP;
}
}
select_tips.push_back(static_cast<id_type>(rank_tips_.size() - 1));
select_tips_ = Vector<id_type>(select_tips);
}
}
id_type BitVector::rank(id_type i) const {
auto& hint = rank_tips_[i / BITS_IN_R1];
return hint.L1 + hint.L2[i / BITS_IN_R2 % R1_PER_R2]
+ pop_count(bits_[i / 32] & ((1U << (i % 32)) - 1));
}
id_type BitVector::select(id_type i) const {
id_type left = 0, right = static_cast<id_type>(rank_tips_.size());
if (!select_tips_.is_empty()) {
auto select_tip_id = static_cast<id_type>(i / ONES_PER_TIP);
left = select_tips_[select_tip_id];
right = select_tips_[select_tip_id + 1] + 1;
}
while (left + 1 < right) {
const auto center = (left + right) / 2;
if (i < rank_tips_[center].L1) {
right = center;
} else {
left = center;
}
}
i += 1; // for i+1 th
i -= rank_tips_[left].L1;
uint32_t offset = 1;
for (; offset < R1_PER_R2; ++offset) {
if (i <= rank_tips_[left].L2[offset]) {
break;
}
}
i -= rank_tips_[left].L2[--offset];
auto ret = (left * BITS_IN_R1) + (offset * BITS_IN_R2);
auto bits = bits_[ret / 32];
{
auto _count = pop_count(bits % 65536);
if (_count < i) {
bits >>= 16;
ret += 16;
i -= _count;
}
}
{
auto _count = pop_count(bits % 256);
if (_count < i) {
bits >>= 8;
ret += 8;
i -= _count;
}
}
ret += SELECT_TABLE[i][bits % 256];
return ret - 1;
}
size_t BitVector::size_in_bytes() const {
size_t ret = 0;
ret += bits_.size_in_bytes();
ret += rank_tips_.size_in_bytes();
ret += select_tips_.size_in_bytes();
ret += sizeof(size_);
ret += sizeof(num_1s_);
return ret;
}
void BitVector::write(std::ostream& os) const {
bits_.write(os);
rank_tips_.write(os);
select_tips_.write(os);
write_value(size_, os);
write_value(num_1s_, os);
}
} //namespace - xcdat

View file

@ -1,141 +0,0 @@
#include <sstream>
#include "xcdat/DacBc.hpp"
namespace xcdat {
DacBc::DacBc(std::istream& is) {
for (size_t i = 0; i < sizeof(id_type); ++i) {
values_[i] = Vector<uint8_t>(is);
}
for (size_t i = 0; i < sizeof(id_type) - 1; ++i) {
flags_[i] = BitVector(is);
}
leaf_flags_ = BitVector(is);
links_ = FitVector(is);
max_level_ = read_value<uint8_t>(is);
num_free_nodes_ = read_value<size_t>(is);
}
DacBc::DacBc(const std::vector<BcPair>& bc, BitVectorBuilder& leaf_flags) {
if (bc.empty()) {
return;
}
std::vector<uint8_t> values[sizeof(id_type)];
BitVectorBuilder flags[sizeof(id_type)];
std::vector<id_type> links;
leaf_flags_ = BitVector(leaf_flags, true, false);
values[0].reserve(bc.size() * 2);
flags[0].reserve(bc.size() * 2);
links.reserve(bc.size());
max_level_ = 0;
auto append = [&](id_type value) {
uint8_t level = 0;
values[level].push_back(static_cast<uint8_t>(value & 0xFF));
flags[level].push_back(true);
value >>= 8;
while (value) {
++level;
values[level].push_back(static_cast<uint8_t>(value & 0xFF));
flags[level].push_back(true);
value >>= 8;
}
flags[level].set_bit(flags[level].size() - 1, false);
max_level_ = std::max(max_level_, level);
};
auto append_leaf = [&](id_type value) {
links.push_back(value >> 8);
values[0].push_back(static_cast<uint8_t>(value & 0xFF));
flags[0].push_back(false);
};
for (id_type i = 0; i < bc.size(); ++i) {
if (leaf_flags_[i]) {
append_leaf(bc[i].base);
} else {
append(bc[i].base ^ i);
}
append(bc[i].check ^ i);
if (bc[i].check == i) {
++num_free_nodes_;
}
}
// release
for (uint8_t i = 0; i < max_level_; ++i) {
values_[i] = Vector<uint8_t>(values[i]);
flags_[i] = BitVector(flags[i], true, false);
}
values_[max_level_] = Vector<uint8_t>(values[max_level_]);
links_ = FitVector(links);
}
size_t DacBc::size_in_bytes() const {
size_t ret = 0;
for (auto& values : values_) {
ret += values.size_in_bytes();
}
for (auto& flags : flags_) {
ret += flags.size_in_bytes();
}
ret += leaf_flags_.size_in_bytes();
ret += links_.size_in_bytes();
ret += sizeof(max_level_);
ret += sizeof(num_free_nodes_);
return ret;
}
void DacBc::show_stat(std::ostream& os) const {
const auto total_size = size_in_bytes();
os << "basic statistics of xcdat::DacBc" << std::endl;
show_size("\tnum links: ", links_.size(), os);
show_size("\tbytes per node:", double(total_size) / num_nodes(), os);
os << "member size statistics of xcdat::DacBc" << std::endl;
for (int i = 0; i <= max_level_; ++i) {
std::ostringstream oss;
oss << "\tvalues_L" << i << ":";
show_size_ratio(oss.str().c_str(), values_[i].size_in_bytes(), total_size, os);
}
for (int i = 0; i < max_level_; ++i) {
std::ostringstream oss;
oss << "\tflags_L" << i << ": ";
show_size_ratio(oss.str().c_str(), flags_[i].size_in_bytes(), total_size, os);
}
show_size_ratio("\tleaves: ", leaf_flags_.size_in_bytes(), total_size, os);
show_size_ratio("\tlinks: ", links_.size_in_bytes(), total_size, os);
}
void DacBc::write(std::ostream& os) const {
for (auto& values : values_) {
values.write(os);
}
for (auto& flags : flags_) {
flags.write(os);
}
leaf_flags_.write(os);
links_.write(os);
write_value(max_level_, os);
write_value(num_free_nodes_, os);
}
id_type DacBc::access_(id_type i) const {
uint8_t level = 0;
id_type value = values_[level][i];
while (level < max_level_) {
if (!flags_[level][i]) {
break;
}
i = flags_[level].rank(i);
++level;
value |= static_cast<id_type>(values_[level][i]) << (level * 8);
}
return value;
}
} //namespace - xcdat

View file

@ -1,188 +0,0 @@
#include "xcdat/FastDacBc.hpp"
namespace xcdat {
FastDacBc::FastDacBc(std::istream& is) {
values_L1_ = Vector<uint8_t>(is);
values_L2_ = Vector<uint16_t>(is);
values_L3_ = Vector<uint32_t>(is);
#ifdef XCDAT_X64
values_L4_ = Vector<uint64_t>(is);
#endif
for (size_t i = 0; i < LAYERS - 1; ++i) {
ranks_[i] = Vector<id_type>(is);
}
leaf_flags_ = BitVector(is);
links_ = FitVector(is);
num_free_nodes_ = read_value<size_t>(is);
}
FastDacBc::FastDacBc(const std::vector<BcPair>& bc,
BitVectorBuilder& leaf_flags) {
if (bc.empty()) {
return;
}
std::vector<uint8_t> values_L1;
std::vector<uint16_t> values_L2;
std::vector<uint32_t> values_L3;
#ifdef XCDAT_X64
std::vector<uint64_t> values_L4;
#endif
std::vector<id_type> ranks[LAYERS - 1];
std::vector<id_type> links;
leaf_flags_ = BitVector(leaf_flags, true, false);
ranks[0].reserve((bc.size() * 2) / 128);
auto append = [&](id_type value) {
if ((values_L1.size() % BLOCK_SIZE_L1) == 0) {
ranks[0].push_back(static_cast<id_type>(values_L2.size()));
}
if ((value / BLOCK_SIZE_L1) == 0) {
values_L1.push_back(static_cast<uint8_t>(0 | (value << 1)));
return;
} else {
auto pos = values_L2.size() - ranks[0].back();
values_L1.push_back(static_cast<uint8_t>(1 | (pos << 1)));
}
if ((values_L2.size() % BLOCK_SIZE_L2) == 0) {
ranks[1].push_back(static_cast<id_type>(values_L3.size()));
}
if ((value / BLOCK_SIZE_L2) == 0) {
values_L2.push_back(static_cast<uint16_t>(0 | (value << 1)));
return;
} else {
auto pos = values_L3.size() - ranks[1].back();
values_L2.push_back(static_cast<uint16_t>(1 | (pos << 1)));
}
#ifdef XCDAT_X64
if ((values_L3.size() % BLOCK_SIZE_L3) == 0) {
ranks[1].push_back(static_cast<id_type>(values_L4.size()));
}
if ((value / BLOCK_SIZE_L3) == 0) {
values_L3.push_back(static_cast<uint32_t>(0 | (value << 1)));
return;
} else {
auto pos = values_L4.size() - ranks[1].back();
values_L3.push_back(static_cast<uint32_t>(1 | (pos << 1)));
}
values_L4.push_back(value);
#else
values_L3.push_back(value);
#endif
};
auto append_leaf = [&](id_type value) {
if ((values_L1.size() % BLOCK_SIZE_L1) == 0) {
ranks[0].push_back(static_cast<id_type>(values_L2.size()));
}
values_L1.push_back(static_cast<uint8_t>(value & 0xFF));
links.push_back(value >> 8);
};
for (id_type i = 0; i < bc.size(); ++i) {
if (leaf_flags_[i]) {
append_leaf(bc[i].base);
} else {
append(bc[i].base ^ i);
}
append(bc[i].check ^ i);
if (bc[i].check == i) {
++num_free_nodes_;
}
}
// release
values_L1_ = Vector<uint8_t>(values_L1);
values_L2_ = Vector<uint16_t>(values_L2);
values_L3_ = Vector<uint32_t>(values_L3);
#ifdef XCDAT_X64
values_L4_ = Vector<uint64_t>(values_L4);
#endif
for (uint8_t j = 0; j < LAYERS - 1; ++j) {
ranks_[j] = Vector<id_type>(ranks[j]);
}
links_ = FitVector(links);
}
size_t FastDacBc::size_in_bytes() const {
size_t ret = 0;
ret += values_L1_.size_in_bytes();
ret += values_L2_.size_in_bytes();
ret += values_L3_.size_in_bytes();
#ifdef XCDAT_X64
ret += values_L4_.size_in_bytes();
#endif
for (auto& ranks : ranks_) {
ret += ranks.size_in_bytes();
}
ret += leaf_flags_.size_in_bytes();
ret += links_.size_in_bytes();
ret += sizeof(num_free_nodes_);
return ret;
}
void FastDacBc::show_stat(std::ostream& os) const {
const auto total_size = size_in_bytes();
os << "basic statistics of xcdat::FastDacBc" << std::endl;
show_size("\tnum links: ", links_.size(), os);
show_size("\tbytes per node:", double(total_size) / num_nodes(), os);
os << "member size statistics of xcdat::FastDacBc" << std::endl;
show_size_ratio("\tvalues_L1:", values_L1_.size_in_bytes(), total_size, os);
show_size_ratio("\tvalues_L2:", values_L2_.size_in_bytes(), total_size, os);
show_size_ratio("\tvalues_L3:", values_L3_.size_in_bytes(), total_size, os);
#ifdef XCDAT_X64
show_size_ratio("\tvalues_L4:", values_L4_.size_in_bytes(), total_size, os);
#endif
show_size_ratio("\tranks_L1: ", ranks_[0].size_in_bytes(), total_size, os);
show_size_ratio("\tranks_L2: ", ranks_[1].size_in_bytes(), total_size, os);
#ifdef XCDAT_X64
show_size_ratio("\tranks_L3: ", ranks_[2].size_in_bytes(), total_size, os);
#endif
show_size_ratio("\tleaves: ", leaf_flags_.size_in_bytes(), total_size, os);
show_size_ratio("\tlinks: ", links_.size_in_bytes(), total_size, os);
}
void FastDacBc::write(std::ostream& os) const {
values_L1_.write(os);
values_L2_.write(os);
values_L3_.write(os);
#ifdef XCDAT_X64
values_L4_.write(os);
#endif
for (auto& ranks : ranks_) {
ranks.write(os);
}
leaf_flags_.write(os);
links_.write(os);
write_value(num_free_nodes_, os);
}
id_type FastDacBc::access_(id_type i) const {
uint32_t value = values_L1_[i] >> 1;
if ((values_L1_[i] & 1U) == 0) {
return value;
}
i = ranks_[0][i / BLOCK_SIZE_L1] + value;
value = values_L2_[i] >> 1;
if ((values_L2_[i] & 1U) == 0) {
return value;
}
i = ranks_[1][i / BLOCK_SIZE_L2] + value;
#ifdef XCDAT_X64
value = values_L3_[i] >> 1;
if ((values_L3_[i] & 1U) == 0) {
return value;
}
i = ranks_[2][i / BLOCK_SIZE_L3] + value;
return values_L4_[i];
#else
return values_L3_[i];
#endif
}
} //namespace - xcdat

View file

@ -1,60 +0,0 @@
#include "xcdat/FitVector.hpp"
namespace xcdat {
FitVector::FitVector(std::istream& is) {
chunks_ = Vector<id_type>(is);
size_= read_value<size_t>(is);
width_ = read_value<id_type>(is);
mask_ = read_value<id_type>(is);
}
FitVector::FitVector(const std::vector<id_type>& values) {
if (values.empty()) {
return;
}
width_ = 0;
auto max_value = *std::max_element(std::begin(values), std::end(values));
do {
++width_;
max_value >>= 1;
} while (max_value);
size_ = values.size();
mask_ = (1U << width_) - 1;
std::vector<id_type> chunks(size_ * width_ / CHUNK_WIDTH + 1, 0);
for (id_type i = 0; i < size_; ++i) {
const auto chunk_pos = static_cast<id_type>(i * width_ / CHUNK_WIDTH);
const auto offset = static_cast<id_type>(i * width_ % CHUNK_WIDTH);
chunks[chunk_pos] &= ~(mask_ << offset);
chunks[chunk_pos] |= (values[i] & mask_) << offset;
if (CHUNK_WIDTH < offset + width_) {
chunks[chunk_pos + 1] &= ~(mask_ >> (CHUNK_WIDTH - offset));
chunks[chunk_pos + 1] |= (values[i] & mask_) >> (CHUNK_WIDTH - offset);
}
}
chunks_ = Vector<id_type>(chunks);
}
size_t FitVector::size_in_bytes() const {
size_t ret = 0;
ret += chunks_.size_in_bytes();
ret += sizeof(size_);
ret += sizeof(width_);
ret += sizeof(mask_);
return ret;
}
void FitVector::write(std::ostream& os) const {
chunks_.write(os);
write_value(size_, os);
write_value(width_, os);
write_value(mask_, os);
}
} //namespace - xcdat

View file

@ -1,317 +0,0 @@
#include <iostream>
#include "xcdat/TrieBuilder.hpp"
namespace xcdat {
TrieBuilder::TrieBuilder(const std::vector<std::string_view>& keys,
id_type width_L1, bool bin_mode)
: keys_(keys), block_size_(1U << width_L1), width_L1_(width_L1),
bin_mode_(bin_mode) {
if (keys_.empty()) {
throw TrieBuilder::Exception("The input data is empty.");
}
if (ID_MAX < keys_.size()) {
throw TrieBuilder::Exception("Key ID range error.");
}
{
size_t init_capa = 1;
while (init_capa < keys_.size()) {
init_capa <<= 1;
}
bc_.reserve(init_capa);
leaf_flags_.reserve(init_capa);
term_flags_.reserve(init_capa);
used_flags_.reserve(init_capa);
heads_.reserve(init_capa >> width_L1_);
}
alphabet_.reserve(256);
edges_.reserve(256);
suffixes_.reserve(keys_.size());
// initialize an empty list.
for (id_type i = 0; i < 256; ++i) {
bc_.push_back({i + 1, i - 1});
leaf_flags_.push_back(false);
term_flags_.push_back(false);
used_flags_.push_back(false);
}
bc_[255].base = 0;
bc_[0].check = 255;
for (id_type i = 0; i < 256; i += block_size_) {
heads_.push_back(i);
}
use_(0);
bc_[0].check = TABOO_ID;
used_flags_[TABOO_ID] = true;
heads_[TABOO_ID >> width_L1_] = bc_[TABOO_ID].base;
build_table_();
build_bc_(0, keys_.size(), 0, 0);
build_tail_();
}
void TrieBuilder::build_table_() {
using tb_type = std::pair<uint8_t, size_t>;
tb_type table_builder[256];
for (uint32_t i = 0; i < 256; ++i) {
table_builder[i] = {static_cast<uint8_t>(i), 0};
}
max_length_ = 0;
for (auto& key : keys_) {
for (char c : key) {
++table_builder[static_cast<uint8_t>(c)].second;
}
max_length_ = std::max(max_length_, key.length());
}
if (table_builder[0].second != 0) { // including '\0'
bin_mode_ = true;
}
for (const auto& item : table_builder) {
if (item.second != 0) {
alphabet_.push_back(item.first);
}
}
alphabet_.shrink_to_fit();
std::sort(std::begin(table_builder), std::end(table_builder),
[](const tb_type& lhs, const tb_type& rhs) {
return lhs.second > rhs.second;
});
for (uint32_t i = 0; i < 256; ++i) {
table_[table_builder[i].first] = static_cast<uint8_t>(i);
}
for (uint32_t i = 0; i < 256; ++i) {
table_[table_[i] + 256] = static_cast<uint8_t>(i);
}
}
void TrieBuilder::build_bc_(size_t begin, size_t end, size_t depth,
id_type node_id) {
if (keys_[begin].length() == depth) {
term_flags_.set_bit(node_id, true);
if (++begin == end) { // without link?
bc_[node_id].base = 0; // with an empty suffix
leaf_flags_.set_bit(node_id, true);
return;
}
} else if (begin + 1 == end) { // leaf?
term_flags_.set_bit(node_id, true);
leaf_flags_.set_bit(node_id, true);
auto& key = keys_[begin];
suffixes_.push_back(
{{key.data() + depth, key.length() - depth}, node_id}
);
return;
}
{ // fetching edges
edges_.clear();
auto label = static_cast<uint8_t>(keys_[begin][depth]);
for (auto str_id = begin + 1; str_id < end; ++str_id) {
const auto _label = static_cast<uint8_t>(keys_[str_id][depth]);
if (label != _label) {
if (_label < label) {
throw TrieBuilder::Exception(
"The input data is not in lexicographical order."
);
}
edges_.push_back(label);
label = _label;
}
}
edges_.push_back(label);
}
const auto base = find_base_(node_id >> width_L1_);
if (bc_.size() <= base) {
expand_();
}
// defining new edges
bc_[node_id].base = base;
for (const auto label : edges_) {
const auto child_id = base ^ table_[label];
use_(child_id);
bc_[child_id].check = node_id;
}
// following the children
auto _begin = begin;
auto label = static_cast<uint8_t>(keys_[begin][depth]);
for (auto _end = begin + 1; _end < end; ++_end) {
const auto _label = static_cast<uint8_t>(keys_[_end][depth]);
if (label != _label) {
build_bc_(_begin, _end, depth + 1, base ^ table_[label]);
label = _label;
_begin = _end;
}
}
build_bc_(_begin, end, depth + 1, base ^ table_[label]);
}
// The algorithm is inspired by marisa-trie
void TrieBuilder::build_tail_() {
std::sort(std::begin(suffixes_), std::end(suffixes_),
[](const Suffix& lhs, const Suffix& rhs) {
return std::lexicographical_compare(
std::rbegin(lhs), std::rend(lhs),
std::rbegin(rhs), std::rend(rhs));
});
// For empty suffixes
tail_.emplace_back('\0');
if (bin_mode_) {
boundary_flags_.push_back(false);
}
const Suffix dummy = {{nullptr, 0}, 0};
const Suffix* prev_suf = &dummy;
for (size_t i = suffixes_.size(); i > 0; --i) {
const auto& cur_suf = suffixes_[i - 1];
if (cur_suf.length() == 0) {
throw TrieBuilder::Exception("A suffix is empty.");
}
size_t match = 0;
while ((match < cur_suf.length()) && (match < prev_suf->length())
&& ((*prev_suf)[match] == cur_suf[match])) {
++match;
}
if ((match == cur_suf.length()) && (prev_suf->length() != 0)) { // sharing
bc_[cur_suf.node_id].base = static_cast<id_type>(
bc_[prev_suf->node_id].base + (prev_suf->length() - match)
);
} else { // append
bc_[cur_suf.node_id].base = static_cast<id_type>(tail_.size());
std::copy(std::begin(cur_suf.str), std::end(cur_suf.str),
std::back_inserter(tail_));
if (bin_mode_) {
for (size_t j = 1; j < cur_suf.length(); ++j) {
boundary_flags_.push_back(false);
}
boundary_flags_.push_back(true);
} else {
tail_.emplace_back('\0');
}
if (ID_MAX < tail_.size()) {
throw TrieBuilder::Exception("TAIL address range error.");
}
}
prev_suf = &cur_suf;
}
}
void TrieBuilder::expand_() {
if (ID_MAX < bc_.size() + 256) {
throw TrieBuilder::Exception("Node ID range error.");
}
const auto old_size = static_cast<id_type>(bc_.size());
const auto new_size = old_size + 256;
for (auto i = old_size; i < new_size; ++i) {
bc_.push_back({i + 1, i - 1});
leaf_flags_.push_back(false);
term_flags_.push_back(false);
used_flags_.push_back(false);
}
{
const auto last = bc_[TABOO_ID].check;
bc_[old_size].check = last;
bc_[last].base = old_size;
bc_[new_size - 1].base = TABOO_ID;
bc_[TABOO_ID].check = new_size - 1;
}
for (auto i = old_size; i < new_size; i += block_size_) {
heads_.push_back(i);
}
const auto block_id = old_size / 256;
if (FREE_BLOCKS <= block_id) {
close_block_(block_id - FREE_BLOCKS);
}
}
void TrieBuilder::use_(id_type node_id) {
used_flags_[node_id] = true;
const auto next = bc_[node_id].base;
const auto prev = bc_[node_id].check;
bc_[prev].base = next;
bc_[next].check = prev;
const auto block_id = node_id >> width_L1_;
if (heads_[block_id] == node_id) {
heads_[block_id] = (block_id != next >> width_L1_) ? TABOO_ID : next;
}
}
void TrieBuilder::close_block_(id_type block_id) {
const auto begin = block_id * 256;
const auto end = begin + 256;
for (auto i = begin; i < end; ++i) {
if (!used_flags_[i]) {
use_(i);
bc_[i].base = i;
bc_[i].check = i;
used_flags_[i] = false;
}
}
for (auto i = begin; i < end; i += block_size_) {
heads_[i >> width_L1_] = TABOO_ID;
}
}
id_type TrieBuilder::find_base_(id_type block_id) const {
if (bc_[TABOO_ID].base == TABOO_ID) { // Full?
return static_cast<id_type>(bc_.size()) ^ table_[edges_[0]];
}
// search in the same block
for (auto i = heads_[block_id];
i != TABOO_ID && i >> width_L1_ == block_id;
i = bc_[i].base) {
const auto base = i ^ table_[edges_[0]];
if (is_target_(base)) {
return base; // base / block_size_ == block_id
}
}
for (auto i = bc_[TABOO_ID].base; i != TABOO_ID; i = bc_[i].base) {
const auto base = i ^ table_[edges_[0]];
if (is_target_(base)) {
return base; // base / block_size_ != block_id
}
}
return static_cast<id_type>(bc_.size()) ^ table_[edges_[0]];
}
bool TrieBuilder::is_target_(id_type base) const {
for (const auto label : edges_) {
if (used_flags_[base ^ table_[label]]) {
return false;
}
}
return true;
}
} //namespace - xcdat

View file

@ -1,8 +1,6 @@
file(GLOB TEST_SOURCES test_*.cpp)
file(GLOB TEST_SOURCES *_test.cpp)
foreach(TEST_SOURCE ${TEST_SOURCES}) foreach(TEST_SOURCE ${TEST_SOURCES})
get_filename_component(TEST_SOURCE_NAME ${TEST_SOURCE} NAME_WE) get_filename_component(TEST_SOURCE_NAME ${TEST_SOURCE} NAME_WE)
add_executable(${TEST_SOURCE_NAME} ${TEST_SOURCE}) add_executable(${TEST_SOURCE_NAME} ${TEST_SOURCE})
target_link_libraries(${TEST_SOURCE_NAME} xcdat)
add_test(${TEST_SOURCE_NAME} ${TEST_SOURCE_NAME}) add_test(${TEST_SOURCE_NAME} ${TEST_SOURCE_NAME})
endforeach() endforeach()

6260
test/doctest/doctest.h Normal file

File diff suppressed because it is too large Load diff

110
test/test_bit_vector.cpp Normal file
View file

@ -0,0 +1,110 @@
#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
#include <algorithm>
#include <random>
#include <xcdat/bit_vector.hpp>
#include "doctest/doctest.h"
std::vector<bool> generate_random_bits(std::uint64_t n) {
static constexpr std::uint64_t seed = 13;
std::vector<bool> bits;
std::mt19937 engine(seed);
for (std::uint64_t i = 0; i < n; i++) {
bits.push_back(engine() & 1);
}
return bits;
}
std::uint64_t get_num_ones(const std::vector<bool>& bits) {
return std::accumulate(bits.begin(), bits.end(), 0ULL);
}
std::uint64_t rank_naive(const std::vector<bool>& bits, std::uint64_t i) {
return std::accumulate(bits.begin(), bits.begin() + i, 0ULL);
}
std::uint64_t select_naive(const std::vector<bool>& bits, std::uint64_t n) {
std::uint64_t i = 0;
for (; i < bits.size(); i++) {
if (bits[i]) {
if (n == 0) {
break;
} else {
n -= 1;
}
}
}
return i;
}
TEST_CASE("Test bit_vector::builder with resize") {
const auto bits = generate_random_bits(10000);
xcdat::bit_vector::builder b;
b.resize(bits.size());
REQUIRE_EQ(b.size(), bits.size());
for (std::uint64_t i = 0; i < bits.size(); i++) {
b.set_bit(i, bits[i]);
}
for (std::uint64_t i = 0; i < bits.size(); i++) {
REQUIRE_EQ(b[i], bits[i]);
}
}
TEST_CASE("Test bit_vector::builder with push_back") {
const auto bits = generate_random_bits(10000);
xcdat::bit_vector::builder b;
b.reserve(bits.size());
for (std::uint64_t i = 0; i < bits.size(); i++) {
b.push_back(bits[i]);
}
REQUIRE_EQ(b.size(), bits.size());
for (std::uint64_t i = 0; i < bits.size(); i++) {
REQUIRE_EQ(b[i], bits[i]);
}
}
TEST_CASE("Test bit_vector") {
const auto bits = generate_random_bits(10000);
xcdat::bit_vector bv;
{
xcdat::bit_vector::builder b(bits.size());
for (std::uint64_t i = 0; i < bits.size(); i++) {
b.set_bit(i, bits[i]);
}
bv.build(b, true, true);
}
REQUIRE_EQ(bv.size(), bits.size());
REQUIRE_EQ(bv.num_ones(), get_num_ones(bits));
for (std::uint64_t i = 0; i < bits.size(); i++) {
REQUIRE_EQ(bv[i], bits[i]);
}
static constexpr std::uint64_t seed = 17;
std::mt19937_64 engine(seed);
{
std::uniform_int_distribution<std::uint64_t> dist(0, bv.size());
for (std::uint64_t r = 0; r < 100; r++) {
const std::uint64_t i = dist(engine);
REQUIRE_EQ(bv.rank(i), rank_naive(bits, i));
}
}
{
std::uniform_int_distribution<std::uint64_t> dist(0, bv.num_ones() - 1);
for (std::uint64_t r = 0; r < 100; r++) {
const std::uint64_t n = dist(engine);
REQUIRE_EQ(bv.select(n), select_naive(bits, n));
}
}
}

View file

@ -1,265 +0,0 @@
#undef NDEBUG
#include <algorithm>
#include <cassert>
#include <iostream>
#include <random>
#include <cstring>
#include "xcdat.hpp"
using namespace xcdat;
namespace {
constexpr size_t NUM_KEYS = 1U << 10;
constexpr size_t MAX_LENGTH = 20;
void to_set(std::vector<std::string>& keys) {
std::sort(std::begin(keys), std::end(keys));
keys.erase(std::unique(std::begin(keys), std::end(keys)), std::end(keys));
}
std::string make_key() {
std::random_device rnd;
std::string key;
size_t length = (rnd() % MAX_LENGTH) + 1;
for (size_t j = 0; j < length; ++j) {
key += 'A' + (rnd() % 26);
}
return key;
}
std::vector<std::string> make_keys() {
std::vector<std::string> keys;
keys.reserve(NUM_KEYS);
for (size_t i = 0; i < NUM_KEYS; ++i) {
keys.push_back(make_key());
}
to_set(keys);
return keys;
}
std::vector<std::string> make_other_keys(const std::vector<std::string>& keys) {
std::vector<std::string> others;
for (size_t i = 0; i < NUM_KEYS; ++i) {
auto string = make_key();
if (std::find(std::begin(keys), std::end(keys), string) == std::end(keys)) {
others.push_back(string);
}
}
to_set(others);
return others;
}
template<bool Fast>
Trie<Fast> test_build(const std::vector<std::string_view>& keys,
bool bin_mode) {
std::cerr << "Construction -> build()\n";
auto trie = TrieBuilder::build<Fast>(keys, bin_mode);
assert(trie.num_keys() == keys.size());
return trie;
}
template<bool Fast>
void test_basic_operations(const Trie<Fast>& trie,
const std::vector<std::string_view>& keys,
const std::vector<std::string_view>& others) {
std::cerr << "Basic operations -> lookup() and access()\n";
for (auto& key : keys) {
auto id = trie.lookup(key);
assert(id != Trie<Fast>::NOT_FOUND);
auto dec = trie.access(id);
assert(dec == key);
}
for (auto& other : others) {
const auto id = trie.lookup(other);
assert(id == Trie<Fast>::NOT_FOUND);
}
}
template<bool Fast>
void test_prefix_operations(const Trie<Fast>& trie,
const std::vector<std::string_view>& keys,
const std::vector<std::string_view>& others) {
std::cerr << "Prefix operations -> PrefixIterator\n";
for (auto& key : keys) {
size_t num_results = 0;
auto it = trie.make_prefix_iterator(key);
while (it.next()) {
auto id = it.id();
auto dec = it.key();
assert(dec.length() <= key.length());
auto dec2 = trie.access(id);
assert(dec == dec2);
++num_results;
}
assert(1 <= num_results);
assert(num_results <= key.length());
}
for (auto& other : others) {
size_t num_results = 0;
auto it = trie.make_prefix_iterator(other);
while (it.next()) {
auto id = it.id();
auto dec = it.key();
assert(dec.length() < other.length());
auto dec2 = trie.access(id);
assert(dec == dec2);
++num_results;
}
assert(num_results < other.length());
}
}
template<bool Fast>
void test_predictive_operations(const Trie<Fast>& trie,
const std::vector<std::string_view>& keys,
const std::vector<std::string_view>& others) {
std::cerr << "Predictive operations -> PredictiveIterator\n";
for (auto& key : keys) {
size_t num_results = 0;
auto it = trie.make_predictive_iterator(key);
while (it.next()) {
auto id = it.id();
auto dec = it.key();
assert(key.length() <= dec.length());
auto dec2 = trie.access(id);
assert(dec == dec2);
++num_results;
}
assert(1 <= num_results);
}
for (auto& other : others) {
auto it = trie.make_predictive_iterator(other);
while (it.next()) {
auto id = it.id();
auto dec = it.key();
assert(other.length() < dec.length());
auto dec2 = trie.access(id);
assert(dec == dec2);
}
}
{ // all enumeration
size_t num_results = 0;
auto it = trie.make_predictive_iterator(std::string_view{});
while (it.next()) {
auto id = it.id();
auto dec = it.key();
assert(0 <= dec.length());
auto dec2 = trie.access(id);
assert(dec == dec2);
++num_results;
}
assert(num_results == trie.num_keys());
}
}
template<bool Fast>
void test_io(const Trie<Fast>& trie) {
std::cerr << "File I/O -> write() and read()\n";
const char* file_name = "index";
{
std::ofstream ofs{file_name};
trie.write(ofs);
}
{
std::ifstream ifs{file_name};
auto size = static_cast<size_t>(ifs.seekg(0, std::ios::end).tellg());
assert(size == trie.size_in_bytes());
}
Trie<Fast> _trie;
{
std::ifstream ifs{file_name};
_trie = Trie<Fast>(ifs);
}
assert(trie.num_keys() == _trie.num_keys());
assert(trie.bin_mode() == _trie.bin_mode());
assert(trie.alphabet_size() == _trie.alphabet_size());
assert(trie.num_nodes() == _trie.num_nodes());
assert(trie.num_used_nodes() == _trie.num_used_nodes());
assert(trie.num_free_nodes() == _trie.num_free_nodes());
assert(trie.size_in_bytes() == _trie.size_in_bytes());
}
template<bool Fast>
void test_trie(const std::vector<std::string_view>& strings,
const std::vector<std::string_view>& others) {
for (int i = 0; i < 2; ++i) {
std::cerr << "** " << (i % 2 ? "Binary" : "Text") << " Mode **\n";
std::cerr << "Testing xcdat::Trie<" << (Fast ? "true" : "false") << ">\n";
auto trie = test_build<Fast>(strings, i % 2 != 0);
test_basic_operations(trie, strings, others);
test_prefix_operations(trie, strings, others);
test_predictive_operations(trie, strings, others);
test_io(trie);
std::cerr << "--> No problem (☝ ՞ਊ ՞)☝" << std::endl << std::endl;
}
}
} // namespace
int main() {
auto keys_buffer = make_keys();
auto others_buffer = make_other_keys(keys_buffer);
std::vector<std::string_view> keys(keys_buffer.size());
for (size_t i = 0; i < keys.size(); ++i) {
keys[i] = std::string_view{keys_buffer[i]};
}
std::vector<std::string_view> others(others_buffer.size());
for (size_t i = 0; i < others.size(); ++i) {
others[i] = std::string_view{others_buffer[i]};
}
test_trie<false>(keys, others);
test_trie<true>(keys, others);
return 0;
}

View file

@ -1,102 +0,0 @@
#undef NDEBUG
#include <cassert>
#include <iostream>
#include <random>
#include "xcdat/BitVector.hpp"
#include "xcdat/FitVector.hpp"
using namespace xcdat;
namespace {
constexpr size_t SIZE = 1U << 10;
void test_vector() {
std::vector<int> orig_vec(SIZE);
{
std::random_device rnd;
for (size_t i = 0; i < SIZE; ++i) {
orig_vec[i] = rnd();
}
}
auto copied_vec = orig_vec; // copy
Vector<int> vec(copied_vec);
assert(copied_vec.empty());
for (size_t i = 0; i < SIZE; ++i) {
assert(orig_vec[i] == vec[i]);
}
Vector<int> swapped_vec;
vec.swap(swapped_vec);
assert(vec.is_empty());
for (size_t i = 0; i < SIZE; ++i) {
assert(orig_vec[i] == swapped_vec[i]);
}
}
void test_bit_vector() {
std::vector<bool> orig_bit_vector;
{
std::random_device rnd;
for (size_t i = 0; i < SIZE; ++i) {
orig_bit_vector.push_back(rnd() % 2 == 0);
}
}
BitVector bit_vector;
{
BitVectorBuilder builder;
for (size_t i = 0; i < SIZE; ++i) {
builder.push_back(orig_bit_vector[i]);
}
bit_vector = BitVector(builder, true, true);
}
assert(bit_vector.size() == SIZE);
id_type sum = 0;
for (id_type i = 0; i < SIZE; ++i) {
assert(bit_vector[i] == orig_bit_vector[i]);
if (bit_vector[i]) {
assert(sum == bit_vector.rank(i));
assert(i == bit_vector.select(sum));
++sum;
}
}
assert(bit_vector.num_1s() == sum);
assert(bit_vector.num_0s() == SIZE - sum);
}
void test_small_vector() {
std::vector<id_type> orig_vector;
{
std::random_device rnd;
for (size_t i = 0; i < SIZE; ++i) {
orig_vector.push_back(rnd() & UINT16_MAX);
}
}
FitVector small_vector(orig_vector);
assert(orig_vector.size() == small_vector.size());
for (size_t i = 0; i < SIZE; ++i) {
assert(orig_vector[i] == small_vector[i]);
}
}
} // namespace
int main() {
test_vector();
test_bit_vector();
test_small_vector();
return 0;
}

View file

@ -1,6 +0,0 @@
add_executable(xcdat-exe xcdat.cpp)
set_target_properties(xcdat-exe PROPERTIES OUTPUT_NAME xcdat)
target_link_libraries(xcdat-exe xcdat)
install(TARGETS xcdat-exe RUNTIME DESTINATION bin)

View file

@ -1,322 +0,0 @@
#include <chrono>
#include <iostream>
#include <random>
#include "xcdat.hpp"
using namespace xcdat;
namespace {
constexpr int RUNS = 10;
class StopWatch {
public:
using hrc = std::chrono::high_resolution_clock;
StopWatch() : tp_{hrc::now()} {}
double sec() const {
const auto tp = hrc::now() - tp_;
return std::chrono::duration<double>(tp).count();
}
double milli_sec() const {
const auto tp = hrc::now() - tp_;
return std::chrono::duration<double, std::milli>(tp).count();
}
double micro_sec() const {
const auto tp = hrc::now() - tp_;
return std::chrono::duration<double, std::micro>(tp).count();
}
private:
hrc::time_point tp_;
};
size_t read_keys(const char* file_name, std::vector<std::string>& keys) {
std::ifstream ifs{file_name};
if (!ifs) {
return 0;
}
size_t size = 0;
for (std::string line; std::getline(ifs, line);) {
keys.push_back(line);
size += line.length() + 1; // with terminator
}
return size;
}
std::vector<std::string_view>
extract_views(const std::vector<std::string>& keys) {
std::vector<std::string_view> views(keys.size());
for (size_t i = 0; i < keys.size(); ++i) {
views[i] = keys[i];
}
return views;
};
void show_usage(std::ostream& os) {
os << "xcdat build <type> <key> <dict>\n";
os << "\t<type>\t1: DACs, 2: FDACs\n";
os << "\t<key> \tInput file name of a set of keys (must be sorted)\n";
os << "\t<dict>\tOutput file name of the dictionary (optional)\n";
os << "\t \tIf omitted, <key>.dacs or <key>.fdacs is output\n";
os << "xcdat query <type> <dict> <limit>\n";
os << "\t<type> \t1: DACs, 2: FDACs\n";
os << "\t<dict> \tInput file name of the dictionary\n";
os << "\t<limit>\tLimit of #results (optional, default=10)\n";
os << "xcdat bench <type> <dict> <key>\n";
os << "\t<type>\t1: DACs, 2: FDACs\n";
os << "\t<dict>\tInput file name of the dictionary\n";
os << "\t<key> \tInput file name of keys for benchmark\n";
os.flush();
}
template<bool Fast>
int build(std::vector<std::string>& args) {
if (args.size() != 3 && args.size() != 4) {
show_usage(std::cerr);
return 1;
}
std::vector<std::string> keys_buffer;
auto raw_size = read_keys(args[2].c_str(), keys_buffer);
if (raw_size == 0) {
std::cerr << "open error : " << args[2] << std::endl;
return 1;
}
auto keys = extract_views(keys_buffer);
Trie<Fast> trie;
try {
StopWatch sw;
trie = TrieBuilder::build<Fast>(keys);
std::cout << "constr. time:\t" << sw.sec() << " sec" << std::endl;
} catch (const xcdat::TrieBuilder::Exception& ex) {
std::cerr << ex.what() << std::endl;
return 1;
}
std::cout << "cmpr. ratio:\t"
<< static_cast<double>(trie.size_in_bytes()) / raw_size
<< " over the raw size" << std::endl;
std::cout << std::endl;
trie.show_stat(std::cout);
std::cout << std::endl;
std::string out_name;
if (args.size() == 4) {
out_name = args[3];
} else {
out_name = args[2] + (Fast ? ".fdac" : ".dac");
}
std::ofstream ofs{out_name};
if (!ofs) {
std::cerr << "open error : " << out_name << std::endl;
return 1;
}
trie.write(ofs);
std::cout << "output -> " << out_name << std::endl;
return 0;
}
template<bool Fast>
int query(std::vector<std::string>& args) {
if (args.size() != 3 && args.size() != 4) {
show_usage(std::cerr);
return 1;
}
Trie<Fast> trie;
{
std::ifstream ifs(args[2]);
if (!ifs) {
std::cerr << "open error : " << args[2] << std::endl;
return 1;
}
trie = Trie<Fast>(ifs);
}
size_t limit = 10;
if (args.size() == 4) {
limit = std::stoull(args.back());
}
std::string query;
while (true){
std::cout << "> " << std::flush;
std::getline(std::cin, query);
if (query.empty()){
break;
}
std::cout << "Lookup" << std::endl;
auto id = trie.lookup(query);
if (id == Trie<Fast>::NOT_FOUND) {
std::cout << "not found" << std::endl;
} else {
std::cout << id << '\t' << query << std::endl;
}
std::cout << "Common Prefix Lookup" << std::endl;
{
size_t N = 0;
auto it = trie.make_prefix_iterator(query);
while (N < limit && it.next()) {
std::cout << it.id() << '\t' << it.key() << std::endl;
++N;
}
size_t M = 0;
while (it.next()) {
++M;
}
if (M != 0) {
std::cout << "and more..." << std::endl;
}
std::cout << N + M << " found" << std::endl;
}
std::cout << "Predictive Lookup" << std::endl;
{
size_t N = 0;
auto it = trie.make_predictive_iterator(query);
while (N < limit && it.next()) {
std::cout << it.id() << '\t' << it.key() << std::endl;
++N;
}
size_t M = 0;
while (it.next()) {
++M;
}
if (M != 0) {
std::cout << "and more..." << std::endl;
}
std::cout << N + M << " found" << std::endl;
}
}
return 0;
}
template<bool Fast>
int bench(std::vector<std::string>& args) {
if (args.size() != 4) {
show_usage(std::cerr);
return 1;
}
Trie<Fast> trie;
{
std::ifstream ifs(args[2]);
if (!ifs) {
std::cerr << "open error : " << args[2] << std::endl;
return 1;
}
trie = Trie<Fast>(ifs);
}
std::vector<std::string> keys_buffer;
if (read_keys(args[3].c_str(), keys_buffer) == 0) {
std::cerr << "open error : " << args[3] << std::endl;
return 1;
}
auto keys = extract_views(keys_buffer);
std::vector<id_type> ids(keys.size());
std::cout << "Warm up" << std::endl;
for (size_t i = 0; i < keys.size(); ++i) {
ids[i] = trie.lookup(keys[i]);
if (ids[i] == Trie<Fast>::NOT_FOUND) {
std::cerr << "A non-registered key is included, "
<< keys_buffer[i] << std::endl;
return 1;
}
}
{
std::cout << "Lookup benchmark on " << RUNS << " runs" << std::endl;
StopWatch sw;
for (uint32_t r = 0; r < RUNS; ++r) {
for (size_t i = 0; i < keys.size(); ++i) {
if (trie.lookup(keys[i]) != ids[i]) {
std::cerr << "Critical lookup error ʅ( ՞ਊ՞)ʃ" << std::endl;
return 1;
}
}
}
std::cout << sw.micro_sec() / RUNS / keys.size()
<< " us per str" << std::endl;
}
{
std::cout << "Access benchmark on " << RUNS << " runs" << std::endl;
StopWatch sw;
for (uint32_t r = 0; r < RUNS; ++r) {
for (auto id : ids) {
auto dec = trie.access(id);
if (dec.empty()) {
std::cerr << "Critical access error ʅ( ՞ਊ՞)ʃ" << std::endl;
return 1;
}
}
}
std::cout << sw.micro_sec() / RUNS / ids.size()
<< " us per ID" << std::endl;
}
return 0;
}
} // namespace
int main(int argc, const char* argv[]) {
if (argc < 3) {
show_usage(std::cerr);
return 1;
}
std::vector<std::string> args;
for (int i = 1; i < argc; ++i) {
args.emplace_back(std::string{argv[i]});
}
bool is_fast;
if (args[1][0] == '1') {
is_fast = false;
} else if (args[1][0] == '2') {
is_fast = true;
} else {
show_usage(std::cerr);
return 1;
}
if (args[0] == "build") {
return is_fast ? build<true>(args) : build<false>(args);
} else if (args[0] == "query") {
return is_fast ? query<true>(args) : query<false>(args);
} else if (args[0] == "bench") {
return is_fast ? bench<true>(args) : bench<false>(args);
}
show_usage(std::cerr);
return 1;
}

View file

@ -1,7 +0,0 @@
#ifndef XCDAT_CONFIG_HPP
#define XCDAT_CONFIG_HPP
#cmakedefine XCDAT_X64
#cmakedefine XCDAT_USE_POPCNT
#endif // XCDAT_CONFIG_HPP