Merge branch 'development'
# Conflicts: # CMakeLists.txt # LICENSE # README.md # include/xcdat.hpp # sample/CMakeLists.txt # sample/sample.cpp
This commit is contained in:
commit
0522882198
113
.clang-format
Normal file
113
.clang-format
Normal file
|
@ -0,0 +1,113 @@
|
|||
---
|
||||
Language: Cpp
|
||||
# BasedOnStyle: Google
|
||||
AccessModifierOffset: -2
|
||||
AlignAfterOpenBracket: Align
|
||||
AlignConsecutiveAssignments: false
|
||||
AlignConsecutiveDeclarations: false
|
||||
AlignEscapedNewlines: Left
|
||||
AlignOperands: true
|
||||
AlignTrailingComments: false
|
||||
AllowAllParametersOfDeclarationOnNextLine: true
|
||||
AllowShortBlocksOnASingleLine: false
|
||||
AllowShortCaseLabelsOnASingleLine: false
|
||||
AllowShortFunctionsOnASingleLine: Empty
|
||||
AllowShortIfStatementsOnASingleLine: true
|
||||
AllowShortLoopsOnASingleLine: true
|
||||
AlwaysBreakAfterDefinitionReturnType: None
|
||||
AlwaysBreakAfterReturnType: None
|
||||
AlwaysBreakBeforeMultilineStrings: false
|
||||
AlwaysBreakTemplateDeclarations: true
|
||||
BinPackArguments: true
|
||||
BinPackParameters: true
|
||||
BraceWrapping:
|
||||
AfterClass: false
|
||||
AfterControlStatement: false
|
||||
AfterEnum: false
|
||||
AfterFunction: false
|
||||
AfterNamespace: false
|
||||
AfterObjCDeclaration: false
|
||||
AfterStruct: false
|
||||
AfterUnion: false
|
||||
AfterExternBlock: false
|
||||
BeforeCatch: false
|
||||
BeforeElse: false
|
||||
IndentBraces: false
|
||||
SplitEmptyFunction: true
|
||||
SplitEmptyRecord: true
|
||||
SplitEmptyNamespace: true
|
||||
BreakBeforeBinaryOperators: None
|
||||
BreakBeforeBraces: Attach
|
||||
BreakBeforeInheritanceComma: false
|
||||
BreakBeforeTernaryOperators: true
|
||||
BreakConstructorInitializersBeforeComma: false
|
||||
BreakConstructorInitializers: BeforeColon
|
||||
BreakAfterJavaFieldAnnotations: false
|
||||
BreakStringLiterals: true
|
||||
ColumnLimit: 120
|
||||
CommentPragmas: '^ IWYU pragma:'
|
||||
CompactNamespaces: false
|
||||
ConstructorInitializerAllOnOneLineOrOnePerLine: false
|
||||
ConstructorInitializerIndentWidth: 4
|
||||
ContinuationIndentWidth: 4
|
||||
Cpp11BracedListStyle: true
|
||||
DerivePointerAlignment: false
|
||||
DisableFormat: false
|
||||
ExperimentalAutoDetectBinPacking: false
|
||||
FixNamespaceComments: true
|
||||
ForEachMacros:
|
||||
- foreach
|
||||
- Q_FOREACH
|
||||
- BOOST_FOREACH
|
||||
IncludeBlocks: Preserve
|
||||
IncludeCategories:
|
||||
- Regex: '^<ext/.*\.h>'
|
||||
Priority: 2
|
||||
- Regex: '^<.*\.h>'
|
||||
Priority: 1
|
||||
- Regex: '^<.*'
|
||||
Priority: 2
|
||||
- Regex: '.*'
|
||||
Priority: 3
|
||||
IncludeIsMainRegex: '([-_](test|unittest))?$'
|
||||
IndentCaseLabels: true
|
||||
IndentPPDirectives: None
|
||||
IndentWidth: 4
|
||||
IndentWrappedFunctionNames: false
|
||||
JavaScriptQuotes: Leave
|
||||
JavaScriptWrapImports: true
|
||||
KeepEmptyLinesAtTheStartOfBlocks: false
|
||||
MacroBlockBegin: ''
|
||||
MacroBlockEnd: ''
|
||||
MaxEmptyLinesToKeep: 1
|
||||
NamespaceIndentation: None
|
||||
ObjCBlockIndentWidth: 2
|
||||
ObjCSpaceAfterProperty: false
|
||||
ObjCSpaceBeforeProtocolList: false
|
||||
PenaltyBreakAssignment: 2
|
||||
PenaltyBreakBeforeFirstCallParameter: 1
|
||||
PenaltyBreakComment: 300
|
||||
PenaltyBreakFirstLessLess: 120
|
||||
PenaltyBreakString: 1000
|
||||
PenaltyExcessCharacter: 1000000
|
||||
PenaltyReturnTypeOnItsOwnLine: 200
|
||||
PointerAlignment: Left
|
||||
ReflowComments: true
|
||||
SortIncludes: true
|
||||
SortUsingDeclarations: true
|
||||
SpaceAfterCStyleCast: false
|
||||
SpaceAfterTemplateKeyword: true
|
||||
SpaceBeforeAssignmentOperators: true
|
||||
SpaceBeforeParens: ControlStatements
|
||||
SpaceInEmptyParentheses: false
|
||||
SpacesBeforeTrailingComments: 2
|
||||
SpacesInAngles: false
|
||||
SpacesInContainerLiterals: true
|
||||
SpacesInCStyleCastParentheses: false
|
||||
SpacesInParentheses: false
|
||||
SpacesInSquareBrackets: false
|
||||
Standard: Latest
|
||||
TabWidth: 8
|
||||
UseTab: Never
|
||||
...
|
||||
|
3
.gitignore
vendored
3
.gitignore
vendored
|
@ -29,8 +29,9 @@
|
|||
*.app
|
||||
|
||||
# My Definition
|
||||
build/
|
||||
build*/
|
||||
cmake-build-debug/
|
||||
.idea/
|
||||
.DS_Store
|
||||
include/xcdat/xcdat_config.hpp
|
||||
.vscode/
|
48
CMakeLists.txt
Normal file
48
CMakeLists.txt
Normal file
|
@ -0,0 +1,48 @@
|
|||
cmake_minimum_required(VERSION 3.0)
|
||||
project(xcdat VERSION 1.0.0 LANGUAGES CXX)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
|
||||
if (NOT CMAKE_BUILD_TYPE)
|
||||
set(CMAKE_BUILD_TYPE Release)
|
||||
endif ()
|
||||
|
||||
if ((CMAKE_CXX_COMPILER_ID STREQUAL "Clang") OR (CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang"))
|
||||
set(CMAKE_COMPILER_IS_CLANGXX 1)
|
||||
endif ()
|
||||
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
|
||||
set(CMAKE_COMPILER_IS_GNUCXX 1)
|
||||
endif ()
|
||||
|
||||
# C++17 compiler check
|
||||
if ((CMAKE_COMPILER_IS_GNUCXX AND ${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 7.0) OR (CMAKE_COMPILER_IS_CLANGXX AND ${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.0))
|
||||
message(FATAL_ERROR "Your C++ compiler does not support C++17. Please install g++ 7.0 (or greater) or clang 4.0 (or greater)")
|
||||
else ()
|
||||
message(STATUS "Compiler is recent enough to support C++17.")
|
||||
endif ()
|
||||
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++1z -pthread -Wall")
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DNDEBUG -march=native -O3")
|
||||
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fsanitize=address -fno-omit-frame-pointer -O0 -g -DDEBUG")
|
||||
|
||||
message(STATUS "BUILD_TYPE is ${CMAKE_BUILD_TYPE}")
|
||||
message(STATUS "CXX_FLAGS are ${CMAKE_CXX_FLAGS}")
|
||||
message(STATUS "CXX_FLAGS_DEBUG are ${CMAKE_CXX_FLAGS_DEBUG}")
|
||||
message(STATUS "CXX_FLAGS_RELEASE are ${CMAKE_CXX_FLAGS_RELEASE}")
|
||||
|
||||
include_directories(include)
|
||||
|
||||
add_subdirectory(sample)
|
||||
add_subdirectory(tools)
|
||||
|
||||
enable_testing()
|
||||
add_subdirectory(tests)
|
||||
|
||||
file(COPY ${CMAKE_SOURCE_DIR}/tests/keys.txt DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tests)
|
||||
|
||||
# Install the library
|
||||
file(GLOB XCDAT_HEADER_FILES include/xcdat/*.hpp)
|
||||
file(GLOB MM_HEADER_FILES include/mm_file/*.hpp)
|
||||
install(FILES include/xcdat.hpp DESTINATION include)
|
||||
install(FILES ${XCDAT_HEADER_FILES} DESTINATION include/xcdat)
|
||||
install(FILES ${MM_HEADER_FILES} DESTINATION include/mm_file)
|
21
LICENSE
Normal file
21
LICENSE
Normal file
|
@ -0,0 +1,21 @@
|
|||
MIT License
|
||||
|
||||
Copyright (c) 2021 Shunsuke Kanda
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
520
README.md
Normal file
520
README.md
Normal file
|
@ -0,0 +1,520 @@
|
|||
# Xcdat: Fast compressed trie dictionary library
|
||||
|
||||
**Xcdat** is a C++17 header-only library of a fast compressed string dictionary based on an improved double-array trie structure described in the paper: [Compressed double-array tries for string dictionaries supporting fast lookup](https://doi.org/10.1007/s10115-016-0999-8), *Knowledge and Information Systems*, 2017, available at [here](https://kampersanda.github.io/pdf/KAIS2017.pdf).
|
||||
|
||||
## Table of contents
|
||||
|
||||
- [Features](#features)
|
||||
- [Build instructions](#build-instructions)
|
||||
- [Command line tools](#command-line-tools)
|
||||
- [Sample usage](#sample-usage)
|
||||
- [API](#api)
|
||||
- [Performance](#performance)
|
||||
- [Licensing](#licensing)
|
||||
- [Todo](#todo)
|
||||
- [References](#references)
|
||||
|
||||
## Features
|
||||
|
||||
- **Compressed string dictionary.** Xcdat implements a (static) *compressed string dictioanry* that stores a set of strings (or keywords) in a compressed space while supporting several search operations [1,2]. For example, Xcdat can store an entire set of English Wikipedia titles at half the size of the raw data.
|
||||
- **Fast and compact data structure.** Xcdat employs the *double-array trie* [3] known as the fastest trie implementation. However, the double-array trie resorts to many pointers and consumes a large amount of memory. To address this, Xcdat applies the *XCDA* method [2] that represents the double-array trie in a compressed format while maintaining the fast searches.
|
||||
- **Cache efficiency.** Xcdat employs a *minimal-prefix trie* [4] that replaces redundant trie nodes into strings to reduce random access and to improve locality of references.
|
||||
- **Dictionary encoding.** Xcdat maps `N` distinct keywords into unique IDs from `[0,N-1]`, and supports the two symmetric operations: `lookup` returns the ID corresponding to a given keyword; `decode` returns the keyword associated with a given ID. The mapping is so-called *dictionary encoding* (or *domain encoding*) and is fundamental in many DB applications as described by Martínez-Prieto et al [1] or Müller et al. [5].
|
||||
- **Prefix search operations.** Xcdat supports prefix search operations realized by trie search algorithms: `prefix_search` returns all the keywords contained as prefixes of a given string; `predictive search` returns all the keywords starting with a given string. These will be useful in many NLP applications such as auto completions [6], stemmed searches [7], or input method editors [8].
|
||||
- **64-bit support.** As mentioned before, since the double array is a pointer-based data structure, most double-array libraries use 32-bit pointers to reduce memory consumption, resulting in limiting the scale of the input dataset. On the other hand, the XCDA method allows Xcdat to represent 64-bit pointers without sacrificing memory efficiency.
|
||||
- **Binary key support.** In normal mode, Xcdat will use the `\0` character as an end marker for each keyword. However, if the dataset include `\0` characters, it will use bit flags instead of end markers, allowing the dataset to consist of binary keywords.
|
||||
- **Memory mapping.** Xcdat supports *memory mapping*, allowing data to be deserialized quickly without loading it into memory. Of course, deserialization by the loading is also supported.
|
||||
- **Header only.** The library consists only of header files, and you can easily install it.
|
||||
|
||||
## Build instructions
|
||||
|
||||
You can download, compile, and install Xcdat with the following commands.
|
||||
|
||||
```
|
||||
$ git clone https://github.com/kampersanda/xcdat.git
|
||||
$ cd xcdat
|
||||
$ mkdir build
|
||||
$ cd build
|
||||
$ cmake ..
|
||||
$ make -j
|
||||
$ make install
|
||||
```
|
||||
|
||||
Or, since this library consists only of header files, you can easily install it by passing through the path to the directory `include`.
|
||||
|
||||
### Requirements
|
||||
|
||||
You need to install a modern C++17 ready compiler such as `g++ >= 7.0` or `clang >= 4.0`. For the build system, you need to install `CMake >= 3.0` to compile the library.
|
||||
|
||||
The library considers a 64-bit operating system. The code has been tested only on Mac OS X and Linux. That is, this library considers only UNIX-compatible OS.
|
||||
|
||||
## Command line tools
|
||||
|
||||
Xcdat provides command line tools to build the dictionary and perform searches, which are inspired by [marisa-trie](https://github.com/s-yata/marisa-trie). All the tools will print the command line options by specifying the parameter `-h`.
|
||||
|
||||
### `xcdat_build`
|
||||
|
||||
It builds the trie dictionary from a given dataset consisting of keywords separated by newlines. The following command builds the trie dictionary from dataset `enwiki-titles.txt` and writes the dictionary into file `idx.bin`.
|
||||
|
||||
```
|
||||
$ xcdat_build enwiki-titles.txt idx.bin
|
||||
Number of keys: 15955763
|
||||
Number of trie nodes: 36441058
|
||||
Number of DA units: 36520704
|
||||
Memory usage in bytes: 1.70618e+08
|
||||
Memory usage in MiB: 162.714
|
||||
```
|
||||
|
||||
### `xcdat_lookup`
|
||||
|
||||
It tests the `lookup` operation for a given dictionary. Given a query string via `stdin`, it prints the associated ID if found, or `-1` otherwise.
|
||||
|
||||
```
|
||||
$ xcdat_lookup idx.bin
|
||||
Algorithm
|
||||
1255938 Algorithm
|
||||
Double_Array
|
||||
-1 Double_Array
|
||||
```
|
||||
|
||||
### `xcdat_decode`
|
||||
|
||||
It tests the `decode` operation for a given dictionary. Given a query ID via `stdin`, it prints the corresponding keyword if the ID is in the range `[0,N-1]`, where `N` is the number of stored keywords.
|
||||
|
||||
```
|
||||
$ xcdat_decode idx.bin
|
||||
1255938
|
||||
1255938 Algorithm
|
||||
```
|
||||
|
||||
### `xcdat_prefix_search`
|
||||
|
||||
It tests the `prefix_search` operation for a given dictionary. Given a query string via `stdin`, it prints all the keywords contained as prefixes of a given string.
|
||||
|
||||
```
|
||||
$ xcdat_prefix_search idx.bin
|
||||
Algorithmic
|
||||
6 found
|
||||
57 A
|
||||
798460 Al
|
||||
1138004 Alg
|
||||
1253024 Algo
|
||||
1255938 Algorithm
|
||||
1255931 Algorithmic
|
||||
```
|
||||
|
||||
### `xcdat_predictive_search`
|
||||
|
||||
It tests the `predictive_search` operation for a given dictionary. Given a query string via `stdin`, it prints the first `n` keywords starting with a given string, where `n` is one of the parameters.
|
||||
|
||||
```
|
||||
$ xcdat_predictive_search idx.bin -n 3
|
||||
Algorithm
|
||||
263 found
|
||||
1255938 Algorithm
|
||||
1255944 Algorithm's_optimality
|
||||
1255972 Algorithm_(C++)
|
||||
```
|
||||
|
||||
### `xcdat_enumerate`
|
||||
|
||||
It prints all the keywords stored in a given dictionary.
|
||||
|
||||
```
|
||||
$ xcdat_enumerate idx.bin | head -3
|
||||
0 !
|
||||
107 !!
|
||||
138 !!!
|
||||
```
|
||||
|
||||
### `xcdat_benchmark`
|
||||
|
||||
It measures the performances of possible tries for a given dataset. To perform search operations, it randomly samples `n` queires from the dataset, where `n` is one of the parameters.
|
||||
|
||||
```
|
||||
$ xcdat_benchmark enwiki-titles.txt
|
||||
** xcdat::trie_7_type **
|
||||
Number of keys: 15955763
|
||||
Memory usage in bytes: 1.70618e+08
|
||||
Memory usage in MiB: 162.714
|
||||
Construction time in seconds: 12.907
|
||||
Lookup time in microsec/query: 0.4674
|
||||
Decode time in microsec/query: 0.8722
|
||||
** xcdat::trie_8_type **
|
||||
Number of keys: 15955763
|
||||
Memory usage in bytes: 1.64104e+08
|
||||
Memory usage in MiB: 156.502
|
||||
Construction time in seconds: 13.442
|
||||
Lookup time in microsec/query: 0.7593
|
||||
Decode time in microsec/query: 1.2341
|
||||
```
|
||||
|
||||
## Sample usage
|
||||
|
||||
`sample/sample.cpp` provides a sample usage. It employs the external library [mm_file](https://github.com/jermp/mm_file) to implement a memory-mapped file, which will be installed by `make install` together.
|
||||
|
||||
```c++
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
#include <mm_file/mm_file.hpp>
|
||||
#include <xcdat.hpp>
|
||||
|
||||
int main() {
|
||||
// Dataset
|
||||
std::vector<std::string> keys = {
|
||||
"AirPods", "AirTag", "Mac", "MacBook", "MacBook_Air", "MacBook_Pro",
|
||||
"Mac_Mini", "Mac_Pro", "iMac", "iPad", "iPhone", "iPhone_SE",
|
||||
};
|
||||
|
||||
// The input keys must be sorted and unique (although they have already satisfied in this case).
|
||||
std::sort(keys.begin(), keys.end());
|
||||
keys.erase(std::unique(keys.begin(), keys.end()), keys.end());
|
||||
|
||||
// The trie dictionary type
|
||||
using trie_type = xcdat::trie_8_type;
|
||||
|
||||
// The dictionary filename
|
||||
const char* tmp_filename = "dic.bin";
|
||||
|
||||
// Build and save the trie dictionary.
|
||||
{
|
||||
const trie_type trie(keys);
|
||||
xcdat::save(trie, tmp_filename);
|
||||
}
|
||||
|
||||
// Memory-map the trie dictionary.
|
||||
const mm::file_source<char> fin(tmp_filename, mm::advice::sequential);
|
||||
const auto trie = xcdat::mmap<trie_type>(fin.data());
|
||||
|
||||
// Or, load the trie dictionary on memory.
|
||||
// const auto trie = xcdat::load<trie_type>(tmp_filename);
|
||||
|
||||
// Basic statistics
|
||||
std::cout << "Number of keys: " << trie.num_keys() << std::endl;
|
||||
std::cout << "Number of trie nodes: " << trie.num_nodes() << std::endl;
|
||||
std::cout << "Number of DA units: " << trie.num_units() << std::endl;
|
||||
std::cout << "Memory usage in bytes: " << xcdat::memory_in_bytes(trie) << std::endl;
|
||||
|
||||
// Lookup the ID for a query key.
|
||||
{
|
||||
const auto id = trie.lookup("Mac_Pro");
|
||||
std::cout << "Lookup(Mac_Pro) = " << id.value_or(UINT64_MAX) << std::endl;
|
||||
}
|
||||
{
|
||||
const auto id = trie.lookup("Google_Pixel");
|
||||
std::cout << "Lookup(Google_Pixel) = " << id.value_or(UINT64_MAX) << std::endl;
|
||||
}
|
||||
|
||||
// Decode the key for a query ID.
|
||||
{
|
||||
const auto dec = trie.decode(4);
|
||||
std::cout << "Decode(4) = " << dec << std::endl;
|
||||
}
|
||||
|
||||
// Common prefix search
|
||||
{
|
||||
std::cout << "CommonPrefixSearch(MacBook_Air) = {" << std::endl;
|
||||
auto itr = trie.make_prefix_iterator("MacBook_Air");
|
||||
while (itr.next()) {
|
||||
std::cout << " (" << itr.decoded_view() << ", " << itr.id() << ")," << std::endl;
|
||||
}
|
||||
std::cout << "}" << std::endl;
|
||||
}
|
||||
|
||||
// Predictive search
|
||||
{
|
||||
std::cout << "PredictiveSearch(Mac) = {" << std::endl;
|
||||
auto itr = trie.make_predictive_iterator("Mac");
|
||||
while (itr.next()) {
|
||||
std::cout << " (" << itr.decoded_view() << ", " << itr.id() << ")," << std::endl;
|
||||
}
|
||||
std::cout << "}" << std::endl;
|
||||
}
|
||||
|
||||
// Enumerate all the keys (in lex order).
|
||||
{
|
||||
std::cout << "Enumerate() = {" << std::endl;
|
||||
auto itr = trie.make_enumerative_iterator();
|
||||
while (itr.next()) {
|
||||
std::cout << " (" << itr.decoded_view() << ", " << itr.id() << ")," << std::endl;
|
||||
}
|
||||
std::cout << "}" << std::endl;
|
||||
}
|
||||
|
||||
std::remove(tmp_filename);
|
||||
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
|
||||
The output will be
|
||||
|
||||
```
|
||||
Number of keys: 12
|
||||
Number of trie nodes: 28
|
||||
Number of DA units: 256
|
||||
Memory usage in bytes: 1766
|
||||
Lookup(Mac_Pro) = 7
|
||||
Lookup(Google_Pixel) = 18446744073709551615
|
||||
Decode(4) = MacBook_Air
|
||||
CommonPrefixSearch(MacBook_Air) = {
|
||||
(Mac, 1),
|
||||
(MacBook, 2),
|
||||
(MacBook_Air, 4),
|
||||
}
|
||||
PredictiveSearch(Mac) = {
|
||||
(Mac, 1),
|
||||
(MacBook, 2),
|
||||
(MacBook_Air, 4),
|
||||
(MacBook_Pro, 11),
|
||||
(Mac_Mini, 5),
|
||||
(Mac_Pro, 7),
|
||||
}
|
||||
Enumerate() = {
|
||||
(AirPods, 0),
|
||||
(AirTag, 3),
|
||||
(Mac, 1),
|
||||
(MacBook, 2),
|
||||
(MacBook_Air, 4),
|
||||
(MacBook_Pro, 11),
|
||||
(Mac_Mini, 5),
|
||||
(Mac_Pro, 7),
|
||||
(iMac, 10),
|
||||
(iPad, 6),
|
||||
(iPhone, 8),
|
||||
(iPhone_SE, 9),
|
||||
}
|
||||
```
|
||||
|
||||
## API
|
||||
|
||||
Xcdat can be used by including `xcdat.hpp`.
|
||||
|
||||
### Trie dictionary types
|
||||
|
||||
The two dictionary types are difined.
|
||||
|
||||
- `xcdat::trie_8_type` is the trie dictionary using standard DACs [9] using 8-bit integers for elements.
|
||||
- `xcdat::trie_7_type` is the trie dictionary using pointer-based DACs [2] using 7-bit integers for elements.
|
||||
|
||||
### Trie dictionary class
|
||||
|
||||
The trie dictionary has the following members.
|
||||
|
||||
```c++
|
||||
//! A compressed string dictionary based on an improved double-array trie.
|
||||
//! 'BcVector' is the data type of Base and Check vectors.
|
||||
template <class BcVector>
|
||||
class trie {
|
||||
public:
|
||||
//! Default constructor
|
||||
trie() = default;
|
||||
|
||||
//! Default destructor
|
||||
virtual ~trie() = default;
|
||||
|
||||
//! Copy constructor (deleted)
|
||||
trie(const trie&) = delete;
|
||||
|
||||
//! Copy constructor (deleted)
|
||||
trie& operator=(const trie&) = delete;
|
||||
|
||||
//! Move constructor
|
||||
trie(trie&&) noexcept = default;
|
||||
|
||||
//! Move constructor
|
||||
trie& operator=(trie&&) noexcept = default;
|
||||
|
||||
//! Build the trie from the input keywords, which are lexicographically sorted and unique.
|
||||
//!
|
||||
//! If bin_mode = false, the NULL character is used for the termination of a keyword.
|
||||
//! If bin_mode = true, bit flags are used istead, and the keywords can contain NULL characters.
|
||||
//! If the input keywords contain NULL characters, bin_mode will be forced to be set to true.
|
||||
//!
|
||||
//! The type 'Strings' and 'Strings::value_type' should be a random iterable container such as std::vector.
|
||||
//! Precisely, they should support the following operations:
|
||||
//! - size() returns the container size.
|
||||
//! - operator[](i) accesses the i-th element.
|
||||
//! - begin() returns the iterator to the beginning.
|
||||
//! - end() returns the iterator to the end.
|
||||
//! The type 'Strings::value_type::value_type' should be one-byte integer type such as 'char'.
|
||||
template <class Strings>
|
||||
trie(const Strings& keys, bool bin_mode = false);
|
||||
|
||||
//! Check if the binary mode.
|
||||
bool bin_mode() const;
|
||||
|
||||
//! Get the number of stored keywords.
|
||||
std::uint64_t num_keys() const;
|
||||
|
||||
//! Get the alphabet size.
|
||||
std::uint64_t alphabet_size() const;
|
||||
|
||||
//! Get the maximum length of keywords.
|
||||
std::uint64_t max_length() const;
|
||||
|
||||
//! Get the number of trie nodes.
|
||||
std::uint64_t num_nodes() const;
|
||||
|
||||
//! Get the number of DA units.
|
||||
std::uint64_t num_units() const;
|
||||
|
||||
//! Get the number of unused DA units.
|
||||
std::uint64_t num_free_units() const;
|
||||
|
||||
//! Get the length of TAIL vector.
|
||||
std::uint64_t tail_length() const;
|
||||
|
||||
//! Lookup the ID of the keyword.
|
||||
std::optional<std::uint64_t> lookup(std::string_view key) const;
|
||||
|
||||
//! Decode the keyword associated with the ID.
|
||||
std::string decode(std::uint64_t id) const;
|
||||
|
||||
//! Decode the keyword associated with the ID and store it in 'decoded'.
|
||||
//! It can avoid reallocation of memory to store the result.
|
||||
void decode(std::uint64_t id, std::string& decoded) const;
|
||||
|
||||
//! An iterator class for common prefix search.
|
||||
//! It enumerates all the keywords contained as prefixes of a given string.
|
||||
//! It should be instantiated via the function 'make_prefix_iterator'.
|
||||
class prefix_iterator {
|
||||
public:
|
||||
prefix_iterator() = default;
|
||||
|
||||
//! Increment the iterator.
|
||||
//! Return false if the iteration is terminated.
|
||||
bool next();
|
||||
|
||||
//! Get the result ID.
|
||||
std::uint64_t id() const;
|
||||
|
||||
//! Get the result keyword.
|
||||
std::string decoded() const;
|
||||
|
||||
//! Get the reference to the result keyword.
|
||||
//! Note that the referenced data will be changed in the next iteration.
|
||||
std::string_view decoded_view() const;
|
||||
};
|
||||
|
||||
//! Make the common prefix searcher for the given keyword.
|
||||
prefix_iterator make_prefix_iterator(std::string_view key) const;
|
||||
|
||||
//! Preform common prefix search for the keyword.
|
||||
void prefix_search(std::string_view key, const std::function<void(std::uint64_t, std::string_view)>& fn) const;
|
||||
|
||||
//! An iterator class for predictive search.
|
||||
//! It enumerates all the keywords starting with a given string.
|
||||
//! It should be instantiated via the function 'make_predictive_iterator'.
|
||||
class predictive_iterator {
|
||||
public:
|
||||
predictive_iterator() = default;
|
||||
|
||||
//! Increment the iterator.
|
||||
//! Return false if the iteration is terminated.
|
||||
bool next();
|
||||
|
||||
//! Get the result ID.
|
||||
std::uint64_t id() const;
|
||||
|
||||
//! Get the result keyword.
|
||||
std::string decoded() const;
|
||||
|
||||
//! Get the reference to the result keyword.
|
||||
//! Note that the referenced data will be changed in the next iteration.
|
||||
std::string_view decoded_view() const;
|
||||
};
|
||||
|
||||
//! Make the predictive searcher for the keyword.
|
||||
predictive_iterator make_predictive_iterator(std::string_view key) const;
|
||||
|
||||
//! Preform predictive search for the keyword.
|
||||
void predictive_search(std::string_view key, const std::function<void(std::uint64_t, std::string_view)>& fn) const;
|
||||
|
||||
//! An iterator class for enumeration.
|
||||
//! It enumerates all the keywords stored in the trie.
|
||||
//! It should be instantiated via the function 'make_enumerative_iterator'.
|
||||
using enumerative_iterator = predictive_iterator;
|
||||
|
||||
//! An iterator class for enumeration.
|
||||
enumerative_iterator make_enumerative_iterator() const;
|
||||
|
||||
//! Enumerate all the keywords and their IDs stored in the trie.
|
||||
void enumerate(const std::function<void(std::uint64_t, std::string_view)>& fn) const;
|
||||
|
||||
//! Visit the members (commonly used for I/O).
|
||||
template <class Visitor>
|
||||
void visit(Visitor& visitor);
|
||||
};
|
||||
```
|
||||
|
||||
### I/O handlers
|
||||
|
||||
`xcdat.hpp` provides some functions for handling I/O operations.
|
||||
|
||||
```c++
|
||||
//! Set the continuous memory block to a new trie instance.
|
||||
template <class Trie>
|
||||
Trie mmap(const char* address);
|
||||
|
||||
//! Load the trie dictionary from the file.
|
||||
template <class Trie>
|
||||
Trie load(std::string_view filepath);
|
||||
|
||||
//! Save the trie dictionary to the file and returns the file size in bytes.
|
||||
template <class Trie>
|
||||
std::uint64_t save(const Trie& idx, std::string_view filepath);
|
||||
|
||||
//! Get the dictionary size in bytes.
|
||||
template <class Trie>
|
||||
std::uint64_t memory_in_bytes(const Trie& idx);
|
||||
|
||||
//! Get the flag indicating the trie type, embedded by the function 'save'.
|
||||
//! The flag corresponds to trie::l1_bits and will be used to detect the trie type from the file.
|
||||
std::uint32_t get_flag(std::string_view filepath);
|
||||
|
||||
//! Load the keywords from the file.
|
||||
std::vector<std::string> load_strings(std::string_view filepath, char delim = '\n');
|
||||
```
|
||||
|
||||
## Performance
|
||||
|
||||
To be added...
|
||||
|
||||
## Licensing
|
||||
|
||||
This library is free software provided under the MIT License.
|
||||
|
||||
If you use the library in academic settings, please cite the following paper.
|
||||
|
||||
```
|
||||
@article{kanda2017compressed,
|
||||
title={Compressed double-array tries for string dictionaries supporting fast lookup},
|
||||
author={Kanda, Shunsuke and Morita, Kazuhiro and Fuketa, Masao},
|
||||
journal={Knowledge and Information Systems (KAIS)},
|
||||
volume={51},
|
||||
number={3},
|
||||
pages={1023--1042},
|
||||
year={2017},
|
||||
publisher={Springer}
|
||||
}
|
||||
```
|
||||
|
||||
## Todo
|
||||
|
||||
- Support other language bindings.
|
||||
- Add SIMD-ization.
|
||||
|
||||
## References
|
||||
|
||||
1. M. A. Martínez-Prieto, N. Brisaboa, R. Cánovas, F. Claude, and G. Navarro. Practical compressed string dictionaries. Information Systems, 56:73–108, 2016
|
||||
2. S. Kanda, K. Morita, and M. Fuketa. Compressed double-array tries for string dictionaries supporting fast lookup. Knowledge and Information Systems, 51(3): 1023–1042, 2017.
|
||||
3. J. Aoe. An efficient digital search algorithm by using a double-array structure. IEEE Transactions on Software Engineering, 15(9):1066–1077, 1989.
|
||||
4. S. Yata, M. Oono, K. Morita, M. Fuketa, T. Sumitomo, and J. Aoe. A compact static double-array keeping character codes. Information Processing & Management, 43(1):237–247, 2007.
|
||||
5. Müller, Ingo, Cornelius Ratsch, and Franz Faerber. Adaptive string dictionary compression in in-memory column-store database systems. In EDBT, pp. 283–294, 2014.
|
||||
6. Gog, Simon, Giulio Ermanno Pibiri, and Rossano Venturini. Efficient and effective query auto-completion. In SIGIR, pp. 2271–2280, 2020.
|
||||
7. Ricardo Baeza-Yates, and Berthier Ribeiro-Neto. Modern Information Retrieval. 2nd ed. Addison Wesley, Boston, MA, USA, 2011.
|
||||
8. Kudo, Taku, et al. Efficient dictionary and language model compression for input method editors. In WTIM, pp. 19–25, 2011.
|
||||
9. N. R. Brisaboa, S. Ladra, and G. Navarro. DACs: Bringing direct access to variable-length codes. Information Processing & Management, 49(1):392–404, 2013.
|
||||
|
177
include/mm_file/mm_file.hpp
Normal file
177
include/mm_file/mm_file.hpp
Normal file
|
@ -0,0 +1,177 @@
|
|||
#pragma once
|
||||
|
||||
#include <sys/mman.h>
|
||||
#include <sys/stat.h>
|
||||
#include <type_traits>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h> // close(fd)
|
||||
#include <string>
|
||||
|
||||
namespace mm {
|
||||
|
||||
namespace advice {
|
||||
static const int normal = POSIX_MADV_NORMAL;
|
||||
static const int random = POSIX_MADV_RANDOM;
|
||||
static const int sequential = POSIX_MADV_SEQUENTIAL;
|
||||
} // namespace advice
|
||||
|
||||
template <typename T>
|
||||
struct file {
|
||||
file() {
|
||||
init();
|
||||
}
|
||||
|
||||
~file() {
|
||||
close();
|
||||
}
|
||||
|
||||
file(file const&) = delete; // non construction-copyable
|
||||
file& operator=(file const&) = delete; // non copyable
|
||||
|
||||
bool is_open() const {
|
||||
return m_fd != -1;
|
||||
}
|
||||
|
||||
void close() {
|
||||
if (is_open()) {
|
||||
if (munmap((char*)m_data, m_size) == -1) {
|
||||
throw std::runtime_error("munmap failed when closing file");
|
||||
}
|
||||
::close(m_fd);
|
||||
init();
|
||||
}
|
||||
}
|
||||
|
||||
size_t bytes() const {
|
||||
return m_size;
|
||||
}
|
||||
|
||||
size_t size() const {
|
||||
return m_size / sizeof(T);
|
||||
}
|
||||
|
||||
T* data() const {
|
||||
return m_data;
|
||||
}
|
||||
|
||||
struct iterator {
|
||||
iterator(T* addr, size_t offset = 0) : m_ptr(addr + offset) {}
|
||||
|
||||
T operator*() {
|
||||
return *m_ptr;
|
||||
}
|
||||
|
||||
void operator++() {
|
||||
++m_ptr;
|
||||
}
|
||||
|
||||
bool operator==(iterator const& rhs) const {
|
||||
return m_ptr == rhs.m_ptr;
|
||||
}
|
||||
|
||||
bool operator!=(iterator const& rhs) const {
|
||||
return !((*this) == rhs);
|
||||
}
|
||||
|
||||
private:
|
||||
T* m_ptr;
|
||||
};
|
||||
|
||||
iterator begin() const {
|
||||
return iterator(m_data);
|
||||
}
|
||||
|
||||
iterator end() const {
|
||||
return iterator(m_data, size());
|
||||
}
|
||||
|
||||
protected:
|
||||
int m_fd;
|
||||
size_t m_size;
|
||||
T* m_data;
|
||||
|
||||
void init() {
|
||||
m_fd = -1;
|
||||
m_size = 0;
|
||||
m_data = nullptr;
|
||||
}
|
||||
|
||||
void check_fd() {
|
||||
if (m_fd == -1) throw std::runtime_error("cannot open file");
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Pointer>
|
||||
Pointer mmap(int fd, size_t size, int prot) {
|
||||
static const size_t offset = 0;
|
||||
Pointer p =
|
||||
static_cast<Pointer>(::mmap(NULL, size, prot, MAP_SHARED, fd, offset));
|
||||
if (p == MAP_FAILED) throw std::runtime_error("mmap failed");
|
||||
return p;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
struct file_source : public file<T const> {
|
||||
typedef file<T const> base;
|
||||
|
||||
file_source() {}
|
||||
|
||||
file_source(std::string const& path, int adv = advice::normal) {
|
||||
open(path, adv);
|
||||
}
|
||||
|
||||
void open(std::string const& path, int adv = advice::normal) {
|
||||
base::m_fd = ::open(path.c_str(), O_RDONLY);
|
||||
base::check_fd();
|
||||
struct stat fs;
|
||||
if (fstat(base::m_fd, &fs) == -1) {
|
||||
throw std::runtime_error("cannot stat file");
|
||||
}
|
||||
base::m_size = fs.st_size;
|
||||
base::m_data = mmap<T const*>(base::m_fd, base::m_size, PROT_READ);
|
||||
if (posix_madvise((void*)base::m_data, base::m_size, adv)) {
|
||||
throw std::runtime_error("madvise failed");
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct file_sink : public file<T> {
|
||||
typedef file<T> base;
|
||||
|
||||
file_sink() {}
|
||||
|
||||
file_sink(std::string const& path) {
|
||||
open(path);
|
||||
}
|
||||
|
||||
file_sink(std::string const& path, size_t n) {
|
||||
open(path, n);
|
||||
}
|
||||
|
||||
void open(std::string const& path) {
|
||||
static const mode_t mode = 0600; // read/write
|
||||
base::m_fd = ::open(path.c_str(), O_RDWR, mode);
|
||||
base::check_fd();
|
||||
struct stat fs;
|
||||
if (fstat(base::m_fd, &fs) == -1) {
|
||||
throw std::runtime_error("cannot stat file");
|
||||
}
|
||||
base::m_size = fs.st_size;
|
||||
base::m_data =
|
||||
mmap<T*>(base::m_fd, base::m_size, PROT_READ | PROT_WRITE);
|
||||
}
|
||||
|
||||
void open(std::string const& path, size_t n) {
|
||||
static const mode_t mode = 0600; // read/write
|
||||
base::m_fd = ::open(path.c_str(), O_RDWR | O_CREAT | O_TRUNC, mode);
|
||||
base::check_fd();
|
||||
base::m_size = n * sizeof(T);
|
||||
ftruncate(base::m_fd,
|
||||
base::m_size); // truncate the file at the new size
|
||||
base::m_data =
|
||||
mmap<T*>(base::m_fd, base::m_size, PROT_READ | PROT_WRITE);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace mm
|
85
include/xcdat.hpp
Normal file
85
include/xcdat.hpp
Normal file
|
@ -0,0 +1,85 @@
|
|||
#pragma once
|
||||
|
||||
#include "xcdat/bc_vector_7.hpp"
|
||||
#include "xcdat/bc_vector_8.hpp"
|
||||
#include "xcdat/load_visitor.hpp"
|
||||
#include "xcdat/mmap_visitor.hpp"
|
||||
#include "xcdat/save_visitor.hpp"
|
||||
#include "xcdat/size_visitor.hpp"
|
||||
#include "xcdat/trie.hpp"
|
||||
|
||||
namespace xcdat {
|
||||
|
||||
using trie_8_type = trie<bc_vector_8>;
|
||||
using trie_7_type = trie<bc_vector_7>;
|
||||
|
||||
//! Set the continuous memory block to a new trie instance.
|
||||
template <class Trie>
|
||||
[[maybe_unused]] Trie mmap(const char* address) {
|
||||
mmap_visitor visitor(address);
|
||||
|
||||
std::uint32_t flag;
|
||||
visitor.visit(flag);
|
||||
XCDAT_THROW_IF(flag != Trie::l1_bits, "The input dictionary type is different.");
|
||||
|
||||
Trie idx;
|
||||
visitor.visit(idx);
|
||||
return idx;
|
||||
}
|
||||
|
||||
//! Load the trie dictionary from the file.
|
||||
template <class Trie>
|
||||
[[maybe_unused]] Trie load(std::string_view filepath) {
|
||||
load_visitor visitor(filepath);
|
||||
|
||||
std::uint32_t flag;
|
||||
visitor.visit(flag);
|
||||
XCDAT_THROW_IF(flag != Trie::l1_bits, "The input dictionary type is different.");
|
||||
|
||||
Trie idx;
|
||||
visitor.visit(idx);
|
||||
return idx;
|
||||
}
|
||||
|
||||
//! Save the trie dictionary to the file and returns the file size in bytes.
|
||||
template <class Trie>
|
||||
[[maybe_unused]] std::uint64_t save(const Trie& idx, std::string_view filepath) {
|
||||
save_visitor visitor(filepath);
|
||||
visitor.visit(static_cast<std::uint32_t>(Trie::l1_bits)); // flag
|
||||
visitor.visit(const_cast<Trie&>(idx));
|
||||
return visitor.bytes();
|
||||
}
|
||||
|
||||
//! Get the dictionary size in bytes.
|
||||
template <class Trie>
|
||||
[[maybe_unused]] std::uint64_t memory_in_bytes(const Trie& idx) {
|
||||
size_visitor visitor;
|
||||
visitor.visit(static_cast<std::uint32_t>(Trie::l1_bits)); // flag
|
||||
visitor.visit(const_cast<Trie&>(idx));
|
||||
return visitor.bytes();
|
||||
}
|
||||
|
||||
//! Get the flag indicating the trie dictionary type, embedded by the function 'save'.
|
||||
//! The flag corresponds to trie::l1_bits and will be used to detect the trie type from the file.
|
||||
[[maybe_unused]] std::uint32_t get_flag(std::string_view filepath) {
|
||||
std::ifstream ifs(filepath);
|
||||
XCDAT_THROW_IF(!ifs.good(), "Cannot open the input file");
|
||||
|
||||
std::uint32_t flag;
|
||||
ifs.read(reinterpret_cast<char*>(&flag), sizeof(flag));
|
||||
return flag;
|
||||
}
|
||||
|
||||
//! Load the keywords from the file.
|
||||
[[maybe_unused]] std::vector<std::string> load_strings(std::string_view filepath, char delim = '\n') {
|
||||
std::ifstream ifs(filepath);
|
||||
XCDAT_THROW_IF(!ifs.good(), "Cannot open the input file");
|
||||
|
||||
std::vector<std::string> strs;
|
||||
for (std::string str; std::getline(ifs, str, delim);) {
|
||||
strs.push_back(str);
|
||||
}
|
||||
return strs;
|
||||
}
|
||||
|
||||
} // namespace xcdat
|
194
include/xcdat/bc_vector_7.hpp
Normal file
194
include/xcdat/bc_vector_7.hpp
Normal file
|
@ -0,0 +1,194 @@
|
|||
#pragma once
|
||||
|
||||
#include <array>
|
||||
|
||||
#include "bit_vector.hpp"
|
||||
#include "compact_vector.hpp"
|
||||
|
||||
namespace xcdat {
|
||||
|
||||
class bc_vector_7 {
|
||||
public:
|
||||
static constexpr std::uint32_t l1_bits = 7;
|
||||
static constexpr std::uint32_t max_levels = 4;
|
||||
|
||||
static constexpr std::uint64_t block_size_l1 = 1ULL << 7;
|
||||
static constexpr std::uint64_t block_size_l2 = 1ULL << 15;
|
||||
static constexpr std::uint64_t block_size_l3 = 1ULL << 31;
|
||||
|
||||
private:
|
||||
std::uint64_t m_num_frees = 0;
|
||||
immutable_vector<std::uint8_t> m_ints_l1;
|
||||
immutable_vector<std::uint16_t> m_ints_l2;
|
||||
immutable_vector<std::uint32_t> m_ints_l3;
|
||||
immutable_vector<std::uint64_t> m_ints_l4;
|
||||
std::array<immutable_vector<std::uint64_t>, max_levels - 1> m_ranks;
|
||||
compact_vector m_links;
|
||||
bit_vector m_leaves;
|
||||
|
||||
public:
|
||||
bc_vector_7() = default;
|
||||
virtual ~bc_vector_7() = default;
|
||||
|
||||
bc_vector_7(const bc_vector_7&) = delete;
|
||||
bc_vector_7& operator=(const bc_vector_7&) = delete;
|
||||
|
||||
bc_vector_7(bc_vector_7&&) noexcept = default;
|
||||
bc_vector_7& operator=(bc_vector_7&&) noexcept = default;
|
||||
|
||||
template <class BcUnits>
|
||||
explicit bc_vector_7(const BcUnits& bc_units, bit_vector::builder&& leaves) {
|
||||
std::vector<std::uint8_t> ints_l1;
|
||||
std::vector<std::uint16_t> ints_l2;
|
||||
std::vector<std::uint32_t> ints_l3;
|
||||
std::vector<std::uint64_t> ints_l4;
|
||||
std::array<std::vector<std::uint64_t>, max_levels - 1> ranks;
|
||||
std::vector<std::uint64_t> links;
|
||||
|
||||
ints_l1.reserve(bc_units.size() * 2);
|
||||
ranks[0].reserve((bc_units.size() * 2) >> l1_bits);
|
||||
links.reserve(bc_units.size());
|
||||
|
||||
auto append_unit = [&](std::uint64_t x) {
|
||||
if ((ints_l1.size() % block_size_l1) == 0) {
|
||||
ranks[0].push_back(static_cast<std::uint64_t>(ints_l2.size()));
|
||||
}
|
||||
if ((x / block_size_l1) == 0) {
|
||||
ints_l1.push_back(static_cast<std::uint8_t>(0 | (x << 1)));
|
||||
return;
|
||||
} else {
|
||||
const auto i = ints_l2.size() - ranks[0].back();
|
||||
ints_l1.push_back(static_cast<std::uint8_t>(1 | (i << 1)));
|
||||
}
|
||||
|
||||
if ((ints_l2.size() % block_size_l2) == 0) {
|
||||
ranks[1].push_back(static_cast<std::uint64_t>(ints_l3.size()));
|
||||
}
|
||||
if ((x / block_size_l2) == 0) {
|
||||
ints_l2.push_back(static_cast<std::uint16_t>(0 | (x << 1)));
|
||||
return;
|
||||
} else {
|
||||
const auto i = ints_l3.size() - ranks[1].back();
|
||||
ints_l2.push_back(static_cast<std::uint16_t>(1 | (i << 1)));
|
||||
}
|
||||
|
||||
if ((ints_l3.size() % block_size_l3) == 0) {
|
||||
ranks[2].push_back(static_cast<std::uint64_t>(ints_l4.size()));
|
||||
}
|
||||
if ((x / block_size_l3) == 0) {
|
||||
ints_l3.push_back(static_cast<std::uint32_t>(0 | (x << 1)));
|
||||
return;
|
||||
} else {
|
||||
const auto i = ints_l4.size() - ranks[2].back();
|
||||
ints_l3.push_back(static_cast<std::uint32_t>(1 | (i << 1)));
|
||||
}
|
||||
ints_l4.push_back(x);
|
||||
};
|
||||
|
||||
auto append_leaf = [&](std::uint64_t x) {
|
||||
if ((ints_l1.size() % block_size_l1) == 0) {
|
||||
ranks[0].push_back(static_cast<std::uint64_t>(ints_l2.size()));
|
||||
}
|
||||
ints_l1.push_back(static_cast<std::uint8_t>(x & 0xFF));
|
||||
links.push_back(x >> 8);
|
||||
};
|
||||
|
||||
for (std::uint64_t i = 0; i < bc_units.size(); ++i) {
|
||||
if (leaves[i]) {
|
||||
append_leaf(bc_units[i].base);
|
||||
} else {
|
||||
append_unit(bc_units[i].base ^ i);
|
||||
}
|
||||
append_unit(bc_units[i].check ^ i);
|
||||
if (bc_units[i].check == i) {
|
||||
m_num_frees += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// release
|
||||
m_ints_l1.build(ints_l1);
|
||||
m_ints_l2.build(ints_l2);
|
||||
m_ints_l3.build(ints_l3);
|
||||
m_ints_l4.build(ints_l4);
|
||||
for (std::uint32_t j = 0; j < m_ranks.size(); ++j) {
|
||||
m_ranks[j].build(ranks[j]);
|
||||
}
|
||||
m_links = compact_vector(links);
|
||||
m_leaves = bit_vector(leaves, true, false);
|
||||
}
|
||||
|
||||
inline std::uint64_t base(std::uint64_t i) const {
|
||||
return access(i * 2) ^ i;
|
||||
}
|
||||
|
||||
inline std::uint64_t check(std::uint64_t i) const {
|
||||
return access(i * 2 + 1) ^ i;
|
||||
}
|
||||
|
||||
inline std::uint64_t link(std::uint64_t i) const {
|
||||
return m_ints_l1[i * 2] | (m_links[m_leaves.rank(i)] << 8);
|
||||
}
|
||||
|
||||
inline bool is_leaf(std::uint64_t i) const {
|
||||
return m_leaves[i];
|
||||
}
|
||||
|
||||
inline bool is_used(std::uint64_t i) const {
|
||||
return check(i) != i;
|
||||
}
|
||||
|
||||
inline std::uint64_t num_units() const {
|
||||
return m_ints_l1.size() / 2;
|
||||
}
|
||||
|
||||
inline std::uint64_t num_free_units() const {
|
||||
return m_num_frees;
|
||||
}
|
||||
|
||||
inline std::uint64_t num_nodes() const {
|
||||
return num_units() - num_free_units();
|
||||
}
|
||||
|
||||
inline std::uint64_t num_leaves() const {
|
||||
return m_leaves.num_ones();
|
||||
}
|
||||
|
||||
template <class Visitor>
|
||||
void visit(Visitor& visitor) {
|
||||
visitor.visit(m_num_frees);
|
||||
visitor.visit(m_ints_l1);
|
||||
visitor.visit(m_ints_l2);
|
||||
visitor.visit(m_ints_l3);
|
||||
visitor.visit(m_ints_l4);
|
||||
for (std::uint32_t j = 0; j < m_ranks.size(); j++) {
|
||||
visitor.visit(m_ranks[j]);
|
||||
}
|
||||
visitor.visit(m_links);
|
||||
visitor.visit(m_leaves);
|
||||
}
|
||||
|
||||
private:
|
||||
inline std::uint64_t access(std::uint64_t i) const {
|
||||
std::uint64_t x = m_ints_l1[i] >> 1;
|
||||
if ((m_ints_l1[i] & 1U) == 0) {
|
||||
return x;
|
||||
}
|
||||
i = m_ranks[0][i / block_size_l1] + x;
|
||||
|
||||
x = m_ints_l2[i] >> 1;
|
||||
if ((m_ints_l2[i] & 1U) == 0) {
|
||||
return x;
|
||||
}
|
||||
i = m_ranks[1][i / block_size_l2] + x;
|
||||
|
||||
x = m_ints_l3[i] >> 1;
|
||||
if ((m_ints_l3[i] & 1U) == 0) {
|
||||
return x;
|
||||
}
|
||||
i = m_ranks[2][i / block_size_l3] + x;
|
||||
|
||||
return m_ints_l4[i];
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace xcdat
|
150
include/xcdat/bc_vector_8.hpp
Normal file
150
include/xcdat/bc_vector_8.hpp
Normal file
|
@ -0,0 +1,150 @@
|
|||
#pragma once
|
||||
|
||||
#include <array>
|
||||
|
||||
#include "bit_vector.hpp"
|
||||
#include "compact_vector.hpp"
|
||||
|
||||
namespace xcdat {
|
||||
|
||||
class bc_vector_8 {
|
||||
public:
|
||||
static constexpr std::uint32_t l1_bits = 8;
|
||||
static constexpr std::uint32_t max_levels = sizeof(std::uint64_t);
|
||||
|
||||
private:
|
||||
std::uint32_t m_num_levels = 0;
|
||||
std::uint64_t m_num_frees = 0;
|
||||
std::array<immutable_vector<std::uint8_t>, max_levels> m_bytes;
|
||||
std::array<bit_vector, max_levels - 1> m_nexts;
|
||||
compact_vector m_links;
|
||||
bit_vector m_leaves;
|
||||
|
||||
public:
|
||||
bc_vector_8() = default;
|
||||
virtual ~bc_vector_8() = default;
|
||||
|
||||
bc_vector_8(const bc_vector_8&) = delete;
|
||||
bc_vector_8& operator=(const bc_vector_8&) = delete;
|
||||
|
||||
bc_vector_8(bc_vector_8&&) noexcept = default;
|
||||
bc_vector_8& operator=(bc_vector_8&&) noexcept = default;
|
||||
|
||||
template <class BcUnits>
|
||||
explicit bc_vector_8(const BcUnits& bc_units, bit_vector::builder&& leaves) {
|
||||
std::array<std::vector<std::uint8_t>, max_levels> bytes;
|
||||
std::array<bit_vector::builder, max_levels> next_flags; // The last will not be released
|
||||
std::vector<std::uint64_t> links;
|
||||
|
||||
bytes[0].reserve(bc_units.size() * 2);
|
||||
next_flags[0].reserve(bc_units.size() * 2);
|
||||
links.reserve(bc_units.size());
|
||||
|
||||
m_num_levels = 0;
|
||||
|
||||
auto append_unit = [&](std::uint64_t x) {
|
||||
std::uint32_t j = 0;
|
||||
bytes[j].push_back(static_cast<std::uint8_t>(x & 0xFF));
|
||||
next_flags[j].push_back(true);
|
||||
x >>= 8;
|
||||
while (x) {
|
||||
++j;
|
||||
bytes[j].push_back(static_cast<std::uint8_t>(x & 0xFF));
|
||||
next_flags[j].push_back(true);
|
||||
x >>= 8;
|
||||
}
|
||||
next_flags[j].set_bit(next_flags[j].size() - 1, false);
|
||||
m_num_levels = std::max(m_num_levels, j);
|
||||
};
|
||||
|
||||
auto append_leaf = [&](std::uint64_t x) {
|
||||
bytes[0].push_back(static_cast<std::uint8_t>(x & 0xFF));
|
||||
next_flags[0].push_back(false);
|
||||
links.push_back(x >> 8);
|
||||
};
|
||||
|
||||
for (std::uint64_t i = 0; i < bc_units.size(); ++i) {
|
||||
if (leaves[i]) {
|
||||
append_leaf(bc_units[i].base);
|
||||
} else {
|
||||
append_unit(bc_units[i].base ^ i);
|
||||
}
|
||||
append_unit(bc_units[i].check ^ i);
|
||||
if (bc_units[i].check == i) {
|
||||
m_num_frees += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// release
|
||||
for (std::uint32_t i = 0; i < m_num_levels; ++i) {
|
||||
m_bytes[i].build(bytes[i]);
|
||||
m_nexts[i] = bit_vector(next_flags[i], true, false);
|
||||
}
|
||||
m_bytes[m_num_levels].build(bytes[m_num_levels]);
|
||||
m_links = compact_vector(links);
|
||||
m_leaves = bit_vector(leaves, true, false);
|
||||
}
|
||||
|
||||
inline std::uint64_t base(std::uint64_t i) const {
|
||||
return access(i * 2) ^ i;
|
||||
}
|
||||
|
||||
inline std::uint64_t check(std::uint64_t i) const {
|
||||
return access(i * 2 + 1) ^ i;
|
||||
}
|
||||
|
||||
inline std::uint64_t link(std::uint64_t i) const {
|
||||
return m_bytes[0][i * 2] | (m_links[m_leaves.rank(i)] << 8);
|
||||
}
|
||||
|
||||
inline bool is_leaf(std::uint64_t i) const {
|
||||
return m_leaves[i];
|
||||
}
|
||||
|
||||
inline bool is_used(std::uint64_t i) const {
|
||||
return check(i) != i;
|
||||
}
|
||||
|
||||
inline std::uint64_t num_units() const {
|
||||
return m_bytes[0].size() / 2;
|
||||
}
|
||||
|
||||
inline std::uint64_t num_free_units() const {
|
||||
return m_num_frees;
|
||||
}
|
||||
|
||||
inline std::uint64_t num_nodes() const {
|
||||
return num_units() - num_free_units();
|
||||
}
|
||||
|
||||
inline std::uint64_t num_leaves() const {
|
||||
return m_leaves.num_ones();
|
||||
}
|
||||
|
||||
template <class Visitor>
|
||||
void visit(Visitor& visitor) {
|
||||
visitor.visit(m_num_levels);
|
||||
visitor.visit(m_num_frees);
|
||||
for (std::uint32_t j = 0; j < m_bytes.size(); j++) {
|
||||
visitor.visit(m_bytes[j]);
|
||||
}
|
||||
for (std::uint32_t j = 0; j < m_nexts.size(); j++) {
|
||||
visitor.visit(m_nexts[j]);
|
||||
}
|
||||
visitor.visit(m_links);
|
||||
visitor.visit(m_leaves);
|
||||
}
|
||||
|
||||
private:
|
||||
inline std::uint64_t access(std::uint64_t i) const {
|
||||
std::uint32_t j = 0;
|
||||
std::uint64_t x = m_bytes[j][i];
|
||||
while (j < m_num_levels and m_nexts[j][i]) {
|
||||
i = m_nexts[j++].rank(i);
|
||||
x |= static_cast<std::uint64_t>(m_bytes[j][i]) << (j * 8);
|
||||
}
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace xcdat
|
148
include/xcdat/bit_tools.hpp
Normal file
148
include/xcdat/bit_tools.hpp
Normal file
|
@ -0,0 +1,148 @@
|
|||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
|
||||
#ifdef __SSE4_2__
|
||||
#include <nmmintrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef __BMI2__
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
|
||||
// The implementatouns are from https://github.com/ot/succinct.
|
||||
namespace xcdat::bit_tools {
|
||||
|
||||
static constexpr std::uint64_t ones_step_4 = 0x1111111111111111ULL;
|
||||
static constexpr std::uint64_t ones_step_8 = 0x0101010101010101ULL;
|
||||
static constexpr std::uint64_t ones_step_9 = 1ULL << 0 | 1ULL << 9 | 1ULL << 18 | 1ULL << 27 | //
|
||||
1ULL << 36 | 1ULL << 45 | 1ULL << 54;
|
||||
static constexpr std::uint64_t msbs_step_8 = 0x80ULL * ones_step_8;
|
||||
static constexpr std::uint64_t msbs_step_9 = 0x100ULL * ones_step_9;
|
||||
|
||||
inline std::uint64_t popcount(std::uint64_t x) {
|
||||
#ifdef __SSE4_2__
|
||||
return static_cast<std::uint64_t>(__builtin_popcountll(x));
|
||||
#else
|
||||
x = x - ((x >> 1) & 0x5555555555555555ULL);
|
||||
x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL);
|
||||
x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
|
||||
x = (0x0101010101010101ULL * x >> 56);
|
||||
return x;
|
||||
#endif
|
||||
}
|
||||
|
||||
static constexpr std::uint8_t debruijn64_mapping[64] = {
|
||||
63, 0, 58, 1, 59, 47, 53, 2, 60, 39, 48, 27, 54, 33, 42, 3, 61, 51, 37, 40, 49, 18,
|
||||
28, 20, 55, 30, 34, 11, 43, 14, 22, 4, 62, 57, 46, 52, 38, 26, 32, 41, 50, 36, 17, 19,
|
||||
29, 10, 13, 21, 56, 45, 25, 31, 35, 16, 9, 12, 44, 24, 15, 8, 23, 7, 6, 5,
|
||||
};
|
||||
|
||||
static constexpr std::uint64_t debruijn64 = 0x07EDD5E59A4E28C2ULL;
|
||||
|
||||
// return the position of the single bit set in the word x
|
||||
inline std::uint8_t bit_position(std::uint64_t x) {
|
||||
return debruijn64_mapping[(x * debruijn64) >> 58];
|
||||
}
|
||||
|
||||
inline std::uint64_t msb(std::uint64_t x) {
|
||||
#ifdef __SSE4_2__
|
||||
return x == 0 ? 0 : 63 - __builtin_clzll(x);
|
||||
#else
|
||||
if (x == 0) {
|
||||
return 0;
|
||||
}
|
||||
// right-saturate the word
|
||||
x |= x >> 1;
|
||||
x |= x >> 2;
|
||||
x |= x >> 4;
|
||||
x |= x >> 8;
|
||||
x |= x >> 16;
|
||||
x |= x >> 32;
|
||||
// isolate the MSB
|
||||
x ^= x >> 1;
|
||||
return bit_position(x);
|
||||
#endif
|
||||
}
|
||||
|
||||
inline std::uint64_t uleq_step_9(std::uint64_t x, std::uint64_t y) {
|
||||
return (((((y | msbs_step_9) - (x & ~msbs_step_9)) | (x ^ y)) ^ (x & ~y)) & msbs_step_9) >> 8;
|
||||
}
|
||||
|
||||
inline std::uint64_t byte_counts(std::uint64_t x) {
|
||||
x = x - ((x & 0xa * ones_step_4) >> 1);
|
||||
x = (x & 3 * ones_step_4) + ((x >> 2) & 3 * ones_step_4);
|
||||
x = (x + (x >> 4)) & 0x0f * ones_step_8;
|
||||
return x;
|
||||
}
|
||||
|
||||
static constexpr std::uint8_t select_in_byte[2048] = {
|
||||
8, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1,
|
||||
0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0,
|
||||
1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2,
|
||||
0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0,
|
||||
2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1,
|
||||
0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0,
|
||||
1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 8, 8, 8, 1, 8, 2, 2, 1, 8, 3, 3, 1, 3, 2, 2, 1, 8,
|
||||
4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 8, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, 3, 2, 2, 1, 5, 4, 4, 1, 4, 2, 2, 1,
|
||||
4, 3, 3, 1, 3, 2, 2, 1, 8, 6, 6, 1, 6, 2, 2, 1, 6, 3, 3, 1, 3, 2, 2, 1, 6, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2,
|
||||
1, 6, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, 3, 2, 2, 1, 5, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 8, 7, 7, 1, 7, 2,
|
||||
2, 1, 7, 3, 3, 1, 3, 2, 2, 1, 7, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 7, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, 3,
|
||||
2, 2, 1, 5, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 7, 6, 6, 1, 6, 2, 2, 1, 6, 3, 3, 1, 3, 2, 2, 1, 6, 4, 4, 1,
|
||||
4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 6, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, 3, 2, 2, 1, 5, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3,
|
||||
1, 3, 2, 2, 1, 8, 8, 8, 8, 8, 8, 8, 2, 8, 8, 8, 3, 8, 3, 3, 2, 8, 8, 8, 4, 8, 4, 4, 2, 8, 4, 4, 3, 4, 3, 3, 2, 8, 8,
|
||||
8, 5, 8, 5, 5, 2, 8, 5, 5, 3, 5, 3, 3, 2, 8, 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, 4, 3, 3, 2, 8, 8, 8, 6, 8, 6, 6, 2, 8,
|
||||
6, 6, 3, 6, 3, 3, 2, 8, 6, 6, 4, 6, 4, 4, 2, 6, 4, 4, 3, 4, 3, 3, 2, 8, 6, 6, 5, 6, 5, 5, 2, 6, 5, 5, 3, 5, 3, 3, 2,
|
||||
6, 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, 4, 3, 3, 2, 8, 8, 8, 7, 8, 7, 7, 2, 8, 7, 7, 3, 7, 3, 3, 2, 8, 7, 7, 4, 7, 4, 4,
|
||||
2, 7, 4, 4, 3, 4, 3, 3, 2, 8, 7, 7, 5, 7, 5, 5, 2, 7, 5, 5, 3, 5, 3, 3, 2, 7, 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, 4, 3,
|
||||
3, 2, 8, 7, 7, 6, 7, 6, 6, 2, 7, 6, 6, 3, 6, 3, 3, 2, 7, 6, 6, 4, 6, 4, 4, 2, 6, 4, 4, 3, 4, 3, 3, 2, 7, 6, 6, 5, 6,
|
||||
5, 5, 2, 6, 5, 5, 3, 5, 3, 3, 2, 6, 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, 4, 3, 3, 2, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 3, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 4, 8, 4, 4, 3, 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, 5, 8, 5, 5, 3, 8, 8, 8,
|
||||
5, 8, 5, 5, 4, 8, 5, 5, 4, 5, 4, 4, 3, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 3, 8, 8, 8, 6, 8, 6, 6, 4, 8, 6,
|
||||
6, 4, 6, 4, 4, 3, 8, 8, 8, 6, 8, 6, 6, 5, 8, 6, 6, 5, 6, 5, 5, 3, 8, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3, 8,
|
||||
8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 3, 8, 8, 8, 7, 8, 7, 7, 4, 8, 7, 7, 4, 7, 4, 4, 3, 8, 8, 8, 7, 8, 7, 7, 5,
|
||||
8, 7, 7, 5, 7, 5, 5, 3, 8, 7, 7, 5, 7, 5, 5, 4, 7, 5, 5, 4, 5, 4, 4, 3, 8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6,
|
||||
3, 8, 7, 7, 6, 7, 6, 6, 4, 7, 6, 6, 4, 6, 4, 4, 3, 8, 7, 7, 6, 7, 6, 6, 5, 7, 6, 6, 5, 6, 5, 5, 3, 7, 6, 6, 5, 6, 5,
|
||||
5, 4, 6, 5, 5, 4, 5, 4, 4, 3, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, 5, 8, 5, 5, 4, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 4, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8,
|
||||
6, 8, 6, 6, 5, 8, 8, 8, 6, 8, 6, 6, 5, 8, 6, 6, 5, 6, 5, 5, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8,
|
||||
8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 4, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 5, 8, 8, 8, 7, 8, 7, 7, 5, 8,
|
||||
7, 7, 5, 7, 5, 5, 4, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, 8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 4,
|
||||
8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 5, 8, 7, 7, 6, 7, 6, 6, 5, 7, 6, 6, 5, 6, 5, 5, 4, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6,
|
||||
8, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, 8,
|
||||
8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7};
|
||||
|
||||
inline std::uint64_t select_in_word(const std::uint64_t x, const std::uint64_t k) {
|
||||
#ifdef __BMI2__
|
||||
return _tzcnt_u64(_pdep_u64(1ULL << k, x));
|
||||
#else
|
||||
const std::uint64_t byte_sums = byte_counts(x) * ones_step_8;
|
||||
const std::uint64_t k_step_8 = k * ones_step_8;
|
||||
const std::uint64_t geq_k_step_8 = (((k_step_8 | msbs_step_8) - byte_sums) & msbs_step_8);
|
||||
const std::uint64_t place = popcount(geq_k_step_8) * 8;
|
||||
const std::uint64_t byte_rank = k - (((byte_sums << 8) >> place) & 0xFFULL);
|
||||
return place + select_in_byte[((x >> place) & 0xFF) | (byte_rank << 8)];
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace xcdat::bit_tools
|
272
include/xcdat/bit_vector.hpp
Normal file
272
include/xcdat/bit_vector.hpp
Normal file
|
@ -0,0 +1,272 @@
|
|||
#pragma once
|
||||
|
||||
#include <numeric>
|
||||
#include <vector>
|
||||
|
||||
#include "bit_tools.hpp"
|
||||
#include "immutable_vector.hpp"
|
||||
|
||||
namespace xcdat {
|
||||
|
||||
// Vigna's Rank9 implementation from https://github.com/ot/succinct.
|
||||
class bit_vector {
|
||||
public:
|
||||
class builder {
|
||||
private:
|
||||
std::uint64_t m_size = 0;
|
||||
std::vector<std::uint64_t> m_bits;
|
||||
|
||||
public:
|
||||
builder() = default;
|
||||
virtual ~builder() = default;
|
||||
|
||||
builder(const builder&) = delete;
|
||||
builder& operator=(const builder&) = delete;
|
||||
|
||||
builder(builder&&) noexcept = default;
|
||||
builder& operator=(builder&&) noexcept = default;
|
||||
|
||||
builder(std::uint64_t size) {
|
||||
resize(size);
|
||||
}
|
||||
|
||||
inline void push_back(bool x) {
|
||||
if (m_size % 64 == 0) {
|
||||
m_bits.push_back(0);
|
||||
}
|
||||
if (x) {
|
||||
set_bit(m_size, true);
|
||||
}
|
||||
m_size += 1;
|
||||
}
|
||||
|
||||
inline bool operator[](std::uint64_t i) const {
|
||||
return m_bits[i / 64] & (1ULL << (i % 64));
|
||||
}
|
||||
|
||||
inline void set_bit(std::uint64_t i, bool x = true) {
|
||||
if (x) {
|
||||
m_bits[i / 64] |= (1ULL << (i % 64));
|
||||
} else {
|
||||
m_bits[i / 64] &= (~(1ULL << (i % 64)));
|
||||
}
|
||||
}
|
||||
|
||||
inline void resize(std::uint64_t size) {
|
||||
m_bits.resize(words_for(size), 0ULL);
|
||||
m_size = size;
|
||||
}
|
||||
|
||||
inline void reserve(std::uint64_t capacity) {
|
||||
m_bits.reserve(words_for(capacity));
|
||||
}
|
||||
|
||||
inline std::uint64_t size() const {
|
||||
return m_size;
|
||||
}
|
||||
|
||||
friend class bit_vector;
|
||||
};
|
||||
|
||||
static constexpr std::uint64_t block_size = 8; // i.e., 64 * 8 bits
|
||||
static constexpr std::uint64_t selects_per_hint = 64 * block_size * 2;
|
||||
|
||||
private:
|
||||
std::uint64_t m_size = 0;
|
||||
std::uint64_t m_num_ones = 0;
|
||||
immutable_vector<std::uint64_t> m_bits;
|
||||
immutable_vector<std::uint64_t> m_rank_hints;
|
||||
immutable_vector<std::uint64_t> m_select_hints;
|
||||
|
||||
public:
|
||||
bit_vector() = default;
|
||||
virtual ~bit_vector() = default;
|
||||
|
||||
bit_vector(const bit_vector&) = delete;
|
||||
bit_vector& operator=(const bit_vector&) = delete;
|
||||
|
||||
bit_vector(bit_vector&&) noexcept = default;
|
||||
bit_vector& operator=(bit_vector&&) noexcept = default;
|
||||
|
||||
explicit bit_vector(builder& b, bool enable_rank = false, bool enable_select = false) {
|
||||
m_bits.build(b.m_bits);
|
||||
m_size = b.m_size;
|
||||
m_num_ones = std::accumulate(m_bits.begin(), m_bits.end(), 0ULL,
|
||||
[](std::uint64_t acc, std::uint64_t x) { return acc + bit_tools::popcount(x); });
|
||||
if (enable_rank) {
|
||||
build_rank_hints();
|
||||
}
|
||||
if (enable_rank and enable_select) {
|
||||
build_select_hints();
|
||||
}
|
||||
}
|
||||
|
||||
inline std::uint64_t size() const {
|
||||
return m_size;
|
||||
}
|
||||
|
||||
inline std::uint64_t num_ones() const {
|
||||
return m_num_ones;
|
||||
}
|
||||
|
||||
inline bool operator[](std::uint64_t i) const {
|
||||
return m_bits[i / 64] & (1ULL << (i % 64));
|
||||
}
|
||||
|
||||
// The number of 1s in B[0..i)
|
||||
inline std::uint64_t rank(std::uint64_t i) const {
|
||||
assert(i <= size());
|
||||
assert(m_rank_hints.size() != 0);
|
||||
|
||||
if (i == size()) {
|
||||
return num_ones();
|
||||
}
|
||||
const auto [wi, wj] = decompose<64>(i);
|
||||
return rank_for_word(wi) + (wj != 0 ? bit_tools::popcount(m_bits[wi] << (64 - wj)) : 0);
|
||||
}
|
||||
|
||||
// The largest position
|
||||
inline std::uint64_t select(std::uint64_t n) const {
|
||||
assert(n < num_ones());
|
||||
assert(m_select_hints.size() != 0);
|
||||
|
||||
const std::uint64_t bi = select_for_block(n);
|
||||
assert(bi < num_blocks());
|
||||
|
||||
std::uint64_t curr_rank = rank_for_block(bi);
|
||||
assert(curr_rank <= n);
|
||||
|
||||
std::uint64_t rank_in_block_parallel = (n - curr_rank) * bit_tools::ones_step_9;
|
||||
std::uint64_t sub_ranks = ranks_in_block(bi);
|
||||
std::uint64_t sub_block_offset =
|
||||
bit_tools::uleq_step_9(sub_ranks, rank_in_block_parallel) * bit_tools::ones_step_9 >> 54 & 0x7;
|
||||
curr_rank += sub_ranks >> (7 - sub_block_offset) * 9 & 0x1FF;
|
||||
assert(curr_rank <= n);
|
||||
|
||||
std::uint64_t word_offset = (bi * block_size) + sub_block_offset;
|
||||
return word_offset * 64 + bit_tools::select_in_word(m_bits[word_offset], n - curr_rank);
|
||||
}
|
||||
|
||||
template <class Visitor>
|
||||
void visit(Visitor& visitor) {
|
||||
visitor.visit(m_size);
|
||||
visitor.visit(m_num_ones);
|
||||
visitor.visit(m_bits);
|
||||
visitor.visit(m_rank_hints);
|
||||
visitor.visit(m_select_hints);
|
||||
}
|
||||
|
||||
private:
|
||||
template <std::uint64_t N>
|
||||
static std::tuple<std::uint64_t, std::uint64_t> decompose(std::uint64_t x) {
|
||||
return {x / N, x % N};
|
||||
}
|
||||
|
||||
static std::uint64_t words_for(std::uint64_t nbits) {
|
||||
return (nbits + 63) / 64;
|
||||
}
|
||||
|
||||
inline std::uint64_t num_blocks() const {
|
||||
return m_rank_hints.size() / 2 - 1;
|
||||
}
|
||||
|
||||
// Absolute rank until the bi-th block
|
||||
inline std::uint64_t rank_for_block(std::uint64_t bi) const {
|
||||
return m_rank_hints[bi * 2];
|
||||
}
|
||||
|
||||
// Packed ranks in the bi-th block
|
||||
inline std::uint64_t ranks_in_block(std::uint64_t bi) const {
|
||||
return m_rank_hints[bi * 2 + 1];
|
||||
}
|
||||
|
||||
// Absolute rank until the wi-th word
|
||||
inline std::uint64_t rank_for_word(std::uint64_t wi) const {
|
||||
const auto [bi, bj] = decompose<block_size>(wi);
|
||||
return rank_for_block(bi) + rank_in_block(bi, bj);
|
||||
}
|
||||
|
||||
// Relative rank in the bi-th block
|
||||
inline std::uint64_t rank_in_block(std::uint64_t bi, std::uint64_t bj) const {
|
||||
return ranks_in_block(bi) >> ((7 - bj) * 9) & 0x1FF;
|
||||
}
|
||||
|
||||
inline std::uint64_t select_for_block(std::uint64_t n) const {
|
||||
auto [a, b] = select_with_hint(n);
|
||||
while (b - a > 1) {
|
||||
const std::uint64_t lb = a + (b - a) / 2;
|
||||
if (rank_for_block(lb) <= n) {
|
||||
a = lb;
|
||||
} else {
|
||||
b = lb;
|
||||
}
|
||||
}
|
||||
return a;
|
||||
}
|
||||
|
||||
inline std::tuple<std::uint64_t, std::uint64_t> select_with_hint(std::uint64_t n) const {
|
||||
const std::uint64_t i = n / selects_per_hint;
|
||||
return {i != 0 ? m_select_hints[i - 1] : 0, m_select_hints[i] + 1};
|
||||
}
|
||||
|
||||
void build_rank_hints() {
|
||||
std::uint64_t curr_num_ones = 0;
|
||||
std::uint64_t curr_num_ones_in_block = 0;
|
||||
std::uint64_t curr_ranks_in_block = 0;
|
||||
|
||||
const std::uint64_t num_words = m_bits.size();
|
||||
std::vector<std::uint64_t> rank_hints = {curr_num_ones};
|
||||
|
||||
for (std::uint64_t wi = 0; wi < num_words; wi++) {
|
||||
const std::uint64_t bi = wi % block_size; // Relative position in the block
|
||||
const std::uint64_t num_ones_in_word = bit_tools::popcount(m_bits[wi]);
|
||||
|
||||
if (bi != 0) {
|
||||
curr_ranks_in_block <<= 9;
|
||||
curr_ranks_in_block |= curr_num_ones_in_block;
|
||||
}
|
||||
|
||||
curr_num_ones += num_ones_in_word;
|
||||
curr_num_ones_in_block += num_ones_in_word;
|
||||
|
||||
if (bi == block_size - 1) {
|
||||
rank_hints.push_back(curr_ranks_in_block);
|
||||
rank_hints.push_back(curr_num_ones);
|
||||
curr_num_ones_in_block = 0;
|
||||
curr_ranks_in_block = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// Padding the remaining hints
|
||||
const std::uint64_t remain = block_size - (num_words % block_size);
|
||||
for (std::uint64_t wi = 0; wi < remain; wi++) {
|
||||
curr_ranks_in_block <<= 9;
|
||||
curr_ranks_in_block |= curr_num_ones_in_block;
|
||||
}
|
||||
rank_hints.push_back(curr_ranks_in_block);
|
||||
|
||||
// Sentinel
|
||||
if (num_words % block_size != 0) {
|
||||
rank_hints.push_back(curr_ranks_in_block);
|
||||
rank_hints.push_back(0);
|
||||
}
|
||||
|
||||
// Release
|
||||
m_rank_hints.build(rank_hints);
|
||||
}
|
||||
|
||||
void build_select_hints() {
|
||||
std::vector<std::uint64_t> select_hints;
|
||||
std::uint64_t threshold = selects_per_hint;
|
||||
for (std::uint64_t bi = 0; bi < num_blocks(); ++bi) {
|
||||
if (rank_for_block(bi + 1) > threshold) {
|
||||
select_hints.push_back(bi);
|
||||
threshold += selects_per_hint;
|
||||
}
|
||||
}
|
||||
select_hints.push_back(num_blocks());
|
||||
m_select_hints.build(select_hints);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace xcdat
|
111
include/xcdat/code_table.hpp
Normal file
111
include/xcdat/code_table.hpp
Normal file
|
@ -0,0 +1,111 @@
|
|||
#pragma once
|
||||
|
||||
#include <array>
|
||||
#include <string_view>
|
||||
|
||||
#include "immutable_vector.hpp"
|
||||
|
||||
namespace xcdat {
|
||||
|
||||
class code_table {
|
||||
private:
|
||||
std::uint64_t m_max_length = 0;
|
||||
std::array<std::uint8_t, 512> m_table;
|
||||
immutable_vector<std::uint8_t> m_alphabet;
|
||||
|
||||
struct counter_type {
|
||||
std::uint8_t ch;
|
||||
std::uint64_t freq;
|
||||
};
|
||||
|
||||
public:
|
||||
code_table() = default;
|
||||
virtual ~code_table() = default;
|
||||
|
||||
code_table(const code_table&) = delete;
|
||||
code_table& operator=(const code_table&) = delete;
|
||||
|
||||
code_table(code_table&&) noexcept = default;
|
||||
code_table& operator=(code_table&&) noexcept = default;
|
||||
|
||||
template <class Strings>
|
||||
code_table(const Strings& keys) {
|
||||
std::array<counter_type, 256> counter;
|
||||
for (std::uint32_t ch = 0; ch < 256; ++ch) {
|
||||
counter[ch] = {static_cast<std::uint8_t>(ch), 0};
|
||||
}
|
||||
|
||||
m_max_length = 0;
|
||||
for (const auto& key : keys) {
|
||||
for (std::uint8_t ch : key) {
|
||||
counter[ch].freq += 1;
|
||||
}
|
||||
m_max_length = std::max<std::uint64_t>(m_max_length, key.length());
|
||||
}
|
||||
|
||||
{
|
||||
std::vector<std::uint8_t> alphabet;
|
||||
for (const auto& cf : counter) {
|
||||
if (cf.freq != 0) {
|
||||
alphabet.push_back(cf.ch);
|
||||
}
|
||||
}
|
||||
m_alphabet.build(alphabet);
|
||||
}
|
||||
|
||||
std::sort(counter.begin(), counter.end(),
|
||||
[](const counter_type& a, const counter_type& b) { return a.freq > b.freq; });
|
||||
|
||||
for (std::uint32_t ch = 0; ch < 256; ++ch) {
|
||||
m_table[counter[ch].ch] = static_cast<std::uint8_t>(ch);
|
||||
}
|
||||
for (std::uint32_t ch = 0; ch < 256; ++ch) {
|
||||
m_table[m_table[ch] + 256] = static_cast<std::uint8_t>(ch);
|
||||
}
|
||||
}
|
||||
|
||||
inline std::uint64_t alphabet_size() const {
|
||||
return m_alphabet.size();
|
||||
}
|
||||
|
||||
inline std::uint64_t max_length() const {
|
||||
return m_max_length;
|
||||
}
|
||||
|
||||
inline std::uint8_t get_code(char ch) const {
|
||||
return m_table[static_cast<std::uint8_t>(ch)];
|
||||
}
|
||||
|
||||
inline char get_char(std::uint8_t cd) const {
|
||||
return static_cast<char>(m_table[cd + 256]);
|
||||
}
|
||||
|
||||
inline bool has_null() {
|
||||
return *m_alphabet.begin() == '\0';
|
||||
}
|
||||
|
||||
inline auto begin() const {
|
||||
return m_alphabet.begin();
|
||||
}
|
||||
|
||||
inline auto end() const {
|
||||
return m_alphabet.end();
|
||||
}
|
||||
|
||||
inline auto rbegin() const {
|
||||
return m_alphabet.rbegin();
|
||||
}
|
||||
|
||||
inline auto rend() const {
|
||||
return m_alphabet.rend();
|
||||
}
|
||||
|
||||
template <class Visitor>
|
||||
void visit(Visitor& visitor) {
|
||||
visitor.visit(m_max_length);
|
||||
visitor.visit(m_table);
|
||||
visitor.visit(m_alphabet);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace xcdat
|
89
include/xcdat/compact_vector.hpp
Normal file
89
include/xcdat/compact_vector.hpp
Normal file
|
@ -0,0 +1,89 @@
|
|||
#pragma once
|
||||
|
||||
#include "bit_tools.hpp"
|
||||
#include "exception.hpp"
|
||||
#include "immutable_vector.hpp"
|
||||
|
||||
namespace xcdat {
|
||||
|
||||
class compact_vector {
|
||||
private:
|
||||
std::uint64_t m_size = 0;
|
||||
std::uint64_t m_bits = 0;
|
||||
std::uint64_t m_mask = 0;
|
||||
immutable_vector<std::uint64_t> m_chunks;
|
||||
|
||||
public:
|
||||
compact_vector() = default;
|
||||
virtual ~compact_vector() = default;
|
||||
|
||||
compact_vector(const compact_vector&) = delete;
|
||||
compact_vector& operator=(const compact_vector&) = delete;
|
||||
|
||||
compact_vector(compact_vector&&) noexcept = default;
|
||||
compact_vector& operator=(compact_vector&&) noexcept = default;
|
||||
|
||||
template <class Vec>
|
||||
compact_vector(const Vec& vec) {
|
||||
XCDAT_THROW_IF(vec.size() == 0, "The input vector is empty.");
|
||||
|
||||
m_size = vec.size();
|
||||
m_bits = needed_bits(*std::max_element(vec.begin(), vec.end()));
|
||||
m_mask = (1ULL << m_bits) - 1;
|
||||
|
||||
std::vector<std::uint64_t> chunks(words_for(m_size * m_bits));
|
||||
|
||||
for (std::uint64_t i = 0; i < m_size; i++) {
|
||||
const auto [quo, mod] = decompose(i * m_bits);
|
||||
chunks[quo] &= ~(m_mask << mod);
|
||||
chunks[quo] |= (vec[i] & m_mask) << mod;
|
||||
if (64 < mod + m_bits) {
|
||||
const std::uint64_t diff = 64ULL - mod;
|
||||
chunks[quo + 1] &= ~(m_mask >> diff);
|
||||
chunks[quo + 1] |= (vec[i] & m_mask) >> diff;
|
||||
}
|
||||
}
|
||||
m_chunks.build(chunks);
|
||||
}
|
||||
|
||||
inline std::uint64_t operator[](std::uint64_t i) const {
|
||||
assert(i < m_size);
|
||||
const auto [quo, mod] = decompose(i * m_bits);
|
||||
if (mod + m_bits <= 64) {
|
||||
return (m_chunks[quo] >> mod) & m_mask;
|
||||
} else {
|
||||
return ((m_chunks[quo] >> mod) | (m_chunks[quo + 1] << (64 - mod))) & m_mask;
|
||||
}
|
||||
}
|
||||
|
||||
inline std::uint64_t size() const {
|
||||
return m_size;
|
||||
}
|
||||
|
||||
inline std::uint64_t bits() const {
|
||||
return m_bits;
|
||||
}
|
||||
|
||||
template <class Visitor>
|
||||
void visit(Visitor& visitor) {
|
||||
visitor.visit(m_size);
|
||||
visitor.visit(m_bits);
|
||||
visitor.visit(m_mask);
|
||||
visitor.visit(m_chunks);
|
||||
}
|
||||
|
||||
private:
|
||||
static std::uint64_t needed_bits(std::uint64_t x) {
|
||||
return bit_tools::msb(x) + 1;
|
||||
}
|
||||
|
||||
static std::tuple<std::uint64_t, std::uint64_t> decompose(std::uint64_t x) {
|
||||
return {x / 64, x % 64};
|
||||
}
|
||||
|
||||
static std::uint64_t words_for(std::uint64_t nbits) {
|
||||
return (nbits + 63) / 64;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace xcdat
|
25
include/xcdat/exception.hpp
Normal file
25
include/xcdat/exception.hpp
Normal file
|
@ -0,0 +1,25 @@
|
|||
#pragma once
|
||||
|
||||
#include <exception>
|
||||
|
||||
namespace xcdat {
|
||||
|
||||
class exception : public std::exception {
|
||||
public:
|
||||
explicit exception(const char* msg) : msg_{msg} {}
|
||||
~exception() throw() override = default;
|
||||
|
||||
const char* what() const throw() override {
|
||||
return msg_;
|
||||
}
|
||||
|
||||
private:
|
||||
const char* msg_;
|
||||
};
|
||||
|
||||
#define XCDAT_TO_STR_(n) #n
|
||||
#define XCDAT_TO_STR(n) XCDAT_TO_STR_(n)
|
||||
#define XCDAT_THROW(msg) throw xcdat::exception(__FILE__ ":" XCDAT_TO_STR(__LINE__) ":" msg)
|
||||
#define XCDAT_THROW_IF(cond, msg) (void)((!(cond)) || (XCDAT_THROW(msg), 0))
|
||||
|
||||
} // namespace xcdat
|
107
include/xcdat/immutable_vector.hpp
Normal file
107
include/xcdat/immutable_vector.hpp
Normal file
|
@ -0,0 +1,107 @@
|
|||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
#include <fstream>
|
||||
#include <iterator>
|
||||
#include <memory>
|
||||
|
||||
namespace xcdat {
|
||||
|
||||
template <class T>
|
||||
class immutable_vector {
|
||||
private:
|
||||
std::unique_ptr<T[]> m_allocator;
|
||||
std::uint64_t m_size = 0;
|
||||
const T* m_data = nullptr;
|
||||
|
||||
public:
|
||||
immutable_vector() = default;
|
||||
virtual ~immutable_vector() = default;
|
||||
|
||||
immutable_vector(const immutable_vector&) = delete;
|
||||
immutable_vector& operator=(const immutable_vector&) = delete;
|
||||
|
||||
immutable_vector(immutable_vector&&) noexcept = default;
|
||||
immutable_vector& operator=(immutable_vector&&) noexcept = default;
|
||||
|
||||
void clear() {
|
||||
m_allocator.reset();
|
||||
m_size = 0;
|
||||
m_data = nullptr;
|
||||
}
|
||||
|
||||
template <class Vector>
|
||||
immutable_vector(const Vector& vec) {
|
||||
build(vec);
|
||||
}
|
||||
|
||||
template <class Vector>
|
||||
void build(const Vector& vec) {
|
||||
clear();
|
||||
if (vec.size() != 0) {
|
||||
m_allocator = std::make_unique<T[]>(vec.size());
|
||||
std::copy_n(vec.data(), vec.size(), m_allocator.get());
|
||||
m_size = vec.size();
|
||||
m_data = m_allocator.get();
|
||||
}
|
||||
}
|
||||
|
||||
std::uint64_t mmap(const char* address) {
|
||||
clear();
|
||||
m_size = *reinterpret_cast<const std::uint64_t*>(address);
|
||||
m_data = reinterpret_cast<const T*>(address + sizeof(std::uint64_t));
|
||||
return sizeof(std::uint64_t) + m_size * sizeof(T);
|
||||
}
|
||||
|
||||
void load(std::ifstream& ifs) {
|
||||
clear();
|
||||
ifs.read(reinterpret_cast<char*>(&m_size), sizeof(m_size));
|
||||
if (m_size != 0) {
|
||||
m_allocator = std::make_unique<T[]>(m_size);
|
||||
ifs.read(reinterpret_cast<char*>(m_allocator.get()), sizeof(T) * m_size);
|
||||
m_data = m_allocator.get();
|
||||
}
|
||||
}
|
||||
|
||||
void save(std::ofstream& ofs) const {
|
||||
ofs.write(reinterpret_cast<const char*>(&m_size), sizeof(m_size));
|
||||
ofs.write(reinterpret_cast<const char*>(m_data), sizeof(T) * m_size);
|
||||
}
|
||||
|
||||
inline std::uint64_t memory_in_bytes() const {
|
||||
return sizeof(m_size) + sizeof(T) * m_size;
|
||||
}
|
||||
|
||||
inline std::uint64_t size() const {
|
||||
return m_size;
|
||||
}
|
||||
|
||||
inline const T* begin() const {
|
||||
return m_data;
|
||||
}
|
||||
|
||||
inline const T* end() const {
|
||||
return m_data + m_size;
|
||||
}
|
||||
|
||||
inline auto rbegin() const {
|
||||
return std::make_reverse_iterator(end());
|
||||
}
|
||||
|
||||
inline auto rend() const {
|
||||
return std::make_reverse_iterator(begin());
|
||||
}
|
||||
|
||||
inline const T& operator[](std::uint64_t i) const {
|
||||
assert(i < m_size);
|
||||
return m_data[i];
|
||||
}
|
||||
|
||||
inline const T* data() const {
|
||||
return m_data;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace xcdat
|
43
include/xcdat/load_visitor.hpp
Normal file
43
include/xcdat/load_visitor.hpp
Normal file
|
@ -0,0 +1,43 @@
|
|||
#pragma once
|
||||
|
||||
#include <string_view>
|
||||
#include <type_traits>
|
||||
|
||||
#include "exception.hpp"
|
||||
#include "immutable_vector.hpp"
|
||||
|
||||
namespace xcdat {
|
||||
|
||||
class load_visitor {
|
||||
private:
|
||||
std::ifstream m_ifs;
|
||||
|
||||
public:
|
||||
load_visitor(std::string_view filepath) : m_ifs(filepath, std::ios::binary) {
|
||||
XCDAT_THROW_IF(!m_ifs.good(), "Cannot open the input file");
|
||||
}
|
||||
|
||||
virtual ~load_visitor() {
|
||||
m_ifs.close();
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void visit(immutable_vector<T>& vec) {
|
||||
vec.load(m_ifs);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void visit(T& obj) {
|
||||
if constexpr (std::is_pod_v<T>) {
|
||||
m_ifs.read(reinterpret_cast<char*>(&obj), sizeof(T));
|
||||
} else {
|
||||
obj.visit(*this);
|
||||
}
|
||||
}
|
||||
|
||||
std::uint64_t bytes() {
|
||||
return m_ifs.tellg();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace xcdat
|
39
include/xcdat/mmap_visitor.hpp
Normal file
39
include/xcdat/mmap_visitor.hpp
Normal file
|
@ -0,0 +1,39 @@
|
|||
#pragma once
|
||||
|
||||
#include <type_traits>
|
||||
|
||||
#include "immutable_vector.hpp"
|
||||
|
||||
namespace xcdat {
|
||||
|
||||
class mmap_visitor {
|
||||
private:
|
||||
const char* m_base = nullptr;
|
||||
const char* m_cur = nullptr;
|
||||
|
||||
public:
|
||||
mmap_visitor(const char* base) : m_base(base), m_cur(base) {}
|
||||
|
||||
virtual ~mmap_visitor() = default;
|
||||
|
||||
template <typename T>
|
||||
void visit(immutable_vector<T>& vec) {
|
||||
m_cur += vec.mmap(m_cur);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void visit(T& obj) {
|
||||
if constexpr (std::is_pod_v<T>) {
|
||||
obj = *reinterpret_cast<const T*>(m_cur);
|
||||
m_cur += sizeof(T);
|
||||
} else {
|
||||
obj.visit(*this);
|
||||
}
|
||||
}
|
||||
|
||||
std::uint64_t bytes() {
|
||||
return std::distance(m_base, m_cur);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace xcdat
|
43
include/xcdat/save_visitor.hpp
Normal file
43
include/xcdat/save_visitor.hpp
Normal file
|
@ -0,0 +1,43 @@
|
|||
#pragma once
|
||||
|
||||
#include <string_view>
|
||||
#include <type_traits>
|
||||
|
||||
#include "exception.hpp"
|
||||
#include "immutable_vector.hpp"
|
||||
|
||||
namespace xcdat {
|
||||
|
||||
class save_visitor {
|
||||
private:
|
||||
std::ofstream m_ofs;
|
||||
|
||||
public:
|
||||
save_visitor(std::string_view filepath) : m_ofs(filepath, std::ios::binary) {
|
||||
XCDAT_THROW_IF(!m_ofs.good(), "Cannot open the input file");
|
||||
}
|
||||
|
||||
virtual ~save_visitor() {
|
||||
m_ofs.close();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void visit(const immutable_vector<T>& vec) {
|
||||
vec.save(m_ofs);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void visit(const T& obj) {
|
||||
if constexpr (std::is_pod_v<T>) {
|
||||
m_ofs.write(reinterpret_cast<const char*>(&obj), sizeof(T));
|
||||
} else {
|
||||
const_cast<T&>(obj).visit(*this);
|
||||
}
|
||||
}
|
||||
|
||||
std::uint64_t bytes() {
|
||||
return m_ofs.tellp();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace xcdat
|
39
include/xcdat/size_visitor.hpp
Normal file
39
include/xcdat/size_visitor.hpp
Normal file
|
@ -0,0 +1,39 @@
|
|||
#pragma once
|
||||
|
||||
#include <string_view>
|
||||
#include <type_traits>
|
||||
|
||||
#include "exception.hpp"
|
||||
#include "immutable_vector.hpp"
|
||||
|
||||
namespace xcdat {
|
||||
|
||||
class size_visitor {
|
||||
private:
|
||||
std::uint64_t m_bytes = 0;
|
||||
|
||||
public:
|
||||
size_visitor() = default;
|
||||
|
||||
virtual ~size_visitor() = default;
|
||||
|
||||
template <typename T>
|
||||
void visit(const immutable_vector<T>& vec) {
|
||||
m_bytes += vec.memory_in_bytes();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void visit(const T& obj) {
|
||||
if constexpr (std::is_pod_v<T>) {
|
||||
m_bytes += sizeof(T);
|
||||
} else {
|
||||
const_cast<T&>(obj).visit(*this);
|
||||
}
|
||||
}
|
||||
|
||||
std::uint64_t bytes() {
|
||||
return m_bytes;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace xcdat
|
222
include/xcdat/tail_vector.hpp
Normal file
222
include/xcdat/tail_vector.hpp
Normal file
|
@ -0,0 +1,222 @@
|
|||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <functional>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <vector>
|
||||
|
||||
#include "bit_vector.hpp"
|
||||
#include "exception.hpp"
|
||||
#include "immutable_vector.hpp"
|
||||
|
||||
namespace xcdat {
|
||||
|
||||
class tail_vector {
|
||||
public:
|
||||
struct suffix_type {
|
||||
std::string_view str;
|
||||
std::uint64_t npos;
|
||||
|
||||
inline char operator[](std::uint64_t i) const {
|
||||
return str[size() - i - 1];
|
||||
}
|
||||
inline std::uint64_t size() const {
|
||||
return str.size();
|
||||
}
|
||||
|
||||
inline const char* begin() const {
|
||||
return str.data();
|
||||
}
|
||||
inline const char* end() const {
|
||||
return str.data() + str.size();
|
||||
}
|
||||
|
||||
inline std::reverse_iterator<const char*> rbegin() const {
|
||||
return std::make_reverse_iterator(str.data() + str.size());
|
||||
}
|
||||
inline std::reverse_iterator<const char*> rend() const {
|
||||
return std::make_reverse_iterator(str.data());
|
||||
}
|
||||
};
|
||||
|
||||
class builder {
|
||||
private:
|
||||
// Buffer
|
||||
std::vector<suffix_type> m_suffixes;
|
||||
|
||||
// Released
|
||||
std::vector<char> m_chars;
|
||||
bit_vector::builder m_terms;
|
||||
|
||||
public:
|
||||
builder() = default;
|
||||
virtual ~builder() = default;
|
||||
|
||||
builder(const builder&) = delete;
|
||||
builder& operator=(const builder&) = delete;
|
||||
|
||||
builder(builder&&) noexcept = default;
|
||||
builder& operator=(builder&&) noexcept = default;
|
||||
|
||||
void set_suffix(std::string_view str, std::uint64_t npos) {
|
||||
XCDAT_THROW_IF(str.size() == 0, "The given suffix is empty.");
|
||||
m_suffixes.push_back({str, npos});
|
||||
}
|
||||
|
||||
// setter(npos, tpos): Set units[npos].base = tpos.
|
||||
void complete(bool bin_mode, const std::function<void(std::uint64_t, std::uint64_t)>& setter) {
|
||||
std::sort(m_suffixes.begin(), m_suffixes.end(), [](const suffix_type& a, const suffix_type& b) {
|
||||
return std::lexicographical_compare(std::rbegin(a), std::rend(a), std::rbegin(b), std::rend(b));
|
||||
});
|
||||
|
||||
// Dummy for an empty suffix
|
||||
m_chars.emplace_back('\0');
|
||||
if (bin_mode) {
|
||||
m_terms.push_back(false);
|
||||
}
|
||||
|
||||
const suffix_type dmmy_suffix = {{nullptr, 0}, 0};
|
||||
const suffix_type* prev_suffix = &dmmy_suffix;
|
||||
|
||||
std::uint64_t prev_tpos = 0;
|
||||
|
||||
for (std::uint64_t i = m_suffixes.size(); i > 0; --i) {
|
||||
const suffix_type& curr_suffix = m_suffixes[i - 1];
|
||||
XCDAT_THROW_IF(curr_suffix.size() == 0, "A suffix is empty.");
|
||||
|
||||
std::uint64_t match = 0;
|
||||
while ((match < curr_suffix.size()) && (match < prev_suffix->size()) &&
|
||||
((*prev_suffix)[match] == curr_suffix[match])) {
|
||||
++match;
|
||||
}
|
||||
|
||||
if ((match == curr_suffix.size()) && (prev_suffix->size() != 0)) { // sharable
|
||||
setter(curr_suffix.npos, prev_tpos + (prev_suffix->size() - match));
|
||||
prev_tpos += prev_suffix->size() - match;
|
||||
} else { // append
|
||||
setter(curr_suffix.npos, m_chars.size());
|
||||
prev_tpos = m_chars.size();
|
||||
std::copy(curr_suffix.begin(), curr_suffix.end(), std::back_inserter(m_chars));
|
||||
if (bin_mode) {
|
||||
for (std::uint64_t j = 1; j < curr_suffix.size(); ++j) {
|
||||
m_terms.push_back(false);
|
||||
}
|
||||
m_terms.push_back(true);
|
||||
} else {
|
||||
m_chars.emplace_back('\0');
|
||||
}
|
||||
}
|
||||
|
||||
prev_suffix = &curr_suffix;
|
||||
}
|
||||
}
|
||||
|
||||
friend class tail_vector;
|
||||
};
|
||||
|
||||
private:
|
||||
immutable_vector<char> m_chars;
|
||||
bit_vector m_terms;
|
||||
|
||||
public:
|
||||
tail_vector() = default;
|
||||
virtual ~tail_vector() = default;
|
||||
|
||||
tail_vector(const tail_vector&) = delete;
|
||||
tail_vector& operator=(const tail_vector&) = delete;
|
||||
|
||||
tail_vector(tail_vector&&) noexcept = default;
|
||||
tail_vector& operator=(tail_vector&&) noexcept = default;
|
||||
|
||||
explicit tail_vector(builder&& b) : m_chars(b.m_chars), m_terms(b.m_terms) {}
|
||||
|
||||
inline bool bin_mode() const {
|
||||
return m_terms.size() != 0;
|
||||
}
|
||||
|
||||
inline bool match(std::string_view key, std::uint64_t tpos) const {
|
||||
if (key.size() == 0) {
|
||||
return tpos == 0;
|
||||
}
|
||||
|
||||
std::uint64_t kpos = 0;
|
||||
|
||||
if (bin_mode()) {
|
||||
do {
|
||||
if (key[kpos] != m_chars[tpos]) {
|
||||
return false;
|
||||
}
|
||||
kpos += 1;
|
||||
if (m_terms[tpos]) {
|
||||
return kpos == key.size();
|
||||
}
|
||||
tpos += 1;
|
||||
} while (kpos < key.size());
|
||||
return false;
|
||||
} else {
|
||||
do {
|
||||
if (!m_chars[tpos] || key[kpos] != m_chars[tpos]) {
|
||||
return false;
|
||||
}
|
||||
kpos += 1;
|
||||
tpos += 1;
|
||||
} while (kpos < key.size());
|
||||
return !m_chars[tpos];
|
||||
}
|
||||
}
|
||||
|
||||
inline bool prefix_match(std::string_view key, std::uint64_t tpos) const {
|
||||
assert(key.size() != 0);
|
||||
std::uint64_t kpos = 0;
|
||||
|
||||
if (bin_mode()) {
|
||||
do {
|
||||
if (key[kpos] != m_chars[tpos]) {
|
||||
return false;
|
||||
}
|
||||
kpos += 1;
|
||||
if (m_terms[tpos]) {
|
||||
return kpos == key.size();
|
||||
}
|
||||
tpos += 1;
|
||||
} while (kpos < key.size());
|
||||
return true;
|
||||
} else {
|
||||
do {
|
||||
if (!m_chars[tpos] || key[kpos] != m_chars[tpos]) {
|
||||
return false;
|
||||
}
|
||||
kpos += 1;
|
||||
tpos += 1;
|
||||
} while (kpos < key.size());
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
inline void decode(std::uint64_t tpos, const std::function<void(char)>& fn) const {
|
||||
if (bin_mode()) {
|
||||
if (tpos != 0) {
|
||||
do {
|
||||
fn(m_chars[tpos]);
|
||||
} while (!m_terms[tpos++]);
|
||||
}
|
||||
} else {
|
||||
while (m_chars[tpos]) {
|
||||
fn(m_chars[tpos++]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline std::uint64_t size() const {
|
||||
return m_chars.size();
|
||||
}
|
||||
|
||||
template <class Visitor>
|
||||
void visit(Visitor& visitor) {
|
||||
visitor.visit(m_chars);
|
||||
visitor.visit(m_terms);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace xcdat
|
468
include/xcdat/trie.hpp
Normal file
468
include/xcdat/trie.hpp
Normal file
|
@ -0,0 +1,468 @@
|
|||
#pragma once
|
||||
|
||||
#include <functional>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
|
||||
#include "trie_builder.hpp"
|
||||
|
||||
namespace xcdat {
|
||||
|
||||
//! A compressed string dictionary based on an improved double-array trie.
|
||||
//! 'BcVector' is the data type of Base and Check vectors.
|
||||
template <class BcVector>
|
||||
class trie {
|
||||
public:
|
||||
using trie_type = trie<BcVector>;
|
||||
using bc_vector_type = BcVector;
|
||||
|
||||
static constexpr auto l1_bits = bc_vector_type::l1_bits;
|
||||
|
||||
private:
|
||||
std::uint64_t m_num_keys = 0;
|
||||
code_table m_table;
|
||||
bit_vector m_terms;
|
||||
bc_vector_type m_bcvec;
|
||||
tail_vector m_tvec;
|
||||
|
||||
public:
|
||||
//! Default constructor
|
||||
trie() = default;
|
||||
|
||||
//! Default destructor
|
||||
virtual ~trie() = default;
|
||||
|
||||
//! Copy constructor (deleted)
|
||||
trie(const trie&) = delete;
|
||||
|
||||
//! Copy constructor (deleted)
|
||||
trie& operator=(const trie&) = delete;
|
||||
|
||||
//! Move constructor
|
||||
trie(trie&&) noexcept = default;
|
||||
|
||||
//! Move constructor
|
||||
trie& operator=(trie&&) noexcept = default;
|
||||
|
||||
//! Build the trie from the input keywords, which are lexicographically sorted and unique.
|
||||
//!
|
||||
//! If bin_mode = false, the NULL character is used for the termination of a keyword.
|
||||
//! If bin_mode = true, bit flags are used istead, and the keywords can contain NULL characters.
|
||||
//! If the input keywords contain NULL characters, bin_mode will be forced to be set to true.
|
||||
//!
|
||||
//! The type 'Strings' and 'Strings::value_type' should be a random iterable container such as std::vector.
|
||||
//! Precisely, they should support the following operations:
|
||||
//! - size() returns the container size.
|
||||
//! - operator[](i) accesses the i-th element.
|
||||
//! - begin() returns the iterator to the beginning.
|
||||
//! - end() returns the iterator to the end.
|
||||
//! The type 'Strings::value_type::value_type' should be one-byte integer type such as 'char'.
|
||||
template <class Strings>
|
||||
trie(const Strings& keys, bool bin_mode = false) : trie(trie_builder(keys, l1_bits, bin_mode)) {
|
||||
static_assert(sizeof(char) == sizeof(typename Strings::value_type::value_type));
|
||||
}
|
||||
|
||||
//! Check if the binary mode.
|
||||
inline bool bin_mode() const {
|
||||
return m_tvec.bin_mode();
|
||||
}
|
||||
|
||||
//! Get the number of stored keywords.
|
||||
inline std::uint64_t num_keys() const {
|
||||
return m_num_keys;
|
||||
}
|
||||
|
||||
//! Get the alphabet size.
|
||||
inline std::uint64_t alphabet_size() const {
|
||||
return m_table.alphabet_size();
|
||||
}
|
||||
|
||||
//! Get the maximum length of keywords.
|
||||
inline std::uint64_t max_length() const {
|
||||
return m_table.max_length();
|
||||
}
|
||||
|
||||
//! Get the number of trie nodes.
|
||||
inline std::uint64_t num_nodes() const {
|
||||
return m_bcvec.num_nodes();
|
||||
}
|
||||
|
||||
//! Get the number of DA units.
|
||||
inline std::uint64_t num_units() const {
|
||||
return m_bcvec.num_units();
|
||||
}
|
||||
|
||||
//! Get the number of unused DA units.
|
||||
inline std::uint64_t num_free_units() const {
|
||||
return m_bcvec.num_free_units();
|
||||
}
|
||||
|
||||
//! Get the length of TAIL vector.
|
||||
inline std::uint64_t tail_length() const {
|
||||
return m_tvec.size();
|
||||
}
|
||||
|
||||
//! Lookup the ID of the keyword.
|
||||
inline std::optional<std::uint64_t> lookup(std::string_view key) const {
|
||||
std::uint64_t kpos = 0, npos = 0;
|
||||
while (!m_bcvec.is_leaf(npos)) {
|
||||
if (kpos == key.size()) {
|
||||
if (!m_terms[npos]) {
|
||||
return std::nullopt;
|
||||
}
|
||||
return npos_to_id(npos);
|
||||
}
|
||||
const std::uint64_t cpos = m_bcvec.base(npos) ^ m_table.get_code(key[kpos++]);
|
||||
if (m_bcvec.check(cpos) != npos) {
|
||||
return std::nullopt;
|
||||
}
|
||||
npos = cpos;
|
||||
}
|
||||
|
||||
const std::uint64_t tpos = m_bcvec.link(npos);
|
||||
if (!m_tvec.match(get_suffix(key, kpos), tpos)) {
|
||||
return std::nullopt;
|
||||
}
|
||||
return npos_to_id(npos);
|
||||
}
|
||||
|
||||
//! Decode the keyword associated with the ID.
|
||||
inline std::string decode(std::uint64_t id) const {
|
||||
std::string decoded;
|
||||
decoded.reserve(max_length());
|
||||
decode(id, decoded);
|
||||
return decoded;
|
||||
}
|
||||
|
||||
//! Decode the keyword associated with the ID and store it in 'decoded'.
|
||||
//! It can avoid reallocation of memory to store the result.
|
||||
inline void decode(std::uint64_t id, std::string& decoded) const {
|
||||
decoded.clear();
|
||||
|
||||
if (num_keys() <= id) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::uint64_t npos = id_to_npos(id);
|
||||
std::uint64_t tpos = m_bcvec.is_leaf(npos) ? m_bcvec.link(npos) : UINT64_MAX;
|
||||
|
||||
while (npos != 0) {
|
||||
const std::uint64_t ppos = m_bcvec.check(npos);
|
||||
decoded.push_back(m_table.get_char(m_bcvec.base(ppos) ^ npos));
|
||||
npos = ppos;
|
||||
}
|
||||
|
||||
std::reverse(decoded.begin(), decoded.end());
|
||||
if (tpos != 0 && tpos != UINT64_MAX) {
|
||||
m_tvec.decode(tpos, [&](char c) { decoded.push_back(c); });
|
||||
}
|
||||
}
|
||||
|
||||
//! An iterator class for common prefix search.
|
||||
//! It enumerates all the keywords contained as prefixes of a given string.
|
||||
//! It should be instantiated via the function 'make_prefix_iterator'.
|
||||
class prefix_iterator {
|
||||
private:
|
||||
const trie_type* m_obj = nullptr;
|
||||
std::string_view m_key;
|
||||
std::uint64_t m_id = 0;
|
||||
std::uint64_t m_kpos = 0;
|
||||
std::uint64_t m_npos = 0;
|
||||
bool is_beg = true;
|
||||
bool is_end = false;
|
||||
|
||||
public:
|
||||
prefix_iterator() = default;
|
||||
|
||||
//! Increment the iterator.
|
||||
//! Return false if the iteration is terminated.
|
||||
inline bool next() {
|
||||
return m_obj != nullptr && m_obj->next_prefix(this);
|
||||
}
|
||||
|
||||
//! Get the result ID.
|
||||
inline std::uint64_t id() const {
|
||||
return m_id;
|
||||
}
|
||||
|
||||
//! Get the result keyword.
|
||||
inline std::string decoded() const {
|
||||
return std::string(m_key.data(), m_kpos);
|
||||
}
|
||||
|
||||
//! Get the reference to the result keyword.
|
||||
//! Note that the referenced data will be changed in the next iteration.
|
||||
inline std::string_view decoded_view() const {
|
||||
return std::string_view(m_key.data(), m_kpos);
|
||||
}
|
||||
|
||||
private:
|
||||
prefix_iterator(const trie_type* obj, std::string_view key) : m_obj(obj), m_key(key) {}
|
||||
|
||||
friend class trie;
|
||||
};
|
||||
|
||||
//! Make the common prefix searcher for the given keyword.
|
||||
inline prefix_iterator make_prefix_iterator(std::string_view key) const {
|
||||
return prefix_iterator(this, key);
|
||||
}
|
||||
|
||||
//! Preform common prefix search for the keyword.
|
||||
inline void prefix_search(std::string_view key,
|
||||
const std::function<void(std::uint64_t, std::string_view)>& fn) const {
|
||||
auto itr = make_prefix_iterator(key);
|
||||
while (itr.next()) {
|
||||
fn(itr.id(), itr.decoded_view());
|
||||
}
|
||||
}
|
||||
|
||||
//! An iterator class for predictive search.
|
||||
//! It enumerates all the keywords starting with a given string.
|
||||
//! It should be instantiated via the function 'make_predictive_iterator'.
|
||||
class predictive_iterator {
|
||||
public:
|
||||
struct cursor_type {
|
||||
char label;
|
||||
std::uint64_t kpos;
|
||||
std::uint64_t npos;
|
||||
};
|
||||
|
||||
private:
|
||||
const trie_type* m_obj = nullptr;
|
||||
std::string_view m_key;
|
||||
std::uint64_t m_id = 0;
|
||||
std::string m_decoded;
|
||||
std::vector<cursor_type> m_stack;
|
||||
bool is_beg = true;
|
||||
bool is_end = false;
|
||||
|
||||
public:
|
||||
predictive_iterator() = default;
|
||||
|
||||
//! Increment the iterator.
|
||||
//! Return false if the iteration is terminated.
|
||||
inline bool next() {
|
||||
return m_obj != nullptr && m_obj->next_predictive(this);
|
||||
}
|
||||
|
||||
//! Get the result ID.
|
||||
inline std::uint64_t id() const {
|
||||
return m_id;
|
||||
}
|
||||
|
||||
//! Get the result keyword.
|
||||
inline std::string decoded() const {
|
||||
return m_decoded;
|
||||
}
|
||||
|
||||
//! Get the reference to the result keyword.
|
||||
//! Note that the referenced data will be changed in the next iteration.
|
||||
inline std::string_view decoded_view() const {
|
||||
return m_decoded;
|
||||
}
|
||||
|
||||
private:
|
||||
predictive_iterator(const trie_type* obj, std::string_view key) : m_obj(obj), m_key(key) {}
|
||||
|
||||
friend class trie;
|
||||
};
|
||||
|
||||
//! Make the predictive searcher for the keyword.
|
||||
inline predictive_iterator make_predictive_iterator(std::string_view key) const {
|
||||
return predictive_iterator(this, key);
|
||||
}
|
||||
|
||||
//! Preform predictive search for the keyword.
|
||||
inline void predictive_search(std::string_view key,
|
||||
const std::function<void(std::uint64_t, std::string_view)>& fn) const {
|
||||
auto itr = make_predictive_iterator(key);
|
||||
while (itr.next()) {
|
||||
fn(itr.id(), itr.decoded_view());
|
||||
}
|
||||
}
|
||||
|
||||
//! An iterator class for enumeration.
|
||||
//! It enumerates all the keywords stored in the trie.
|
||||
//! It should be instantiated via the function 'make_enumerative_iterator'.
|
||||
using enumerative_iterator = predictive_iterator;
|
||||
|
||||
//! Make the enumerator.
|
||||
inline enumerative_iterator make_enumerative_iterator() const {
|
||||
return enumerative_iterator(this, "");
|
||||
}
|
||||
|
||||
//! Enumerate all the keywords and their IDs stored in the trie.
|
||||
inline void enumerate(const std::function<void(std::uint64_t, std::string_view)>& fn) const {
|
||||
auto itr = make_enumerative_iterator();
|
||||
while (itr.next()) {
|
||||
fn(itr.id(), itr.decoded_view());
|
||||
}
|
||||
}
|
||||
|
||||
//! Visit the members (commonly used for I/O).
|
||||
template <class Visitor>
|
||||
void visit(Visitor& visitor) {
|
||||
visitor.visit(m_num_keys);
|
||||
visitor.visit(m_table);
|
||||
visitor.visit(m_terms);
|
||||
visitor.visit(m_bcvec);
|
||||
visitor.visit(m_tvec);
|
||||
}
|
||||
|
||||
private:
|
||||
template <class Strings>
|
||||
explicit trie(trie_builder<Strings>&& b)
|
||||
: m_num_keys(b.m_keys.size()), m_table(std::move(b.m_table)), m_terms(b.m_terms, true, true),
|
||||
m_bcvec(b.m_units, std::move(b.m_leaves)), m_tvec(std::move(b.m_suffixes)) {}
|
||||
|
||||
template <class String>
|
||||
static constexpr String get_suffix(const String& s, std::uint64_t i) {
|
||||
assert(i <= s.size());
|
||||
return s.substr(i, s.size() - i);
|
||||
}
|
||||
|
||||
inline std::uint64_t npos_to_id(std::uint64_t npos) const {
|
||||
return m_terms.rank(npos);
|
||||
};
|
||||
|
||||
inline std::uint64_t id_to_npos(std::uint64_t id) const {
|
||||
return m_terms.select(id);
|
||||
};
|
||||
|
||||
inline bool next_prefix(prefix_iterator* itr) const {
|
||||
if (itr->is_end) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (itr->is_beg) {
|
||||
itr->is_beg = false;
|
||||
if (m_terms[itr->m_npos]) {
|
||||
itr->m_id = npos_to_id(itr->m_npos);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if (bin_mode() and itr->m_kpos == itr->m_key.size()) {
|
||||
// Is the key terminated at an inner term?
|
||||
itr->is_end = true;
|
||||
itr->m_id = num_keys();
|
||||
return false;
|
||||
}
|
||||
|
||||
while (!m_bcvec.is_leaf(itr->m_npos)) {
|
||||
if (bin_mode() and itr->m_kpos == itr->m_key.size()) {
|
||||
// Is the key terminated at an internal node (not term)?
|
||||
itr->is_end = true;
|
||||
itr->m_id = num_keys();
|
||||
return false;
|
||||
}
|
||||
|
||||
const std::uint64_t cpos = m_bcvec.base(itr->m_npos) ^ m_table.get_code(itr->m_key[itr->m_kpos++]);
|
||||
|
||||
if (m_bcvec.check(cpos) != itr->m_npos) {
|
||||
itr->is_end = true;
|
||||
itr->m_id = num_keys();
|
||||
return false;
|
||||
}
|
||||
|
||||
itr->m_npos = cpos;
|
||||
if (!m_bcvec.is_leaf(itr->m_npos) && m_terms[itr->m_npos]) {
|
||||
itr->m_id = npos_to_id(itr->m_npos);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
itr->is_end = true;
|
||||
|
||||
const std::uint64_t tpos = m_bcvec.link(itr->m_npos);
|
||||
if (!m_tvec.match(get_suffix(itr->m_key, itr->m_kpos), tpos)) {
|
||||
itr->m_id = num_keys();
|
||||
return false;
|
||||
}
|
||||
|
||||
itr->m_kpos = itr->m_key.size();
|
||||
itr->m_id = npos_to_id(itr->m_npos);
|
||||
return true;
|
||||
}
|
||||
|
||||
inline bool next_predictive(predictive_iterator* itr) const {
|
||||
if (itr->is_end) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (itr->is_beg) {
|
||||
itr->is_beg = false;
|
||||
|
||||
std::uint64_t kpos = 0;
|
||||
std::uint64_t npos = 0;
|
||||
|
||||
for (; kpos < itr->m_key.size(); ++kpos) {
|
||||
if (m_bcvec.is_leaf(npos)) {
|
||||
itr->is_end = true;
|
||||
const std::uint64_t tpos = m_bcvec.link(npos);
|
||||
if (tpos == 0) {
|
||||
return false;
|
||||
}
|
||||
if (!m_tvec.prefix_match(get_suffix(itr->m_key, kpos), tpos)) {
|
||||
return false;
|
||||
}
|
||||
itr->m_id = npos_to_id(npos);
|
||||
m_tvec.decode(tpos, [&](char c) { itr->m_decoded.push_back(c); });
|
||||
return true;
|
||||
}
|
||||
|
||||
const std::uint64_t cpos = m_bcvec.base(npos) ^ m_table.get_code(itr->m_key[kpos]);
|
||||
if (m_bcvec.check(cpos) != npos) {
|
||||
itr->is_end = true;
|
||||
return false;
|
||||
}
|
||||
|
||||
npos = cpos;
|
||||
itr->m_decoded.push_back(itr->m_key[kpos]);
|
||||
}
|
||||
|
||||
if (!itr->m_decoded.empty()) {
|
||||
itr->m_stack.push_back({itr->m_decoded.back(), kpos, npos});
|
||||
} else {
|
||||
itr->m_stack.push_back({'\0', kpos, npos});
|
||||
}
|
||||
}
|
||||
|
||||
while (!itr->m_stack.empty()) {
|
||||
const char label = itr->m_stack.back().label;
|
||||
const std::uint64_t kpos = itr->m_stack.back().kpos;
|
||||
const std::uint64_t npos = itr->m_stack.back().npos;
|
||||
|
||||
itr->m_stack.pop_back();
|
||||
|
||||
if (0 < kpos) {
|
||||
itr->m_decoded.resize(kpos);
|
||||
itr->m_decoded.back() = label;
|
||||
}
|
||||
|
||||
if (m_bcvec.is_leaf(npos)) {
|
||||
itr->m_id = npos_to_id(npos);
|
||||
m_tvec.decode(m_bcvec.link(npos), [&](char c) { itr->m_decoded.push_back(c); });
|
||||
return true;
|
||||
}
|
||||
|
||||
const std::uint64_t base = m_bcvec.base(npos);
|
||||
|
||||
for (auto cit = m_table.rbegin(); cit != m_table.rend(); ++cit) {
|
||||
const std::uint64_t cpos = base ^ m_table.get_code(*cit);
|
||||
if (m_bcvec.check(cpos) == npos) {
|
||||
itr->m_stack.push_back({static_cast<char>(*cit), kpos + 1, cpos});
|
||||
}
|
||||
}
|
||||
|
||||
if (m_terms[npos]) {
|
||||
itr->m_id = npos_to_id(npos);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
itr->is_end = true;
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace xcdat
|
265
include/xcdat/trie_builder.hpp
Normal file
265
include/xcdat/trie_builder.hpp
Normal file
|
@ -0,0 +1,265 @@
|
|||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <string_view>
|
||||
|
||||
// #include "bc_vector.hpp"
|
||||
#include "code_table.hpp"
|
||||
#include "exception.hpp"
|
||||
#include "tail_vector.hpp"
|
||||
|
||||
namespace xcdat {
|
||||
|
||||
template <class Strings>
|
||||
class trie_builder {
|
||||
template <class>
|
||||
friend class trie;
|
||||
|
||||
public:
|
||||
struct unit_type {
|
||||
std::uint64_t base;
|
||||
std::uint64_t check;
|
||||
};
|
||||
|
||||
private:
|
||||
static constexpr std::uint64_t taboo_npos = 1;
|
||||
static constexpr std::uint64_t free_blocks = 16;
|
||||
|
||||
const Strings& m_keys;
|
||||
const std::uint32_t m_l1_bits; // # of bits for L1 layer of DACs
|
||||
const std::uint64_t m_l1_size;
|
||||
|
||||
bool m_bin_mode = false;
|
||||
|
||||
code_table m_table;
|
||||
std::vector<unit_type> m_units;
|
||||
bit_vector::builder m_leaves;
|
||||
bit_vector::builder m_terms;
|
||||
bit_vector::builder m_useds;
|
||||
std::vector<std::uint64_t> m_heads; // for L1 blocks
|
||||
std::vector<std::uint8_t> m_edges;
|
||||
tail_vector::builder m_suffixes;
|
||||
|
||||
public:
|
||||
explicit trie_builder(const Strings& keys, std::uint32_t l1_bits, bool bin_mode)
|
||||
: m_keys(keys), m_l1_bits(std::min(l1_bits, 8U)), m_l1_size(1ULL << m_l1_bits), m_bin_mode(bin_mode) {
|
||||
XCDAT_THROW_IF(m_keys.size() == 0, "The input dataset is empty.");
|
||||
|
||||
// Reserve
|
||||
{
|
||||
std::uint64_t init_capa = 1;
|
||||
while (init_capa < m_keys.size()) {
|
||||
init_capa <<= 1;
|
||||
}
|
||||
m_units.reserve(init_capa);
|
||||
m_leaves.reserve(init_capa);
|
||||
m_terms.reserve(init_capa);
|
||||
m_useds.reserve(init_capa);
|
||||
m_heads.reserve(init_capa >> m_l1_bits);
|
||||
m_edges.reserve(256);
|
||||
}
|
||||
|
||||
// Initialize an empty list.
|
||||
for (std::uint64_t npos = 0; npos < 256; ++npos) {
|
||||
m_units.push_back(unit_type{npos + 1, npos - 1});
|
||||
m_leaves.push_back(false);
|
||||
m_terms.push_back(false);
|
||||
m_useds.push_back(false);
|
||||
}
|
||||
m_units[255].base = 0;
|
||||
m_units[0].check = 255;
|
||||
|
||||
for (std::uint64_t npos = 0; npos < 256; npos += m_l1_size) {
|
||||
m_heads.push_back(npos);
|
||||
}
|
||||
|
||||
// Fix the root
|
||||
use_unit(0);
|
||||
m_units[0].check = taboo_npos;
|
||||
m_useds.set_bit(taboo_npos, true);
|
||||
m_heads[taboo_npos >> m_l1_bits] = m_units[taboo_npos].base;
|
||||
|
||||
// Build the code table
|
||||
m_table = code_table(keys);
|
||||
m_bin_mode |= m_table.has_null();
|
||||
|
||||
// Build the BC units
|
||||
arrange(0, m_keys.size(), 0, 0);
|
||||
|
||||
// Finish
|
||||
finish();
|
||||
|
||||
// Build the TAIL vector
|
||||
m_suffixes.complete(m_bin_mode, [&](std::uint64_t npos, std::uint64_t tpos) { m_units[npos].base = tpos; });
|
||||
}
|
||||
|
||||
virtual ~trie_builder() = default;
|
||||
|
||||
trie_builder(const trie_builder&) = delete;
|
||||
trie_builder& operator=(const trie_builder&) = delete;
|
||||
|
||||
trie_builder(trie_builder&&) noexcept = default;
|
||||
trie_builder& operator=(trie_builder&&) noexcept = default;
|
||||
|
||||
private:
|
||||
inline void use_unit(std::uint64_t npos) {
|
||||
m_useds.set_bit(npos);
|
||||
|
||||
const auto next = m_units[npos].base;
|
||||
const auto prev = m_units[npos].check;
|
||||
m_units[prev].base = next;
|
||||
m_units[next].check = prev;
|
||||
|
||||
const auto lpos = npos >> m_l1_bits;
|
||||
if (m_heads[lpos] == npos) {
|
||||
m_heads[lpos] = (lpos != next >> m_l1_bits) ? taboo_npos : next;
|
||||
}
|
||||
}
|
||||
|
||||
inline void close_block(std::uint64_t bpos) {
|
||||
const auto beg_npos = bpos * 256;
|
||||
const auto end_npos = beg_npos + 256;
|
||||
|
||||
for (auto npos = beg_npos; npos < end_npos; ++npos) {
|
||||
if (!m_useds[npos]) {
|
||||
use_unit(npos);
|
||||
m_useds.set_bit(npos, false);
|
||||
m_units[npos].base = npos;
|
||||
m_units[npos].check = npos;
|
||||
}
|
||||
}
|
||||
|
||||
for (auto npos = beg_npos; npos < end_npos; npos += m_l1_size) {
|
||||
m_heads[npos >> m_l1_bits] = taboo_npos;
|
||||
}
|
||||
}
|
||||
|
||||
void expand() {
|
||||
const auto old_size = static_cast<std::uint64_t>(m_units.size());
|
||||
const auto new_size = old_size + 256;
|
||||
|
||||
for (auto npos = old_size; npos < new_size; ++npos) {
|
||||
m_units.push_back({npos + 1, npos - 1});
|
||||
m_leaves.push_back(false);
|
||||
m_terms.push_back(false);
|
||||
m_useds.push_back(false);
|
||||
}
|
||||
|
||||
{
|
||||
const auto last_npos = m_units[taboo_npos].check;
|
||||
m_units[old_size].check = last_npos;
|
||||
m_units[last_npos].base = old_size;
|
||||
m_units[new_size - 1].base = taboo_npos;
|
||||
m_units[taboo_npos].check = new_size - 1;
|
||||
}
|
||||
|
||||
for (auto npos = old_size; npos < new_size; npos += m_l1_size) {
|
||||
m_heads.push_back(npos);
|
||||
}
|
||||
|
||||
const auto bpos = old_size / 256;
|
||||
if (free_blocks <= bpos) {
|
||||
close_block(bpos - free_blocks);
|
||||
}
|
||||
}
|
||||
|
||||
void finish() {
|
||||
while (m_units[taboo_npos].base != taboo_npos) {
|
||||
auto bpos = m_units[taboo_npos].base / 256;
|
||||
close_block(bpos);
|
||||
}
|
||||
}
|
||||
|
||||
void arrange(std::uint64_t beg, std::uint64_t end, std::uint64_t kpos, std::uint64_t npos) {
|
||||
if (m_keys[beg].size() == kpos) {
|
||||
m_terms.set_bit(npos, true);
|
||||
if (++beg == end) { // without link?
|
||||
m_units[npos].base = 0; // with an empty suffix
|
||||
m_leaves.set_bit(npos, true);
|
||||
return;
|
||||
}
|
||||
} else if (beg + 1 == end) { // leaf?
|
||||
XCDAT_THROW_IF(m_keys[beg].size() <= kpos, "The input keys are not unique.");
|
||||
m_terms.set_bit(npos, true);
|
||||
m_leaves.set_bit(npos, true);
|
||||
m_suffixes.set_suffix({m_keys[beg].data() + kpos, m_keys[beg].size() - kpos}, npos);
|
||||
return;
|
||||
}
|
||||
|
||||
// fetching edges
|
||||
{
|
||||
m_edges.clear();
|
||||
auto ch = static_cast<std::uint8_t>(m_keys[beg][kpos]);
|
||||
for (auto i = beg + 1; i < end; ++i) {
|
||||
const auto next_ch = static_cast<std::uint8_t>(m_keys[i][kpos]);
|
||||
if (ch != next_ch) {
|
||||
XCDAT_THROW_IF(next_ch < ch, "The input keys are not in lexicographical order.");
|
||||
m_edges.push_back(ch);
|
||||
ch = next_ch;
|
||||
}
|
||||
}
|
||||
m_edges.push_back(ch);
|
||||
}
|
||||
|
||||
const auto base = xcheck(npos >> m_l1_bits);
|
||||
if (m_units.size() <= base) {
|
||||
expand();
|
||||
}
|
||||
|
||||
// defining new edges
|
||||
m_units[npos].base = base;
|
||||
for (const auto ch : m_edges) {
|
||||
const auto child_id = base ^ m_table.get_code(ch);
|
||||
use_unit(child_id);
|
||||
m_units[child_id].check = npos;
|
||||
}
|
||||
|
||||
// following the children
|
||||
auto i = beg;
|
||||
auto ch = static_cast<uint8_t>(m_keys[beg][kpos]);
|
||||
for (auto j = beg + 1; j < end; ++j) {
|
||||
const auto next_ch = static_cast<uint8_t>(m_keys[j][kpos]);
|
||||
if (ch != next_ch) {
|
||||
arrange(i, j, kpos + 1, base ^ m_table.get_code(ch));
|
||||
ch = next_ch;
|
||||
i = j;
|
||||
}
|
||||
}
|
||||
arrange(i, end, kpos + 1, base ^ m_table.get_code(ch));
|
||||
}
|
||||
|
||||
inline std::uint64_t xcheck(std::uint64_t lpos) const {
|
||||
if (m_units[taboo_npos].base == taboo_npos) { // Full?
|
||||
return m_units.size() ^ m_table.get_code(m_edges[0]);
|
||||
}
|
||||
|
||||
// First, search in the same L1 block
|
||||
for (auto i = m_heads[lpos]; i != taboo_npos && i >> m_l1_bits == lpos; i = m_units[i].base) {
|
||||
const auto base = i ^ m_table.get_code(m_edges[0]);
|
||||
if (is_target(base)) {
|
||||
return base; // base / block_size_ == lpos
|
||||
}
|
||||
}
|
||||
|
||||
// Second, search in the other blocks
|
||||
for (auto i = m_units[taboo_npos].base; i != taboo_npos; i = m_units[i].base) {
|
||||
const auto base = i ^ m_table.get_code(m_edges[0]);
|
||||
if (is_target(base)) {
|
||||
return base; // base / block_size_ != lpos
|
||||
}
|
||||
}
|
||||
return m_units.size() ^ m_table.get_code(m_edges[0]);
|
||||
}
|
||||
|
||||
inline bool is_target(std::uint64_t base) const {
|
||||
for (const auto ch : m_edges) {
|
||||
if (m_useds[base ^ m_table.get_code(ch)]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace xcdat
|
1
sample/CMakeLists.txt
Normal file
1
sample/CMakeLists.txt
Normal file
|
@ -0,0 +1 @@
|
|||
add_executable(sample sample.cpp)
|
92
sample/sample.cpp
Normal file
92
sample/sample.cpp
Normal file
|
@ -0,0 +1,92 @@
|
|||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
#include <mm_file/mm_file.hpp>
|
||||
#include <xcdat.hpp>
|
||||
|
||||
int main() {
|
||||
// Dataset of keywords
|
||||
std::vector<std::string> keys = {
|
||||
"AirPods", "AirTag", "Mac", "MacBook", "MacBook_Air", "MacBook_Pro",
|
||||
"Mac_Mini", "Mac_Pro", "iMac", "iPad", "iPhone", "iPhone_SE",
|
||||
};
|
||||
|
||||
// The input keys must be sorted and unique (although they have already satisfied in this case).
|
||||
std::sort(keys.begin(), keys.end());
|
||||
keys.erase(std::unique(keys.begin(), keys.end()), keys.end());
|
||||
|
||||
// The trie dictionary type
|
||||
using trie_type = xcdat::trie_8_type;
|
||||
|
||||
// The dictionary filename
|
||||
const char* tmp_filename = "dic.bin";
|
||||
|
||||
// Build and save the trie dictionary.
|
||||
{
|
||||
const trie_type trie(keys);
|
||||
xcdat::save(trie, tmp_filename);
|
||||
}
|
||||
|
||||
// Memory-map the trie dictionary.
|
||||
const mm::file_source<char> fin(tmp_filename, mm::advice::sequential);
|
||||
const auto trie = xcdat::mmap<trie_type>(fin.data());
|
||||
|
||||
// Or, load the trie dictionary on memory.
|
||||
// const auto trie = xcdat::load<trie_type>(tmp_filename);
|
||||
|
||||
// Basic statistics
|
||||
std::cout << "Number of keys: " << trie.num_keys() << std::endl;
|
||||
std::cout << "Number of trie nodes: " << trie.num_nodes() << std::endl;
|
||||
std::cout << "Number of DA units: " << trie.num_units() << std::endl;
|
||||
std::cout << "Memory usage in bytes: " << xcdat::memory_in_bytes(trie) << std::endl;
|
||||
|
||||
// Lookup the ID for a query key.
|
||||
{
|
||||
const auto id = trie.lookup("Mac_Pro");
|
||||
std::cout << "Lookup(Mac_Pro) = " << id.value_or(UINT64_MAX) << std::endl;
|
||||
}
|
||||
{
|
||||
const auto id = trie.lookup("Google_Pixel");
|
||||
std::cout << "Lookup(Google_Pixel) = " << id.value_or(UINT64_MAX) << std::endl;
|
||||
}
|
||||
|
||||
// Decode the key for a query ID.
|
||||
{
|
||||
const auto dec = trie.decode(4);
|
||||
std::cout << "Decode(4) = " << dec << std::endl;
|
||||
}
|
||||
|
||||
// Common prefix search
|
||||
{
|
||||
std::cout << "CommonPrefixSearch(MacBook_Air) = {" << std::endl;
|
||||
auto itr = trie.make_prefix_iterator("MacBook_Air");
|
||||
while (itr.next()) {
|
||||
std::cout << " (" << itr.decoded_view() << ", " << itr.id() << ")," << std::endl;
|
||||
}
|
||||
std::cout << "}" << std::endl;
|
||||
}
|
||||
|
||||
// Predictive search
|
||||
{
|
||||
std::cout << "PredictiveSearch(Mac) = {" << std::endl;
|
||||
auto itr = trie.make_predictive_iterator("Mac");
|
||||
while (itr.next()) {
|
||||
std::cout << " (" << itr.decoded_view() << ", " << itr.id() << ")," << std::endl;
|
||||
}
|
||||
std::cout << "}" << std::endl;
|
||||
}
|
||||
|
||||
// Enumerate all the keys (in lex order).
|
||||
{
|
||||
std::cout << "Enumerate() = {" << std::endl;
|
||||
auto itr = trie.make_enumerative_iterator();
|
||||
while (itr.next()) {
|
||||
std::cout << " (" << itr.decoded_view() << ", " << itr.id() << ")," << std::endl;
|
||||
}
|
||||
std::cout << "}" << std::endl;
|
||||
}
|
||||
|
||||
std::remove(tmp_filename);
|
||||
|
||||
return 0;
|
||||
}
|
24
tests/CMakeLists.txt
Normal file
24
tests/CMakeLists.txt
Normal file
|
@ -0,0 +1,24 @@
|
|||
add_executable(test_bit_vector test_bit_vector.cpp)
|
||||
add_test(test_bit_vector test_bit_vector)
|
||||
|
||||
add_executable(test_compact_vector test_compact_vector.cpp)
|
||||
add_test(test_compact_vector test_compact_vector)
|
||||
|
||||
add_executable(test_tail_vector test_tail_vector.cpp)
|
||||
add_test(test_tail_vector test_tail_vector)
|
||||
|
||||
set(BC_OPTIONS "7" "8")
|
||||
|
||||
foreach(BC_OPTION ${BC_OPTIONS})
|
||||
set(TEST_SRC_NAME test_bc_vector_${BC_OPTION})
|
||||
add_executable(${TEST_SRC_NAME} test_bc_vector.cpp)
|
||||
set_target_properties(${TEST_SRC_NAME} PROPERTIES COMPILE_DEFINITIONS BC_VECTOR_${BC_OPTION})
|
||||
add_test(${TEST_SRC_NAME} ${TEST_SRC_NAME})
|
||||
endforeach(BC_OPTION)
|
||||
|
||||
foreach(BC_OPTION ${BC_OPTIONS})
|
||||
set(TEST_SRC_NAME test_trie_${BC_OPTION})
|
||||
add_executable(${TEST_SRC_NAME} test_trie.cpp)
|
||||
set_target_properties(${TEST_SRC_NAME} PROPERTIES COMPILE_DEFINITIONS TRIE_${BC_OPTION})
|
||||
add_test(${TEST_SRC_NAME} ${TEST_SRC_NAME})
|
||||
endforeach(BC_OPTION)
|
6260
tests/doctest/doctest.h
Normal file
6260
tests/doctest/doctest.h
Normal file
File diff suppressed because it is too large
Load diff
9976
tests/keys.txt
Normal file
9976
tests/keys.txt
Normal file
File diff suppressed because it is too large
Load diff
75
tests/test_bc_vector.cpp
Normal file
75
tests/test_bc_vector.cpp
Normal file
|
@ -0,0 +1,75 @@
|
|||
#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
|
||||
|
||||
#include <algorithm>
|
||||
#include <random>
|
||||
|
||||
#include "doctest/doctest.h"
|
||||
#include "test_common.hpp"
|
||||
#include "xcdat/bc_vector_7.hpp"
|
||||
#include "xcdat/bc_vector_8.hpp"
|
||||
|
||||
#ifdef BC_VECTOR_7
|
||||
using bc_vector_type = xcdat::bc_vector_7;
|
||||
#elif BC_VECTOR_8
|
||||
using bc_vector_type = xcdat::bc_vector_8;
|
||||
#endif
|
||||
|
||||
struct bc_unit {
|
||||
std::uint64_t base;
|
||||
std::uint64_t check;
|
||||
};
|
||||
|
||||
std::vector<bc_unit> make_random_units(std::uint64_t n, std::uint64_t maxv, std::uint64_t seed = 13) {
|
||||
std::mt19937_64 engine(seed);
|
||||
std::uniform_int_distribution<std::uint64_t> dist(0, maxv);
|
||||
|
||||
std::vector<bc_unit> bc_units(n);
|
||||
for (std::uint64_t i = 0; i < n; i++) {
|
||||
bc_units[i].base = dist(engine);
|
||||
bc_units[i].check = dist(engine);
|
||||
}
|
||||
return bc_units;
|
||||
}
|
||||
|
||||
xcdat::bit_vector::builder to_bit_vector_builder(const std::vector<bool>& bits) {
|
||||
xcdat::bit_vector::builder bvb(bits.size());
|
||||
for (std::uint64_t i = 0; i < bits.size(); i++) {
|
||||
bvb.set_bit(i, bits[i]);
|
||||
}
|
||||
return bvb;
|
||||
}
|
||||
|
||||
std::uint64_t get_num_ones(const std::vector<bool>& bits) {
|
||||
return std::accumulate(bits.begin(), bits.end(), 0ULL);
|
||||
}
|
||||
|
||||
void test_bc_vector(const std::vector<bc_unit>& bc_units, const std::vector<bool>& leaves) {
|
||||
bc_vector_type bc(bc_units, to_bit_vector_builder(leaves));
|
||||
|
||||
REQUIRE_EQ(bc.num_units(), bc_units.size());
|
||||
REQUIRE_EQ(bc.num_leaves(), get_num_ones(leaves));
|
||||
|
||||
for (std::uint64_t i = 0; i < bc.num_units(); i++) {
|
||||
REQUIRE_EQ(bc.is_leaf(i), leaves[i]);
|
||||
if (leaves[i]) {
|
||||
REQUIRE_EQ(bc.link(i), bc_units[i].base);
|
||||
} else {
|
||||
REQUIRE_EQ(bc.base(i), bc_units[i].base);
|
||||
}
|
||||
REQUIRE_EQ(bc.check(i), bc_units[i].check);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("Test bc_vector 10K in [0,10K)") {
|
||||
const std::uint64_t size = 10000;
|
||||
auto bc_units = make_random_units(size, size - 1);
|
||||
auto leaves = xcdat::test::make_random_bits(size, 0.2);
|
||||
test_bc_vector(bc_units, leaves);
|
||||
}
|
||||
|
||||
TEST_CASE("Test bc_vector 10K in [0,UINT64_MAX)") {
|
||||
const std::uint64_t size = 10000;
|
||||
auto bc_units = make_random_units(size, UINT64_MAX);
|
||||
auto leaves = xcdat::test::make_random_bits(size, 0.2);
|
||||
test_bc_vector(bc_units, leaves);
|
||||
}
|
113
tests/test_bit_vector.cpp
Normal file
113
tests/test_bit_vector.cpp
Normal file
|
@ -0,0 +1,113 @@
|
|||
#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
|
||||
|
||||
#include <algorithm>
|
||||
#include <random>
|
||||
|
||||
#include "doctest/doctest.h"
|
||||
#include "test_common.hpp"
|
||||
#include "xcdat/bit_vector.hpp"
|
||||
|
||||
std::uint64_t get_num_ones(const std::vector<bool>& bits) {
|
||||
return std::accumulate(bits.begin(), bits.end(), 0ULL);
|
||||
}
|
||||
|
||||
std::uint64_t rank_naive(const std::vector<bool>& bits, std::uint64_t i) {
|
||||
return std::accumulate(bits.begin(), bits.begin() + i, 0ULL);
|
||||
}
|
||||
|
||||
std::uint64_t select_naive(const std::vector<bool>& bits, std::uint64_t n) {
|
||||
std::uint64_t i = 0;
|
||||
for (; i < bits.size(); i++) {
|
||||
if (bits[i]) {
|
||||
if (n == 0) {
|
||||
break;
|
||||
}
|
||||
n -= 1;
|
||||
}
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
void test_rank_select(const std::vector<bool>& bits) {
|
||||
xcdat::bit_vector bv;
|
||||
{
|
||||
xcdat::bit_vector::builder bvb(bits.size());
|
||||
for (std::uint64_t i = 0; i < bits.size(); i++) {
|
||||
bvb.set_bit(i, bits[i]);
|
||||
}
|
||||
bv = xcdat::bit_vector(bvb, true, true);
|
||||
}
|
||||
|
||||
REQUIRE_EQ(bv.size(), bits.size());
|
||||
REQUIRE_EQ(bv.num_ones(), get_num_ones(bits));
|
||||
|
||||
for (std::uint64_t i = 0; i < bits.size(); i++) {
|
||||
REQUIRE_EQ(bv[i], bits[i]);
|
||||
}
|
||||
|
||||
static constexpr std::uint64_t seed = 17;
|
||||
std::mt19937_64 engine(seed);
|
||||
|
||||
{
|
||||
std::uniform_int_distribution<std::uint64_t> dist(0, bv.size());
|
||||
for (std::uint64_t r = 0; r < 100; r++) {
|
||||
const std::uint64_t i = dist(engine);
|
||||
REQUIRE_EQ(bv.rank(i), rank_naive(bits, i));
|
||||
}
|
||||
}
|
||||
if (bv.num_ones() != 0) {
|
||||
std::uniform_int_distribution<std::uint64_t> dist(0, bv.num_ones() - 1);
|
||||
for (std::uint64_t r = 0; r < 100; r++) {
|
||||
const std::uint64_t n = dist(engine);
|
||||
REQUIRE_EQ(bv.select(n), select_naive(bits, n));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("Test bit_vector::builder with resize") {
|
||||
const auto bits = xcdat::test::make_random_bits(10000);
|
||||
|
||||
xcdat::bit_vector::builder bvb;
|
||||
bvb.resize(bits.size());
|
||||
|
||||
REQUIRE_EQ(bvb.size(), bits.size());
|
||||
|
||||
for (std::uint64_t i = 0; i < bits.size(); i++) {
|
||||
bvb.set_bit(i, bits[i]);
|
||||
}
|
||||
for (std::uint64_t i = 0; i < bits.size(); i++) {
|
||||
REQUIRE_EQ(bvb[i], bits[i]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("Test bit_vector::builder with push_back") {
|
||||
const auto bits = xcdat::test::make_random_bits(10000);
|
||||
|
||||
xcdat::bit_vector::builder bvb;
|
||||
bvb.reserve(bits.size());
|
||||
|
||||
for (std::uint64_t i = 0; i < bits.size(); i++) {
|
||||
bvb.push_back(bits[i]);
|
||||
}
|
||||
|
||||
REQUIRE_EQ(bvb.size(), bits.size());
|
||||
|
||||
for (std::uint64_t i = 0; i < bits.size(); i++) {
|
||||
REQUIRE_EQ(bvb[i], bits[i]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("Test rank/select operations") {
|
||||
const auto bits = xcdat::test::make_random_bits(10000);
|
||||
test_rank_select(bits);
|
||||
}
|
||||
|
||||
TEST_CASE("Test rank/select operations (all zeros)") {
|
||||
const auto bits = xcdat::test::make_random_bits(10000, 0.0);
|
||||
test_rank_select(bits);
|
||||
}
|
||||
|
||||
TEST_CASE("Test rank/select operations (all ones)") {
|
||||
const auto bits = xcdat::test::make_random_bits(10000, 1.1);
|
||||
test_rank_select(bits);
|
||||
}
|
84
tests/test_common.hpp
Normal file
84
tests/test_common.hpp
Normal file
|
@ -0,0 +1,84 @@
|
|||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <random>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace xcdat::test {
|
||||
|
||||
template <class T>
|
||||
std::vector<T> to_unique_vec(std::vector<T>&& vec) {
|
||||
std::sort(vec.begin(), vec.end());
|
||||
vec.erase(std::unique(vec.begin(), vec.end()), vec.end());
|
||||
return std::move(vec);
|
||||
}
|
||||
|
||||
std::uint64_t max_length(const std::vector<std::string>& keys) {
|
||||
std::uint64_t n = 0;
|
||||
for (auto& key : keys) {
|
||||
n = std::max<std::uint64_t>(n, key.size());
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
std::vector<bool> make_random_bits(std::uint64_t n, double dens = 0.5, std::uint64_t seed = 13) {
|
||||
std::mt19937_64 engine(seed);
|
||||
std::uniform_real_distribution<double> dist(0.0, 1.0);
|
||||
|
||||
std::vector<bool> bits(n);
|
||||
for (std::uint64_t i = 0; i < n; i++) {
|
||||
bits[i] = dist(engine) < dens;
|
||||
}
|
||||
return bits;
|
||||
}
|
||||
|
||||
std::vector<std::uint64_t> make_random_ints(std::uint64_t n, std::uint64_t min, std::uint64_t max,
|
||||
std::uint64_t seed = 13) {
|
||||
std::mt19937_64 engine(seed);
|
||||
std::uniform_int_distribution<std::uint64_t> dist(min, max);
|
||||
|
||||
std::vector<std::uint64_t> ints(n);
|
||||
for (std::uint64_t i = 0; i < n; i++) {
|
||||
ints[i] = dist(engine);
|
||||
}
|
||||
return ints;
|
||||
}
|
||||
|
||||
std::vector<std::string> make_random_keys(std::uint64_t n, std::uint64_t min_m, std::uint64_t max_m, //
|
||||
char min_c = 'A', char max_c = 'Z', std::uint64_t seed = 13) {
|
||||
std::mt19937_64 engine(seed);
|
||||
std::uniform_int_distribution<std::uint64_t> dist_m(min_m, max_m);
|
||||
std::uniform_int_distribution<char> dist_c(min_c, max_c);
|
||||
|
||||
std::vector<std::string> keys(n);
|
||||
for (std::uint64_t i = 0; i < n; i++) {
|
||||
keys[i].resize(dist_m(engine));
|
||||
for (std::uint64_t j = 0; j < keys[i].size(); j++) {
|
||||
keys[i][j] = dist_c(engine);
|
||||
}
|
||||
}
|
||||
return keys;
|
||||
}
|
||||
|
||||
std::vector<std::string> extract_keys(std::vector<std::string>& keys, double ratio = 0.1, std::uint64_t seed = 13) {
|
||||
std::mt19937_64 engine(seed);
|
||||
std::uniform_real_distribution<double> dist(0.0, 1.0);
|
||||
|
||||
std::vector<std::string> keys1;
|
||||
std::vector<std::string> keys2;
|
||||
|
||||
for (std::uint64_t i = 0; i < keys.size(); ++i) {
|
||||
if (ratio < dist(engine)) {
|
||||
keys1.push_back(keys[i]);
|
||||
} else {
|
||||
keys2.push_back(keys[i]);
|
||||
}
|
||||
}
|
||||
|
||||
keys = keys1;
|
||||
return keys2;
|
||||
}
|
||||
|
||||
} // namespace xcdat::test
|
41
tests/test_compact_vector.cpp
Normal file
41
tests/test_compact_vector.cpp
Normal file
|
@ -0,0 +1,41 @@
|
|||
#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
|
||||
|
||||
#include <algorithm>
|
||||
#include <random>
|
||||
|
||||
#include "doctest/doctest.h"
|
||||
#include "test_common.hpp"
|
||||
#include "xcdat/compact_vector.hpp"
|
||||
|
||||
TEST_CASE("Test compact_vector (zero)") {
|
||||
std::vector<std::uint64_t> ints = {0, 0, 0, 0, 0};
|
||||
xcdat::compact_vector cv(ints);
|
||||
|
||||
REQUIRE_EQ(cv.size(), ints.size());
|
||||
|
||||
for (std::uint64_t i = 0; i < ints.size(); i++) {
|
||||
REQUIRE_EQ(cv[i], ints[i]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("Test compact_vector (tiny)") {
|
||||
std::vector<std::uint64_t> ints = {2, 0, 14, 456, 32, 5544, 23};
|
||||
xcdat::compact_vector cv(ints);
|
||||
|
||||
REQUIRE_EQ(cv.size(), ints.size());
|
||||
|
||||
for (std::uint64_t i = 0; i < ints.size(); i++) {
|
||||
REQUIRE_EQ(cv[i], ints[i]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("Test compact_vector (random)") {
|
||||
std::vector<std::uint64_t> ints = xcdat::test::make_random_ints(10000, 0, UINT16_MAX);
|
||||
xcdat::compact_vector cv(ints);
|
||||
|
||||
REQUIRE_EQ(cv.size(), ints.size());
|
||||
|
||||
for (std::uint64_t i = 0; i < ints.size(); i++) {
|
||||
REQUIRE_EQ(cv[i], ints[i]);
|
||||
}
|
||||
}
|
51
tests/test_tail_vector.cpp
Normal file
51
tests/test_tail_vector.cpp
Normal file
|
@ -0,0 +1,51 @@
|
|||
#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
|
||||
|
||||
#include <algorithm>
|
||||
#include <random>
|
||||
|
||||
#include "doctest/doctest.h"
|
||||
#include "test_common.hpp"
|
||||
#include "xcdat/tail_vector.hpp"
|
||||
|
||||
void test_tail_vector(const std::vector<std::string>& sufs, bool bin_mode = false) {
|
||||
xcdat::tail_vector tvec;
|
||||
std::vector<std::uint64_t> idxs(sufs.size());
|
||||
|
||||
{
|
||||
xcdat::tail_vector::builder tvb;
|
||||
for (std::uint64_t i = 0; i < sufs.size(); i++) {
|
||||
tvb.set_suffix(sufs[i], i);
|
||||
}
|
||||
tvb.complete(bin_mode, [&](std::uint64_t npos, std::uint64_t tpos) { idxs[npos] = tpos; });
|
||||
tvec = xcdat::tail_vector(std::move(tvb));
|
||||
}
|
||||
|
||||
for (std::uint64_t i = 0; i < sufs.size(); i++) {
|
||||
REQUIRE(tvec.match(sufs[i], idxs[i]));
|
||||
}
|
||||
for (std::uint64_t i = 0; i < sufs.size(); i++) {
|
||||
std::string decoded;
|
||||
tvec.decode(idxs[i], [&](char c) { decoded.push_back(c); });
|
||||
REQUIRE_EQ(sufs[i], decoded);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("Test xcdat::tail_vector (tiny)") {
|
||||
std::vector<std::string> sufs = {"ML", "STATS", "A", "M", "L", "AKDD", "M", "R", "DD", "OD"};
|
||||
test_tail_vector(sufs);
|
||||
}
|
||||
|
||||
TEST_CASE("Test xcdat::tail_vector (random, A--B)") {
|
||||
std::vector<std::string> sufs = xcdat::test::make_random_keys(10000, 1, 30, 'A', 'B');
|
||||
test_tail_vector(sufs);
|
||||
}
|
||||
|
||||
TEST_CASE("Test xcdat::tail_vector (random, A--Z)") {
|
||||
std::vector<std::string> sufs = xcdat::test::make_random_keys(10000, 1, 30, 'A', 'Z');
|
||||
test_tail_vector(sufs);
|
||||
}
|
||||
|
||||
TEST_CASE("Test xcdat::tail_vector (random, 0x00--0xFF)") {
|
||||
std::vector<std::string> sufs = xcdat::test::make_random_keys(10000, 1, 30, INT8_MIN, INT8_MAX);
|
||||
test_tail_vector(sufs, true);
|
||||
}
|
297
tests/test_trie.cpp
Normal file
297
tests/test_trie.cpp
Normal file
|
@ -0,0 +1,297 @@
|
|||
#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
|
||||
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <random>
|
||||
#include <string>
|
||||
|
||||
#include "doctest/doctest.h"
|
||||
#include "mm_file/mm_file.hpp"
|
||||
#include "test_common.hpp"
|
||||
#include "xcdat.hpp"
|
||||
|
||||
#ifdef TRIE_7
|
||||
using trie_type = xcdat::trie_7_type;
|
||||
#elif TRIE_8
|
||||
using trie_type = xcdat::trie_8_type;
|
||||
#endif
|
||||
|
||||
void test_basic_operations(const trie_type& trie, const std::vector<std::string>& keys,
|
||||
const std::vector<std::string>& others) {
|
||||
REQUIRE_EQ(trie.num_keys(), keys.size());
|
||||
REQUIRE_EQ(trie.max_length(), xcdat::test::max_length(keys));
|
||||
|
||||
for (std::uint64_t i = 0; i < keys.size(); i++) {
|
||||
auto id = trie.lookup(keys[i]);
|
||||
REQUIRE(id.has_value());
|
||||
REQUIRE_LT(id.value(), keys.size());
|
||||
auto decoded = trie.decode(id.value());
|
||||
REQUIRE_EQ(keys[i], decoded);
|
||||
}
|
||||
|
||||
for (std::uint64_t i = 0; i < others.size(); i++) {
|
||||
auto id = trie.lookup(others[i]);
|
||||
REQUIRE_FALSE(id.has_value());
|
||||
}
|
||||
}
|
||||
|
||||
void test_prefix_search(const trie_type& trie, const std::vector<std::string>& keys,
|
||||
const std::vector<std::string>& others) {
|
||||
for (auto& key : keys) {
|
||||
size_t num_results = 0;
|
||||
auto itr = trie.make_prefix_iterator(key);
|
||||
|
||||
while (itr.next()) {
|
||||
const auto id = itr.id();
|
||||
const auto decoded = itr.decoded_view();
|
||||
|
||||
REQUIRE_LE(decoded.size(), key.size());
|
||||
REQUIRE_EQ(id, trie.lookup(decoded));
|
||||
REQUIRE_EQ(decoded, trie.decode(id));
|
||||
|
||||
num_results += 1;
|
||||
}
|
||||
|
||||
REQUIRE_LE(1, num_results);
|
||||
REQUIRE_LE(num_results, key.size());
|
||||
}
|
||||
|
||||
for (auto& key : others) {
|
||||
size_t num_results = 0;
|
||||
auto itr = trie.make_prefix_iterator(key);
|
||||
|
||||
while (itr.next()) {
|
||||
const auto id = itr.id();
|
||||
const auto decoded = itr.decoded_view();
|
||||
|
||||
REQUIRE_LT(decoded.size(), key.size());
|
||||
REQUIRE_EQ(id, trie.lookup(decoded));
|
||||
REQUIRE_EQ(decoded, trie.decode(id));
|
||||
|
||||
num_results += 1;
|
||||
}
|
||||
|
||||
REQUIRE_LT(num_results, key.size());
|
||||
}
|
||||
}
|
||||
|
||||
void test_predictive_search(const trie_type& trie, const std::vector<std::string>& keys,
|
||||
const std::vector<std::string>& others) {
|
||||
for (auto& key : keys) {
|
||||
size_t num_results = 0;
|
||||
auto itr = trie.make_predictive_iterator(key);
|
||||
|
||||
while (itr.next()) {
|
||||
const auto id = itr.id();
|
||||
const auto decoded = itr.decoded_view();
|
||||
|
||||
REQUIRE_LE(key.size(), decoded.size());
|
||||
REQUIRE_EQ(id, trie.lookup(decoded));
|
||||
REQUIRE_EQ(decoded, trie.decode(id));
|
||||
|
||||
num_results += 1;
|
||||
}
|
||||
|
||||
REQUIRE_LE(1, num_results);
|
||||
}
|
||||
|
||||
for (auto& key : others) {
|
||||
auto itr = trie.make_predictive_iterator(key);
|
||||
|
||||
while (itr.next()) {
|
||||
const auto id = itr.id();
|
||||
const auto decoded = itr.decoded_view();
|
||||
|
||||
REQUIRE_LT(key.size(), decoded.size());
|
||||
REQUIRE_EQ(id, trie.lookup(decoded));
|
||||
REQUIRE_EQ(decoded, trie.decode(id));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void test_enumerate(const trie_type& trie, const std::vector<std::string>& keys) {
|
||||
auto itr = trie.make_enumerative_iterator();
|
||||
for (auto& key : keys) {
|
||||
REQUIRE(itr.next());
|
||||
REQUIRE_EQ(itr.decoded_view(), key);
|
||||
REQUIRE_EQ(itr.id(), trie.lookup(key));
|
||||
}
|
||||
REQUIRE_FALSE(itr.next());
|
||||
}
|
||||
|
||||
void test_io(const trie_type& trie, const std::vector<std::string>& keys, const std::vector<std::string>& others) {
|
||||
const char* tmp_filepath = "tmp.idx";
|
||||
|
||||
const std::uint64_t memory = xcdat::memory_in_bytes(trie);
|
||||
REQUIRE_EQ(memory, xcdat::save(trie, tmp_filepath));
|
||||
|
||||
{
|
||||
const auto loaded = xcdat::load<trie_type>(tmp_filepath);
|
||||
REQUIRE_EQ(trie.bin_mode(), loaded.bin_mode());
|
||||
REQUIRE_EQ(trie.num_keys(), loaded.num_keys());
|
||||
REQUIRE_EQ(trie.alphabet_size(), loaded.alphabet_size());
|
||||
REQUIRE_EQ(trie.max_length(), loaded.max_length());
|
||||
REQUIRE_EQ(memory, xcdat::memory_in_bytes(loaded));
|
||||
test_basic_operations(loaded, keys, others);
|
||||
}
|
||||
|
||||
{
|
||||
mm::file_source<char> fin(tmp_filepath, mm::advice::sequential);
|
||||
const auto mapped = xcdat::mmap<trie_type>(fin.data());
|
||||
REQUIRE_EQ(trie.bin_mode(), mapped.bin_mode());
|
||||
REQUIRE_EQ(trie.num_keys(), mapped.num_keys());
|
||||
REQUIRE_EQ(trie.alphabet_size(), mapped.alphabet_size());
|
||||
REQUIRE_EQ(trie.max_length(), mapped.max_length());
|
||||
REQUIRE_EQ(memory, xcdat::memory_in_bytes(mapped));
|
||||
test_basic_operations(mapped, keys, others);
|
||||
}
|
||||
|
||||
std::remove(tmp_filepath);
|
||||
}
|
||||
|
||||
TEST_CASE("Test trie_type (tiny)") {
|
||||
std::vector<std::string> keys = {
|
||||
"AirPods", "AirTag", "Mac", "MacBook", "MacBook_Air", "MacBook_Pro",
|
||||
"Mac_Mini", "Mac_Pro", "iMac", "iPad", "iPhone", "iPhone_SE",
|
||||
};
|
||||
std::vector<std::string> others = {
|
||||
"Google_Pixel", "iPad_mini", "iPadOS", "iPod", "ThinkPad",
|
||||
};
|
||||
|
||||
trie_type trie(keys);
|
||||
REQUIRE_FALSE(trie.bin_mode());
|
||||
|
||||
test_basic_operations(trie, keys, others);
|
||||
|
||||
{
|
||||
auto itr = trie.make_prefix_iterator("MacBook_Pro");
|
||||
std::vector<std::string> expected = {"Mac", "MacBook", "MacBook_Pro"};
|
||||
for (const auto& exp : expected) {
|
||||
REQUIRE(itr.next());
|
||||
REQUIRE_EQ(itr.decoded(), exp);
|
||||
REQUIRE_EQ(itr.id(), trie.lookup(exp));
|
||||
}
|
||||
REQUIRE_FALSE(itr.next());
|
||||
}
|
||||
{
|
||||
auto itr = trie.make_predictive_iterator("MacBook");
|
||||
std::vector<std::string> expected = {"MacBook", "MacBook_Air", "MacBook_Pro"};
|
||||
for (const auto& exp : expected) {
|
||||
REQUIRE(itr.next());
|
||||
REQUIRE_EQ(itr.decoded(), exp);
|
||||
REQUIRE_EQ(itr.id(), trie.lookup(exp));
|
||||
}
|
||||
REQUIRE_FALSE(itr.next());
|
||||
}
|
||||
{
|
||||
auto itr = trie.make_enumerative_iterator();
|
||||
for (const auto& key : keys) {
|
||||
REQUIRE(itr.next());
|
||||
REQUIRE_EQ(itr.decoded(), key);
|
||||
REQUIRE_EQ(itr.id(), trie.lookup(key));
|
||||
}
|
||||
REQUIRE_FALSE(itr.next());
|
||||
}
|
||||
|
||||
test_io(trie, keys, others);
|
||||
}
|
||||
|
||||
TEST_CASE("Test trie_type (real)") {
|
||||
auto keys = xcdat::test::to_unique_vec(xcdat::load_strings("keys.txt"));
|
||||
auto others = xcdat::test::extract_keys(keys);
|
||||
|
||||
trie_type trie(keys);
|
||||
REQUIRE_FALSE(trie.bin_mode());
|
||||
|
||||
test_basic_operations(trie, keys, others);
|
||||
test_prefix_search(trie, keys, others);
|
||||
test_predictive_search(trie, keys, others);
|
||||
test_enumerate(trie, keys);
|
||||
test_io(trie, keys, others);
|
||||
}
|
||||
|
||||
TEST_CASE("Test trie_type (random 10K, A--B)") {
|
||||
auto keys = xcdat::test::to_unique_vec(xcdat::test::make_random_keys(10000, 1, 30, 'A', 'B'));
|
||||
auto others = xcdat::test::extract_keys(keys);
|
||||
|
||||
trie_type trie(keys);
|
||||
REQUIRE_FALSE(trie.bin_mode());
|
||||
|
||||
test_basic_operations(trie, keys, others);
|
||||
test_prefix_search(trie, keys, others);
|
||||
test_predictive_search(trie, keys, others);
|
||||
test_enumerate(trie, keys);
|
||||
test_io(trie, keys, others);
|
||||
}
|
||||
|
||||
TEST_CASE("Test trie_type (random 10K, A--Z)") {
|
||||
auto keys = xcdat::test::to_unique_vec(xcdat::test::make_random_keys(10000, 1, 30, 'A', 'Z'));
|
||||
auto others = xcdat::test::extract_keys(keys);
|
||||
|
||||
trie_type trie(keys);
|
||||
REQUIRE_FALSE(trie.bin_mode());
|
||||
|
||||
test_basic_operations(trie, keys, others);
|
||||
test_prefix_search(trie, keys, others);
|
||||
test_predictive_search(trie, keys, others);
|
||||
test_enumerate(trie, keys);
|
||||
test_io(trie, keys, others);
|
||||
}
|
||||
|
||||
TEST_CASE("Test trie_type (random 10K, 0x00--0xFF)") {
|
||||
auto keys = xcdat::test::to_unique_vec(xcdat::test::make_random_keys(10000, 1, 30, INT8_MIN, INT8_MAX));
|
||||
auto others = xcdat::test::extract_keys(keys);
|
||||
|
||||
trie_type trie(keys);
|
||||
REQUIRE(trie.bin_mode());
|
||||
|
||||
test_basic_operations(trie, keys, others);
|
||||
test_prefix_search(trie, keys, others);
|
||||
test_predictive_search(trie, keys, others);
|
||||
test_enumerate(trie, keys);
|
||||
test_io(trie, keys, others);
|
||||
}
|
||||
|
||||
#ifdef NDEBUG
|
||||
TEST_CASE("Test trie_type (random 100K, A--B)") {
|
||||
auto keys = xcdat::test::to_unique_vec(xcdat::test::make_random_keys(100000, 1, 30, 'A', 'B'));
|
||||
auto others = xcdat::test::extract_keys(keys);
|
||||
|
||||
trie_type trie(keys);
|
||||
REQUIRE_FALSE(trie.bin_mode());
|
||||
|
||||
test_basic_operations(trie, keys, others);
|
||||
test_prefix_search(trie, keys, others);
|
||||
test_predictive_search(trie, keys, others);
|
||||
test_enumerate(trie, keys);
|
||||
test_io(trie, keys, others);
|
||||
}
|
||||
|
||||
TEST_CASE("Test trie_type (random 100K, A--Z)") {
|
||||
auto keys = xcdat::test::to_unique_vec(xcdat::test::make_random_keys(100000, 1, 30, 'A', 'Z'));
|
||||
auto others = xcdat::test::extract_keys(keys);
|
||||
|
||||
trie_type trie(keys);
|
||||
REQUIRE_FALSE(trie.bin_mode());
|
||||
|
||||
test_basic_operations(trie, keys, others);
|
||||
test_prefix_search(trie, keys, others);
|
||||
test_predictive_search(trie, keys, others);
|
||||
test_enumerate(trie, keys);
|
||||
test_io(trie, keys, others);
|
||||
}
|
||||
|
||||
TEST_CASE("Test trie_type (random 100K, 0x00--0xFF)") {
|
||||
auto keys = xcdat::test::to_unique_vec(xcdat::test::make_random_keys(100000, 1, 30, INT8_MIN, INT8_MAX));
|
||||
auto others = xcdat::test::extract_keys(keys);
|
||||
|
||||
trie_type trie(keys);
|
||||
REQUIRE(trie.bin_mode());
|
||||
|
||||
test_basic_operations(trie, keys, others);
|
||||
test_prefix_search(trie, keys, others);
|
||||
test_predictive_search(trie, keys, others);
|
||||
test_enumerate(trie, keys);
|
||||
test_io(trie, keys, others);
|
||||
}
|
||||
#endif
|
14
tools/CMakeLists.txt
Normal file
14
tools/CMakeLists.txt
Normal file
|
@ -0,0 +1,14 @@
|
|||
set(XCDAT_FILES
|
||||
"xcdat_build"
|
||||
"xcdat_lookup"
|
||||
"xcdat_decode"
|
||||
"xcdat_prefix_search"
|
||||
"xcdat_predictive_search"
|
||||
"xcdat_enumerate"
|
||||
"xcdat_benchmark"
|
||||
)
|
||||
|
||||
foreach(XCDAT_FILE ${XCDAT_FILES})
|
||||
add_executable(${XCDAT_FILE} ${XCDAT_FILE}.cpp)
|
||||
install(TARGETS ${XCDAT_FILE} RUNTIME DESTINATION bin)
|
||||
endforeach(XCDAT_FILE)
|
158
tools/cmd_line_parser/parser.hpp
Normal file
158
tools/cmd_line_parser/parser.hpp
Normal file
|
@ -0,0 +1,158 @@
|
|||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <type_traits>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
namespace cmd_line_parser {
|
||||
|
||||
struct parser {
|
||||
inline static const std::string empty = "";
|
||||
|
||||
parser(int argc, char** argv) : m_argc(argc), m_argv(argv), m_required(0) {}
|
||||
|
||||
struct cmd {
|
||||
std::string shorthand, value, descr;
|
||||
bool is_boolean;
|
||||
};
|
||||
|
||||
bool parse() {
|
||||
if (size_t(m_argc - 1) < m_required) return abort();
|
||||
size_t k = 0;
|
||||
for (int i = 1; i != m_argc; ++i, ++k) {
|
||||
std::string parsed(m_argv[i]);
|
||||
if (parsed == "-h" or parsed == "--help") return abort();
|
||||
size_t id = k;
|
||||
bool is_optional = id >= m_required;
|
||||
if (is_optional) {
|
||||
auto it = m_shorthands.find(parsed);
|
||||
if (it == m_shorthands.end()) {
|
||||
std::cerr << "== error: shorthand '" + parsed + "' not found" << std::endl;
|
||||
return abort();
|
||||
}
|
||||
id = (*it).second;
|
||||
}
|
||||
assert(id < m_names.size());
|
||||
auto const& name = m_names[id];
|
||||
auto& c = m_cmds[name];
|
||||
if (is_optional) {
|
||||
if (c.is_boolean) {
|
||||
parsed = "true";
|
||||
} else {
|
||||
++i;
|
||||
if (i == m_argc) return abort();
|
||||
parsed = m_argv[i];
|
||||
}
|
||||
}
|
||||
c.value = parsed;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void help() const {
|
||||
std::cerr << "Usage: \e[1m" << m_argv[0] << "\e[0m [-h,--help]";
|
||||
auto print = [this](bool with_description) {
|
||||
for (size_t i = 0; i != m_names.size(); ++i) {
|
||||
auto const& c = m_cmds.at(m_names[i]);
|
||||
bool is_optional = i >= m_required;
|
||||
if (is_optional) std::cerr << " [\e[1m" << c.shorthand << "\e[0m";
|
||||
if (!c.is_boolean) std::cerr << " \e[4m" << m_names[i] << "\e[0m";
|
||||
if (is_optional) std::cerr << "]";
|
||||
if (with_description) std::cerr << "\n\t" << c.descr << "\n";
|
||||
}
|
||||
};
|
||||
print(false);
|
||||
std::cerr << "\n\n";
|
||||
print(true);
|
||||
std::cerr << " [-h,--help]\n\tPrint this help text and silently exits." << std::endl;
|
||||
}
|
||||
|
||||
bool add(std::string const& name, std::string const& descr) {
|
||||
bool ret = m_cmds.emplace(name, cmd{empty, empty, descr, false}).second;
|
||||
if (ret) {
|
||||
m_names.push_back(name);
|
||||
m_required += 1;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool add(std::string const& name, std::string const& descr, std::string const& shorthand, bool is_boolean = true) {
|
||||
bool ret = m_cmds.emplace(name, cmd{shorthand, is_boolean ? "false" : empty, descr, is_boolean}).second;
|
||||
if (ret) {
|
||||
m_names.push_back(name);
|
||||
m_shorthands.emplace(shorthand, m_names.size() - 1);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
T get(std::string const& name) const {
|
||||
auto it = m_cmds.find(name);
|
||||
if (it == m_cmds.end()) {
|
||||
throw std::runtime_error("error: '" + name + "' not found");
|
||||
}
|
||||
auto const& value = (*it).second.value;
|
||||
return parse<T>(value);
|
||||
}
|
||||
|
||||
// added by Kampersanda
|
||||
template <typename T>
|
||||
T get(std::string const& name, const T& default_value) const {
|
||||
return parsed(name) ? get<T>(name) : default_value;
|
||||
}
|
||||
|
||||
bool parsed(std::string const& name) const {
|
||||
auto it = m_cmds.find(name);
|
||||
if (it == m_cmds.end() or (*it).second.value == empty) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
T parse(std::string const& value) const {
|
||||
if constexpr (std::is_same<T, std::string>::value) {
|
||||
return value;
|
||||
} else if constexpr (std::is_same<T, char>::value or std::is_same<T, signed char>::value or
|
||||
std::is_same<T, unsigned char>::value) {
|
||||
return value.front();
|
||||
} else if constexpr (std::is_same<T, unsigned int>::value or std::is_same<T, int>::value or
|
||||
std::is_same<T, unsigned short int>::value or std::is_same<T, short int>::value) {
|
||||
return std::atoi(value.c_str());
|
||||
} else if constexpr (std::is_same<T, unsigned long int>::value or std::is_same<T, long int>::value or
|
||||
std::is_same<T, unsigned long long int>::value or std::is_same<T, long long int>::value) {
|
||||
return std::atoll(value.c_str());
|
||||
} else if constexpr (std::is_same<T, float>::value or std::is_same<T, double>::value or
|
||||
std::is_same<T, long double>::value) {
|
||||
return std::atof(value.c_str());
|
||||
} else if constexpr (std::is_same<T, bool>::value) {
|
||||
std::istringstream stream(value);
|
||||
bool ret;
|
||||
if (value == "true" or value == "false") {
|
||||
stream >> std::boolalpha >> ret;
|
||||
} else {
|
||||
stream >> std::noboolalpha >> ret;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
assert(false);
|
||||
__builtin_unreachable();
|
||||
}
|
||||
|
||||
private:
|
||||
int m_argc;
|
||||
char** m_argv;
|
||||
size_t m_required;
|
||||
std::unordered_map<std::string, cmd> m_cmds;
|
||||
std::unordered_map<std::string, int> m_shorthands;
|
||||
std::vector<std::string> m_names;
|
||||
|
||||
bool abort() const {
|
||||
help();
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace cmd_line_parser
|
1155
tools/tinyformat/tinyformat.h
Normal file
1155
tools/tinyformat/tinyformat.h
Normal file
File diff suppressed because it is too large
Load diff
148
tools/xcdat_benchmark.cpp
Normal file
148
tools/xcdat_benchmark.cpp
Normal file
|
@ -0,0 +1,148 @@
|
|||
#include <chrono>
|
||||
#include <random>
|
||||
|
||||
#include <xcdat.hpp>
|
||||
|
||||
#include "cmd_line_parser/parser.hpp"
|
||||
#include "tinyformat/tinyformat.h"
|
||||
|
||||
static constexpr int num_trials = 10;
|
||||
|
||||
cmd_line_parser::parser make_parser(int argc, char** argv) {
|
||||
cmd_line_parser::parser p(argc, argv);
|
||||
p.add("input_keys", "Input filepath of keywords");
|
||||
p.add("num_samples", "Number of sample keys for searches (default=1000)", "-n", false);
|
||||
p.add("random_seed", "Random seed for sampling (default=13)", "-s", false);
|
||||
p.add("binary_mode", "Is binary mode? (default=0)", "-b", false);
|
||||
return p;
|
||||
}
|
||||
|
||||
std::vector<std::string_view> sample_keys(const std::vector<std::string>& keys, std::uint64_t num_samples,
|
||||
std::uint64_t random_seed) {
|
||||
std::mt19937_64 engine(random_seed);
|
||||
std::uniform_int_distribution<std::uint64_t> dist(0, keys.size() - 1);
|
||||
|
||||
std::vector<std::string_view> sampled_keys(num_samples);
|
||||
for (std::uint64_t i = 0; i < num_samples; i++) {
|
||||
sampled_keys[i] = std::string_view(keys[dist(engine)]);
|
||||
}
|
||||
return sampled_keys;
|
||||
}
|
||||
|
||||
template <class Trie>
|
||||
std::vector<std::uint64_t> extract_ids(const Trie& trie, const std::vector<std::string_view>& keys) {
|
||||
std::vector<std::uint64_t> sampled_ids(keys.size());
|
||||
for (std::uint64_t i = 0; i < keys.size(); i++) {
|
||||
sampled_ids[i] = trie.lookup(keys[i]).value();
|
||||
}
|
||||
return sampled_ids;
|
||||
}
|
||||
|
||||
template <class Trie>
|
||||
Trie benchmark_build(const std::vector<std::string>& keys, bool binary_mode) {
|
||||
const auto start_tp = std::chrono::high_resolution_clock::now();
|
||||
Trie trie(keys, binary_mode);
|
||||
const auto stop_tp = std::chrono::high_resolution_clock::now();
|
||||
|
||||
const auto dur_ms = std::chrono::duration_cast<std::chrono::milliseconds>(stop_tp - start_tp);
|
||||
const double time_in_sec = dur_ms.count() / 1000.0;
|
||||
const double memory_in_bytes = xcdat::memory_in_bytes(trie);
|
||||
|
||||
tfm::printfln("Number of keys: %d", trie.num_keys());
|
||||
tfm::printfln("Memory usage in bytes: %d", memory_in_bytes);
|
||||
tfm::printfln("Memory usage in MiB: %g", memory_in_bytes / (1024.0 * 1024.0));
|
||||
tfm::printfln("Construction time in seconds: %g", time_in_sec);
|
||||
|
||||
return trie;
|
||||
}
|
||||
|
||||
template <class Trie>
|
||||
void benchmark_lookup(const Trie& trie, const std::vector<std::string_view>& queries) {
|
||||
// Warmup
|
||||
volatile std::uint64_t tmp = 0;
|
||||
for (const auto& query : queries) {
|
||||
tmp += trie.lookup(query).value();
|
||||
}
|
||||
|
||||
// Measure
|
||||
const auto start_tp = std::chrono::high_resolution_clock::now();
|
||||
for (int r = 0; r < num_trials; r++) {
|
||||
for (const auto& query : queries) {
|
||||
tmp += trie.lookup(query).value();
|
||||
}
|
||||
}
|
||||
const auto stop_tp = std::chrono::high_resolution_clock::now();
|
||||
|
||||
const auto dur_us = std::chrono::duration_cast<std::chrono::microseconds>(stop_tp - start_tp);
|
||||
const auto elapsed_us = static_cast<double>(dur_us.count());
|
||||
|
||||
tfm::printfln("Lookup time in microsec/query: %g", elapsed_us / (num_trials * queries.size()));
|
||||
}
|
||||
|
||||
template <class Trie>
|
||||
void benchmark_decode(const Trie& trie, const std::vector<std::uint64_t>& queries) {
|
||||
// Warmup
|
||||
volatile std::uint64_t tmp = 0;
|
||||
for (const std::uint64_t query : queries) {
|
||||
tmp += trie.decode(query).size();
|
||||
}
|
||||
|
||||
// Measure
|
||||
const auto start_tp = std::chrono::high_resolution_clock::now();
|
||||
for (int r = 0; r < num_trials; r++) {
|
||||
for (const std::uint64_t query : queries) {
|
||||
tmp += trie.decode(query).size();
|
||||
}
|
||||
}
|
||||
const auto stop_tp = std::chrono::high_resolution_clock::now();
|
||||
|
||||
const auto dur_us = std::chrono::duration_cast<std::chrono::microseconds>(stop_tp - start_tp);
|
||||
const auto elapsed_us = static_cast<double>(dur_us.count());
|
||||
|
||||
tfm::printfln("Decode time in microsec/query: %g", elapsed_us / (num_trials * queries.size()));
|
||||
}
|
||||
|
||||
template <class Trie>
|
||||
void benchmark(std::vector<std::string> keys, const std::vector<std::string_view>& query_keys, bool binary_mode) {
|
||||
const auto trie = benchmark_build<Trie>(keys, binary_mode);
|
||||
const auto query_ids = extract_ids(trie, query_keys);
|
||||
|
||||
benchmark_lookup(trie, query_keys);
|
||||
benchmark_decode(trie, query_ids);
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
#ifndef NDEBUG
|
||||
tfm::warnfln("The code is running in debug mode.");
|
||||
#endif
|
||||
std::ios::sync_with_stdio(false);
|
||||
|
||||
auto p = make_parser(argc, argv);
|
||||
if (!p.parse()) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
const auto input_keys = p.get<std::string>("input_keys");
|
||||
const auto num_samples = p.get<std::uint64_t>("num_samples", 1000);
|
||||
const auto random_seed = p.get<std::uint64_t>("random_seed", 13);
|
||||
const auto binary_mode = p.get<bool>("binary_mode", false);
|
||||
|
||||
auto keys = xcdat::load_strings(input_keys);
|
||||
if (keys.empty()) {
|
||||
tfm::errorfln("Error: The input dataset is empty.");
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::sort(keys.begin(), keys.end());
|
||||
keys.erase(std::unique(keys.begin(), keys.end()), keys.end());
|
||||
|
||||
const auto query_keys = sample_keys(keys, num_samples, random_seed);
|
||||
|
||||
tfm::printfln("** xcdat::trie_7_type **");
|
||||
benchmark<xcdat::trie_7_type>(keys, query_keys, binary_mode);
|
||||
|
||||
tfm::printfln("** xcdat::trie_8_type **");
|
||||
benchmark<xcdat::trie_8_type>(keys, query_keys, binary_mode);
|
||||
|
||||
return 0;
|
||||
}
|
67
tools/xcdat_build.cpp
Normal file
67
tools/xcdat_build.cpp
Normal file
|
@ -0,0 +1,67 @@
|
|||
#include <xcdat.hpp>
|
||||
|
||||
#include "cmd_line_parser/parser.hpp"
|
||||
#include "tinyformat/tinyformat.h"
|
||||
|
||||
cmd_line_parser::parser make_parser(int argc, char** argv) {
|
||||
cmd_line_parser::parser p(argc, argv);
|
||||
p.add("input_keys", "Input filepath of keywords");
|
||||
p.add("output_dic", "Output filepath of trie dictionary");
|
||||
p.add("trie_type", "Trie type: [7|8] (default=7)", "-t", false);
|
||||
p.add("binary_mode", "Is binary mode? (default=0)", "-b", false);
|
||||
return p;
|
||||
}
|
||||
|
||||
template <class Trie>
|
||||
int build(const cmd_line_parser::parser& p) {
|
||||
const auto input_keys = p.get<std::string>("input_keys");
|
||||
const auto output_dic = p.get<std::string>("output_dic");
|
||||
const auto binary_mode = p.get<bool>("binary_mode", false);
|
||||
|
||||
auto keys = xcdat::load_strings(input_keys);
|
||||
if (keys.empty()) {
|
||||
tfm::errorfln("Error: The input dataset is empty.");
|
||||
}
|
||||
|
||||
std::sort(keys.begin(), keys.end());
|
||||
keys.erase(std::unique(keys.begin(), keys.end()), keys.end());
|
||||
|
||||
const Trie trie(keys, binary_mode);
|
||||
const double memory_in_bytes = xcdat::memory_in_bytes(trie);
|
||||
|
||||
tfm::printfln("Number of keys: %d", trie.num_keys());
|
||||
tfm::printfln("Number of trie nodes: %d", trie.num_nodes());
|
||||
tfm::printfln("Number of DA units: %d", trie.num_units());
|
||||
tfm::printfln("Memory usage in bytes: %d", memory_in_bytes);
|
||||
tfm::printfln("Memory usage in MiB: %g", memory_in_bytes / (1024.0 * 1024.0));
|
||||
|
||||
xcdat::save(trie, output_dic);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
#ifndef NDEBUG
|
||||
tfm::warnfln("The code is running in debug mode.");
|
||||
#endif
|
||||
std::ios::sync_with_stdio(false);
|
||||
|
||||
auto p = make_parser(argc, argv);
|
||||
if (!p.parse()) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
const auto trie_type = p.get<int>("trie_type", 7);
|
||||
|
||||
switch (trie_type) {
|
||||
case 7:
|
||||
return build<xcdat::trie_7_type>(p);
|
||||
case 8:
|
||||
return build<xcdat::trie_8_type>(p);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
p.help();
|
||||
return 1;
|
||||
}
|
53
tools/xcdat_decode.cpp
Normal file
53
tools/xcdat_decode.cpp
Normal file
|
@ -0,0 +1,53 @@
|
|||
#include <mm_file/mm_file.hpp>
|
||||
#include <xcdat.hpp>
|
||||
|
||||
#include "cmd_line_parser/parser.hpp"
|
||||
#include "tinyformat/tinyformat.h"
|
||||
|
||||
cmd_line_parser::parser make_parser(int argc, char** argv) {
|
||||
cmd_line_parser::parser p(argc, argv);
|
||||
p.add("input_dic", "Input filepath of trie dictionary");
|
||||
return p;
|
||||
}
|
||||
|
||||
template <class Trie>
|
||||
int decode(const cmd_line_parser::parser& p) {
|
||||
const auto input_dic = p.get<std::string>("input_dic");
|
||||
|
||||
const mm::file_source<char> fin(input_dic.c_str(), mm::advice::sequential);
|
||||
const auto trie = xcdat::mmap<Trie>(fin.data());
|
||||
|
||||
for (std::uint64_t id; std::cin >> id;) {
|
||||
const auto dec = trie.decode(id);
|
||||
tfm::printfln("%d\t%s", id, dec);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
#ifndef NDEBUG
|
||||
tfm::warnfln("The code is running in debug mode.");
|
||||
#endif
|
||||
std::ios::sync_with_stdio(false);
|
||||
|
||||
auto p = make_parser(argc, argv);
|
||||
if (!p.parse()) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
const auto input_dic = p.get<std::string>("input_dic");
|
||||
const auto flag = xcdat::get_flag(input_dic);
|
||||
|
||||
switch (flag) {
|
||||
case 7:
|
||||
return decode<xcdat::trie_7_type>(p);
|
||||
case 8:
|
||||
return decode<xcdat::trie_8_type>(p);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
p.help();
|
||||
return 1;
|
||||
}
|
50
tools/xcdat_enumerate.cpp
Normal file
50
tools/xcdat_enumerate.cpp
Normal file
|
@ -0,0 +1,50 @@
|
|||
#include <mm_file/mm_file.hpp>
|
||||
#include <xcdat.hpp>
|
||||
|
||||
#include "cmd_line_parser/parser.hpp"
|
||||
#include "tinyformat/tinyformat.h"
|
||||
|
||||
cmd_line_parser::parser make_parser(int argc, char** argv) {
|
||||
cmd_line_parser::parser p(argc, argv);
|
||||
p.add("input_dic", "Input filepath of trie dictionary");
|
||||
return p;
|
||||
}
|
||||
|
||||
template <class Trie>
|
||||
int enumerate(const cmd_line_parser::parser& p) {
|
||||
const auto input_dic = p.get<std::string>("input_dic");
|
||||
|
||||
const mm::file_source<char> fin(input_dic.c_str(), mm::advice::sequential);
|
||||
const auto trie = xcdat::mmap<Trie>(fin.data());
|
||||
|
||||
trie.enumerate([&](std::uint64_t id, std::string_view str) { tfm::printfln("%d\t%s", id, str); });
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
#ifndef NDEBUG
|
||||
tfm::warnfln("The code is running in debug mode.");
|
||||
#endif
|
||||
std::ios::sync_with_stdio(false);
|
||||
|
||||
auto p = make_parser(argc, argv);
|
||||
if (!p.parse()) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
const auto input_dic = p.get<std::string>("input_dic");
|
||||
const auto flag = xcdat::get_flag(input_dic);
|
||||
|
||||
switch (flag) {
|
||||
case 7:
|
||||
return enumerate<xcdat::trie_7_type>(p);
|
||||
case 8:
|
||||
return enumerate<xcdat::trie_8_type>(p);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
p.help();
|
||||
return 1;
|
||||
}
|
57
tools/xcdat_lookup.cpp
Normal file
57
tools/xcdat_lookup.cpp
Normal file
|
@ -0,0 +1,57 @@
|
|||
#include <mm_file/mm_file.hpp>
|
||||
#include <xcdat.hpp>
|
||||
|
||||
#include "cmd_line_parser/parser.hpp"
|
||||
#include "tinyformat/tinyformat.h"
|
||||
|
||||
cmd_line_parser::parser make_parser(int argc, char** argv) {
|
||||
cmd_line_parser::parser p(argc, argv);
|
||||
p.add("input_dic", "Input filepath of trie dictionary");
|
||||
return p;
|
||||
}
|
||||
|
||||
template <class Trie>
|
||||
int lookup(const cmd_line_parser::parser& p) {
|
||||
const auto input_dic = p.get<std::string>("input_dic");
|
||||
|
||||
const mm::file_source<char> fin(input_dic.c_str(), mm::advice::sequential);
|
||||
const auto trie = xcdat::mmap<Trie>(fin.data());
|
||||
|
||||
for (std::string str; std::getline(std::cin, str);) {
|
||||
const auto id = trie.lookup(str);
|
||||
if (id.has_value()) {
|
||||
tfm::printfln("%d\t%s", id.value(), str);
|
||||
} else {
|
||||
tfm::printfln("-1\t%s", str);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
#ifndef NDEBUG
|
||||
tfm::warnfln("The code is running in debug mode.");
|
||||
#endif
|
||||
std::ios::sync_with_stdio(false);
|
||||
|
||||
auto p = make_parser(argc, argv);
|
||||
if (!p.parse()) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
const auto input_dic = p.get<std::string>("input_dic");
|
||||
const auto flag = xcdat::get_flag(input_dic);
|
||||
|
||||
switch (flag) {
|
||||
case 7:
|
||||
return lookup<xcdat::trie_7_type>(p);
|
||||
case 8:
|
||||
return lookup<xcdat::trie_8_type>(p);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
p.help();
|
||||
return 1;
|
||||
}
|
70
tools/xcdat_predictive_search.cpp
Normal file
70
tools/xcdat_predictive_search.cpp
Normal file
|
@ -0,0 +1,70 @@
|
|||
#include <mm_file/mm_file.hpp>
|
||||
#include <xcdat.hpp>
|
||||
|
||||
#include "cmd_line_parser/parser.hpp"
|
||||
#include "tinyformat/tinyformat.h"
|
||||
|
||||
cmd_line_parser::parser make_parser(int argc, char** argv) {
|
||||
cmd_line_parser::parser p(argc, argv);
|
||||
p.add("input_dic", "Input filepath of trie dictionary");
|
||||
p.add("max_num_results", "The max number of results (default=10)", "-n", false);
|
||||
return p;
|
||||
}
|
||||
|
||||
template <class Trie>
|
||||
int predictive_search(const cmd_line_parser::parser& p) {
|
||||
const auto input_dic = p.get<std::string>("input_dic");
|
||||
const auto max_num_results = p.get<std::uint64_t>("max_num_results", 10);
|
||||
|
||||
const mm::file_source<char> fin(input_dic.c_str(), mm::advice::sequential);
|
||||
const auto trie = xcdat::mmap<Trie>(fin.data());
|
||||
|
||||
struct result_type {
|
||||
std::uint64_t id;
|
||||
std::string str;
|
||||
};
|
||||
std::vector<result_type> results;
|
||||
results.reserve(1ULL << 10);
|
||||
|
||||
for (std::string key; std::getline(std::cin, key);) {
|
||||
results.clear();
|
||||
trie.predictive_search(key, [&](std::uint64_t id, std::string_view str) {
|
||||
results.push_back({id, std::string(str)});
|
||||
});
|
||||
|
||||
tfm::printfln("%d found", results.size());
|
||||
for (std::uint64_t i = 0; i < std::min<std::uint64_t>(results.size(), max_num_results); i++) {
|
||||
const auto& r = results[i];
|
||||
tfm::printfln("%d\t%s", r.id, r.str);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
#ifndef NDEBUG
|
||||
tfm::warnfln("The code is running in debug mode.");
|
||||
#endif
|
||||
std::ios::sync_with_stdio(false);
|
||||
|
||||
auto p = make_parser(argc, argv);
|
||||
if (!p.parse()) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
const auto input_dic = p.get<std::string>("input_dic");
|
||||
const auto flag = xcdat::get_flag(input_dic);
|
||||
|
||||
switch (flag) {
|
||||
case 7:
|
||||
return predictive_search<xcdat::trie_7_type>(p);
|
||||
case 8:
|
||||
return predictive_search<xcdat::trie_8_type>(p);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
p.help();
|
||||
return 1;
|
||||
}
|
66
tools/xcdat_prefix_search.cpp
Normal file
66
tools/xcdat_prefix_search.cpp
Normal file
|
@ -0,0 +1,66 @@
|
|||
#include <mm_file/mm_file.hpp>
|
||||
#include <xcdat.hpp>
|
||||
|
||||
#include "cmd_line_parser/parser.hpp"
|
||||
#include "tinyformat/tinyformat.h"
|
||||
|
||||
cmd_line_parser::parser make_parser(int argc, char** argv) {
|
||||
cmd_line_parser::parser p(argc, argv);
|
||||
p.add("input_dic", "Input filepath of trie dictionary");
|
||||
return p;
|
||||
}
|
||||
|
||||
template <class Trie>
|
||||
int prefix_search(const cmd_line_parser::parser& p) {
|
||||
const auto input_dic = p.get<std::string>("input_dic");
|
||||
|
||||
const mm::file_source<char> fin(input_dic.c_str(), mm::advice::sequential);
|
||||
const auto trie = xcdat::mmap<Trie>(fin.data());
|
||||
|
||||
struct result_type {
|
||||
std::uint64_t id;
|
||||
std::string_view str;
|
||||
};
|
||||
|
||||
std::vector<result_type> results;
|
||||
results.reserve(trie.max_length());
|
||||
|
||||
for (std::string key; std::getline(std::cin, key);) {
|
||||
results.clear();
|
||||
trie.prefix_search(key, [&](std::uint64_t id, std::string_view str) { results.push_back({id, str}); });
|
||||
|
||||
tfm::printfln("%d found", results.size());
|
||||
for (const auto& r : results) {
|
||||
tfm::printfln("%d\t%s", r.id, r.str);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
#ifndef NDEBUG
|
||||
tfm::warnfln("The code is running in debug mode.");
|
||||
#endif
|
||||
std::ios::sync_with_stdio(false);
|
||||
|
||||
auto p = make_parser(argc, argv);
|
||||
if (!p.parse()) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
const auto input_dic = p.get<std::string>("input_dic");
|
||||
const auto flag = xcdat::get_flag(input_dic);
|
||||
|
||||
switch (flag) {
|
||||
case 7:
|
||||
return prefix_search<xcdat::trie_7_type>(p);
|
||||
case 8:
|
||||
return prefix_search<xcdat::trie_8_type>(p);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
p.help();
|
||||
return 1;
|
||||
}
|
Loading…
Reference in a new issue