Merge branch 'development'

# Conflicts:
#	CMakeLists.txt
#	LICENSE
#	README.md
#	include/xcdat.hpp
#	sample/CMakeLists.txt
#	sample/sample.cpp
This commit is contained in:
Shunsuke Kanda 2021-07-02 22:00:25 +09:00
commit 0522882198
43 changed files with 22033 additions and 1 deletions

113
.clang-format Normal file
View file

@ -0,0 +1,113 @@
---
Language: Cpp
# BasedOnStyle: Google
AccessModifierOffset: -2
AlignAfterOpenBracket: Align
AlignConsecutiveAssignments: false
AlignConsecutiveDeclarations: false
AlignEscapedNewlines: Left
AlignOperands: true
AlignTrailingComments: false
AllowAllParametersOfDeclarationOnNextLine: true
AllowShortBlocksOnASingleLine: false
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: Empty
AllowShortIfStatementsOnASingleLine: true
AllowShortLoopsOnASingleLine: true
AlwaysBreakAfterDefinitionReturnType: None
AlwaysBreakAfterReturnType: None
AlwaysBreakBeforeMultilineStrings: false
AlwaysBreakTemplateDeclarations: true
BinPackArguments: true
BinPackParameters: true
BraceWrapping:
AfterClass: false
AfterControlStatement: false
AfterEnum: false
AfterFunction: false
AfterNamespace: false
AfterObjCDeclaration: false
AfterStruct: false
AfterUnion: false
AfterExternBlock: false
BeforeCatch: false
BeforeElse: false
IndentBraces: false
SplitEmptyFunction: true
SplitEmptyRecord: true
SplitEmptyNamespace: true
BreakBeforeBinaryOperators: None
BreakBeforeBraces: Attach
BreakBeforeInheritanceComma: false
BreakBeforeTernaryOperators: true
BreakConstructorInitializersBeforeComma: false
BreakConstructorInitializers: BeforeColon
BreakAfterJavaFieldAnnotations: false
BreakStringLiterals: true
ColumnLimit: 120
CommentPragmas: '^ IWYU pragma:'
CompactNamespaces: false
ConstructorInitializerAllOnOneLineOrOnePerLine: false
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
Cpp11BracedListStyle: true
DerivePointerAlignment: false
DisableFormat: false
ExperimentalAutoDetectBinPacking: false
FixNamespaceComments: true
ForEachMacros:
- foreach
- Q_FOREACH
- BOOST_FOREACH
IncludeBlocks: Preserve
IncludeCategories:
- Regex: '^<ext/.*\.h>'
Priority: 2
- Regex: '^<.*\.h>'
Priority: 1
- Regex: '^<.*'
Priority: 2
- Regex: '.*'
Priority: 3
IncludeIsMainRegex: '([-_](test|unittest))?$'
IndentCaseLabels: true
IndentPPDirectives: None
IndentWidth: 4
IndentWrappedFunctionNames: false
JavaScriptQuotes: Leave
JavaScriptWrapImports: true
KeepEmptyLinesAtTheStartOfBlocks: false
MacroBlockBegin: ''
MacroBlockEnd: ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: None
ObjCBlockIndentWidth: 2
ObjCSpaceAfterProperty: false
ObjCSpaceBeforeProtocolList: false
PenaltyBreakAssignment: 2
PenaltyBreakBeforeFirstCallParameter: 1
PenaltyBreakComment: 300
PenaltyBreakFirstLessLess: 120
PenaltyBreakString: 1000
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 200
PointerAlignment: Left
ReflowComments: true
SortIncludes: true
SortUsingDeclarations: true
SpaceAfterCStyleCast: false
SpaceAfterTemplateKeyword: true
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: ControlStatements
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 2
SpacesInAngles: false
SpacesInContainerLiterals: true
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
Standard: Latest
TabWidth: 8
UseTab: Never
...

3
.gitignore vendored
View file

@ -29,8 +29,9 @@
*.app
# My Definition
build/
build*/
cmake-build-debug/
.idea/
.DS_Store
include/xcdat/xcdat_config.hpp
.vscode/

48
CMakeLists.txt Normal file
View file

@ -0,0 +1,48 @@
cmake_minimum_required(VERSION 3.0)
project(xcdat VERSION 1.0.0 LANGUAGES CXX)
set(CMAKE_CXX_STANDARD 17)
if (NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release)
endif ()
if ((CMAKE_CXX_COMPILER_ID STREQUAL "Clang") OR (CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang"))
set(CMAKE_COMPILER_IS_CLANGXX 1)
endif ()
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
set(CMAKE_COMPILER_IS_GNUCXX 1)
endif ()
# C++17 compiler check
if ((CMAKE_COMPILER_IS_GNUCXX AND ${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 7.0) OR (CMAKE_COMPILER_IS_CLANGXX AND ${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.0))
message(FATAL_ERROR "Your C++ compiler does not support C++17. Please install g++ 7.0 (or greater) or clang 4.0 (or greater)")
else ()
message(STATUS "Compiler is recent enough to support C++17.")
endif ()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++1z -pthread -Wall")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DNDEBUG -march=native -O3")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fsanitize=address -fno-omit-frame-pointer -O0 -g -DDEBUG")
message(STATUS "BUILD_TYPE is ${CMAKE_BUILD_TYPE}")
message(STATUS "CXX_FLAGS are ${CMAKE_CXX_FLAGS}")
message(STATUS "CXX_FLAGS_DEBUG are ${CMAKE_CXX_FLAGS_DEBUG}")
message(STATUS "CXX_FLAGS_RELEASE are ${CMAKE_CXX_FLAGS_RELEASE}")
include_directories(include)
add_subdirectory(sample)
add_subdirectory(tools)
enable_testing()
add_subdirectory(tests)
file(COPY ${CMAKE_SOURCE_DIR}/tests/keys.txt DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tests)
# Install the library
file(GLOB XCDAT_HEADER_FILES include/xcdat/*.hpp)
file(GLOB MM_HEADER_FILES include/mm_file/*.hpp)
install(FILES include/xcdat.hpp DESTINATION include)
install(FILES ${XCDAT_HEADER_FILES} DESTINATION include/xcdat)
install(FILES ${MM_HEADER_FILES} DESTINATION include/mm_file)

21
LICENSE Normal file
View file

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2021 Shunsuke Kanda
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

520
README.md Normal file
View file

@ -0,0 +1,520 @@
# Xcdat: Fast compressed trie dictionary library
**Xcdat** is a C++17 header-only library of a fast compressed string dictionary based on an improved double-array trie structure described in the paper: [Compressed double-array tries for string dictionaries supporting fast lookup](https://doi.org/10.1007/s10115-016-0999-8), *Knowledge and Information Systems*, 2017, available at [here](https://kampersanda.github.io/pdf/KAIS2017.pdf).
## Table of contents
- [Features](#features)
- [Build instructions](#build-instructions)
- [Command line tools](#command-line-tools)
- [Sample usage](#sample-usage)
- [API](#api)
- [Performance](#performance)
- [Licensing](#licensing)
- [Todo](#todo)
- [References](#references)
## Features
- **Compressed string dictionary.** Xcdat implements a (static) *compressed string dictioanry* that stores a set of strings (or keywords) in a compressed space while supporting several search operations [1,2]. For example, Xcdat can store an entire set of English Wikipedia titles at half the size of the raw data.
- **Fast and compact data structure.** Xcdat employs the *double-array trie* [3] known as the fastest trie implementation. However, the double-array trie resorts to many pointers and consumes a large amount of memory. To address this, Xcdat applies the *XCDA* method [2] that represents the double-array trie in a compressed format while maintaining the fast searches.
- **Cache efficiency.** Xcdat employs a *minimal-prefix trie* [4] that replaces redundant trie nodes into strings to reduce random access and to improve locality of references.
- **Dictionary encoding.** Xcdat maps `N` distinct keywords into unique IDs from `[0,N-1]`, and supports the two symmetric operations: `lookup` returns the ID corresponding to a given keyword; `decode` returns the keyword associated with a given ID. The mapping is so-called *dictionary encoding* (or *domain encoding*) and is fundamental in many DB applications as described by Martínez-Prieto et al [1] or Müller et al. [5].
- **Prefix search operations.** Xcdat supports prefix search operations realized by trie search algorithms: `prefix_search` returns all the keywords contained as prefixes of a given string; `predictive search` returns all the keywords starting with a given string. These will be useful in many NLP applications such as auto completions [6], stemmed searches [7], or input method editors [8].
- **64-bit support.** As mentioned before, since the double array is a pointer-based data structure, most double-array libraries use 32-bit pointers to reduce memory consumption, resulting in limiting the scale of the input dataset. On the other hand, the XCDA method allows Xcdat to represent 64-bit pointers without sacrificing memory efficiency.
- **Binary key support.** In normal mode, Xcdat will use the `\0` character as an end marker for each keyword. However, if the dataset include `\0` characters, it will use bit flags instead of end markers, allowing the dataset to consist of binary keywords.
- **Memory mapping.** Xcdat supports *memory mapping*, allowing data to be deserialized quickly without loading it into memory. Of course, deserialization by the loading is also supported.
- **Header only.** The library consists only of header files, and you can easily install it.
## Build instructions
You can download, compile, and install Xcdat with the following commands.
```
$ git clone https://github.com/kampersanda/xcdat.git
$ cd xcdat
$ mkdir build
$ cd build
$ cmake ..
$ make -j
$ make install
```
Or, since this library consists only of header files, you can easily install it by passing through the path to the directory `include`.
### Requirements
You need to install a modern C++17 ready compiler such as `g++ >= 7.0` or `clang >= 4.0`. For the build system, you need to install `CMake >= 3.0` to compile the library.
The library considers a 64-bit operating system. The code has been tested only on Mac OS X and Linux. That is, this library considers only UNIX-compatible OS.
## Command line tools
Xcdat provides command line tools to build the dictionary and perform searches, which are inspired by [marisa-trie](https://github.com/s-yata/marisa-trie). All the tools will print the command line options by specifying the parameter `-h`.
### `xcdat_build`
It builds the trie dictionary from a given dataset consisting of keywords separated by newlines. The following command builds the trie dictionary from dataset `enwiki-titles.txt` and writes the dictionary into file `idx.bin`.
```
$ xcdat_build enwiki-titles.txt idx.bin
Number of keys: 15955763
Number of trie nodes: 36441058
Number of DA units: 36520704
Memory usage in bytes: 1.70618e+08
Memory usage in MiB: 162.714
```
### `xcdat_lookup`
It tests the `lookup` operation for a given dictionary. Given a query string via `stdin`, it prints the associated ID if found, or `-1` otherwise.
```
$ xcdat_lookup idx.bin
Algorithm
1255938 Algorithm
Double_Array
-1 Double_Array
```
### `xcdat_decode`
It tests the `decode` operation for a given dictionary. Given a query ID via `stdin`, it prints the corresponding keyword if the ID is in the range `[0,N-1]`, where `N` is the number of stored keywords.
```
$ xcdat_decode idx.bin
1255938
1255938 Algorithm
```
### `xcdat_prefix_search`
It tests the `prefix_search` operation for a given dictionary. Given a query string via `stdin`, it prints all the keywords contained as prefixes of a given string.
```
$ xcdat_prefix_search idx.bin
Algorithmic
6 found
57 A
798460 Al
1138004 Alg
1253024 Algo
1255938 Algorithm
1255931 Algorithmic
```
### `xcdat_predictive_search`
It tests the `predictive_search` operation for a given dictionary. Given a query string via `stdin`, it prints the first `n` keywords starting with a given string, where `n` is one of the parameters.
```
$ xcdat_predictive_search idx.bin -n 3
Algorithm
263 found
1255938 Algorithm
1255944 Algorithm's_optimality
1255972 Algorithm_(C++)
```
### `xcdat_enumerate`
It prints all the keywords stored in a given dictionary.
```
$ xcdat_enumerate idx.bin | head -3
0 !
107 !!
138 !!!
```
### `xcdat_benchmark`
It measures the performances of possible tries for a given dataset. To perform search operations, it randomly samples `n` queires from the dataset, where `n` is one of the parameters.
```
$ xcdat_benchmark enwiki-titles.txt
** xcdat::trie_7_type **
Number of keys: 15955763
Memory usage in bytes: 1.70618e+08
Memory usage in MiB: 162.714
Construction time in seconds: 12.907
Lookup time in microsec/query: 0.4674
Decode time in microsec/query: 0.8722
** xcdat::trie_8_type **
Number of keys: 15955763
Memory usage in bytes: 1.64104e+08
Memory usage in MiB: 156.502
Construction time in seconds: 13.442
Lookup time in microsec/query: 0.7593
Decode time in microsec/query: 1.2341
```
## Sample usage
`sample/sample.cpp` provides a sample usage. It employs the external library [mm_file](https://github.com/jermp/mm_file) to implement a memory-mapped file, which will be installed by `make install` together.
```c++
#include <iostream>
#include <string>
#include <mm_file/mm_file.hpp>
#include <xcdat.hpp>
int main() {
// Dataset
std::vector<std::string> keys = {
"AirPods", "AirTag", "Mac", "MacBook", "MacBook_Air", "MacBook_Pro",
"Mac_Mini", "Mac_Pro", "iMac", "iPad", "iPhone", "iPhone_SE",
};
// The input keys must be sorted and unique (although they have already satisfied in this case).
std::sort(keys.begin(), keys.end());
keys.erase(std::unique(keys.begin(), keys.end()), keys.end());
// The trie dictionary type
using trie_type = xcdat::trie_8_type;
// The dictionary filename
const char* tmp_filename = "dic.bin";
// Build and save the trie dictionary.
{
const trie_type trie(keys);
xcdat::save(trie, tmp_filename);
}
// Memory-map the trie dictionary.
const mm::file_source<char> fin(tmp_filename, mm::advice::sequential);
const auto trie = xcdat::mmap<trie_type>(fin.data());
// Or, load the trie dictionary on memory.
// const auto trie = xcdat::load<trie_type>(tmp_filename);
// Basic statistics
std::cout << "Number of keys: " << trie.num_keys() << std::endl;
std::cout << "Number of trie nodes: " << trie.num_nodes() << std::endl;
std::cout << "Number of DA units: " << trie.num_units() << std::endl;
std::cout << "Memory usage in bytes: " << xcdat::memory_in_bytes(trie) << std::endl;
// Lookup the ID for a query key.
{
const auto id = trie.lookup("Mac_Pro");
std::cout << "Lookup(Mac_Pro) = " << id.value_or(UINT64_MAX) << std::endl;
}
{
const auto id = trie.lookup("Google_Pixel");
std::cout << "Lookup(Google_Pixel) = " << id.value_or(UINT64_MAX) << std::endl;
}
// Decode the key for a query ID.
{
const auto dec = trie.decode(4);
std::cout << "Decode(4) = " << dec << std::endl;
}
// Common prefix search
{
std::cout << "CommonPrefixSearch(MacBook_Air) = {" << std::endl;
auto itr = trie.make_prefix_iterator("MacBook_Air");
while (itr.next()) {
std::cout << " (" << itr.decoded_view() << ", " << itr.id() << ")," << std::endl;
}
std::cout << "}" << std::endl;
}
// Predictive search
{
std::cout << "PredictiveSearch(Mac) = {" << std::endl;
auto itr = trie.make_predictive_iterator("Mac");
while (itr.next()) {
std::cout << " (" << itr.decoded_view() << ", " << itr.id() << ")," << std::endl;
}
std::cout << "}" << std::endl;
}
// Enumerate all the keys (in lex order).
{
std::cout << "Enumerate() = {" << std::endl;
auto itr = trie.make_enumerative_iterator();
while (itr.next()) {
std::cout << " (" << itr.decoded_view() << ", " << itr.id() << ")," << std::endl;
}
std::cout << "}" << std::endl;
}
std::remove(tmp_filename);
return 0;
}
```
The output will be
```
Number of keys: 12
Number of trie nodes: 28
Number of DA units: 256
Memory usage in bytes: 1766
Lookup(Mac_Pro) = 7
Lookup(Google_Pixel) = 18446744073709551615
Decode(4) = MacBook_Air
CommonPrefixSearch(MacBook_Air) = {
(Mac, 1),
(MacBook, 2),
(MacBook_Air, 4),
}
PredictiveSearch(Mac) = {
(Mac, 1),
(MacBook, 2),
(MacBook_Air, 4),
(MacBook_Pro, 11),
(Mac_Mini, 5),
(Mac_Pro, 7),
}
Enumerate() = {
(AirPods, 0),
(AirTag, 3),
(Mac, 1),
(MacBook, 2),
(MacBook_Air, 4),
(MacBook_Pro, 11),
(Mac_Mini, 5),
(Mac_Pro, 7),
(iMac, 10),
(iPad, 6),
(iPhone, 8),
(iPhone_SE, 9),
}
```
## API
Xcdat can be used by including `xcdat.hpp`.
### Trie dictionary types
The two dictionary types are difined.
- `xcdat::trie_8_type` is the trie dictionary using standard DACs [9] using 8-bit integers for elements.
- `xcdat::trie_7_type` is the trie dictionary using pointer-based DACs [2] using 7-bit integers for elements.
### Trie dictionary class
The trie dictionary has the following members.
```c++
//! A compressed string dictionary based on an improved double-array trie.
//! 'BcVector' is the data type of Base and Check vectors.
template <class BcVector>
class trie {
public:
//! Default constructor
trie() = default;
//! Default destructor
virtual ~trie() = default;
//! Copy constructor (deleted)
trie(const trie&) = delete;
//! Copy constructor (deleted)
trie& operator=(const trie&) = delete;
//! Move constructor
trie(trie&&) noexcept = default;
//! Move constructor
trie& operator=(trie&&) noexcept = default;
//! Build the trie from the input keywords, which are lexicographically sorted and unique.
//!
//! If bin_mode = false, the NULL character is used for the termination of a keyword.
//! If bin_mode = true, bit flags are used istead, and the keywords can contain NULL characters.
//! If the input keywords contain NULL characters, bin_mode will be forced to be set to true.
//!
//! The type 'Strings' and 'Strings::value_type' should be a random iterable container such as std::vector.
//! Precisely, they should support the following operations:
//! - size() returns the container size.
//! - operator[](i) accesses the i-th element.
//! - begin() returns the iterator to the beginning.
//! - end() returns the iterator to the end.
//! The type 'Strings::value_type::value_type' should be one-byte integer type such as 'char'.
template <class Strings>
trie(const Strings& keys, bool bin_mode = false);
//! Check if the binary mode.
bool bin_mode() const;
//! Get the number of stored keywords.
std::uint64_t num_keys() const;
//! Get the alphabet size.
std::uint64_t alphabet_size() const;
//! Get the maximum length of keywords.
std::uint64_t max_length() const;
//! Get the number of trie nodes.
std::uint64_t num_nodes() const;
//! Get the number of DA units.
std::uint64_t num_units() const;
//! Get the number of unused DA units.
std::uint64_t num_free_units() const;
//! Get the length of TAIL vector.
std::uint64_t tail_length() const;
//! Lookup the ID of the keyword.
std::optional<std::uint64_t> lookup(std::string_view key) const;
//! Decode the keyword associated with the ID.
std::string decode(std::uint64_t id) const;
//! Decode the keyword associated with the ID and store it in 'decoded'.
//! It can avoid reallocation of memory to store the result.
void decode(std::uint64_t id, std::string& decoded) const;
//! An iterator class for common prefix search.
//! It enumerates all the keywords contained as prefixes of a given string.
//! It should be instantiated via the function 'make_prefix_iterator'.
class prefix_iterator {
public:
prefix_iterator() = default;
//! Increment the iterator.
//! Return false if the iteration is terminated.
bool next();
//! Get the result ID.
std::uint64_t id() const;
//! Get the result keyword.
std::string decoded() const;
//! Get the reference to the result keyword.
//! Note that the referenced data will be changed in the next iteration.
std::string_view decoded_view() const;
};
//! Make the common prefix searcher for the given keyword.
prefix_iterator make_prefix_iterator(std::string_view key) const;
//! Preform common prefix search for the keyword.
void prefix_search(std::string_view key, const std::function<void(std::uint64_t, std::string_view)>& fn) const;
//! An iterator class for predictive search.
//! It enumerates all the keywords starting with a given string.
//! It should be instantiated via the function 'make_predictive_iterator'.
class predictive_iterator {
public:
predictive_iterator() = default;
//! Increment the iterator.
//! Return false if the iteration is terminated.
bool next();
//! Get the result ID.
std::uint64_t id() const;
//! Get the result keyword.
std::string decoded() const;
//! Get the reference to the result keyword.
//! Note that the referenced data will be changed in the next iteration.
std::string_view decoded_view() const;
};
//! Make the predictive searcher for the keyword.
predictive_iterator make_predictive_iterator(std::string_view key) const;
//! Preform predictive search for the keyword.
void predictive_search(std::string_view key, const std::function<void(std::uint64_t, std::string_view)>& fn) const;
//! An iterator class for enumeration.
//! It enumerates all the keywords stored in the trie.
//! It should be instantiated via the function 'make_enumerative_iterator'.
using enumerative_iterator = predictive_iterator;
//! An iterator class for enumeration.
enumerative_iterator make_enumerative_iterator() const;
//! Enumerate all the keywords and their IDs stored in the trie.
void enumerate(const std::function<void(std::uint64_t, std::string_view)>& fn) const;
//! Visit the members (commonly used for I/O).
template <class Visitor>
void visit(Visitor& visitor);
};
```
### I/O handlers
`xcdat.hpp` provides some functions for handling I/O operations.
```c++
//! Set the continuous memory block to a new trie instance.
template <class Trie>
Trie mmap(const char* address);
//! Load the trie dictionary from the file.
template <class Trie>
Trie load(std::string_view filepath);
//! Save the trie dictionary to the file and returns the file size in bytes.
template <class Trie>
std::uint64_t save(const Trie& idx, std::string_view filepath);
//! Get the dictionary size in bytes.
template <class Trie>
std::uint64_t memory_in_bytes(const Trie& idx);
//! Get the flag indicating the trie type, embedded by the function 'save'.
//! The flag corresponds to trie::l1_bits and will be used to detect the trie type from the file.
std::uint32_t get_flag(std::string_view filepath);
//! Load the keywords from the file.
std::vector<std::string> load_strings(std::string_view filepath, char delim = '\n');
```
## Performance
To be added...
## Licensing
This library is free software provided under the MIT License.
If you use the library in academic settings, please cite the following paper.
```
@article{kanda2017compressed,
title={Compressed double-array tries for string dictionaries supporting fast lookup},
author={Kanda, Shunsuke and Morita, Kazuhiro and Fuketa, Masao},
journal={Knowledge and Information Systems (KAIS)},
volume={51},
number={3},
pages={1023--1042},
year={2017},
publisher={Springer}
}
```
## Todo
- Support other language bindings.
- Add SIMD-ization.
## References
1. M. A. Martínez-Prieto, N. Brisaboa, R. Cánovas, F. Claude, and G. Navarro. Practical compressed string dictionaries. Information Systems, 56:73108, 2016
2. S. Kanda, K. Morita, and M. Fuketa. Compressed double-array tries for string dictionaries supporting fast lookup. Knowledge and Information Systems, 51(3): 10231042, 2017.
3. J. Aoe. An efficient digital search algorithm by using a double-array structure. IEEE Transactions on Software Engineering, 15(9):10661077, 1989.
4. S. Yata, M. Oono, K. Morita, M. Fuketa, T. Sumitomo, and J. Aoe. A compact static double-array keeping character codes. Information Processing & Management, 43(1):237247, 2007.
5. Müller, Ingo, Cornelius Ratsch, and Franz Faerber. Adaptive string dictionary compression in in-memory column-store database systems. In EDBT, pp. 283294, 2014.
6. Gog, Simon, Giulio Ermanno Pibiri, and Rossano Venturini. Efficient and effective query auto-completion. In SIGIR, pp. 22712280, 2020.
7. Ricardo Baeza-Yates, and Berthier Ribeiro-Neto. Modern Information Retrieval. 2nd ed. Addison Wesley, Boston, MA, USA, 2011.
8. Kudo, Taku, et al. Efficient dictionary and language model compression for input method editors. In WTIM, pp. 1925, 2011.
9. N. R. Brisaboa, S. Ladra, and G. Navarro. DACs: Bringing direct access to variable-length codes. Information Processing & Management, 49(1):392404, 2013.

177
include/mm_file/mm_file.hpp Normal file
View file

@ -0,0 +1,177 @@
#pragma once
#include <sys/mman.h>
#include <sys/stat.h>
#include <type_traits>
#include <fcntl.h>
#include <unistd.h> // close(fd)
#include <string>
namespace mm {
namespace advice {
static const int normal = POSIX_MADV_NORMAL;
static const int random = POSIX_MADV_RANDOM;
static const int sequential = POSIX_MADV_SEQUENTIAL;
} // namespace advice
template <typename T>
struct file {
file() {
init();
}
~file() {
close();
}
file(file const&) = delete; // non construction-copyable
file& operator=(file const&) = delete; // non copyable
bool is_open() const {
return m_fd != -1;
}
void close() {
if (is_open()) {
if (munmap((char*)m_data, m_size) == -1) {
throw std::runtime_error("munmap failed when closing file");
}
::close(m_fd);
init();
}
}
size_t bytes() const {
return m_size;
}
size_t size() const {
return m_size / sizeof(T);
}
T* data() const {
return m_data;
}
struct iterator {
iterator(T* addr, size_t offset = 0) : m_ptr(addr + offset) {}
T operator*() {
return *m_ptr;
}
void operator++() {
++m_ptr;
}
bool operator==(iterator const& rhs) const {
return m_ptr == rhs.m_ptr;
}
bool operator!=(iterator const& rhs) const {
return !((*this) == rhs);
}
private:
T* m_ptr;
};
iterator begin() const {
return iterator(m_data);
}
iterator end() const {
return iterator(m_data, size());
}
protected:
int m_fd;
size_t m_size;
T* m_data;
void init() {
m_fd = -1;
m_size = 0;
m_data = nullptr;
}
void check_fd() {
if (m_fd == -1) throw std::runtime_error("cannot open file");
}
};
template <typename Pointer>
Pointer mmap(int fd, size_t size, int prot) {
static const size_t offset = 0;
Pointer p =
static_cast<Pointer>(::mmap(NULL, size, prot, MAP_SHARED, fd, offset));
if (p == MAP_FAILED) throw std::runtime_error("mmap failed");
return p;
}
template <typename T>
struct file_source : public file<T const> {
typedef file<T const> base;
file_source() {}
file_source(std::string const& path, int adv = advice::normal) {
open(path, adv);
}
void open(std::string const& path, int adv = advice::normal) {
base::m_fd = ::open(path.c_str(), O_RDONLY);
base::check_fd();
struct stat fs;
if (fstat(base::m_fd, &fs) == -1) {
throw std::runtime_error("cannot stat file");
}
base::m_size = fs.st_size;
base::m_data = mmap<T const*>(base::m_fd, base::m_size, PROT_READ);
if (posix_madvise((void*)base::m_data, base::m_size, adv)) {
throw std::runtime_error("madvise failed");
}
}
};
template <typename T>
struct file_sink : public file<T> {
typedef file<T> base;
file_sink() {}
file_sink(std::string const& path) {
open(path);
}
file_sink(std::string const& path, size_t n) {
open(path, n);
}
void open(std::string const& path) {
static const mode_t mode = 0600; // read/write
base::m_fd = ::open(path.c_str(), O_RDWR, mode);
base::check_fd();
struct stat fs;
if (fstat(base::m_fd, &fs) == -1) {
throw std::runtime_error("cannot stat file");
}
base::m_size = fs.st_size;
base::m_data =
mmap<T*>(base::m_fd, base::m_size, PROT_READ | PROT_WRITE);
}
void open(std::string const& path, size_t n) {
static const mode_t mode = 0600; // read/write
base::m_fd = ::open(path.c_str(), O_RDWR | O_CREAT | O_TRUNC, mode);
base::check_fd();
base::m_size = n * sizeof(T);
ftruncate(base::m_fd,
base::m_size); // truncate the file at the new size
base::m_data =
mmap<T*>(base::m_fd, base::m_size, PROT_READ | PROT_WRITE);
}
};
} // namespace mm

85
include/xcdat.hpp Normal file
View file

@ -0,0 +1,85 @@
#pragma once
#include "xcdat/bc_vector_7.hpp"
#include "xcdat/bc_vector_8.hpp"
#include "xcdat/load_visitor.hpp"
#include "xcdat/mmap_visitor.hpp"
#include "xcdat/save_visitor.hpp"
#include "xcdat/size_visitor.hpp"
#include "xcdat/trie.hpp"
namespace xcdat {
using trie_8_type = trie<bc_vector_8>;
using trie_7_type = trie<bc_vector_7>;
//! Set the continuous memory block to a new trie instance.
template <class Trie>
[[maybe_unused]] Trie mmap(const char* address) {
mmap_visitor visitor(address);
std::uint32_t flag;
visitor.visit(flag);
XCDAT_THROW_IF(flag != Trie::l1_bits, "The input dictionary type is different.");
Trie idx;
visitor.visit(idx);
return idx;
}
//! Load the trie dictionary from the file.
template <class Trie>
[[maybe_unused]] Trie load(std::string_view filepath) {
load_visitor visitor(filepath);
std::uint32_t flag;
visitor.visit(flag);
XCDAT_THROW_IF(flag != Trie::l1_bits, "The input dictionary type is different.");
Trie idx;
visitor.visit(idx);
return idx;
}
//! Save the trie dictionary to the file and returns the file size in bytes.
template <class Trie>
[[maybe_unused]] std::uint64_t save(const Trie& idx, std::string_view filepath) {
save_visitor visitor(filepath);
visitor.visit(static_cast<std::uint32_t>(Trie::l1_bits)); // flag
visitor.visit(const_cast<Trie&>(idx));
return visitor.bytes();
}
//! Get the dictionary size in bytes.
template <class Trie>
[[maybe_unused]] std::uint64_t memory_in_bytes(const Trie& idx) {
size_visitor visitor;
visitor.visit(static_cast<std::uint32_t>(Trie::l1_bits)); // flag
visitor.visit(const_cast<Trie&>(idx));
return visitor.bytes();
}
//! Get the flag indicating the trie dictionary type, embedded by the function 'save'.
//! The flag corresponds to trie::l1_bits and will be used to detect the trie type from the file.
[[maybe_unused]] std::uint32_t get_flag(std::string_view filepath) {
std::ifstream ifs(filepath);
XCDAT_THROW_IF(!ifs.good(), "Cannot open the input file");
std::uint32_t flag;
ifs.read(reinterpret_cast<char*>(&flag), sizeof(flag));
return flag;
}
//! Load the keywords from the file.
[[maybe_unused]] std::vector<std::string> load_strings(std::string_view filepath, char delim = '\n') {
std::ifstream ifs(filepath);
XCDAT_THROW_IF(!ifs.good(), "Cannot open the input file");
std::vector<std::string> strs;
for (std::string str; std::getline(ifs, str, delim);) {
strs.push_back(str);
}
return strs;
}
} // namespace xcdat

View file

@ -0,0 +1,194 @@
#pragma once
#include <array>
#include "bit_vector.hpp"
#include "compact_vector.hpp"
namespace xcdat {
class bc_vector_7 {
public:
static constexpr std::uint32_t l1_bits = 7;
static constexpr std::uint32_t max_levels = 4;
static constexpr std::uint64_t block_size_l1 = 1ULL << 7;
static constexpr std::uint64_t block_size_l2 = 1ULL << 15;
static constexpr std::uint64_t block_size_l3 = 1ULL << 31;
private:
std::uint64_t m_num_frees = 0;
immutable_vector<std::uint8_t> m_ints_l1;
immutable_vector<std::uint16_t> m_ints_l2;
immutable_vector<std::uint32_t> m_ints_l3;
immutable_vector<std::uint64_t> m_ints_l4;
std::array<immutable_vector<std::uint64_t>, max_levels - 1> m_ranks;
compact_vector m_links;
bit_vector m_leaves;
public:
bc_vector_7() = default;
virtual ~bc_vector_7() = default;
bc_vector_7(const bc_vector_7&) = delete;
bc_vector_7& operator=(const bc_vector_7&) = delete;
bc_vector_7(bc_vector_7&&) noexcept = default;
bc_vector_7& operator=(bc_vector_7&&) noexcept = default;
template <class BcUnits>
explicit bc_vector_7(const BcUnits& bc_units, bit_vector::builder&& leaves) {
std::vector<std::uint8_t> ints_l1;
std::vector<std::uint16_t> ints_l2;
std::vector<std::uint32_t> ints_l3;
std::vector<std::uint64_t> ints_l4;
std::array<std::vector<std::uint64_t>, max_levels - 1> ranks;
std::vector<std::uint64_t> links;
ints_l1.reserve(bc_units.size() * 2);
ranks[0].reserve((bc_units.size() * 2) >> l1_bits);
links.reserve(bc_units.size());
auto append_unit = [&](std::uint64_t x) {
if ((ints_l1.size() % block_size_l1) == 0) {
ranks[0].push_back(static_cast<std::uint64_t>(ints_l2.size()));
}
if ((x / block_size_l1) == 0) {
ints_l1.push_back(static_cast<std::uint8_t>(0 | (x << 1)));
return;
} else {
const auto i = ints_l2.size() - ranks[0].back();
ints_l1.push_back(static_cast<std::uint8_t>(1 | (i << 1)));
}
if ((ints_l2.size() % block_size_l2) == 0) {
ranks[1].push_back(static_cast<std::uint64_t>(ints_l3.size()));
}
if ((x / block_size_l2) == 0) {
ints_l2.push_back(static_cast<std::uint16_t>(0 | (x << 1)));
return;
} else {
const auto i = ints_l3.size() - ranks[1].back();
ints_l2.push_back(static_cast<std::uint16_t>(1 | (i << 1)));
}
if ((ints_l3.size() % block_size_l3) == 0) {
ranks[2].push_back(static_cast<std::uint64_t>(ints_l4.size()));
}
if ((x / block_size_l3) == 0) {
ints_l3.push_back(static_cast<std::uint32_t>(0 | (x << 1)));
return;
} else {
const auto i = ints_l4.size() - ranks[2].back();
ints_l3.push_back(static_cast<std::uint32_t>(1 | (i << 1)));
}
ints_l4.push_back(x);
};
auto append_leaf = [&](std::uint64_t x) {
if ((ints_l1.size() % block_size_l1) == 0) {
ranks[0].push_back(static_cast<std::uint64_t>(ints_l2.size()));
}
ints_l1.push_back(static_cast<std::uint8_t>(x & 0xFF));
links.push_back(x >> 8);
};
for (std::uint64_t i = 0; i < bc_units.size(); ++i) {
if (leaves[i]) {
append_leaf(bc_units[i].base);
} else {
append_unit(bc_units[i].base ^ i);
}
append_unit(bc_units[i].check ^ i);
if (bc_units[i].check == i) {
m_num_frees += 1;
}
}
// release
m_ints_l1.build(ints_l1);
m_ints_l2.build(ints_l2);
m_ints_l3.build(ints_l3);
m_ints_l4.build(ints_l4);
for (std::uint32_t j = 0; j < m_ranks.size(); ++j) {
m_ranks[j].build(ranks[j]);
}
m_links = compact_vector(links);
m_leaves = bit_vector(leaves, true, false);
}
inline std::uint64_t base(std::uint64_t i) const {
return access(i * 2) ^ i;
}
inline std::uint64_t check(std::uint64_t i) const {
return access(i * 2 + 1) ^ i;
}
inline std::uint64_t link(std::uint64_t i) const {
return m_ints_l1[i * 2] | (m_links[m_leaves.rank(i)] << 8);
}
inline bool is_leaf(std::uint64_t i) const {
return m_leaves[i];
}
inline bool is_used(std::uint64_t i) const {
return check(i) != i;
}
inline std::uint64_t num_units() const {
return m_ints_l1.size() / 2;
}
inline std::uint64_t num_free_units() const {
return m_num_frees;
}
inline std::uint64_t num_nodes() const {
return num_units() - num_free_units();
}
inline std::uint64_t num_leaves() const {
return m_leaves.num_ones();
}
template <class Visitor>
void visit(Visitor& visitor) {
visitor.visit(m_num_frees);
visitor.visit(m_ints_l1);
visitor.visit(m_ints_l2);
visitor.visit(m_ints_l3);
visitor.visit(m_ints_l4);
for (std::uint32_t j = 0; j < m_ranks.size(); j++) {
visitor.visit(m_ranks[j]);
}
visitor.visit(m_links);
visitor.visit(m_leaves);
}
private:
inline std::uint64_t access(std::uint64_t i) const {
std::uint64_t x = m_ints_l1[i] >> 1;
if ((m_ints_l1[i] & 1U) == 0) {
return x;
}
i = m_ranks[0][i / block_size_l1] + x;
x = m_ints_l2[i] >> 1;
if ((m_ints_l2[i] & 1U) == 0) {
return x;
}
i = m_ranks[1][i / block_size_l2] + x;
x = m_ints_l3[i] >> 1;
if ((m_ints_l3[i] & 1U) == 0) {
return x;
}
i = m_ranks[2][i / block_size_l3] + x;
return m_ints_l4[i];
}
};
} // namespace xcdat

View file

@ -0,0 +1,150 @@
#pragma once
#include <array>
#include "bit_vector.hpp"
#include "compact_vector.hpp"
namespace xcdat {
class bc_vector_8 {
public:
static constexpr std::uint32_t l1_bits = 8;
static constexpr std::uint32_t max_levels = sizeof(std::uint64_t);
private:
std::uint32_t m_num_levels = 0;
std::uint64_t m_num_frees = 0;
std::array<immutable_vector<std::uint8_t>, max_levels> m_bytes;
std::array<bit_vector, max_levels - 1> m_nexts;
compact_vector m_links;
bit_vector m_leaves;
public:
bc_vector_8() = default;
virtual ~bc_vector_8() = default;
bc_vector_8(const bc_vector_8&) = delete;
bc_vector_8& operator=(const bc_vector_8&) = delete;
bc_vector_8(bc_vector_8&&) noexcept = default;
bc_vector_8& operator=(bc_vector_8&&) noexcept = default;
template <class BcUnits>
explicit bc_vector_8(const BcUnits& bc_units, bit_vector::builder&& leaves) {
std::array<std::vector<std::uint8_t>, max_levels> bytes;
std::array<bit_vector::builder, max_levels> next_flags; // The last will not be released
std::vector<std::uint64_t> links;
bytes[0].reserve(bc_units.size() * 2);
next_flags[0].reserve(bc_units.size() * 2);
links.reserve(bc_units.size());
m_num_levels = 0;
auto append_unit = [&](std::uint64_t x) {
std::uint32_t j = 0;
bytes[j].push_back(static_cast<std::uint8_t>(x & 0xFF));
next_flags[j].push_back(true);
x >>= 8;
while (x) {
++j;
bytes[j].push_back(static_cast<std::uint8_t>(x & 0xFF));
next_flags[j].push_back(true);
x >>= 8;
}
next_flags[j].set_bit(next_flags[j].size() - 1, false);
m_num_levels = std::max(m_num_levels, j);
};
auto append_leaf = [&](std::uint64_t x) {
bytes[0].push_back(static_cast<std::uint8_t>(x & 0xFF));
next_flags[0].push_back(false);
links.push_back(x >> 8);
};
for (std::uint64_t i = 0; i < bc_units.size(); ++i) {
if (leaves[i]) {
append_leaf(bc_units[i].base);
} else {
append_unit(bc_units[i].base ^ i);
}
append_unit(bc_units[i].check ^ i);
if (bc_units[i].check == i) {
m_num_frees += 1;
}
}
// release
for (std::uint32_t i = 0; i < m_num_levels; ++i) {
m_bytes[i].build(bytes[i]);
m_nexts[i] = bit_vector(next_flags[i], true, false);
}
m_bytes[m_num_levels].build(bytes[m_num_levels]);
m_links = compact_vector(links);
m_leaves = bit_vector(leaves, true, false);
}
inline std::uint64_t base(std::uint64_t i) const {
return access(i * 2) ^ i;
}
inline std::uint64_t check(std::uint64_t i) const {
return access(i * 2 + 1) ^ i;
}
inline std::uint64_t link(std::uint64_t i) const {
return m_bytes[0][i * 2] | (m_links[m_leaves.rank(i)] << 8);
}
inline bool is_leaf(std::uint64_t i) const {
return m_leaves[i];
}
inline bool is_used(std::uint64_t i) const {
return check(i) != i;
}
inline std::uint64_t num_units() const {
return m_bytes[0].size() / 2;
}
inline std::uint64_t num_free_units() const {
return m_num_frees;
}
inline std::uint64_t num_nodes() const {
return num_units() - num_free_units();
}
inline std::uint64_t num_leaves() const {
return m_leaves.num_ones();
}
template <class Visitor>
void visit(Visitor& visitor) {
visitor.visit(m_num_levels);
visitor.visit(m_num_frees);
for (std::uint32_t j = 0; j < m_bytes.size(); j++) {
visitor.visit(m_bytes[j]);
}
for (std::uint32_t j = 0; j < m_nexts.size(); j++) {
visitor.visit(m_nexts[j]);
}
visitor.visit(m_links);
visitor.visit(m_leaves);
}
private:
inline std::uint64_t access(std::uint64_t i) const {
std::uint32_t j = 0;
std::uint64_t x = m_bytes[j][i];
while (j < m_num_levels and m_nexts[j][i]) {
i = m_nexts[j++].rank(i);
x |= static_cast<std::uint64_t>(m_bytes[j][i]) << (j * 8);
}
return x;
}
};
} // namespace xcdat

148
include/xcdat/bit_tools.hpp Normal file
View file

@ -0,0 +1,148 @@
#pragma once
#include <cstdint>
#include <cstdlib>
#ifdef __SSE4_2__
#include <nmmintrin.h>
#endif
#ifdef __BMI2__
#include <immintrin.h>
#endif
// The implementatouns are from https://github.com/ot/succinct.
namespace xcdat::bit_tools {
static constexpr std::uint64_t ones_step_4 = 0x1111111111111111ULL;
static constexpr std::uint64_t ones_step_8 = 0x0101010101010101ULL;
static constexpr std::uint64_t ones_step_9 = 1ULL << 0 | 1ULL << 9 | 1ULL << 18 | 1ULL << 27 | //
1ULL << 36 | 1ULL << 45 | 1ULL << 54;
static constexpr std::uint64_t msbs_step_8 = 0x80ULL * ones_step_8;
static constexpr std::uint64_t msbs_step_9 = 0x100ULL * ones_step_9;
inline std::uint64_t popcount(std::uint64_t x) {
#ifdef __SSE4_2__
return static_cast<std::uint64_t>(__builtin_popcountll(x));
#else
x = x - ((x >> 1) & 0x5555555555555555ULL);
x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL);
x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
x = (0x0101010101010101ULL * x >> 56);
return x;
#endif
}
static constexpr std::uint8_t debruijn64_mapping[64] = {
63, 0, 58, 1, 59, 47, 53, 2, 60, 39, 48, 27, 54, 33, 42, 3, 61, 51, 37, 40, 49, 18,
28, 20, 55, 30, 34, 11, 43, 14, 22, 4, 62, 57, 46, 52, 38, 26, 32, 41, 50, 36, 17, 19,
29, 10, 13, 21, 56, 45, 25, 31, 35, 16, 9, 12, 44, 24, 15, 8, 23, 7, 6, 5,
};
static constexpr std::uint64_t debruijn64 = 0x07EDD5E59A4E28C2ULL;
// return the position of the single bit set in the word x
inline std::uint8_t bit_position(std::uint64_t x) {
return debruijn64_mapping[(x * debruijn64) >> 58];
}
inline std::uint64_t msb(std::uint64_t x) {
#ifdef __SSE4_2__
return x == 0 ? 0 : 63 - __builtin_clzll(x);
#else
if (x == 0) {
return 0;
}
// right-saturate the word
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
x |= x >> 32;
// isolate the MSB
x ^= x >> 1;
return bit_position(x);
#endif
}
inline std::uint64_t uleq_step_9(std::uint64_t x, std::uint64_t y) {
return (((((y | msbs_step_9) - (x & ~msbs_step_9)) | (x ^ y)) ^ (x & ~y)) & msbs_step_9) >> 8;
}
inline std::uint64_t byte_counts(std::uint64_t x) {
x = x - ((x & 0xa * ones_step_4) >> 1);
x = (x & 3 * ones_step_4) + ((x >> 2) & 3 * ones_step_4);
x = (x + (x >> 4)) & 0x0f * ones_step_8;
return x;
}
static constexpr std::uint8_t select_in_byte[2048] = {
8, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1,
0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0,
1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2,
0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0,
2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1,
0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0,
1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 8, 8, 8, 1, 8, 2, 2, 1, 8, 3, 3, 1, 3, 2, 2, 1, 8,
4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 8, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, 3, 2, 2, 1, 5, 4, 4, 1, 4, 2, 2, 1,
4, 3, 3, 1, 3, 2, 2, 1, 8, 6, 6, 1, 6, 2, 2, 1, 6, 3, 3, 1, 3, 2, 2, 1, 6, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2,
1, 6, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, 3, 2, 2, 1, 5, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 8, 7, 7, 1, 7, 2,
2, 1, 7, 3, 3, 1, 3, 2, 2, 1, 7, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 7, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, 3,
2, 2, 1, 5, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 7, 6, 6, 1, 6, 2, 2, 1, 6, 3, 3, 1, 3, 2, 2, 1, 6, 4, 4, 1,
4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 6, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, 3, 2, 2, 1, 5, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3,
1, 3, 2, 2, 1, 8, 8, 8, 8, 8, 8, 8, 2, 8, 8, 8, 3, 8, 3, 3, 2, 8, 8, 8, 4, 8, 4, 4, 2, 8, 4, 4, 3, 4, 3, 3, 2, 8, 8,
8, 5, 8, 5, 5, 2, 8, 5, 5, 3, 5, 3, 3, 2, 8, 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, 4, 3, 3, 2, 8, 8, 8, 6, 8, 6, 6, 2, 8,
6, 6, 3, 6, 3, 3, 2, 8, 6, 6, 4, 6, 4, 4, 2, 6, 4, 4, 3, 4, 3, 3, 2, 8, 6, 6, 5, 6, 5, 5, 2, 6, 5, 5, 3, 5, 3, 3, 2,
6, 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, 4, 3, 3, 2, 8, 8, 8, 7, 8, 7, 7, 2, 8, 7, 7, 3, 7, 3, 3, 2, 8, 7, 7, 4, 7, 4, 4,
2, 7, 4, 4, 3, 4, 3, 3, 2, 8, 7, 7, 5, 7, 5, 5, 2, 7, 5, 5, 3, 5, 3, 3, 2, 7, 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, 4, 3,
3, 2, 8, 7, 7, 6, 7, 6, 6, 2, 7, 6, 6, 3, 6, 3, 3, 2, 7, 6, 6, 4, 6, 4, 4, 2, 6, 4, 4, 3, 4, 3, 3, 2, 7, 6, 6, 5, 6,
5, 5, 2, 6, 5, 5, 3, 5, 3, 3, 2, 6, 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, 4, 3, 3, 2, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 3, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 4, 8, 4, 4, 3, 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, 5, 8, 5, 5, 3, 8, 8, 8,
5, 8, 5, 5, 4, 8, 5, 5, 4, 5, 4, 4, 3, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 3, 8, 8, 8, 6, 8, 6, 6, 4, 8, 6,
6, 4, 6, 4, 4, 3, 8, 8, 8, 6, 8, 6, 6, 5, 8, 6, 6, 5, 6, 5, 5, 3, 8, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3, 8,
8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 3, 8, 8, 8, 7, 8, 7, 7, 4, 8, 7, 7, 4, 7, 4, 4, 3, 8, 8, 8, 7, 8, 7, 7, 5,
8, 7, 7, 5, 7, 5, 5, 3, 8, 7, 7, 5, 7, 5, 5, 4, 7, 5, 5, 4, 5, 4, 4, 3, 8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6,
3, 8, 7, 7, 6, 7, 6, 6, 4, 7, 6, 6, 4, 6, 4, 4, 3, 8, 7, 7, 6, 7, 6, 6, 5, 7, 6, 6, 5, 6, 5, 5, 3, 7, 6, 6, 5, 6, 5,
5, 4, 6, 5, 5, 4, 5, 4, 4, 3, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, 5, 8, 5, 5, 4, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 4, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8,
6, 8, 6, 6, 5, 8, 8, 8, 6, 8, 6, 6, 5, 8, 6, 6, 5, 6, 5, 5, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8,
8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 4, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 5, 8, 8, 8, 7, 8, 7, 7, 5, 8,
7, 7, 5, 7, 5, 5, 4, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, 8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 4,
8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 5, 8, 7, 7, 6, 7, 6, 6, 5, 7, 6, 6, 5, 6, 5, 5, 4, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6,
8, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, 8,
8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7};
inline std::uint64_t select_in_word(const std::uint64_t x, const std::uint64_t k) {
#ifdef __BMI2__
return _tzcnt_u64(_pdep_u64(1ULL << k, x));
#else
const std::uint64_t byte_sums = byte_counts(x) * ones_step_8;
const std::uint64_t k_step_8 = k * ones_step_8;
const std::uint64_t geq_k_step_8 = (((k_step_8 | msbs_step_8) - byte_sums) & msbs_step_8);
const std::uint64_t place = popcount(geq_k_step_8) * 8;
const std::uint64_t byte_rank = k - (((byte_sums << 8) >> place) & 0xFFULL);
return place + select_in_byte[((x >> place) & 0xFF) | (byte_rank << 8)];
#endif
}
} // namespace xcdat::bit_tools

View file

@ -0,0 +1,272 @@
#pragma once
#include <numeric>
#include <vector>
#include "bit_tools.hpp"
#include "immutable_vector.hpp"
namespace xcdat {
// Vigna's Rank9 implementation from https://github.com/ot/succinct.
class bit_vector {
public:
class builder {
private:
std::uint64_t m_size = 0;
std::vector<std::uint64_t> m_bits;
public:
builder() = default;
virtual ~builder() = default;
builder(const builder&) = delete;
builder& operator=(const builder&) = delete;
builder(builder&&) noexcept = default;
builder& operator=(builder&&) noexcept = default;
builder(std::uint64_t size) {
resize(size);
}
inline void push_back(bool x) {
if (m_size % 64 == 0) {
m_bits.push_back(0);
}
if (x) {
set_bit(m_size, true);
}
m_size += 1;
}
inline bool operator[](std::uint64_t i) const {
return m_bits[i / 64] & (1ULL << (i % 64));
}
inline void set_bit(std::uint64_t i, bool x = true) {
if (x) {
m_bits[i / 64] |= (1ULL << (i % 64));
} else {
m_bits[i / 64] &= (~(1ULL << (i % 64)));
}
}
inline void resize(std::uint64_t size) {
m_bits.resize(words_for(size), 0ULL);
m_size = size;
}
inline void reserve(std::uint64_t capacity) {
m_bits.reserve(words_for(capacity));
}
inline std::uint64_t size() const {
return m_size;
}
friend class bit_vector;
};
static constexpr std::uint64_t block_size = 8; // i.e., 64 * 8 bits
static constexpr std::uint64_t selects_per_hint = 64 * block_size * 2;
private:
std::uint64_t m_size = 0;
std::uint64_t m_num_ones = 0;
immutable_vector<std::uint64_t> m_bits;
immutable_vector<std::uint64_t> m_rank_hints;
immutable_vector<std::uint64_t> m_select_hints;
public:
bit_vector() = default;
virtual ~bit_vector() = default;
bit_vector(const bit_vector&) = delete;
bit_vector& operator=(const bit_vector&) = delete;
bit_vector(bit_vector&&) noexcept = default;
bit_vector& operator=(bit_vector&&) noexcept = default;
explicit bit_vector(builder& b, bool enable_rank = false, bool enable_select = false) {
m_bits.build(b.m_bits);
m_size = b.m_size;
m_num_ones = std::accumulate(m_bits.begin(), m_bits.end(), 0ULL,
[](std::uint64_t acc, std::uint64_t x) { return acc + bit_tools::popcount(x); });
if (enable_rank) {
build_rank_hints();
}
if (enable_rank and enable_select) {
build_select_hints();
}
}
inline std::uint64_t size() const {
return m_size;
}
inline std::uint64_t num_ones() const {
return m_num_ones;
}
inline bool operator[](std::uint64_t i) const {
return m_bits[i / 64] & (1ULL << (i % 64));
}
// The number of 1s in B[0..i)
inline std::uint64_t rank(std::uint64_t i) const {
assert(i <= size());
assert(m_rank_hints.size() != 0);
if (i == size()) {
return num_ones();
}
const auto [wi, wj] = decompose<64>(i);
return rank_for_word(wi) + (wj != 0 ? bit_tools::popcount(m_bits[wi] << (64 - wj)) : 0);
}
// The largest position
inline std::uint64_t select(std::uint64_t n) const {
assert(n < num_ones());
assert(m_select_hints.size() != 0);
const std::uint64_t bi = select_for_block(n);
assert(bi < num_blocks());
std::uint64_t curr_rank = rank_for_block(bi);
assert(curr_rank <= n);
std::uint64_t rank_in_block_parallel = (n - curr_rank) * bit_tools::ones_step_9;
std::uint64_t sub_ranks = ranks_in_block(bi);
std::uint64_t sub_block_offset =
bit_tools::uleq_step_9(sub_ranks, rank_in_block_parallel) * bit_tools::ones_step_9 >> 54 & 0x7;
curr_rank += sub_ranks >> (7 - sub_block_offset) * 9 & 0x1FF;
assert(curr_rank <= n);
std::uint64_t word_offset = (bi * block_size) + sub_block_offset;
return word_offset * 64 + bit_tools::select_in_word(m_bits[word_offset], n - curr_rank);
}
template <class Visitor>
void visit(Visitor& visitor) {
visitor.visit(m_size);
visitor.visit(m_num_ones);
visitor.visit(m_bits);
visitor.visit(m_rank_hints);
visitor.visit(m_select_hints);
}
private:
template <std::uint64_t N>
static std::tuple<std::uint64_t, std::uint64_t> decompose(std::uint64_t x) {
return {x / N, x % N};
}
static std::uint64_t words_for(std::uint64_t nbits) {
return (nbits + 63) / 64;
}
inline std::uint64_t num_blocks() const {
return m_rank_hints.size() / 2 - 1;
}
// Absolute rank until the bi-th block
inline std::uint64_t rank_for_block(std::uint64_t bi) const {
return m_rank_hints[bi * 2];
}
// Packed ranks in the bi-th block
inline std::uint64_t ranks_in_block(std::uint64_t bi) const {
return m_rank_hints[bi * 2 + 1];
}
// Absolute rank until the wi-th word
inline std::uint64_t rank_for_word(std::uint64_t wi) const {
const auto [bi, bj] = decompose<block_size>(wi);
return rank_for_block(bi) + rank_in_block(bi, bj);
}
// Relative rank in the bi-th block
inline std::uint64_t rank_in_block(std::uint64_t bi, std::uint64_t bj) const {
return ranks_in_block(bi) >> ((7 - bj) * 9) & 0x1FF;
}
inline std::uint64_t select_for_block(std::uint64_t n) const {
auto [a, b] = select_with_hint(n);
while (b - a > 1) {
const std::uint64_t lb = a + (b - a) / 2;
if (rank_for_block(lb) <= n) {
a = lb;
} else {
b = lb;
}
}
return a;
}
inline std::tuple<std::uint64_t, std::uint64_t> select_with_hint(std::uint64_t n) const {
const std::uint64_t i = n / selects_per_hint;
return {i != 0 ? m_select_hints[i - 1] : 0, m_select_hints[i] + 1};
}
void build_rank_hints() {
std::uint64_t curr_num_ones = 0;
std::uint64_t curr_num_ones_in_block = 0;
std::uint64_t curr_ranks_in_block = 0;
const std::uint64_t num_words = m_bits.size();
std::vector<std::uint64_t> rank_hints = {curr_num_ones};
for (std::uint64_t wi = 0; wi < num_words; wi++) {
const std::uint64_t bi = wi % block_size; // Relative position in the block
const std::uint64_t num_ones_in_word = bit_tools::popcount(m_bits[wi]);
if (bi != 0) {
curr_ranks_in_block <<= 9;
curr_ranks_in_block |= curr_num_ones_in_block;
}
curr_num_ones += num_ones_in_word;
curr_num_ones_in_block += num_ones_in_word;
if (bi == block_size - 1) {
rank_hints.push_back(curr_ranks_in_block);
rank_hints.push_back(curr_num_ones);
curr_num_ones_in_block = 0;
curr_ranks_in_block = 0;
}
}
// Padding the remaining hints
const std::uint64_t remain = block_size - (num_words % block_size);
for (std::uint64_t wi = 0; wi < remain; wi++) {
curr_ranks_in_block <<= 9;
curr_ranks_in_block |= curr_num_ones_in_block;
}
rank_hints.push_back(curr_ranks_in_block);
// Sentinel
if (num_words % block_size != 0) {
rank_hints.push_back(curr_ranks_in_block);
rank_hints.push_back(0);
}
// Release
m_rank_hints.build(rank_hints);
}
void build_select_hints() {
std::vector<std::uint64_t> select_hints;
std::uint64_t threshold = selects_per_hint;
for (std::uint64_t bi = 0; bi < num_blocks(); ++bi) {
if (rank_for_block(bi + 1) > threshold) {
select_hints.push_back(bi);
threshold += selects_per_hint;
}
}
select_hints.push_back(num_blocks());
m_select_hints.build(select_hints);
}
};
} // namespace xcdat

View file

@ -0,0 +1,111 @@
#pragma once
#include <array>
#include <string_view>
#include "immutable_vector.hpp"
namespace xcdat {
class code_table {
private:
std::uint64_t m_max_length = 0;
std::array<std::uint8_t, 512> m_table;
immutable_vector<std::uint8_t> m_alphabet;
struct counter_type {
std::uint8_t ch;
std::uint64_t freq;
};
public:
code_table() = default;
virtual ~code_table() = default;
code_table(const code_table&) = delete;
code_table& operator=(const code_table&) = delete;
code_table(code_table&&) noexcept = default;
code_table& operator=(code_table&&) noexcept = default;
template <class Strings>
code_table(const Strings& keys) {
std::array<counter_type, 256> counter;
for (std::uint32_t ch = 0; ch < 256; ++ch) {
counter[ch] = {static_cast<std::uint8_t>(ch), 0};
}
m_max_length = 0;
for (const auto& key : keys) {
for (std::uint8_t ch : key) {
counter[ch].freq += 1;
}
m_max_length = std::max<std::uint64_t>(m_max_length, key.length());
}
{
std::vector<std::uint8_t> alphabet;
for (const auto& cf : counter) {
if (cf.freq != 0) {
alphabet.push_back(cf.ch);
}
}
m_alphabet.build(alphabet);
}
std::sort(counter.begin(), counter.end(),
[](const counter_type& a, const counter_type& b) { return a.freq > b.freq; });
for (std::uint32_t ch = 0; ch < 256; ++ch) {
m_table[counter[ch].ch] = static_cast<std::uint8_t>(ch);
}
for (std::uint32_t ch = 0; ch < 256; ++ch) {
m_table[m_table[ch] + 256] = static_cast<std::uint8_t>(ch);
}
}
inline std::uint64_t alphabet_size() const {
return m_alphabet.size();
}
inline std::uint64_t max_length() const {
return m_max_length;
}
inline std::uint8_t get_code(char ch) const {
return m_table[static_cast<std::uint8_t>(ch)];
}
inline char get_char(std::uint8_t cd) const {
return static_cast<char>(m_table[cd + 256]);
}
inline bool has_null() {
return *m_alphabet.begin() == '\0';
}
inline auto begin() const {
return m_alphabet.begin();
}
inline auto end() const {
return m_alphabet.end();
}
inline auto rbegin() const {
return m_alphabet.rbegin();
}
inline auto rend() const {
return m_alphabet.rend();
}
template <class Visitor>
void visit(Visitor& visitor) {
visitor.visit(m_max_length);
visitor.visit(m_table);
visitor.visit(m_alphabet);
}
};
} // namespace xcdat

View file

@ -0,0 +1,89 @@
#pragma once
#include "bit_tools.hpp"
#include "exception.hpp"
#include "immutable_vector.hpp"
namespace xcdat {
class compact_vector {
private:
std::uint64_t m_size = 0;
std::uint64_t m_bits = 0;
std::uint64_t m_mask = 0;
immutable_vector<std::uint64_t> m_chunks;
public:
compact_vector() = default;
virtual ~compact_vector() = default;
compact_vector(const compact_vector&) = delete;
compact_vector& operator=(const compact_vector&) = delete;
compact_vector(compact_vector&&) noexcept = default;
compact_vector& operator=(compact_vector&&) noexcept = default;
template <class Vec>
compact_vector(const Vec& vec) {
XCDAT_THROW_IF(vec.size() == 0, "The input vector is empty.");
m_size = vec.size();
m_bits = needed_bits(*std::max_element(vec.begin(), vec.end()));
m_mask = (1ULL << m_bits) - 1;
std::vector<std::uint64_t> chunks(words_for(m_size * m_bits));
for (std::uint64_t i = 0; i < m_size; i++) {
const auto [quo, mod] = decompose(i * m_bits);
chunks[quo] &= ~(m_mask << mod);
chunks[quo] |= (vec[i] & m_mask) << mod;
if (64 < mod + m_bits) {
const std::uint64_t diff = 64ULL - mod;
chunks[quo + 1] &= ~(m_mask >> diff);
chunks[quo + 1] |= (vec[i] & m_mask) >> diff;
}
}
m_chunks.build(chunks);
}
inline std::uint64_t operator[](std::uint64_t i) const {
assert(i < m_size);
const auto [quo, mod] = decompose(i * m_bits);
if (mod + m_bits <= 64) {
return (m_chunks[quo] >> mod) & m_mask;
} else {
return ((m_chunks[quo] >> mod) | (m_chunks[quo + 1] << (64 - mod))) & m_mask;
}
}
inline std::uint64_t size() const {
return m_size;
}
inline std::uint64_t bits() const {
return m_bits;
}
template <class Visitor>
void visit(Visitor& visitor) {
visitor.visit(m_size);
visitor.visit(m_bits);
visitor.visit(m_mask);
visitor.visit(m_chunks);
}
private:
static std::uint64_t needed_bits(std::uint64_t x) {
return bit_tools::msb(x) + 1;
}
static std::tuple<std::uint64_t, std::uint64_t> decompose(std::uint64_t x) {
return {x / 64, x % 64};
}
static std::uint64_t words_for(std::uint64_t nbits) {
return (nbits + 63) / 64;
}
};
} // namespace xcdat

View file

@ -0,0 +1,25 @@
#pragma once
#include <exception>
namespace xcdat {
class exception : public std::exception {
public:
explicit exception(const char* msg) : msg_{msg} {}
~exception() throw() override = default;
const char* what() const throw() override {
return msg_;
}
private:
const char* msg_;
};
#define XCDAT_TO_STR_(n) #n
#define XCDAT_TO_STR(n) XCDAT_TO_STR_(n)
#define XCDAT_THROW(msg) throw xcdat::exception(__FILE__ ":" XCDAT_TO_STR(__LINE__) ":" msg)
#define XCDAT_THROW_IF(cond, msg) (void)((!(cond)) || (XCDAT_THROW(msg), 0))
} // namespace xcdat

View file

@ -0,0 +1,107 @@
#pragma once
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <fstream>
#include <iterator>
#include <memory>
namespace xcdat {
template <class T>
class immutable_vector {
private:
std::unique_ptr<T[]> m_allocator;
std::uint64_t m_size = 0;
const T* m_data = nullptr;
public:
immutable_vector() = default;
virtual ~immutable_vector() = default;
immutable_vector(const immutable_vector&) = delete;
immutable_vector& operator=(const immutable_vector&) = delete;
immutable_vector(immutable_vector&&) noexcept = default;
immutable_vector& operator=(immutable_vector&&) noexcept = default;
void clear() {
m_allocator.reset();
m_size = 0;
m_data = nullptr;
}
template <class Vector>
immutable_vector(const Vector& vec) {
build(vec);
}
template <class Vector>
void build(const Vector& vec) {
clear();
if (vec.size() != 0) {
m_allocator = std::make_unique<T[]>(vec.size());
std::copy_n(vec.data(), vec.size(), m_allocator.get());
m_size = vec.size();
m_data = m_allocator.get();
}
}
std::uint64_t mmap(const char* address) {
clear();
m_size = *reinterpret_cast<const std::uint64_t*>(address);
m_data = reinterpret_cast<const T*>(address + sizeof(std::uint64_t));
return sizeof(std::uint64_t) + m_size * sizeof(T);
}
void load(std::ifstream& ifs) {
clear();
ifs.read(reinterpret_cast<char*>(&m_size), sizeof(m_size));
if (m_size != 0) {
m_allocator = std::make_unique<T[]>(m_size);
ifs.read(reinterpret_cast<char*>(m_allocator.get()), sizeof(T) * m_size);
m_data = m_allocator.get();
}
}
void save(std::ofstream& ofs) const {
ofs.write(reinterpret_cast<const char*>(&m_size), sizeof(m_size));
ofs.write(reinterpret_cast<const char*>(m_data), sizeof(T) * m_size);
}
inline std::uint64_t memory_in_bytes() const {
return sizeof(m_size) + sizeof(T) * m_size;
}
inline std::uint64_t size() const {
return m_size;
}
inline const T* begin() const {
return m_data;
}
inline const T* end() const {
return m_data + m_size;
}
inline auto rbegin() const {
return std::make_reverse_iterator(end());
}
inline auto rend() const {
return std::make_reverse_iterator(begin());
}
inline const T& operator[](std::uint64_t i) const {
assert(i < m_size);
return m_data[i];
}
inline const T* data() const {
return m_data;
}
};
} // namespace xcdat

View file

@ -0,0 +1,43 @@
#pragma once
#include <string_view>
#include <type_traits>
#include "exception.hpp"
#include "immutable_vector.hpp"
namespace xcdat {
class load_visitor {
private:
std::ifstream m_ifs;
public:
load_visitor(std::string_view filepath) : m_ifs(filepath, std::ios::binary) {
XCDAT_THROW_IF(!m_ifs.good(), "Cannot open the input file");
}
virtual ~load_visitor() {
m_ifs.close();
}
template <class T>
void visit(immutable_vector<T>& vec) {
vec.load(m_ifs);
}
template <class T>
void visit(T& obj) {
if constexpr (std::is_pod_v<T>) {
m_ifs.read(reinterpret_cast<char*>(&obj), sizeof(T));
} else {
obj.visit(*this);
}
}
std::uint64_t bytes() {
return m_ifs.tellg();
}
};
} // namespace xcdat

View file

@ -0,0 +1,39 @@
#pragma once
#include <type_traits>
#include "immutable_vector.hpp"
namespace xcdat {
class mmap_visitor {
private:
const char* m_base = nullptr;
const char* m_cur = nullptr;
public:
mmap_visitor(const char* base) : m_base(base), m_cur(base) {}
virtual ~mmap_visitor() = default;
template <typename T>
void visit(immutable_vector<T>& vec) {
m_cur += vec.mmap(m_cur);
}
template <typename T>
void visit(T& obj) {
if constexpr (std::is_pod_v<T>) {
obj = *reinterpret_cast<const T*>(m_cur);
m_cur += sizeof(T);
} else {
obj.visit(*this);
}
}
std::uint64_t bytes() {
return std::distance(m_base, m_cur);
}
};
} // namespace xcdat

View file

@ -0,0 +1,43 @@
#pragma once
#include <string_view>
#include <type_traits>
#include "exception.hpp"
#include "immutable_vector.hpp"
namespace xcdat {
class save_visitor {
private:
std::ofstream m_ofs;
public:
save_visitor(std::string_view filepath) : m_ofs(filepath, std::ios::binary) {
XCDAT_THROW_IF(!m_ofs.good(), "Cannot open the input file");
}
virtual ~save_visitor() {
m_ofs.close();
}
template <typename T>
void visit(const immutable_vector<T>& vec) {
vec.save(m_ofs);
}
template <typename T>
void visit(const T& obj) {
if constexpr (std::is_pod_v<T>) {
m_ofs.write(reinterpret_cast<const char*>(&obj), sizeof(T));
} else {
const_cast<T&>(obj).visit(*this);
}
}
std::uint64_t bytes() {
return m_ofs.tellp();
}
};
} // namespace xcdat

View file

@ -0,0 +1,39 @@
#pragma once
#include <string_view>
#include <type_traits>
#include "exception.hpp"
#include "immutable_vector.hpp"
namespace xcdat {
class size_visitor {
private:
std::uint64_t m_bytes = 0;
public:
size_visitor() = default;
virtual ~size_visitor() = default;
template <typename T>
void visit(const immutable_vector<T>& vec) {
m_bytes += vec.memory_in_bytes();
}
template <typename T>
void visit(const T& obj) {
if constexpr (std::is_pod_v<T>) {
m_bytes += sizeof(T);
} else {
const_cast<T&>(obj).visit(*this);
}
}
std::uint64_t bytes() {
return m_bytes;
}
};
} // namespace xcdat

View file

@ -0,0 +1,222 @@
#pragma once
#include <algorithm>
#include <functional>
#include <string>
#include <string_view>
#include <vector>
#include "bit_vector.hpp"
#include "exception.hpp"
#include "immutable_vector.hpp"
namespace xcdat {
class tail_vector {
public:
struct suffix_type {
std::string_view str;
std::uint64_t npos;
inline char operator[](std::uint64_t i) const {
return str[size() - i - 1];
}
inline std::uint64_t size() const {
return str.size();
}
inline const char* begin() const {
return str.data();
}
inline const char* end() const {
return str.data() + str.size();
}
inline std::reverse_iterator<const char*> rbegin() const {
return std::make_reverse_iterator(str.data() + str.size());
}
inline std::reverse_iterator<const char*> rend() const {
return std::make_reverse_iterator(str.data());
}
};
class builder {
private:
// Buffer
std::vector<suffix_type> m_suffixes;
// Released
std::vector<char> m_chars;
bit_vector::builder m_terms;
public:
builder() = default;
virtual ~builder() = default;
builder(const builder&) = delete;
builder& operator=(const builder&) = delete;
builder(builder&&) noexcept = default;
builder& operator=(builder&&) noexcept = default;
void set_suffix(std::string_view str, std::uint64_t npos) {
XCDAT_THROW_IF(str.size() == 0, "The given suffix is empty.");
m_suffixes.push_back({str, npos});
}
// setter(npos, tpos): Set units[npos].base = tpos.
void complete(bool bin_mode, const std::function<void(std::uint64_t, std::uint64_t)>& setter) {
std::sort(m_suffixes.begin(), m_suffixes.end(), [](const suffix_type& a, const suffix_type& b) {
return std::lexicographical_compare(std::rbegin(a), std::rend(a), std::rbegin(b), std::rend(b));
});
// Dummy for an empty suffix
m_chars.emplace_back('\0');
if (bin_mode) {
m_terms.push_back(false);
}
const suffix_type dmmy_suffix = {{nullptr, 0}, 0};
const suffix_type* prev_suffix = &dmmy_suffix;
std::uint64_t prev_tpos = 0;
for (std::uint64_t i = m_suffixes.size(); i > 0; --i) {
const suffix_type& curr_suffix = m_suffixes[i - 1];
XCDAT_THROW_IF(curr_suffix.size() == 0, "A suffix is empty.");
std::uint64_t match = 0;
while ((match < curr_suffix.size()) && (match < prev_suffix->size()) &&
((*prev_suffix)[match] == curr_suffix[match])) {
++match;
}
if ((match == curr_suffix.size()) && (prev_suffix->size() != 0)) { // sharable
setter(curr_suffix.npos, prev_tpos + (prev_suffix->size() - match));
prev_tpos += prev_suffix->size() - match;
} else { // append
setter(curr_suffix.npos, m_chars.size());
prev_tpos = m_chars.size();
std::copy(curr_suffix.begin(), curr_suffix.end(), std::back_inserter(m_chars));
if (bin_mode) {
for (std::uint64_t j = 1; j < curr_suffix.size(); ++j) {
m_terms.push_back(false);
}
m_terms.push_back(true);
} else {
m_chars.emplace_back('\0');
}
}
prev_suffix = &curr_suffix;
}
}
friend class tail_vector;
};
private:
immutable_vector<char> m_chars;
bit_vector m_terms;
public:
tail_vector() = default;
virtual ~tail_vector() = default;
tail_vector(const tail_vector&) = delete;
tail_vector& operator=(const tail_vector&) = delete;
tail_vector(tail_vector&&) noexcept = default;
tail_vector& operator=(tail_vector&&) noexcept = default;
explicit tail_vector(builder&& b) : m_chars(b.m_chars), m_terms(b.m_terms) {}
inline bool bin_mode() const {
return m_terms.size() != 0;
}
inline bool match(std::string_view key, std::uint64_t tpos) const {
if (key.size() == 0) {
return tpos == 0;
}
std::uint64_t kpos = 0;
if (bin_mode()) {
do {
if (key[kpos] != m_chars[tpos]) {
return false;
}
kpos += 1;
if (m_terms[tpos]) {
return kpos == key.size();
}
tpos += 1;
} while (kpos < key.size());
return false;
} else {
do {
if (!m_chars[tpos] || key[kpos] != m_chars[tpos]) {
return false;
}
kpos += 1;
tpos += 1;
} while (kpos < key.size());
return !m_chars[tpos];
}
}
inline bool prefix_match(std::string_view key, std::uint64_t tpos) const {
assert(key.size() != 0);
std::uint64_t kpos = 0;
if (bin_mode()) {
do {
if (key[kpos] != m_chars[tpos]) {
return false;
}
kpos += 1;
if (m_terms[tpos]) {
return kpos == key.size();
}
tpos += 1;
} while (kpos < key.size());
return true;
} else {
do {
if (!m_chars[tpos] || key[kpos] != m_chars[tpos]) {
return false;
}
kpos += 1;
tpos += 1;
} while (kpos < key.size());
return true;
}
}
inline void decode(std::uint64_t tpos, const std::function<void(char)>& fn) const {
if (bin_mode()) {
if (tpos != 0) {
do {
fn(m_chars[tpos]);
} while (!m_terms[tpos++]);
}
} else {
while (m_chars[tpos]) {
fn(m_chars[tpos++]);
}
}
}
inline std::uint64_t size() const {
return m_chars.size();
}
template <class Visitor>
void visit(Visitor& visitor) {
visitor.visit(m_chars);
visitor.visit(m_terms);
}
};
} // namespace xcdat

468
include/xcdat/trie.hpp Normal file
View file

@ -0,0 +1,468 @@
#pragma once
#include <functional>
#include <optional>
#include <string>
#include "trie_builder.hpp"
namespace xcdat {
//! A compressed string dictionary based on an improved double-array trie.
//! 'BcVector' is the data type of Base and Check vectors.
template <class BcVector>
class trie {
public:
using trie_type = trie<BcVector>;
using bc_vector_type = BcVector;
static constexpr auto l1_bits = bc_vector_type::l1_bits;
private:
std::uint64_t m_num_keys = 0;
code_table m_table;
bit_vector m_terms;
bc_vector_type m_bcvec;
tail_vector m_tvec;
public:
//! Default constructor
trie() = default;
//! Default destructor
virtual ~trie() = default;
//! Copy constructor (deleted)
trie(const trie&) = delete;
//! Copy constructor (deleted)
trie& operator=(const trie&) = delete;
//! Move constructor
trie(trie&&) noexcept = default;
//! Move constructor
trie& operator=(trie&&) noexcept = default;
//! Build the trie from the input keywords, which are lexicographically sorted and unique.
//!
//! If bin_mode = false, the NULL character is used for the termination of a keyword.
//! If bin_mode = true, bit flags are used istead, and the keywords can contain NULL characters.
//! If the input keywords contain NULL characters, bin_mode will be forced to be set to true.
//!
//! The type 'Strings' and 'Strings::value_type' should be a random iterable container such as std::vector.
//! Precisely, they should support the following operations:
//! - size() returns the container size.
//! - operator[](i) accesses the i-th element.
//! - begin() returns the iterator to the beginning.
//! - end() returns the iterator to the end.
//! The type 'Strings::value_type::value_type' should be one-byte integer type such as 'char'.
template <class Strings>
trie(const Strings& keys, bool bin_mode = false) : trie(trie_builder(keys, l1_bits, bin_mode)) {
static_assert(sizeof(char) == sizeof(typename Strings::value_type::value_type));
}
//! Check if the binary mode.
inline bool bin_mode() const {
return m_tvec.bin_mode();
}
//! Get the number of stored keywords.
inline std::uint64_t num_keys() const {
return m_num_keys;
}
//! Get the alphabet size.
inline std::uint64_t alphabet_size() const {
return m_table.alphabet_size();
}
//! Get the maximum length of keywords.
inline std::uint64_t max_length() const {
return m_table.max_length();
}
//! Get the number of trie nodes.
inline std::uint64_t num_nodes() const {
return m_bcvec.num_nodes();
}
//! Get the number of DA units.
inline std::uint64_t num_units() const {
return m_bcvec.num_units();
}
//! Get the number of unused DA units.
inline std::uint64_t num_free_units() const {
return m_bcvec.num_free_units();
}
//! Get the length of TAIL vector.
inline std::uint64_t tail_length() const {
return m_tvec.size();
}
//! Lookup the ID of the keyword.
inline std::optional<std::uint64_t> lookup(std::string_view key) const {
std::uint64_t kpos = 0, npos = 0;
while (!m_bcvec.is_leaf(npos)) {
if (kpos == key.size()) {
if (!m_terms[npos]) {
return std::nullopt;
}
return npos_to_id(npos);
}
const std::uint64_t cpos = m_bcvec.base(npos) ^ m_table.get_code(key[kpos++]);
if (m_bcvec.check(cpos) != npos) {
return std::nullopt;
}
npos = cpos;
}
const std::uint64_t tpos = m_bcvec.link(npos);
if (!m_tvec.match(get_suffix(key, kpos), tpos)) {
return std::nullopt;
}
return npos_to_id(npos);
}
//! Decode the keyword associated with the ID.
inline std::string decode(std::uint64_t id) const {
std::string decoded;
decoded.reserve(max_length());
decode(id, decoded);
return decoded;
}
//! Decode the keyword associated with the ID and store it in 'decoded'.
//! It can avoid reallocation of memory to store the result.
inline void decode(std::uint64_t id, std::string& decoded) const {
decoded.clear();
if (num_keys() <= id) {
return;
}
std::uint64_t npos = id_to_npos(id);
std::uint64_t tpos = m_bcvec.is_leaf(npos) ? m_bcvec.link(npos) : UINT64_MAX;
while (npos != 0) {
const std::uint64_t ppos = m_bcvec.check(npos);
decoded.push_back(m_table.get_char(m_bcvec.base(ppos) ^ npos));
npos = ppos;
}
std::reverse(decoded.begin(), decoded.end());
if (tpos != 0 && tpos != UINT64_MAX) {
m_tvec.decode(tpos, [&](char c) { decoded.push_back(c); });
}
}
//! An iterator class for common prefix search.
//! It enumerates all the keywords contained as prefixes of a given string.
//! It should be instantiated via the function 'make_prefix_iterator'.
class prefix_iterator {
private:
const trie_type* m_obj = nullptr;
std::string_view m_key;
std::uint64_t m_id = 0;
std::uint64_t m_kpos = 0;
std::uint64_t m_npos = 0;
bool is_beg = true;
bool is_end = false;
public:
prefix_iterator() = default;
//! Increment the iterator.
//! Return false if the iteration is terminated.
inline bool next() {
return m_obj != nullptr && m_obj->next_prefix(this);
}
//! Get the result ID.
inline std::uint64_t id() const {
return m_id;
}
//! Get the result keyword.
inline std::string decoded() const {
return std::string(m_key.data(), m_kpos);
}
//! Get the reference to the result keyword.
//! Note that the referenced data will be changed in the next iteration.
inline std::string_view decoded_view() const {
return std::string_view(m_key.data(), m_kpos);
}
private:
prefix_iterator(const trie_type* obj, std::string_view key) : m_obj(obj), m_key(key) {}
friend class trie;
};
//! Make the common prefix searcher for the given keyword.
inline prefix_iterator make_prefix_iterator(std::string_view key) const {
return prefix_iterator(this, key);
}
//! Preform common prefix search for the keyword.
inline void prefix_search(std::string_view key,
const std::function<void(std::uint64_t, std::string_view)>& fn) const {
auto itr = make_prefix_iterator(key);
while (itr.next()) {
fn(itr.id(), itr.decoded_view());
}
}
//! An iterator class for predictive search.
//! It enumerates all the keywords starting with a given string.
//! It should be instantiated via the function 'make_predictive_iterator'.
class predictive_iterator {
public:
struct cursor_type {
char label;
std::uint64_t kpos;
std::uint64_t npos;
};
private:
const trie_type* m_obj = nullptr;
std::string_view m_key;
std::uint64_t m_id = 0;
std::string m_decoded;
std::vector<cursor_type> m_stack;
bool is_beg = true;
bool is_end = false;
public:
predictive_iterator() = default;
//! Increment the iterator.
//! Return false if the iteration is terminated.
inline bool next() {
return m_obj != nullptr && m_obj->next_predictive(this);
}
//! Get the result ID.
inline std::uint64_t id() const {
return m_id;
}
//! Get the result keyword.
inline std::string decoded() const {
return m_decoded;
}
//! Get the reference to the result keyword.
//! Note that the referenced data will be changed in the next iteration.
inline std::string_view decoded_view() const {
return m_decoded;
}
private:
predictive_iterator(const trie_type* obj, std::string_view key) : m_obj(obj), m_key(key) {}
friend class trie;
};
//! Make the predictive searcher for the keyword.
inline predictive_iterator make_predictive_iterator(std::string_view key) const {
return predictive_iterator(this, key);
}
//! Preform predictive search for the keyword.
inline void predictive_search(std::string_view key,
const std::function<void(std::uint64_t, std::string_view)>& fn) const {
auto itr = make_predictive_iterator(key);
while (itr.next()) {
fn(itr.id(), itr.decoded_view());
}
}
//! An iterator class for enumeration.
//! It enumerates all the keywords stored in the trie.
//! It should be instantiated via the function 'make_enumerative_iterator'.
using enumerative_iterator = predictive_iterator;
//! Make the enumerator.
inline enumerative_iterator make_enumerative_iterator() const {
return enumerative_iterator(this, "");
}
//! Enumerate all the keywords and their IDs stored in the trie.
inline void enumerate(const std::function<void(std::uint64_t, std::string_view)>& fn) const {
auto itr = make_enumerative_iterator();
while (itr.next()) {
fn(itr.id(), itr.decoded_view());
}
}
//! Visit the members (commonly used for I/O).
template <class Visitor>
void visit(Visitor& visitor) {
visitor.visit(m_num_keys);
visitor.visit(m_table);
visitor.visit(m_terms);
visitor.visit(m_bcvec);
visitor.visit(m_tvec);
}
private:
template <class Strings>
explicit trie(trie_builder<Strings>&& b)
: m_num_keys(b.m_keys.size()), m_table(std::move(b.m_table)), m_terms(b.m_terms, true, true),
m_bcvec(b.m_units, std::move(b.m_leaves)), m_tvec(std::move(b.m_suffixes)) {}
template <class String>
static constexpr String get_suffix(const String& s, std::uint64_t i) {
assert(i <= s.size());
return s.substr(i, s.size() - i);
}
inline std::uint64_t npos_to_id(std::uint64_t npos) const {
return m_terms.rank(npos);
};
inline std::uint64_t id_to_npos(std::uint64_t id) const {
return m_terms.select(id);
};
inline bool next_prefix(prefix_iterator* itr) const {
if (itr->is_end) {
return false;
}
if (itr->is_beg) {
itr->is_beg = false;
if (m_terms[itr->m_npos]) {
itr->m_id = npos_to_id(itr->m_npos);
return true;
}
}
if (bin_mode() and itr->m_kpos == itr->m_key.size()) {
// Is the key terminated at an inner term?
itr->is_end = true;
itr->m_id = num_keys();
return false;
}
while (!m_bcvec.is_leaf(itr->m_npos)) {
if (bin_mode() and itr->m_kpos == itr->m_key.size()) {
// Is the key terminated at an internal node (not term)?
itr->is_end = true;
itr->m_id = num_keys();
return false;
}
const std::uint64_t cpos = m_bcvec.base(itr->m_npos) ^ m_table.get_code(itr->m_key[itr->m_kpos++]);
if (m_bcvec.check(cpos) != itr->m_npos) {
itr->is_end = true;
itr->m_id = num_keys();
return false;
}
itr->m_npos = cpos;
if (!m_bcvec.is_leaf(itr->m_npos) && m_terms[itr->m_npos]) {
itr->m_id = npos_to_id(itr->m_npos);
return true;
}
}
itr->is_end = true;
const std::uint64_t tpos = m_bcvec.link(itr->m_npos);
if (!m_tvec.match(get_suffix(itr->m_key, itr->m_kpos), tpos)) {
itr->m_id = num_keys();
return false;
}
itr->m_kpos = itr->m_key.size();
itr->m_id = npos_to_id(itr->m_npos);
return true;
}
inline bool next_predictive(predictive_iterator* itr) const {
if (itr->is_end) {
return false;
}
if (itr->is_beg) {
itr->is_beg = false;
std::uint64_t kpos = 0;
std::uint64_t npos = 0;
for (; kpos < itr->m_key.size(); ++kpos) {
if (m_bcvec.is_leaf(npos)) {
itr->is_end = true;
const std::uint64_t tpos = m_bcvec.link(npos);
if (tpos == 0) {
return false;
}
if (!m_tvec.prefix_match(get_suffix(itr->m_key, kpos), tpos)) {
return false;
}
itr->m_id = npos_to_id(npos);
m_tvec.decode(tpos, [&](char c) { itr->m_decoded.push_back(c); });
return true;
}
const std::uint64_t cpos = m_bcvec.base(npos) ^ m_table.get_code(itr->m_key[kpos]);
if (m_bcvec.check(cpos) != npos) {
itr->is_end = true;
return false;
}
npos = cpos;
itr->m_decoded.push_back(itr->m_key[kpos]);
}
if (!itr->m_decoded.empty()) {
itr->m_stack.push_back({itr->m_decoded.back(), kpos, npos});
} else {
itr->m_stack.push_back({'\0', kpos, npos});
}
}
while (!itr->m_stack.empty()) {
const char label = itr->m_stack.back().label;
const std::uint64_t kpos = itr->m_stack.back().kpos;
const std::uint64_t npos = itr->m_stack.back().npos;
itr->m_stack.pop_back();
if (0 < kpos) {
itr->m_decoded.resize(kpos);
itr->m_decoded.back() = label;
}
if (m_bcvec.is_leaf(npos)) {
itr->m_id = npos_to_id(npos);
m_tvec.decode(m_bcvec.link(npos), [&](char c) { itr->m_decoded.push_back(c); });
return true;
}
const std::uint64_t base = m_bcvec.base(npos);
for (auto cit = m_table.rbegin(); cit != m_table.rend(); ++cit) {
const std::uint64_t cpos = base ^ m_table.get_code(*cit);
if (m_bcvec.check(cpos) == npos) {
itr->m_stack.push_back({static_cast<char>(*cit), kpos + 1, cpos});
}
}
if (m_terms[npos]) {
itr->m_id = npos_to_id(npos);
return true;
}
}
itr->is_end = true;
return false;
}
};
} // namespace xcdat

View file

@ -0,0 +1,265 @@
#pragma once
#include <algorithm>
#include <iostream>
#include <string_view>
// #include "bc_vector.hpp"
#include "code_table.hpp"
#include "exception.hpp"
#include "tail_vector.hpp"
namespace xcdat {
template <class Strings>
class trie_builder {
template <class>
friend class trie;
public:
struct unit_type {
std::uint64_t base;
std::uint64_t check;
};
private:
static constexpr std::uint64_t taboo_npos = 1;
static constexpr std::uint64_t free_blocks = 16;
const Strings& m_keys;
const std::uint32_t m_l1_bits; // # of bits for L1 layer of DACs
const std::uint64_t m_l1_size;
bool m_bin_mode = false;
code_table m_table;
std::vector<unit_type> m_units;
bit_vector::builder m_leaves;
bit_vector::builder m_terms;
bit_vector::builder m_useds;
std::vector<std::uint64_t> m_heads; // for L1 blocks
std::vector<std::uint8_t> m_edges;
tail_vector::builder m_suffixes;
public:
explicit trie_builder(const Strings& keys, std::uint32_t l1_bits, bool bin_mode)
: m_keys(keys), m_l1_bits(std::min(l1_bits, 8U)), m_l1_size(1ULL << m_l1_bits), m_bin_mode(bin_mode) {
XCDAT_THROW_IF(m_keys.size() == 0, "The input dataset is empty.");
// Reserve
{
std::uint64_t init_capa = 1;
while (init_capa < m_keys.size()) {
init_capa <<= 1;
}
m_units.reserve(init_capa);
m_leaves.reserve(init_capa);
m_terms.reserve(init_capa);
m_useds.reserve(init_capa);
m_heads.reserve(init_capa >> m_l1_bits);
m_edges.reserve(256);
}
// Initialize an empty list.
for (std::uint64_t npos = 0; npos < 256; ++npos) {
m_units.push_back(unit_type{npos + 1, npos - 1});
m_leaves.push_back(false);
m_terms.push_back(false);
m_useds.push_back(false);
}
m_units[255].base = 0;
m_units[0].check = 255;
for (std::uint64_t npos = 0; npos < 256; npos += m_l1_size) {
m_heads.push_back(npos);
}
// Fix the root
use_unit(0);
m_units[0].check = taboo_npos;
m_useds.set_bit(taboo_npos, true);
m_heads[taboo_npos >> m_l1_bits] = m_units[taboo_npos].base;
// Build the code table
m_table = code_table(keys);
m_bin_mode |= m_table.has_null();
// Build the BC units
arrange(0, m_keys.size(), 0, 0);
// Finish
finish();
// Build the TAIL vector
m_suffixes.complete(m_bin_mode, [&](std::uint64_t npos, std::uint64_t tpos) { m_units[npos].base = tpos; });
}
virtual ~trie_builder() = default;
trie_builder(const trie_builder&) = delete;
trie_builder& operator=(const trie_builder&) = delete;
trie_builder(trie_builder&&) noexcept = default;
trie_builder& operator=(trie_builder&&) noexcept = default;
private:
inline void use_unit(std::uint64_t npos) {
m_useds.set_bit(npos);
const auto next = m_units[npos].base;
const auto prev = m_units[npos].check;
m_units[prev].base = next;
m_units[next].check = prev;
const auto lpos = npos >> m_l1_bits;
if (m_heads[lpos] == npos) {
m_heads[lpos] = (lpos != next >> m_l1_bits) ? taboo_npos : next;
}
}
inline void close_block(std::uint64_t bpos) {
const auto beg_npos = bpos * 256;
const auto end_npos = beg_npos + 256;
for (auto npos = beg_npos; npos < end_npos; ++npos) {
if (!m_useds[npos]) {
use_unit(npos);
m_useds.set_bit(npos, false);
m_units[npos].base = npos;
m_units[npos].check = npos;
}
}
for (auto npos = beg_npos; npos < end_npos; npos += m_l1_size) {
m_heads[npos >> m_l1_bits] = taboo_npos;
}
}
void expand() {
const auto old_size = static_cast<std::uint64_t>(m_units.size());
const auto new_size = old_size + 256;
for (auto npos = old_size; npos < new_size; ++npos) {
m_units.push_back({npos + 1, npos - 1});
m_leaves.push_back(false);
m_terms.push_back(false);
m_useds.push_back(false);
}
{
const auto last_npos = m_units[taboo_npos].check;
m_units[old_size].check = last_npos;
m_units[last_npos].base = old_size;
m_units[new_size - 1].base = taboo_npos;
m_units[taboo_npos].check = new_size - 1;
}
for (auto npos = old_size; npos < new_size; npos += m_l1_size) {
m_heads.push_back(npos);
}
const auto bpos = old_size / 256;
if (free_blocks <= bpos) {
close_block(bpos - free_blocks);
}
}
void finish() {
while (m_units[taboo_npos].base != taboo_npos) {
auto bpos = m_units[taboo_npos].base / 256;
close_block(bpos);
}
}
void arrange(std::uint64_t beg, std::uint64_t end, std::uint64_t kpos, std::uint64_t npos) {
if (m_keys[beg].size() == kpos) {
m_terms.set_bit(npos, true);
if (++beg == end) { // without link?
m_units[npos].base = 0; // with an empty suffix
m_leaves.set_bit(npos, true);
return;
}
} else if (beg + 1 == end) { // leaf?
XCDAT_THROW_IF(m_keys[beg].size() <= kpos, "The input keys are not unique.");
m_terms.set_bit(npos, true);
m_leaves.set_bit(npos, true);
m_suffixes.set_suffix({m_keys[beg].data() + kpos, m_keys[beg].size() - kpos}, npos);
return;
}
// fetching edges
{
m_edges.clear();
auto ch = static_cast<std::uint8_t>(m_keys[beg][kpos]);
for (auto i = beg + 1; i < end; ++i) {
const auto next_ch = static_cast<std::uint8_t>(m_keys[i][kpos]);
if (ch != next_ch) {
XCDAT_THROW_IF(next_ch < ch, "The input keys are not in lexicographical order.");
m_edges.push_back(ch);
ch = next_ch;
}
}
m_edges.push_back(ch);
}
const auto base = xcheck(npos >> m_l1_bits);
if (m_units.size() <= base) {
expand();
}
// defining new edges
m_units[npos].base = base;
for (const auto ch : m_edges) {
const auto child_id = base ^ m_table.get_code(ch);
use_unit(child_id);
m_units[child_id].check = npos;
}
// following the children
auto i = beg;
auto ch = static_cast<uint8_t>(m_keys[beg][kpos]);
for (auto j = beg + 1; j < end; ++j) {
const auto next_ch = static_cast<uint8_t>(m_keys[j][kpos]);
if (ch != next_ch) {
arrange(i, j, kpos + 1, base ^ m_table.get_code(ch));
ch = next_ch;
i = j;
}
}
arrange(i, end, kpos + 1, base ^ m_table.get_code(ch));
}
inline std::uint64_t xcheck(std::uint64_t lpos) const {
if (m_units[taboo_npos].base == taboo_npos) { // Full?
return m_units.size() ^ m_table.get_code(m_edges[0]);
}
// First, search in the same L1 block
for (auto i = m_heads[lpos]; i != taboo_npos && i >> m_l1_bits == lpos; i = m_units[i].base) {
const auto base = i ^ m_table.get_code(m_edges[0]);
if (is_target(base)) {
return base; // base / block_size_ == lpos
}
}
// Second, search in the other blocks
for (auto i = m_units[taboo_npos].base; i != taboo_npos; i = m_units[i].base) {
const auto base = i ^ m_table.get_code(m_edges[0]);
if (is_target(base)) {
return base; // base / block_size_ != lpos
}
}
return m_units.size() ^ m_table.get_code(m_edges[0]);
}
inline bool is_target(std::uint64_t base) const {
for (const auto ch : m_edges) {
if (m_useds[base ^ m_table.get_code(ch)]) {
return false;
}
}
return true;
}
};
} // namespace xcdat

1
sample/CMakeLists.txt Normal file
View file

@ -0,0 +1 @@
add_executable(sample sample.cpp)

92
sample/sample.cpp Normal file
View file

@ -0,0 +1,92 @@
#include <iostream>
#include <string>
#include <mm_file/mm_file.hpp>
#include <xcdat.hpp>
int main() {
// Dataset of keywords
std::vector<std::string> keys = {
"AirPods", "AirTag", "Mac", "MacBook", "MacBook_Air", "MacBook_Pro",
"Mac_Mini", "Mac_Pro", "iMac", "iPad", "iPhone", "iPhone_SE",
};
// The input keys must be sorted and unique (although they have already satisfied in this case).
std::sort(keys.begin(), keys.end());
keys.erase(std::unique(keys.begin(), keys.end()), keys.end());
// The trie dictionary type
using trie_type = xcdat::trie_8_type;
// The dictionary filename
const char* tmp_filename = "dic.bin";
// Build and save the trie dictionary.
{
const trie_type trie(keys);
xcdat::save(trie, tmp_filename);
}
// Memory-map the trie dictionary.
const mm::file_source<char> fin(tmp_filename, mm::advice::sequential);
const auto trie = xcdat::mmap<trie_type>(fin.data());
// Or, load the trie dictionary on memory.
// const auto trie = xcdat::load<trie_type>(tmp_filename);
// Basic statistics
std::cout << "Number of keys: " << trie.num_keys() << std::endl;
std::cout << "Number of trie nodes: " << trie.num_nodes() << std::endl;
std::cout << "Number of DA units: " << trie.num_units() << std::endl;
std::cout << "Memory usage in bytes: " << xcdat::memory_in_bytes(trie) << std::endl;
// Lookup the ID for a query key.
{
const auto id = trie.lookup("Mac_Pro");
std::cout << "Lookup(Mac_Pro) = " << id.value_or(UINT64_MAX) << std::endl;
}
{
const auto id = trie.lookup("Google_Pixel");
std::cout << "Lookup(Google_Pixel) = " << id.value_or(UINT64_MAX) << std::endl;
}
// Decode the key for a query ID.
{
const auto dec = trie.decode(4);
std::cout << "Decode(4) = " << dec << std::endl;
}
// Common prefix search
{
std::cout << "CommonPrefixSearch(MacBook_Air) = {" << std::endl;
auto itr = trie.make_prefix_iterator("MacBook_Air");
while (itr.next()) {
std::cout << " (" << itr.decoded_view() << ", " << itr.id() << ")," << std::endl;
}
std::cout << "}" << std::endl;
}
// Predictive search
{
std::cout << "PredictiveSearch(Mac) = {" << std::endl;
auto itr = trie.make_predictive_iterator("Mac");
while (itr.next()) {
std::cout << " (" << itr.decoded_view() << ", " << itr.id() << ")," << std::endl;
}
std::cout << "}" << std::endl;
}
// Enumerate all the keys (in lex order).
{
std::cout << "Enumerate() = {" << std::endl;
auto itr = trie.make_enumerative_iterator();
while (itr.next()) {
std::cout << " (" << itr.decoded_view() << ", " << itr.id() << ")," << std::endl;
}
std::cout << "}" << std::endl;
}
std::remove(tmp_filename);
return 0;
}

24
tests/CMakeLists.txt Normal file
View file

@ -0,0 +1,24 @@
add_executable(test_bit_vector test_bit_vector.cpp)
add_test(test_bit_vector test_bit_vector)
add_executable(test_compact_vector test_compact_vector.cpp)
add_test(test_compact_vector test_compact_vector)
add_executable(test_tail_vector test_tail_vector.cpp)
add_test(test_tail_vector test_tail_vector)
set(BC_OPTIONS "7" "8")
foreach(BC_OPTION ${BC_OPTIONS})
set(TEST_SRC_NAME test_bc_vector_${BC_OPTION})
add_executable(${TEST_SRC_NAME} test_bc_vector.cpp)
set_target_properties(${TEST_SRC_NAME} PROPERTIES COMPILE_DEFINITIONS BC_VECTOR_${BC_OPTION})
add_test(${TEST_SRC_NAME} ${TEST_SRC_NAME})
endforeach(BC_OPTION)
foreach(BC_OPTION ${BC_OPTIONS})
set(TEST_SRC_NAME test_trie_${BC_OPTION})
add_executable(${TEST_SRC_NAME} test_trie.cpp)
set_target_properties(${TEST_SRC_NAME} PROPERTIES COMPILE_DEFINITIONS TRIE_${BC_OPTION})
add_test(${TEST_SRC_NAME} ${TEST_SRC_NAME})
endforeach(BC_OPTION)

6260
tests/doctest/doctest.h Normal file

File diff suppressed because it is too large Load diff

9976
tests/keys.txt Normal file

File diff suppressed because it is too large Load diff

75
tests/test_bc_vector.cpp Normal file
View file

@ -0,0 +1,75 @@
#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
#include <algorithm>
#include <random>
#include "doctest/doctest.h"
#include "test_common.hpp"
#include "xcdat/bc_vector_7.hpp"
#include "xcdat/bc_vector_8.hpp"
#ifdef BC_VECTOR_7
using bc_vector_type = xcdat::bc_vector_7;
#elif BC_VECTOR_8
using bc_vector_type = xcdat::bc_vector_8;
#endif
struct bc_unit {
std::uint64_t base;
std::uint64_t check;
};
std::vector<bc_unit> make_random_units(std::uint64_t n, std::uint64_t maxv, std::uint64_t seed = 13) {
std::mt19937_64 engine(seed);
std::uniform_int_distribution<std::uint64_t> dist(0, maxv);
std::vector<bc_unit> bc_units(n);
for (std::uint64_t i = 0; i < n; i++) {
bc_units[i].base = dist(engine);
bc_units[i].check = dist(engine);
}
return bc_units;
}
xcdat::bit_vector::builder to_bit_vector_builder(const std::vector<bool>& bits) {
xcdat::bit_vector::builder bvb(bits.size());
for (std::uint64_t i = 0; i < bits.size(); i++) {
bvb.set_bit(i, bits[i]);
}
return bvb;
}
std::uint64_t get_num_ones(const std::vector<bool>& bits) {
return std::accumulate(bits.begin(), bits.end(), 0ULL);
}
void test_bc_vector(const std::vector<bc_unit>& bc_units, const std::vector<bool>& leaves) {
bc_vector_type bc(bc_units, to_bit_vector_builder(leaves));
REQUIRE_EQ(bc.num_units(), bc_units.size());
REQUIRE_EQ(bc.num_leaves(), get_num_ones(leaves));
for (std::uint64_t i = 0; i < bc.num_units(); i++) {
REQUIRE_EQ(bc.is_leaf(i), leaves[i]);
if (leaves[i]) {
REQUIRE_EQ(bc.link(i), bc_units[i].base);
} else {
REQUIRE_EQ(bc.base(i), bc_units[i].base);
}
REQUIRE_EQ(bc.check(i), bc_units[i].check);
}
}
TEST_CASE("Test bc_vector 10K in [0,10K)") {
const std::uint64_t size = 10000;
auto bc_units = make_random_units(size, size - 1);
auto leaves = xcdat::test::make_random_bits(size, 0.2);
test_bc_vector(bc_units, leaves);
}
TEST_CASE("Test bc_vector 10K in [0,UINT64_MAX)") {
const std::uint64_t size = 10000;
auto bc_units = make_random_units(size, UINT64_MAX);
auto leaves = xcdat::test::make_random_bits(size, 0.2);
test_bc_vector(bc_units, leaves);
}

113
tests/test_bit_vector.cpp Normal file
View file

@ -0,0 +1,113 @@
#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
#include <algorithm>
#include <random>
#include "doctest/doctest.h"
#include "test_common.hpp"
#include "xcdat/bit_vector.hpp"
std::uint64_t get_num_ones(const std::vector<bool>& bits) {
return std::accumulate(bits.begin(), bits.end(), 0ULL);
}
std::uint64_t rank_naive(const std::vector<bool>& bits, std::uint64_t i) {
return std::accumulate(bits.begin(), bits.begin() + i, 0ULL);
}
std::uint64_t select_naive(const std::vector<bool>& bits, std::uint64_t n) {
std::uint64_t i = 0;
for (; i < bits.size(); i++) {
if (bits[i]) {
if (n == 0) {
break;
}
n -= 1;
}
}
return i;
}
void test_rank_select(const std::vector<bool>& bits) {
xcdat::bit_vector bv;
{
xcdat::bit_vector::builder bvb(bits.size());
for (std::uint64_t i = 0; i < bits.size(); i++) {
bvb.set_bit(i, bits[i]);
}
bv = xcdat::bit_vector(bvb, true, true);
}
REQUIRE_EQ(bv.size(), bits.size());
REQUIRE_EQ(bv.num_ones(), get_num_ones(bits));
for (std::uint64_t i = 0; i < bits.size(); i++) {
REQUIRE_EQ(bv[i], bits[i]);
}
static constexpr std::uint64_t seed = 17;
std::mt19937_64 engine(seed);
{
std::uniform_int_distribution<std::uint64_t> dist(0, bv.size());
for (std::uint64_t r = 0; r < 100; r++) {
const std::uint64_t i = dist(engine);
REQUIRE_EQ(bv.rank(i), rank_naive(bits, i));
}
}
if (bv.num_ones() != 0) {
std::uniform_int_distribution<std::uint64_t> dist(0, bv.num_ones() - 1);
for (std::uint64_t r = 0; r < 100; r++) {
const std::uint64_t n = dist(engine);
REQUIRE_EQ(bv.select(n), select_naive(bits, n));
}
}
}
TEST_CASE("Test bit_vector::builder with resize") {
const auto bits = xcdat::test::make_random_bits(10000);
xcdat::bit_vector::builder bvb;
bvb.resize(bits.size());
REQUIRE_EQ(bvb.size(), bits.size());
for (std::uint64_t i = 0; i < bits.size(); i++) {
bvb.set_bit(i, bits[i]);
}
for (std::uint64_t i = 0; i < bits.size(); i++) {
REQUIRE_EQ(bvb[i], bits[i]);
}
}
TEST_CASE("Test bit_vector::builder with push_back") {
const auto bits = xcdat::test::make_random_bits(10000);
xcdat::bit_vector::builder bvb;
bvb.reserve(bits.size());
for (std::uint64_t i = 0; i < bits.size(); i++) {
bvb.push_back(bits[i]);
}
REQUIRE_EQ(bvb.size(), bits.size());
for (std::uint64_t i = 0; i < bits.size(); i++) {
REQUIRE_EQ(bvb[i], bits[i]);
}
}
TEST_CASE("Test rank/select operations") {
const auto bits = xcdat::test::make_random_bits(10000);
test_rank_select(bits);
}
TEST_CASE("Test rank/select operations (all zeros)") {
const auto bits = xcdat::test::make_random_bits(10000, 0.0);
test_rank_select(bits);
}
TEST_CASE("Test rank/select operations (all ones)") {
const auto bits = xcdat::test::make_random_bits(10000, 1.1);
test_rank_select(bits);
}

84
tests/test_common.hpp Normal file
View file

@ -0,0 +1,84 @@
#pragma once
#include <algorithm>
#include <iostream>
#include <random>
#include <string>
#include <vector>
namespace xcdat::test {
template <class T>
std::vector<T> to_unique_vec(std::vector<T>&& vec) {
std::sort(vec.begin(), vec.end());
vec.erase(std::unique(vec.begin(), vec.end()), vec.end());
return std::move(vec);
}
std::uint64_t max_length(const std::vector<std::string>& keys) {
std::uint64_t n = 0;
for (auto& key : keys) {
n = std::max<std::uint64_t>(n, key.size());
}
return n;
}
std::vector<bool> make_random_bits(std::uint64_t n, double dens = 0.5, std::uint64_t seed = 13) {
std::mt19937_64 engine(seed);
std::uniform_real_distribution<double> dist(0.0, 1.0);
std::vector<bool> bits(n);
for (std::uint64_t i = 0; i < n; i++) {
bits[i] = dist(engine) < dens;
}
return bits;
}
std::vector<std::uint64_t> make_random_ints(std::uint64_t n, std::uint64_t min, std::uint64_t max,
std::uint64_t seed = 13) {
std::mt19937_64 engine(seed);
std::uniform_int_distribution<std::uint64_t> dist(min, max);
std::vector<std::uint64_t> ints(n);
for (std::uint64_t i = 0; i < n; i++) {
ints[i] = dist(engine);
}
return ints;
}
std::vector<std::string> make_random_keys(std::uint64_t n, std::uint64_t min_m, std::uint64_t max_m, //
char min_c = 'A', char max_c = 'Z', std::uint64_t seed = 13) {
std::mt19937_64 engine(seed);
std::uniform_int_distribution<std::uint64_t> dist_m(min_m, max_m);
std::uniform_int_distribution<char> dist_c(min_c, max_c);
std::vector<std::string> keys(n);
for (std::uint64_t i = 0; i < n; i++) {
keys[i].resize(dist_m(engine));
for (std::uint64_t j = 0; j < keys[i].size(); j++) {
keys[i][j] = dist_c(engine);
}
}
return keys;
}
std::vector<std::string> extract_keys(std::vector<std::string>& keys, double ratio = 0.1, std::uint64_t seed = 13) {
std::mt19937_64 engine(seed);
std::uniform_real_distribution<double> dist(0.0, 1.0);
std::vector<std::string> keys1;
std::vector<std::string> keys2;
for (std::uint64_t i = 0; i < keys.size(); ++i) {
if (ratio < dist(engine)) {
keys1.push_back(keys[i]);
} else {
keys2.push_back(keys[i]);
}
}
keys = keys1;
return keys2;
}
} // namespace xcdat::test

View file

@ -0,0 +1,41 @@
#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
#include <algorithm>
#include <random>
#include "doctest/doctest.h"
#include "test_common.hpp"
#include "xcdat/compact_vector.hpp"
TEST_CASE("Test compact_vector (zero)") {
std::vector<std::uint64_t> ints = {0, 0, 0, 0, 0};
xcdat::compact_vector cv(ints);
REQUIRE_EQ(cv.size(), ints.size());
for (std::uint64_t i = 0; i < ints.size(); i++) {
REQUIRE_EQ(cv[i], ints[i]);
}
}
TEST_CASE("Test compact_vector (tiny)") {
std::vector<std::uint64_t> ints = {2, 0, 14, 456, 32, 5544, 23};
xcdat::compact_vector cv(ints);
REQUIRE_EQ(cv.size(), ints.size());
for (std::uint64_t i = 0; i < ints.size(); i++) {
REQUIRE_EQ(cv[i], ints[i]);
}
}
TEST_CASE("Test compact_vector (random)") {
std::vector<std::uint64_t> ints = xcdat::test::make_random_ints(10000, 0, UINT16_MAX);
xcdat::compact_vector cv(ints);
REQUIRE_EQ(cv.size(), ints.size());
for (std::uint64_t i = 0; i < ints.size(); i++) {
REQUIRE_EQ(cv[i], ints[i]);
}
}

View file

@ -0,0 +1,51 @@
#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
#include <algorithm>
#include <random>
#include "doctest/doctest.h"
#include "test_common.hpp"
#include "xcdat/tail_vector.hpp"
void test_tail_vector(const std::vector<std::string>& sufs, bool bin_mode = false) {
xcdat::tail_vector tvec;
std::vector<std::uint64_t> idxs(sufs.size());
{
xcdat::tail_vector::builder tvb;
for (std::uint64_t i = 0; i < sufs.size(); i++) {
tvb.set_suffix(sufs[i], i);
}
tvb.complete(bin_mode, [&](std::uint64_t npos, std::uint64_t tpos) { idxs[npos] = tpos; });
tvec = xcdat::tail_vector(std::move(tvb));
}
for (std::uint64_t i = 0; i < sufs.size(); i++) {
REQUIRE(tvec.match(sufs[i], idxs[i]));
}
for (std::uint64_t i = 0; i < sufs.size(); i++) {
std::string decoded;
tvec.decode(idxs[i], [&](char c) { decoded.push_back(c); });
REQUIRE_EQ(sufs[i], decoded);
}
}
TEST_CASE("Test xcdat::tail_vector (tiny)") {
std::vector<std::string> sufs = {"ML", "STATS", "A", "M", "L", "AKDD", "M", "R", "DD", "OD"};
test_tail_vector(sufs);
}
TEST_CASE("Test xcdat::tail_vector (random, A--B)") {
std::vector<std::string> sufs = xcdat::test::make_random_keys(10000, 1, 30, 'A', 'B');
test_tail_vector(sufs);
}
TEST_CASE("Test xcdat::tail_vector (random, A--Z)") {
std::vector<std::string> sufs = xcdat::test::make_random_keys(10000, 1, 30, 'A', 'Z');
test_tail_vector(sufs);
}
TEST_CASE("Test xcdat::tail_vector (random, 0x00--0xFF)") {
std::vector<std::string> sufs = xcdat::test::make_random_keys(10000, 1, 30, INT8_MIN, INT8_MAX);
test_tail_vector(sufs, true);
}

297
tests/test_trie.cpp Normal file
View file

@ -0,0 +1,297 @@
#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
#include <algorithm>
#include <iostream>
#include <random>
#include <string>
#include "doctest/doctest.h"
#include "mm_file/mm_file.hpp"
#include "test_common.hpp"
#include "xcdat.hpp"
#ifdef TRIE_7
using trie_type = xcdat::trie_7_type;
#elif TRIE_8
using trie_type = xcdat::trie_8_type;
#endif
void test_basic_operations(const trie_type& trie, const std::vector<std::string>& keys,
const std::vector<std::string>& others) {
REQUIRE_EQ(trie.num_keys(), keys.size());
REQUIRE_EQ(trie.max_length(), xcdat::test::max_length(keys));
for (std::uint64_t i = 0; i < keys.size(); i++) {
auto id = trie.lookup(keys[i]);
REQUIRE(id.has_value());
REQUIRE_LT(id.value(), keys.size());
auto decoded = trie.decode(id.value());
REQUIRE_EQ(keys[i], decoded);
}
for (std::uint64_t i = 0; i < others.size(); i++) {
auto id = trie.lookup(others[i]);
REQUIRE_FALSE(id.has_value());
}
}
void test_prefix_search(const trie_type& trie, const std::vector<std::string>& keys,
const std::vector<std::string>& others) {
for (auto& key : keys) {
size_t num_results = 0;
auto itr = trie.make_prefix_iterator(key);
while (itr.next()) {
const auto id = itr.id();
const auto decoded = itr.decoded_view();
REQUIRE_LE(decoded.size(), key.size());
REQUIRE_EQ(id, trie.lookup(decoded));
REQUIRE_EQ(decoded, trie.decode(id));
num_results += 1;
}
REQUIRE_LE(1, num_results);
REQUIRE_LE(num_results, key.size());
}
for (auto& key : others) {
size_t num_results = 0;
auto itr = trie.make_prefix_iterator(key);
while (itr.next()) {
const auto id = itr.id();
const auto decoded = itr.decoded_view();
REQUIRE_LT(decoded.size(), key.size());
REQUIRE_EQ(id, trie.lookup(decoded));
REQUIRE_EQ(decoded, trie.decode(id));
num_results += 1;
}
REQUIRE_LT(num_results, key.size());
}
}
void test_predictive_search(const trie_type& trie, const std::vector<std::string>& keys,
const std::vector<std::string>& others) {
for (auto& key : keys) {
size_t num_results = 0;
auto itr = trie.make_predictive_iterator(key);
while (itr.next()) {
const auto id = itr.id();
const auto decoded = itr.decoded_view();
REQUIRE_LE(key.size(), decoded.size());
REQUIRE_EQ(id, trie.lookup(decoded));
REQUIRE_EQ(decoded, trie.decode(id));
num_results += 1;
}
REQUIRE_LE(1, num_results);
}
for (auto& key : others) {
auto itr = trie.make_predictive_iterator(key);
while (itr.next()) {
const auto id = itr.id();
const auto decoded = itr.decoded_view();
REQUIRE_LT(key.size(), decoded.size());
REQUIRE_EQ(id, trie.lookup(decoded));
REQUIRE_EQ(decoded, trie.decode(id));
}
}
}
void test_enumerate(const trie_type& trie, const std::vector<std::string>& keys) {
auto itr = trie.make_enumerative_iterator();
for (auto& key : keys) {
REQUIRE(itr.next());
REQUIRE_EQ(itr.decoded_view(), key);
REQUIRE_EQ(itr.id(), trie.lookup(key));
}
REQUIRE_FALSE(itr.next());
}
void test_io(const trie_type& trie, const std::vector<std::string>& keys, const std::vector<std::string>& others) {
const char* tmp_filepath = "tmp.idx";
const std::uint64_t memory = xcdat::memory_in_bytes(trie);
REQUIRE_EQ(memory, xcdat::save(trie, tmp_filepath));
{
const auto loaded = xcdat::load<trie_type>(tmp_filepath);
REQUIRE_EQ(trie.bin_mode(), loaded.bin_mode());
REQUIRE_EQ(trie.num_keys(), loaded.num_keys());
REQUIRE_EQ(trie.alphabet_size(), loaded.alphabet_size());
REQUIRE_EQ(trie.max_length(), loaded.max_length());
REQUIRE_EQ(memory, xcdat::memory_in_bytes(loaded));
test_basic_operations(loaded, keys, others);
}
{
mm::file_source<char> fin(tmp_filepath, mm::advice::sequential);
const auto mapped = xcdat::mmap<trie_type>(fin.data());
REQUIRE_EQ(trie.bin_mode(), mapped.bin_mode());
REQUIRE_EQ(trie.num_keys(), mapped.num_keys());
REQUIRE_EQ(trie.alphabet_size(), mapped.alphabet_size());
REQUIRE_EQ(trie.max_length(), mapped.max_length());
REQUIRE_EQ(memory, xcdat::memory_in_bytes(mapped));
test_basic_operations(mapped, keys, others);
}
std::remove(tmp_filepath);
}
TEST_CASE("Test trie_type (tiny)") {
std::vector<std::string> keys = {
"AirPods", "AirTag", "Mac", "MacBook", "MacBook_Air", "MacBook_Pro",
"Mac_Mini", "Mac_Pro", "iMac", "iPad", "iPhone", "iPhone_SE",
};
std::vector<std::string> others = {
"Google_Pixel", "iPad_mini", "iPadOS", "iPod", "ThinkPad",
};
trie_type trie(keys);
REQUIRE_FALSE(trie.bin_mode());
test_basic_operations(trie, keys, others);
{
auto itr = trie.make_prefix_iterator("MacBook_Pro");
std::vector<std::string> expected = {"Mac", "MacBook", "MacBook_Pro"};
for (const auto& exp : expected) {
REQUIRE(itr.next());
REQUIRE_EQ(itr.decoded(), exp);
REQUIRE_EQ(itr.id(), trie.lookup(exp));
}
REQUIRE_FALSE(itr.next());
}
{
auto itr = trie.make_predictive_iterator("MacBook");
std::vector<std::string> expected = {"MacBook", "MacBook_Air", "MacBook_Pro"};
for (const auto& exp : expected) {
REQUIRE(itr.next());
REQUIRE_EQ(itr.decoded(), exp);
REQUIRE_EQ(itr.id(), trie.lookup(exp));
}
REQUIRE_FALSE(itr.next());
}
{
auto itr = trie.make_enumerative_iterator();
for (const auto& key : keys) {
REQUIRE(itr.next());
REQUIRE_EQ(itr.decoded(), key);
REQUIRE_EQ(itr.id(), trie.lookup(key));
}
REQUIRE_FALSE(itr.next());
}
test_io(trie, keys, others);
}
TEST_CASE("Test trie_type (real)") {
auto keys = xcdat::test::to_unique_vec(xcdat::load_strings("keys.txt"));
auto others = xcdat::test::extract_keys(keys);
trie_type trie(keys);
REQUIRE_FALSE(trie.bin_mode());
test_basic_operations(trie, keys, others);
test_prefix_search(trie, keys, others);
test_predictive_search(trie, keys, others);
test_enumerate(trie, keys);
test_io(trie, keys, others);
}
TEST_CASE("Test trie_type (random 10K, A--B)") {
auto keys = xcdat::test::to_unique_vec(xcdat::test::make_random_keys(10000, 1, 30, 'A', 'B'));
auto others = xcdat::test::extract_keys(keys);
trie_type trie(keys);
REQUIRE_FALSE(trie.bin_mode());
test_basic_operations(trie, keys, others);
test_prefix_search(trie, keys, others);
test_predictive_search(trie, keys, others);
test_enumerate(trie, keys);
test_io(trie, keys, others);
}
TEST_CASE("Test trie_type (random 10K, A--Z)") {
auto keys = xcdat::test::to_unique_vec(xcdat::test::make_random_keys(10000, 1, 30, 'A', 'Z'));
auto others = xcdat::test::extract_keys(keys);
trie_type trie(keys);
REQUIRE_FALSE(trie.bin_mode());
test_basic_operations(trie, keys, others);
test_prefix_search(trie, keys, others);
test_predictive_search(trie, keys, others);
test_enumerate(trie, keys);
test_io(trie, keys, others);
}
TEST_CASE("Test trie_type (random 10K, 0x00--0xFF)") {
auto keys = xcdat::test::to_unique_vec(xcdat::test::make_random_keys(10000, 1, 30, INT8_MIN, INT8_MAX));
auto others = xcdat::test::extract_keys(keys);
trie_type trie(keys);
REQUIRE(trie.bin_mode());
test_basic_operations(trie, keys, others);
test_prefix_search(trie, keys, others);
test_predictive_search(trie, keys, others);
test_enumerate(trie, keys);
test_io(trie, keys, others);
}
#ifdef NDEBUG
TEST_CASE("Test trie_type (random 100K, A--B)") {
auto keys = xcdat::test::to_unique_vec(xcdat::test::make_random_keys(100000, 1, 30, 'A', 'B'));
auto others = xcdat::test::extract_keys(keys);
trie_type trie(keys);
REQUIRE_FALSE(trie.bin_mode());
test_basic_operations(trie, keys, others);
test_prefix_search(trie, keys, others);
test_predictive_search(trie, keys, others);
test_enumerate(trie, keys);
test_io(trie, keys, others);
}
TEST_CASE("Test trie_type (random 100K, A--Z)") {
auto keys = xcdat::test::to_unique_vec(xcdat::test::make_random_keys(100000, 1, 30, 'A', 'Z'));
auto others = xcdat::test::extract_keys(keys);
trie_type trie(keys);
REQUIRE_FALSE(trie.bin_mode());
test_basic_operations(trie, keys, others);
test_prefix_search(trie, keys, others);
test_predictive_search(trie, keys, others);
test_enumerate(trie, keys);
test_io(trie, keys, others);
}
TEST_CASE("Test trie_type (random 100K, 0x00--0xFF)") {
auto keys = xcdat::test::to_unique_vec(xcdat::test::make_random_keys(100000, 1, 30, INT8_MIN, INT8_MAX));
auto others = xcdat::test::extract_keys(keys);
trie_type trie(keys);
REQUIRE(trie.bin_mode());
test_basic_operations(trie, keys, others);
test_prefix_search(trie, keys, others);
test_predictive_search(trie, keys, others);
test_enumerate(trie, keys);
test_io(trie, keys, others);
}
#endif

14
tools/CMakeLists.txt Normal file
View file

@ -0,0 +1,14 @@
set(XCDAT_FILES
"xcdat_build"
"xcdat_lookup"
"xcdat_decode"
"xcdat_prefix_search"
"xcdat_predictive_search"
"xcdat_enumerate"
"xcdat_benchmark"
)
foreach(XCDAT_FILE ${XCDAT_FILES})
add_executable(${XCDAT_FILE} ${XCDAT_FILE}.cpp)
install(TARGETS ${XCDAT_FILE} RUNTIME DESTINATION bin)
endforeach(XCDAT_FILE)

View file

@ -0,0 +1,158 @@
#pragma once
#include <algorithm>
#include <cassert>
#include <iostream>
#include <sstream>
#include <type_traits>
#include <unordered_map>
#include <vector>
namespace cmd_line_parser {
struct parser {
inline static const std::string empty = "";
parser(int argc, char** argv) : m_argc(argc), m_argv(argv), m_required(0) {}
struct cmd {
std::string shorthand, value, descr;
bool is_boolean;
};
bool parse() {
if (size_t(m_argc - 1) < m_required) return abort();
size_t k = 0;
for (int i = 1; i != m_argc; ++i, ++k) {
std::string parsed(m_argv[i]);
if (parsed == "-h" or parsed == "--help") return abort();
size_t id = k;
bool is_optional = id >= m_required;
if (is_optional) {
auto it = m_shorthands.find(parsed);
if (it == m_shorthands.end()) {
std::cerr << "== error: shorthand '" + parsed + "' not found" << std::endl;
return abort();
}
id = (*it).second;
}
assert(id < m_names.size());
auto const& name = m_names[id];
auto& c = m_cmds[name];
if (is_optional) {
if (c.is_boolean) {
parsed = "true";
} else {
++i;
if (i == m_argc) return abort();
parsed = m_argv[i];
}
}
c.value = parsed;
}
return true;
}
void help() const {
std::cerr << "Usage: \e[1m" << m_argv[0] << "\e[0m [-h,--help]";
auto print = [this](bool with_description) {
for (size_t i = 0; i != m_names.size(); ++i) {
auto const& c = m_cmds.at(m_names[i]);
bool is_optional = i >= m_required;
if (is_optional) std::cerr << " [\e[1m" << c.shorthand << "\e[0m";
if (!c.is_boolean) std::cerr << " \e[4m" << m_names[i] << "\e[0m";
if (is_optional) std::cerr << "]";
if (with_description) std::cerr << "\n\t" << c.descr << "\n";
}
};
print(false);
std::cerr << "\n\n";
print(true);
std::cerr << " [-h,--help]\n\tPrint this help text and silently exits." << std::endl;
}
bool add(std::string const& name, std::string const& descr) {
bool ret = m_cmds.emplace(name, cmd{empty, empty, descr, false}).second;
if (ret) {
m_names.push_back(name);
m_required += 1;
}
return ret;
}
bool add(std::string const& name, std::string const& descr, std::string const& shorthand, bool is_boolean = true) {
bool ret = m_cmds.emplace(name, cmd{shorthand, is_boolean ? "false" : empty, descr, is_boolean}).second;
if (ret) {
m_names.push_back(name);
m_shorthands.emplace(shorthand, m_names.size() - 1);
}
return ret;
}
template <typename T>
T get(std::string const& name) const {
auto it = m_cmds.find(name);
if (it == m_cmds.end()) {
throw std::runtime_error("error: '" + name + "' not found");
}
auto const& value = (*it).second.value;
return parse<T>(value);
}
// added by Kampersanda
template <typename T>
T get(std::string const& name, const T& default_value) const {
return parsed(name) ? get<T>(name) : default_value;
}
bool parsed(std::string const& name) const {
auto it = m_cmds.find(name);
if (it == m_cmds.end() or (*it).second.value == empty) return false;
return true;
}
template <typename T>
T parse(std::string const& value) const {
if constexpr (std::is_same<T, std::string>::value) {
return value;
} else if constexpr (std::is_same<T, char>::value or std::is_same<T, signed char>::value or
std::is_same<T, unsigned char>::value) {
return value.front();
} else if constexpr (std::is_same<T, unsigned int>::value or std::is_same<T, int>::value or
std::is_same<T, unsigned short int>::value or std::is_same<T, short int>::value) {
return std::atoi(value.c_str());
} else if constexpr (std::is_same<T, unsigned long int>::value or std::is_same<T, long int>::value or
std::is_same<T, unsigned long long int>::value or std::is_same<T, long long int>::value) {
return std::atoll(value.c_str());
} else if constexpr (std::is_same<T, float>::value or std::is_same<T, double>::value or
std::is_same<T, long double>::value) {
return std::atof(value.c_str());
} else if constexpr (std::is_same<T, bool>::value) {
std::istringstream stream(value);
bool ret;
if (value == "true" or value == "false") {
stream >> std::boolalpha >> ret;
} else {
stream >> std::noboolalpha >> ret;
}
return ret;
}
assert(false);
__builtin_unreachable();
}
private:
int m_argc;
char** m_argv;
size_t m_required;
std::unordered_map<std::string, cmd> m_cmds;
std::unordered_map<std::string, int> m_shorthands;
std::vector<std::string> m_names;
bool abort() const {
help();
return false;
}
};
} // namespace cmd_line_parser

File diff suppressed because it is too large Load diff

148
tools/xcdat_benchmark.cpp Normal file
View file

@ -0,0 +1,148 @@
#include <chrono>
#include <random>
#include <xcdat.hpp>
#include "cmd_line_parser/parser.hpp"
#include "tinyformat/tinyformat.h"
static constexpr int num_trials = 10;
cmd_line_parser::parser make_parser(int argc, char** argv) {
cmd_line_parser::parser p(argc, argv);
p.add("input_keys", "Input filepath of keywords");
p.add("num_samples", "Number of sample keys for searches (default=1000)", "-n", false);
p.add("random_seed", "Random seed for sampling (default=13)", "-s", false);
p.add("binary_mode", "Is binary mode? (default=0)", "-b", false);
return p;
}
std::vector<std::string_view> sample_keys(const std::vector<std::string>& keys, std::uint64_t num_samples,
std::uint64_t random_seed) {
std::mt19937_64 engine(random_seed);
std::uniform_int_distribution<std::uint64_t> dist(0, keys.size() - 1);
std::vector<std::string_view> sampled_keys(num_samples);
for (std::uint64_t i = 0; i < num_samples; i++) {
sampled_keys[i] = std::string_view(keys[dist(engine)]);
}
return sampled_keys;
}
template <class Trie>
std::vector<std::uint64_t> extract_ids(const Trie& trie, const std::vector<std::string_view>& keys) {
std::vector<std::uint64_t> sampled_ids(keys.size());
for (std::uint64_t i = 0; i < keys.size(); i++) {
sampled_ids[i] = trie.lookup(keys[i]).value();
}
return sampled_ids;
}
template <class Trie>
Trie benchmark_build(const std::vector<std::string>& keys, bool binary_mode) {
const auto start_tp = std::chrono::high_resolution_clock::now();
Trie trie(keys, binary_mode);
const auto stop_tp = std::chrono::high_resolution_clock::now();
const auto dur_ms = std::chrono::duration_cast<std::chrono::milliseconds>(stop_tp - start_tp);
const double time_in_sec = dur_ms.count() / 1000.0;
const double memory_in_bytes = xcdat::memory_in_bytes(trie);
tfm::printfln("Number of keys: %d", trie.num_keys());
tfm::printfln("Memory usage in bytes: %d", memory_in_bytes);
tfm::printfln("Memory usage in MiB: %g", memory_in_bytes / (1024.0 * 1024.0));
tfm::printfln("Construction time in seconds: %g", time_in_sec);
return trie;
}
template <class Trie>
void benchmark_lookup(const Trie& trie, const std::vector<std::string_view>& queries) {
// Warmup
volatile std::uint64_t tmp = 0;
for (const auto& query : queries) {
tmp += trie.lookup(query).value();
}
// Measure
const auto start_tp = std::chrono::high_resolution_clock::now();
for (int r = 0; r < num_trials; r++) {
for (const auto& query : queries) {
tmp += trie.lookup(query).value();
}
}
const auto stop_tp = std::chrono::high_resolution_clock::now();
const auto dur_us = std::chrono::duration_cast<std::chrono::microseconds>(stop_tp - start_tp);
const auto elapsed_us = static_cast<double>(dur_us.count());
tfm::printfln("Lookup time in microsec/query: %g", elapsed_us / (num_trials * queries.size()));
}
template <class Trie>
void benchmark_decode(const Trie& trie, const std::vector<std::uint64_t>& queries) {
// Warmup
volatile std::uint64_t tmp = 0;
for (const std::uint64_t query : queries) {
tmp += trie.decode(query).size();
}
// Measure
const auto start_tp = std::chrono::high_resolution_clock::now();
for (int r = 0; r < num_trials; r++) {
for (const std::uint64_t query : queries) {
tmp += trie.decode(query).size();
}
}
const auto stop_tp = std::chrono::high_resolution_clock::now();
const auto dur_us = std::chrono::duration_cast<std::chrono::microseconds>(stop_tp - start_tp);
const auto elapsed_us = static_cast<double>(dur_us.count());
tfm::printfln("Decode time in microsec/query: %g", elapsed_us / (num_trials * queries.size()));
}
template <class Trie>
void benchmark(std::vector<std::string> keys, const std::vector<std::string_view>& query_keys, bool binary_mode) {
const auto trie = benchmark_build<Trie>(keys, binary_mode);
const auto query_ids = extract_ids(trie, query_keys);
benchmark_lookup(trie, query_keys);
benchmark_decode(trie, query_ids);
}
int main(int argc, char** argv) {
#ifndef NDEBUG
tfm::warnfln("The code is running in debug mode.");
#endif
std::ios::sync_with_stdio(false);
auto p = make_parser(argc, argv);
if (!p.parse()) {
return 1;
}
const auto input_keys = p.get<std::string>("input_keys");
const auto num_samples = p.get<std::uint64_t>("num_samples", 1000);
const auto random_seed = p.get<std::uint64_t>("random_seed", 13);
const auto binary_mode = p.get<bool>("binary_mode", false);
auto keys = xcdat::load_strings(input_keys);
if (keys.empty()) {
tfm::errorfln("Error: The input dataset is empty.");
return 1;
}
std::sort(keys.begin(), keys.end());
keys.erase(std::unique(keys.begin(), keys.end()), keys.end());
const auto query_keys = sample_keys(keys, num_samples, random_seed);
tfm::printfln("** xcdat::trie_7_type **");
benchmark<xcdat::trie_7_type>(keys, query_keys, binary_mode);
tfm::printfln("** xcdat::trie_8_type **");
benchmark<xcdat::trie_8_type>(keys, query_keys, binary_mode);
return 0;
}

67
tools/xcdat_build.cpp Normal file
View file

@ -0,0 +1,67 @@
#include <xcdat.hpp>
#include "cmd_line_parser/parser.hpp"
#include "tinyformat/tinyformat.h"
cmd_line_parser::parser make_parser(int argc, char** argv) {
cmd_line_parser::parser p(argc, argv);
p.add("input_keys", "Input filepath of keywords");
p.add("output_dic", "Output filepath of trie dictionary");
p.add("trie_type", "Trie type: [7|8] (default=7)", "-t", false);
p.add("binary_mode", "Is binary mode? (default=0)", "-b", false);
return p;
}
template <class Trie>
int build(const cmd_line_parser::parser& p) {
const auto input_keys = p.get<std::string>("input_keys");
const auto output_dic = p.get<std::string>("output_dic");
const auto binary_mode = p.get<bool>("binary_mode", false);
auto keys = xcdat::load_strings(input_keys);
if (keys.empty()) {
tfm::errorfln("Error: The input dataset is empty.");
}
std::sort(keys.begin(), keys.end());
keys.erase(std::unique(keys.begin(), keys.end()), keys.end());
const Trie trie(keys, binary_mode);
const double memory_in_bytes = xcdat::memory_in_bytes(trie);
tfm::printfln("Number of keys: %d", trie.num_keys());
tfm::printfln("Number of trie nodes: %d", trie.num_nodes());
tfm::printfln("Number of DA units: %d", trie.num_units());
tfm::printfln("Memory usage in bytes: %d", memory_in_bytes);
tfm::printfln("Memory usage in MiB: %g", memory_in_bytes / (1024.0 * 1024.0));
xcdat::save(trie, output_dic);
return 0;
}
int main(int argc, char** argv) {
#ifndef NDEBUG
tfm::warnfln("The code is running in debug mode.");
#endif
std::ios::sync_with_stdio(false);
auto p = make_parser(argc, argv);
if (!p.parse()) {
return 1;
}
const auto trie_type = p.get<int>("trie_type", 7);
switch (trie_type) {
case 7:
return build<xcdat::trie_7_type>(p);
case 8:
return build<xcdat::trie_8_type>(p);
default:
break;
}
p.help();
return 1;
}

53
tools/xcdat_decode.cpp Normal file
View file

@ -0,0 +1,53 @@
#include <mm_file/mm_file.hpp>
#include <xcdat.hpp>
#include "cmd_line_parser/parser.hpp"
#include "tinyformat/tinyformat.h"
cmd_line_parser::parser make_parser(int argc, char** argv) {
cmd_line_parser::parser p(argc, argv);
p.add("input_dic", "Input filepath of trie dictionary");
return p;
}
template <class Trie>
int decode(const cmd_line_parser::parser& p) {
const auto input_dic = p.get<std::string>("input_dic");
const mm::file_source<char> fin(input_dic.c_str(), mm::advice::sequential);
const auto trie = xcdat::mmap<Trie>(fin.data());
for (std::uint64_t id; std::cin >> id;) {
const auto dec = trie.decode(id);
tfm::printfln("%d\t%s", id, dec);
}
return 0;
}
int main(int argc, char** argv) {
#ifndef NDEBUG
tfm::warnfln("The code is running in debug mode.");
#endif
std::ios::sync_with_stdio(false);
auto p = make_parser(argc, argv);
if (!p.parse()) {
return 1;
}
const auto input_dic = p.get<std::string>("input_dic");
const auto flag = xcdat::get_flag(input_dic);
switch (flag) {
case 7:
return decode<xcdat::trie_7_type>(p);
case 8:
return decode<xcdat::trie_8_type>(p);
default:
break;
}
p.help();
return 1;
}

50
tools/xcdat_enumerate.cpp Normal file
View file

@ -0,0 +1,50 @@
#include <mm_file/mm_file.hpp>
#include <xcdat.hpp>
#include "cmd_line_parser/parser.hpp"
#include "tinyformat/tinyformat.h"
cmd_line_parser::parser make_parser(int argc, char** argv) {
cmd_line_parser::parser p(argc, argv);
p.add("input_dic", "Input filepath of trie dictionary");
return p;
}
template <class Trie>
int enumerate(const cmd_line_parser::parser& p) {
const auto input_dic = p.get<std::string>("input_dic");
const mm::file_source<char> fin(input_dic.c_str(), mm::advice::sequential);
const auto trie = xcdat::mmap<Trie>(fin.data());
trie.enumerate([&](std::uint64_t id, std::string_view str) { tfm::printfln("%d\t%s", id, str); });
return 0;
}
int main(int argc, char** argv) {
#ifndef NDEBUG
tfm::warnfln("The code is running in debug mode.");
#endif
std::ios::sync_with_stdio(false);
auto p = make_parser(argc, argv);
if (!p.parse()) {
return 1;
}
const auto input_dic = p.get<std::string>("input_dic");
const auto flag = xcdat::get_flag(input_dic);
switch (flag) {
case 7:
return enumerate<xcdat::trie_7_type>(p);
case 8:
return enumerate<xcdat::trie_8_type>(p);
default:
break;
}
p.help();
return 1;
}

57
tools/xcdat_lookup.cpp Normal file
View file

@ -0,0 +1,57 @@
#include <mm_file/mm_file.hpp>
#include <xcdat.hpp>
#include "cmd_line_parser/parser.hpp"
#include "tinyformat/tinyformat.h"
cmd_line_parser::parser make_parser(int argc, char** argv) {
cmd_line_parser::parser p(argc, argv);
p.add("input_dic", "Input filepath of trie dictionary");
return p;
}
template <class Trie>
int lookup(const cmd_line_parser::parser& p) {
const auto input_dic = p.get<std::string>("input_dic");
const mm::file_source<char> fin(input_dic.c_str(), mm::advice::sequential);
const auto trie = xcdat::mmap<Trie>(fin.data());
for (std::string str; std::getline(std::cin, str);) {
const auto id = trie.lookup(str);
if (id.has_value()) {
tfm::printfln("%d\t%s", id.value(), str);
} else {
tfm::printfln("-1\t%s", str);
}
}
return 0;
}
int main(int argc, char** argv) {
#ifndef NDEBUG
tfm::warnfln("The code is running in debug mode.");
#endif
std::ios::sync_with_stdio(false);
auto p = make_parser(argc, argv);
if (!p.parse()) {
return 1;
}
const auto input_dic = p.get<std::string>("input_dic");
const auto flag = xcdat::get_flag(input_dic);
switch (flag) {
case 7:
return lookup<xcdat::trie_7_type>(p);
case 8:
return lookup<xcdat::trie_8_type>(p);
default:
break;
}
p.help();
return 1;
}

View file

@ -0,0 +1,70 @@
#include <mm_file/mm_file.hpp>
#include <xcdat.hpp>
#include "cmd_line_parser/parser.hpp"
#include "tinyformat/tinyformat.h"
cmd_line_parser::parser make_parser(int argc, char** argv) {
cmd_line_parser::parser p(argc, argv);
p.add("input_dic", "Input filepath of trie dictionary");
p.add("max_num_results", "The max number of results (default=10)", "-n", false);
return p;
}
template <class Trie>
int predictive_search(const cmd_line_parser::parser& p) {
const auto input_dic = p.get<std::string>("input_dic");
const auto max_num_results = p.get<std::uint64_t>("max_num_results", 10);
const mm::file_source<char> fin(input_dic.c_str(), mm::advice::sequential);
const auto trie = xcdat::mmap<Trie>(fin.data());
struct result_type {
std::uint64_t id;
std::string str;
};
std::vector<result_type> results;
results.reserve(1ULL << 10);
for (std::string key; std::getline(std::cin, key);) {
results.clear();
trie.predictive_search(key, [&](std::uint64_t id, std::string_view str) {
results.push_back({id, std::string(str)});
});
tfm::printfln("%d found", results.size());
for (std::uint64_t i = 0; i < std::min<std::uint64_t>(results.size(), max_num_results); i++) {
const auto& r = results[i];
tfm::printfln("%d\t%s", r.id, r.str);
}
}
return 0;
}
int main(int argc, char** argv) {
#ifndef NDEBUG
tfm::warnfln("The code is running in debug mode.");
#endif
std::ios::sync_with_stdio(false);
auto p = make_parser(argc, argv);
if (!p.parse()) {
return 1;
}
const auto input_dic = p.get<std::string>("input_dic");
const auto flag = xcdat::get_flag(input_dic);
switch (flag) {
case 7:
return predictive_search<xcdat::trie_7_type>(p);
case 8:
return predictive_search<xcdat::trie_8_type>(p);
default:
break;
}
p.help();
return 1;
}

View file

@ -0,0 +1,66 @@
#include <mm_file/mm_file.hpp>
#include <xcdat.hpp>
#include "cmd_line_parser/parser.hpp"
#include "tinyformat/tinyformat.h"
cmd_line_parser::parser make_parser(int argc, char** argv) {
cmd_line_parser::parser p(argc, argv);
p.add("input_dic", "Input filepath of trie dictionary");
return p;
}
template <class Trie>
int prefix_search(const cmd_line_parser::parser& p) {
const auto input_dic = p.get<std::string>("input_dic");
const mm::file_source<char> fin(input_dic.c_str(), mm::advice::sequential);
const auto trie = xcdat::mmap<Trie>(fin.data());
struct result_type {
std::uint64_t id;
std::string_view str;
};
std::vector<result_type> results;
results.reserve(trie.max_length());
for (std::string key; std::getline(std::cin, key);) {
results.clear();
trie.prefix_search(key, [&](std::uint64_t id, std::string_view str) { results.push_back({id, str}); });
tfm::printfln("%d found", results.size());
for (const auto& r : results) {
tfm::printfln("%d\t%s", r.id, r.str);
}
}
return 0;
}
int main(int argc, char** argv) {
#ifndef NDEBUG
tfm::warnfln("The code is running in debug mode.");
#endif
std::ios::sync_with_stdio(false);
auto p = make_parser(argc, argv);
if (!p.parse()) {
return 1;
}
const auto input_dic = p.get<std::string>("input_dic");
const auto flag = xcdat::get_flag(input_dic);
switch (flag) {
case 7:
return prefix_search<xcdat::trie_7_type>(p);
case 8:
return prefix_search<xcdat::trie_8_type>(p);
default:
break;
}
p.help();
return 1;
}