diff --git a/.gitignore b/.gitignore index 4581ef2..7636d73 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,9 @@ *.exe *.out *.app + +# My Definition +build/ +cmake-build-debug/ +.idea/ +.DS_Store diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..07893e3 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,15 @@ +cmake_minimum_required(VERSION 2.8) +project(XCDAT) + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -g -std=c++11") +set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DNDEBUG -O3") + +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release) +endif() + +message(STATUS "BUILD_TYPE is ${CMAKE_BUILD_TYPE}") + +enable_testing() + +add_subdirectory(src) diff --git a/README.md b/README.md index 8eaa1c0..2698666 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,41 @@ -# xcdat -Xor-Compressed Double-Array Trie +# Xcdat: xor-compressed double-array trie + +Xcdat is a C++ library that implements a static dictionary structure based on a double-array trie. The double-array is known as the fastest trie representation and has been used in many trie libraries. On the other hand, it has a problem for space efficiency because of a pointer-based data structure. Xcdat solves the problem by applying the XOR-compressed double-array (XCDA) described on + +- S. Kanda, K. Morita, and M. Fuketa, "Compressed double-array tries for string dictionaries supporting fast lookup", _Knowledge and Information Systems_, Online first. [[pdf](https://sites.google.com/site/shnskknd/kais2016.pdf)] + +Therefore, Xcdat can implement a trie dictionary in smaller space than the other double-array libraries. +In addition, the lookup speed is fairly fast in compressed data structures from the double-array advantage. + +## Features + +- **Compressed data structure**. Xcdat practically compresses double-array elements representing node pointers by using XCDA methods. While the original double-array uses 8 bytes per node, it uses about 3 ~ 4 bytes (but, depending on a dataset). In addition, the dictionary is implemented by using a minimal-prefix trie unifying suffix strings. The structure is effective for long strings in time and space. +- **Two compression versions**. XCDA includes two versions for compressing elements: using byte-oriented directly addressable codes (DACs) and using pointer-based ones. For characterless strings such as natural language keywords, the former will be slightly smaller and the latter will be slightly faster. For long strings such as URLs, the latter will outperform the former. Xcdat implements the two versions by using a static polymorphism with C++ template to avoid an overhead of virtual functions. +- **Dictionary coding**. Xcdat supports mapping N different strings to unique IDs in [0,N). That is to say, it supports two basic dictionary operations: Lookup return the ID corresponding to a given string and Access (also called ReverseLookup) return the string corresponding to a given ID. Therefore, Xcdat is very useful in many applications for string precessing and indexing. +- **Prefix-based lookup operations**. As with other trie libraries, Xcdat also provides prefix-based lookup operations, which are useful in natural language processing and so on. +- **Fast operations**. The lookup speed of Xcdat is faster than that of other compressed trie libraries because it is based on the double-array trie. On the other hand, compared to the existing double-array libraries, the speed is slower because of the compression. + +## Build instructions + +You can download and compile the library as the following commands: + +``` +$ git clone https://github.com/kamp78/xcdat.git +$ cd xcdat +$ mkdir build +$ cd build +$ cmake .. +$ make +``` + +## Todo + +- Creating API documents +- Showing benchmarks +- Supporting faster operations +- Supporting 64-bit integers + +## Acknowledgements + +I would like to thank Dr. Yata, a creator of sophisticated software such as Darts-clone and Marisa-trie, for kindly giving useful comments to a previous version of the library. + diff --git a/src/BitVector.cpp b/src/BitVector.cpp new file mode 100644 index 0000000..7882b6d --- /dev/null +++ b/src/BitVector.cpp @@ -0,0 +1,319 @@ +#include "BitVector.hpp" + +namespace xcdat { + +// inspired by marisa-trie +constexpr uint8_t kSelectTable[9][256] = { + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }, + { + 8, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, + 5, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, + 6, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, + 5, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, + 7, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, + 5, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, + 6, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, + 5, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, + 8, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, + 5, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, + 6, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, + 5, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, + 7, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, + 5, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, + 6, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, + 5, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1 + }, + { + 8, 8, 8, 2, 8, 3, 3, 2, 8, 4, 4, 2, 4, 3, 3, 2, + 8, 5, 5, 2, 5, 3, 3, 2, 5, 4, 4, 2, 4, 3, 3, 2, + 8, 6, 6, 2, 6, 3, 3, 2, 6, 4, 4, 2, 4, 3, 3, 2, + 6, 5, 5, 2, 5, 3, 3, 2, 5, 4, 4, 2, 4, 3, 3, 2, + 8, 7, 7, 2, 7, 3, 3, 2, 7, 4, 4, 2, 4, 3, 3, 2, + 7, 5, 5, 2, 5, 3, 3, 2, 5, 4, 4, 2, 4, 3, 3, 2, + 7, 6, 6, 2, 6, 3, 3, 2, 6, 4, 4, 2, 4, 3, 3, 2, + 6, 5, 5, 2, 5, 3, 3, 2, 5, 4, 4, 2, 4, 3, 3, 2, + 8, 8, 8, 2, 8, 3, 3, 2, 8, 4, 4, 2, 4, 3, 3, 2, + 8, 5, 5, 2, 5, 3, 3, 2, 5, 4, 4, 2, 4, 3, 3, 2, + 8, 6, 6, 2, 6, 3, 3, 2, 6, 4, 4, 2, 4, 3, 3, 2, + 6, 5, 5, 2, 5, 3, 3, 2, 5, 4, 4, 2, 4, 3, 3, 2, + 8, 7, 7, 2, 7, 3, 3, 2, 7, 4, 4, 2, 4, 3, 3, 2, + 7, 5, 5, 2, 5, 3, 3, 2, 5, 4, 4, 2, 4, 3, 3, 2, + 7, 6, 6, 2, 6, 3, 3, 2, 6, 4, 4, 2, 4, 3, 3, 2, + 6, 5, 5, 2, 5, 3, 3, 2, 5, 4, 4, 2, 4, 3, 3, 2 + }, + { + 8, 8, 8, 8, 8, 8, 8, 3, 8, 8, 8, 4, 8, 4, 4, 3, + 8, 8, 8, 5, 8, 5, 5, 3, 8, 5, 5, 4, 5, 4, 4, 3, + 8, 8, 8, 6, 8, 6, 6, 3, 8, 6, 6, 4, 6, 4, 4, 3, + 8, 6, 6, 5, 6, 5, 5, 3, 6, 5, 5, 4, 5, 4, 4, 3, + 8, 8, 8, 7, 8, 7, 7, 3, 8, 7, 7, 4, 7, 4, 4, 3, + 8, 7, 7, 5, 7, 5, 5, 3, 7, 5, 5, 4, 5, 4, 4, 3, + 8, 7, 7, 6, 7, 6, 6, 3, 7, 6, 6, 4, 6, 4, 4, 3, + 7, 6, 6, 5, 6, 5, 5, 3, 6, 5, 5, 4, 5, 4, 4, 3, + 8, 8, 8, 8, 8, 8, 8, 3, 8, 8, 8, 4, 8, 4, 4, 3, + 8, 8, 8, 5, 8, 5, 5, 3, 8, 5, 5, 4, 5, 4, 4, 3, + 8, 8, 8, 6, 8, 6, 6, 3, 8, 6, 6, 4, 6, 4, 4, 3, + 8, 6, 6, 5, 6, 5, 5, 3, 6, 5, 5, 4, 5, 4, 4, 3, + 8, 8, 8, 7, 8, 7, 7, 3, 8, 7, 7, 4, 7, 4, 4, 3, + 8, 7, 7, 5, 7, 5, 5, 3, 7, 5, 5, 4, 5, 4, 4, 3, + 8, 7, 7, 6, 7, 6, 6, 3, 7, 6, 6, 4, 6, 4, 4, 3, + 7, 6, 6, 5, 6, 5, 5, 3, 6, 5, 5, 4, 5, 4, 4, 3 + }, + { + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, + 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, 5, 8, 5, 5, 4, + 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 4, + 8, 8, 8, 6, 8, 6, 6, 5, 8, 6, 6, 5, 6, 5, 5, 4, + 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 4, + 8, 8, 8, 7, 8, 7, 7, 5, 8, 7, 7, 5, 7, 5, 5, 4, + 8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 4, + 8, 7, 7, 6, 7, 6, 6, 5, 7, 6, 6, 5, 6, 5, 5, 4, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, + 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, 5, 8, 5, 5, 4, + 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 4, + 8, 8, 8, 6, 8, 6, 6, 5, 8, 6, 6, 5, 6, 5, 5, 4, + 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 4, + 8, 8, 8, 7, 8, 7, 7, 5, 8, 7, 7, 5, 7, 5, 5, 4, + 8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 4, + 8, 7, 7, 6, 7, 6, 6, 5, 7, 6, 6, 5, 6, 5, 5, 4 + }, + { + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, + 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 5, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, + 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 5, + 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, + 8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 5, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, + 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 5, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, + 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 5, + 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, + 8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 5 + }, + { + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, + 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, + 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6 + }, + { + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7 + }, + { + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 + } +}; + +uint32_t pop_count(uint32_t bits) { + bits = ((bits & 0xAAAAAAAA) >> 1) + (bits & 0x55555555); + bits = ((bits & 0xCCCCCCCC) >> 2) + (bits & 0x33333333); + bits = ((bits >> 4) + bits) & 0x0F0F0F0F; + bits += bits >> 8; + bits += bits >> 16; + return bits & 0x3F; +} + +BitVector::BitVector(BitVectorBuilder& builder, bool select_flag) { + if (!builder.size()) { + return; + } + + builder.bits_.shrink_to_fit(); + bits_ = std::move(builder.bits_); + + size_ = builder.size_; + rank_tips_.resize(size_ / kBitsInR1 + 1); + + num_1s_ = 0; + + // builds rank_tips_ + for (uint32_t i = 0; i < rank_tips_.size(); ++i) { + auto& tip = rank_tips_[i]; + tip.first = static_cast(num_1s_); + + for (uint32_t offset = 0; offset < kR1PerR2; ++offset) { + tip.second[offset] = static_cast(num_1s_ - tip.first); + auto pos_in_bits = i * kR1PerR2 + offset; + + if (pos_in_bits < bits_.size()) { + num_1s_ += pop_count(bits_[pos_in_bits]); + } + } + } + + if (select_flag) { + return; + } + + // builds select_tips_ + select_tips_.push_back(0); + auto count = kNum1sPerTip; + + for (uint32_t i = 0; i < rank_tips_.size(); ++i) { + if (count < rank_tips_[i].first) { + select_tips_.push_back(i - 1); + count += kNum1sPerTip; + } + } + select_tips_.push_back(static_cast(rank_tips_.size() - 1)); + select_tips_.shrink_to_fit(); +} + +uint32_t BitVector::rank(uint32_t i) const { + auto& hint = rank_tips_[i / kBitsInR1]; + return hint.first + hint.second[i / kBitsInR2 % kR1PerR2] + + pop_count(bits_[i / 32] & ((1U << (i % 32)) - 1)); +} + +uint32_t BitVector::select(uint32_t i) const { + uint32_t left = 0, right = static_cast(rank_tips_.size()); + + if (!select_tips_.empty()) { + auto select_tip_id = i / kNum1sPerTip; + left = select_tips_[select_tip_id]; + right = select_tips_[select_tip_id + 1] + 1; + } + + while (left + 1 < right) { + const auto center = (left + right) / 2; + if (i < rank_tips_[center].first) { + right = center; + } else { + left = center; + } + } + + i += 1; // for i+1 th + i -= rank_tips_[left].first; + + uint32_t offset = 1; + for (; offset < kR1PerR2; ++offset) { + if (i <= rank_tips_[left].second[offset]) { + break; + } + } + i -= rank_tips_[left].second[--offset]; + + auto ret = (left * kBitsInR1) + (offset * kBitsInR2); + auto bits = bits_[ret / 32]; + + { + auto _count = pop_count(bits % 65536); + if (_count < i) { + bits >>= 16; + ret += 16; + i -= _count; + } + } + { + auto _count = pop_count(bits % 256); + if (_count < i) { + bits >>= 8; + ret += 8; + i -= _count; + } + } + + ret += kSelectTable[i][bits % 256]; + return ret - 1; +} + +size_t BitVector::size_in_bytes() const { + size_t ret = 0; + ret += util::size_in_bytes(bits_); + ret += util::size_in_bytes(rank_tips_); + ret += util::size_in_bytes(select_tips_); + ret += sizeof(size_); + ret += sizeof(num_1s_); + return ret; +} + +void BitVector::write(std::ostream& os) const { + util::write_vector(bits_, os); + util::write_vector(rank_tips_, os); + util::write_vector(select_tips_, os); + util::write_value(size_, os); + util::write_value(num_1s_, os); +} + +void BitVector::read(std::istream& is) { + util::read_vector(bits_, is); + util::read_vector(rank_tips_, is); + util::read_vector(select_tips_, is); + util::read_value(size_, is); + util::read_value(num_1s_, is); +} + +void BitVector::swap(BitVector& rhs) { + bits_.swap(rhs.bits_); + rank_tips_.swap(rhs.rank_tips_); + select_tips_.swap(rhs.select_tips_); + std::swap(size_, rhs.size_); + std::swap(num_1s_, rhs.num_1s_); +} + +} //namespace - xcdat diff --git a/src/BitVector.hpp b/src/BitVector.hpp new file mode 100644 index 0000000..7a29008 --- /dev/null +++ b/src/BitVector.hpp @@ -0,0 +1,57 @@ +#ifndef XCDAT_BIT_VECTOR_HPP_ +#define XCDAT_BIT_VECTOR_HPP_ + +#include + +#include "BitVectorBuilder.hpp" + +namespace xcdat { + +class BitVector { +public: + BitVector() {} + BitVector(BitVectorBuilder& builder, bool select_flag = false); // builder.bits_ is stolen. + + ~BitVector() {} + + bool operator[](uint32_t i) const { + return (bits_[i / 32] & (1U << (i % 32))) != 0; + } + + // # of 1s in B[0,i) + uint32_t rank(uint32_t i) const; + // position of the i+1 th occurrence + uint32_t select(uint32_t i) const; + + size_t num_1s() const { return num_1s_; } + size_t num_0s() const { return size_ - num_1s_; } + + size_t size() const { return size_; } // # of bits + size_t size_in_bytes() const; + + void write(std::ostream &os) const; + void read(std::istream &is); + + void swap(BitVector& rhs); + + BitVector(const BitVector&) = delete; + BitVector& operator=(const BitVector&) = delete; + +private: + static constexpr uint32_t kBitsInR1 = 256; + static constexpr uint32_t kBitsInR2 = 32; + static constexpr uint32_t kR1PerR2 = kBitsInR1 / kBitsInR2; + static constexpr uint32_t kNum1sPerTip = 512; + + using RankTip = std::pair>; + + std::vector bits_; + std::vector rank_tips_; + std::vector select_tips_; + size_t size_ = 0; + size_t num_1s_ = 0; +}; + +} //namespace - xcdat + +#endif //XCDAT_BIT_VECTOR_HPP_ diff --git a/src/BitVectorBuilder.hpp b/src/BitVectorBuilder.hpp new file mode 100644 index 0000000..1e859f8 --- /dev/null +++ b/src/BitVectorBuilder.hpp @@ -0,0 +1,62 @@ +// +// Created by Kampersanda on 2016/11/22. +// + +#ifndef XCDAT_BIT_VECTOR_BUILDER_HPP_ +#define XCDAT_BIT_VECTOR_BUILDER_HPP_ + +#include "xcdatBasics.hpp" + +namespace xcdat { + +class BitVectorBuilder { +public: + friend class BitVector; + + BitVectorBuilder() {} + BitVectorBuilder(size_t size) { resize(size); } + + ~BitVectorBuilder() {} + + void push_back(bool bit) { + if (size_ % 32 == 0) { + bits_.emplace_back(0); + } + if (bit) { + set_bit(size_, true); + } + ++size_; + } + + void set_bit(size_t i, bool bit) { + if (bit) { + bits_[i / 32] |= (1U << (i % 32)); + } else { + bits_[i / 32] &= (~(1U << (i % 32))); + } + } + + void resize(size_t size) { + bits_.resize(size / 32 + 1, 0); + size_ = size; + } + + void reserve(size_t capacity) { + bits_.reserve(capacity / 32 + 1); + } + + size_t size() const { + return size_; + } + + BitVectorBuilder(const BitVectorBuilder&) = delete; + BitVectorBuilder& operator=(const BitVectorBuilder&) = delete; + +private: + std::vector bits_; + size_t size_ = 0; +}; + +} //namespace - xcdat + +#endif //XCDAT_BIT_VECTOR_BUILDER_HPP_ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 0000000..6ac3044 --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,34 @@ +set(HEADER_FILES + BitVector.hpp + BitVectorBuilder.hpp + DacBc.hpp + FastDacBc.hpp + SmallVector.hpp + Trie.hpp + TrieBuilder.hpp + xcdatBasics.hpp + ) + +set(SOURCE_FILES + ${HEADER_FILES} + BitVector.cpp + DacBc.cpp + SmallVector.cpp + TrieBuilder.cpp + FastDacBc.cpp + ) + +add_library(xcdat STATIC ${SOURCE_FILES}) + +add_executable(xcdat-exe xcdat.cpp) +set_target_properties(xcdat-exe PROPERTIES OUTPUT_NAME xcdat) +target_link_libraries(xcdat-exe xcdat) + +enable_testing() +file(GLOB TEST_SOURCES test*.cpp) +foreach(TEST_SOURCE ${TEST_SOURCES}) + get_filename_component(TEST_SOURCE_NAME ${TEST_SOURCE} NAME_WE) + add_executable(${TEST_SOURCE_NAME} ${TEST_SOURCE}) + target_link_libraries(${TEST_SOURCE_NAME} xcdat) + add_test(${TEST_SOURCE_NAME} ${TEST_SOURCE_NAME}) +endforeach() diff --git a/src/DacBc.cpp b/src/DacBc.cpp new file mode 100644 index 0000000..08535f5 --- /dev/null +++ b/src/DacBc.cpp @@ -0,0 +1,151 @@ +#include "DacBc.hpp" + +namespace xcdat { + +DacBc::DacBc(const std::vector& bc) { + if (bc.empty()) { + return; + } + + std::array flags; + BitVectorBuilder leaves(bc.size()); + std::vector links; + + values_[0].reserve(bc.size() * 2); + flags[0].reserve(bc.size() * 2); + links.reserve(bc.size()); + + max_level_ = 0; + + auto append = [&](uint32_t value, bool is_leaf) { + if (is_leaf) { + links.push_back(value >> 8); + values_[0].push_back(static_cast(value & 0xFF)); + flags[0].push_back(false); + return; + } + + uint8_t level = 0; + values_[level].push_back(static_cast(value & 0xFF)); + flags[level].push_back(true); + value >>= 8; + + while (value) { + ++level; + values_[level].push_back(static_cast(value & 0xFF)); + flags[level].push_back(true); + value >>= 8; + } + flags[level].set_bit(flags[level].size() - 1, false); + max_level_ = std::max(max_level_, level); + }; + + for (uint32_t i = 0; i < bc.size(); ++i) { + if (!bc[i].is_used) { + append(0, false); + append(0, false); + ++num_free_nodes_; + } else { + const auto is_leaf = bc[i].is_leaf; + leaves.set_bit(i, is_leaf); + !is_leaf ? append(bc[i].base ^ i, false) : append(bc[i].base, true); + append(bc[i].check ^ i, false); + } + } + + // release + for (uint8_t i = 0; i < max_level_; ++i) { + values_[i].shrink_to_fit(); + BitVector{flags[i]}.swap(flags_[i]); + } + values_[max_level_].shrink_to_fit(); + BitVector{leaves}.swap(leaves_); + SmallVector{links}.swap(links_); +} + +size_t DacBc::size_in_bytes() const { + size_t ret = 0; + for (auto& values : values_) { + ret += util::size_in_bytes(values); + } + for (auto& flags : flags_) { + ret += flags.size_in_bytes(); + } + ret += leaves_.size_in_bytes(); + ret += links_.size_in_bytes(); + ret += sizeof(max_level_); + ret += sizeof(num_free_nodes_); + return ret; +} + +void DacBc::show_stat(std::ostream& os) const { + const auto total_size = size_in_bytes(); + os << "basic statistics of xcdat::DacBc" << std::endl; + util::show_stat("\tnum links: ", links_.size(), os); + util::show_stat("\tbytes per node:", double(total_size) / num_nodes(), os); + os << "member size statistics of xcdat::DacBc" << std::endl; + util::show_stat("\tvalues_[0]:", util::size_in_bytes(values_[0]), total_size, os); + util::show_stat("\tvalues_[1]:", util::size_in_bytes(values_[1]), total_size, os); + util::show_stat("\tvalues_[2]:", util::size_in_bytes(values_[2]), total_size, os); + util::show_stat("\tvalues_[3]:", util::size_in_bytes(values_[3]), total_size, os); + util::show_stat("\tflags_[0]: ", flags_[0].size_in_bytes(), total_size, os); + util::show_stat("\tflags_[1]: ", flags_[1].size_in_bytes(), total_size, os); + util::show_stat("\tflags_[2]: ", flags_[2].size_in_bytes(), total_size, os); + util::show_stat("\tleaves_: ", leaves_.size_in_bytes(), total_size, os); + util::show_stat("\tlinks_: ", links_.size_in_bytes(), total_size, os); +} + +void DacBc::write(std::ostream& os) const { + for (auto& values : values_) { + util::write_vector(values, os); + } + for (auto& flags : flags_) { + flags.write(os); + } + leaves_.write(os); + links_.write(os); + util::write_value(max_level_, os); + util::write_value(num_free_nodes_, os); +} + +void DacBc::read(std::istream& is) { + for (auto& values : values_) { + util::read_vector(values, is); + } + for (auto& flags : flags_) { + flags.read(is); + } + leaves_.read(is); + links_.read(is); + util::read_value(max_level_, is); + util::read_value(num_free_nodes_, is); +} + +void DacBc::swap(DacBc& rhs) { + for (uint32_t i = 0; i < values_.size(); ++i) { + values_[i].swap(rhs.values_[i]); + } + for (uint32_t i = 0; i < flags_.size(); ++i) { + flags_[i].swap(rhs.flags_[i]); + } + leaves_.swap(rhs.leaves_); + links_.swap(rhs.links_); + std::swap(max_level_, rhs.max_level_); + std::swap(num_free_nodes_, rhs.num_free_nodes_); +} + +uint32_t DacBc::access_(uint32_t i) const { + uint8_t level = 0; + uint32_t value = values_[level][i]; + while (level < max_level_) { + if (!flags_[level][i]) { + break; + } + i = flags_[level].rank(i); + ++level; + value |= static_cast(values_[level][i]) << (level * 8); + } + return value; +} + +} //namespace - xcdat diff --git a/src/DacBc.hpp b/src/DacBc.hpp new file mode 100644 index 0000000..28bb1d2 --- /dev/null +++ b/src/DacBc.hpp @@ -0,0 +1,73 @@ +#ifndef XCDAT_DAC_BC_HPP_ +#define XCDAT_DAC_BC_HPP_ + +#include + +#include "BitVector.hpp" +#include "SmallVector.hpp" + +namespace xcdat { + +// BASE/CHECK arrays using byte-oriented DACs. +class DacBc { +public: + static constexpr uint32_t kFirstBits = 8; + + DacBc() {} + DacBc(const std::vector& bc); + + ~DacBc() {} + + uint32_t base(uint32_t i) const { + return access_(i * 2) ^ i; + } + uint32_t link(uint32_t i) const { + return values_[0][i * 2] | (links_[leaves_.rank(i)] << 8); + } + uint32_t check(uint32_t i) const { + return access_(i * 2 + 1) ^ i; + } + + bool is_leaf(uint32_t i) const { + return leaves_[i]; + } + bool is_used(uint32_t i) const { + return check(i) != i; + } + + size_t num_nodes() const { + return values_[0].size() / 2; + } + size_t num_used_nodes() const { + return num_nodes() - num_free_nodes_; + } + size_t num_free_nodes() const { + return num_free_nodes_; + } + + size_t size_in_bytes() const; + + void show_stat(std::ostream &os) const; + + void write(std::ostream &os) const; + void read(std::istream &is); + + void swap(DacBc& rhs); + + DacBc(const DacBc&) = delete; + DacBc& operator=(const DacBc&) = delete; + +private: + std::array, 4> values_; + std::array flags_; + BitVector leaves_; + SmallVector links_; + uint8_t max_level_ = 0; + size_t num_free_nodes_ = 0; + + uint32_t access_(uint32_t i) const; +}; + +} //namespace - xcdat + +#endif //XCDAT_DAC_BC_HPP_ diff --git a/src/FastDacBc.cpp b/src/FastDacBc.cpp new file mode 100644 index 0000000..7976ba9 --- /dev/null +++ b/src/FastDacBc.cpp @@ -0,0 +1,153 @@ +#include "FastDacBc.hpp" + +namespace xcdat { + +FastDacBc::FastDacBc(const std::vector& bc) { + if (bc.empty()) { + return; + } + + BitVectorBuilder leaves(bc.size()); + std::vector links; + + std::get<0>(values_).reserve(bc.size() * 2); + ranks_[0].reserve((bc.size() * 2) / 128); + + auto append = [&](uint32_t value, bool is_leaf) { + if ((std::get<0>(values_).size() % 128) == 0) { + ranks_[0].push_back(static_cast(std::get<1>(values_).size())); + } + + if (is_leaf) { + std::get<0>(values_).push_back(static_cast(value & 0xFF)); + links.push_back(value >> 8); + return; + } + + if ((value / 128) == 0) { + std::get<0>(values_).push_back(static_cast(0 | (value << 1))); + return; + } else { + auto pos = std::get<1>(values_).size() - ranks_[0].back(); + std::get<0>(values_).push_back(static_cast(1 | (pos << 1))); + } + + if ((std::get<1>(values_).size() % 32768) == 0) { + ranks_[1].push_back(static_cast(std::get<2>(values_).size())); + } + + if ((value / 32768) == 0) { + std::get<1>(values_).push_back(static_cast(0 | (value << 1))); + return; + } else { + auto pos = std::get<2>(values_).size() - ranks_[1].back(); + std::get<1>(values_).push_back(static_cast(1 | (pos << 1))); + } + + std::get<2>(values_).push_back(value); + }; + + + for (uint32_t i = 0; i < bc.size(); ++i) { + if (!bc[i].is_used) { + append(0, false); + append(0, false); + ++num_free_nodes_; + } else { + const auto is_leaf = bc[i].is_leaf; + leaves.set_bit(i, is_leaf); + !is_leaf ? append(bc[i].base ^ i, false) : append(bc[i].base, true); + append(bc[i].check ^ i, false); + } + } + + // release + std::get<0>(values_).shrink_to_fit(); + std::get<1>(values_).shrink_to_fit(); + std::get<2>(values_).shrink_to_fit(); + ranks_[0].shrink_to_fit(); + ranks_[1].shrink_to_fit(); + BitVector{leaves}.swap(leaves_); + SmallVector{links}.swap(links_); +} + +size_t FastDacBc::size_in_bytes() const { + size_t ret = 0; + ret += util::size_in_bytes(std::get<0>(values_)); + ret += util::size_in_bytes(std::get<1>(values_)); + ret += util::size_in_bytes(std::get<2>(values_)); + for (auto& ranks : ranks_) { + ret += util::size_in_bytes(ranks); + } + ret += leaves_.size_in_bytes(); + ret += links_.size_in_bytes(); + ret += sizeof(num_free_nodes_); + return ret; +} + +void FastDacBc::show_stat(std::ostream& os) const { + const auto total_size = size_in_bytes(); + os << "basic statistics of xcdat::FastDacBc" << std::endl; + util::show_stat("\tnum links: ", links_.size(), os); + util::show_stat("\tbytes per node:", double(total_size) / num_nodes(), os); + os << "member size statistics of xcdat::FastDacBc" << std::endl; + util::show_stat("\tvalues_[0]:", util::size_in_bytes(std::get<0>(values_)), total_size, os); + util::show_stat("\tvalues_[1]:", util::size_in_bytes(std::get<1>(values_)), total_size, os); + util::show_stat("\tvalues_[2]:", util::size_in_bytes(std::get<2>(values_)), total_size, os); + util::show_stat("\tranks_[0]: ", util::size_in_bytes(ranks_[0]), total_size, os); + util::show_stat("\tranks_[1]: ", util::size_in_bytes(ranks_[1]), total_size, os); + util::show_stat("\tleaves_: ", leaves_.size_in_bytes(), total_size, os); + util::show_stat("\tlinks_: ", links_.size_in_bytes(), total_size, os); +} + +void FastDacBc::write(std::ostream& os) const { + util::write_vector(std::get<0>(values_), os); + util::write_vector(std::get<1>(values_), os); + util::write_vector(std::get<2>(values_), os); + for (auto& ranks : ranks_) { + util::write_vector(ranks, os); + } + leaves_.write(os); + links_.write(os); + util::write_value(num_free_nodes_, os); +} + +void FastDacBc::read(std::istream& is) { + util::read_vector(std::get<0>(values_), is); + util::read_vector(std::get<1>(values_), is); + util::read_vector(std::get<2>(values_), is); + for (auto& ranks : ranks_) { + util::read_vector(ranks, is); + } + leaves_.read(is); + links_.read(is); + util::read_value(num_free_nodes_, is); +} + +void FastDacBc::swap(FastDacBc& rhs) { + std::get<0>(values_).swap(std::get<0>(rhs.values_)); + std::get<1>(values_).swap(std::get<1>(rhs.values_)); + std::get<2>(values_).swap(std::get<2>(rhs.values_)); + for (uint32_t i = 0; i < ranks_.size(); ++i) { + ranks_[i].swap(rhs.ranks_[i]); + } + leaves_.swap(rhs.leaves_); + links_.swap(rhs.links_); + std::swap(num_free_nodes_, rhs.num_free_nodes_); +} + +uint32_t FastDacBc::access_(uint32_t i) const { + uint32_t value = std::get<0>(values_)[i] >> 1; + if ((std::get<0>(values_)[i] & 1U) == 0) { + return value; + } + i = ranks_[0][i / 128] + value; + value = std::get<1>(values_)[i] >> 1; + if ((std::get<1>(values_)[i] & 1U) == 0) { + return value; + } + i = ranks_[1][i / 32768] + value; + return std::get<2>(values_)[i]; +} + +} //namespace - xcdat diff --git a/src/FastDacBc.hpp b/src/FastDacBc.hpp new file mode 100644 index 0000000..f1fc2b6 --- /dev/null +++ b/src/FastDacBc.hpp @@ -0,0 +1,77 @@ +#ifndef XCDAT_FAST_DAC_BC_HPP_ +#define XCDAT_FAST_DAC_BC_HPP_ + +#include +#include + +#include "BitVector.hpp" +#include "SmallVector.hpp" + +namespace xcdat { + +// BASE/CHECK arrays using pointer-based DACs. +class FastDacBc { +public: + static constexpr uint32_t kFirstBits = 7; + + FastDacBc() {} + FastDacBc(const std::vector& bc); + + ~FastDacBc() {} + + uint32_t base(uint32_t i) const { + return access_(i * 2) ^ i; + } + uint32_t link(uint32_t i) const { + return std::get<0>(values_)[i * 2] | (links_[leaves_.rank(i)] << 8); + } + uint32_t check(uint32_t i) const { + return access_(i * 2 + 1) ^ i; + } + + bool is_leaf(uint32_t i) const { + return leaves_[i]; + } + bool is_used(uint32_t i) const { + return check(i) != i; + } + + size_t num_nodes() const { + return std::get<0>(values_).size() / 2; + } + size_t num_used_nodes() const { + return num_nodes() - num_free_nodes_; + } + size_t num_free_nodes() const { + return num_free_nodes_; + } + + size_t size_in_bytes() const; + + void show_stat(std::ostream& os) const; + + void write(std::ostream& os) const; + void read(std::istream& is); + + void swap(FastDacBc& rhs); + + FastDacBc(const FastDacBc&) = delete; + FastDacBc& operator=(const FastDacBc&) = delete; + +private: + std::tuple< + std::vector, + std::vector, + std::vector + > values_; + std::array, 2> ranks_; + BitVector leaves_; + SmallVector links_; + size_t num_free_nodes_ = 0; + + uint32_t access_(uint32_t i) const; +}; + +} //namespace - xcdat + +#endif //XCDAT_FAST_DAC_BC_HPP_ diff --git a/src/SmallVector.cpp b/src/SmallVector.cpp new file mode 100644 index 0000000..65cb7f2 --- /dev/null +++ b/src/SmallVector.cpp @@ -0,0 +1,65 @@ +#include "SmallVector.hpp" + +namespace xcdat { + +SmallVector::SmallVector(const std::vector& integers) { + if (integers.empty()) { + return; + } + + bits_ = 0; + auto max_value = *std::max_element(std::begin(integers), std::end(integers)); + do { + ++bits_; + max_value >>= 1; + } while (max_value); + + size_ = integers.size(); + chunks_.resize(size_ * bits_ / 32 + 1, 0); + mask_ = static_cast((1 << bits_) - 1); + + for (uint32_t i = 0; i < size_; ++i) { + const auto chunk_pos = i * bits_ / 32; + const auto offset = i * bits_ % 32; + + chunks_[chunk_pos] &= ~(mask_ << offset); + chunks_[chunk_pos] |= (integers[i] & mask_) << offset; + + if (32 < offset + bits_) { + chunks_[chunk_pos + 1] &= ~(mask_ >> (32 - offset)); + chunks_[chunk_pos + 1] |= (integers[i] & mask_) >> (32 - offset); + } + } +} + +size_t SmallVector::size_in_bytes() const { + size_t ret = 0; + ret += util::size_in_bytes(chunks_); + ret += sizeof(size_); + ret += sizeof(bits_); + ret += sizeof(mask_); + return ret; +} + +void SmallVector::write(std::ostream& os) const { + util::write_vector(chunks_, os); + util::write_value(size_, os); + util::write_value(bits_, os); + util::write_value(mask_, os); +} + +void SmallVector::read(std::istream& is) { + util::read_vector(chunks_, is); + util::read_value(size_, is); + util::read_value(bits_, is); + util::read_value(mask_, is); +} + +void SmallVector::swap(SmallVector& rhs) { + chunks_.swap(rhs.chunks_); + std::swap(size_, rhs.size_); + std::swap(bits_, rhs.bits_); + std::swap(mask_, rhs.mask_); +} + +} //namespace - xcdat diff --git a/src/SmallVector.hpp b/src/SmallVector.hpp new file mode 100644 index 0000000..823eb38 --- /dev/null +++ b/src/SmallVector.hpp @@ -0,0 +1,46 @@ +#ifndef XCDAT_SMALL_VECTOR_HPP_ +#define XCDAT_SMALL_VECTOR_HPP_ + +#include "xcdatBasics.hpp" + +namespace xcdat { + +class SmallVector { +public: + SmallVector() {} + SmallVector(const std::vector& integers); + + ~SmallVector() {} + + uint32_t operator[](uint32_t i) const { + auto chunk_pos = i * bits_ / 32; + auto offset = i * bits_ % 32; + if (offset + bits_ <= 32) { + return (chunks_[chunk_pos] >> offset) & mask_; + } else { + return ((chunks_[chunk_pos] >> offset) + | (chunks_[chunk_pos + 1] << (32 - offset))) & mask_; + } + } + + size_t size() const { return size_; } + size_t size_in_bytes() const; + + void write(std::ostream &os) const; + void read(std::istream &is); + + void swap(SmallVector& rhs); + + SmallVector(const SmallVector&) = delete; + SmallVector& operator=(const SmallVector&) = delete; + +private: + std::vector chunks_; + size_t size_ = 0; + uint32_t bits_ = 0; + uint32_t mask_ = 0; +}; + +} //namespace - xcdat + +#endif //XCDAT_SMALL_VECTOR_HPP_ diff --git a/src/Trie.hpp b/src/Trie.hpp new file mode 100644 index 0000000..da1b9df --- /dev/null +++ b/src/Trie.hpp @@ -0,0 +1,299 @@ +#ifndef XCDAT_TRIE_HPP_ +#define XCDAT_TRIE_HPP_ + +#include "TrieBuilder.hpp" +#include "DacBc.hpp" +#include "FastDacBc.hpp" + +namespace xcdat { + +// A static compressed string dictionary based on an improved double-array trie. +template +class Trie { +public: + using Type = Trie; + using BcType = typename std::conditional::type; + + static constexpr auto kDefaultLimit = static_cast(-1); + + Trie() {} + + // Builds a trie dictionary from a set of strings in lexicographical order. + Trie(const std::vector& strings) { + TrieBuilder builder{strings, BcType::kFirstBits}; + + BcType{builder.bc_}.swap(bc_); + BitVector{builder.terms_, true}.swap(terms_); + tail_ = std::move(builder.tail_); + alphabet_ = std::move(builder.alphabet_); + table_ = builder.table_; + + num_strings_ = strings.size(); + max_length_ = builder.max_length_; + } + + ~Trie() {} + + // Returns the string ID if stored, otherwise 'kNotFound'. + uint32_t lookup(CharRange string) const { + auto node_id = kRootId; + + while (!bc_.is_leaf(node_id)) { + if (string.begin == string.end) { + return terms_[node_id] ? to_string_id_(node_id) : kNotFound; + } + + const auto child_id = bc_.base(node_id) ^ table_[*string.begin++]; + + if (bc_.check(child_id) != node_id) { + return kNotFound; + } + + node_id = child_id; + } + + if (match_(string, bc_.link(node_id))) { + return to_string_id_(node_id); + } + return kNotFound; + } + + // Return the corresponding string, access(lookup()) + std::string access(uint32_t id) const { + if (num_strings_ <= id) { + return {}; + } + + std::string ret; + ret.reserve(max_length_); + + auto node_id = to_node_id_(id); + const auto link = bc_.is_leaf(node_id) ? bc_.link(node_id) : kNotFound; + + while (node_id != kRootId) { + const auto parent_id = bc_.check(node_id); + ret += edge_(parent_id, node_id); + node_id = parent_id; + } + + std::reverse(std::begin(ret), std::end(ret)); + if (link != kNotFound) { + ret += reinterpret_cast(tail_.data()) + link; + } + + return ret; // expecting move semantics + } + + // Return the corresponding string. + size_t common_prefix_lookup(CharRange string, std::vector& ids, + size_t limit = kDefaultLimit) const { + if (limit == 0) { + return 0; + } + + auto node_id = kRootId; + size_t num_ids = 0; + + while (!bc_.is_leaf(node_id)) { + if (terms_[node_id]) { + ids.push_back(to_string_id_(node_id)); + ++num_ids; + if (num_ids == limit) { + return num_ids; + } + } + if (string.begin == string.end) { + return num_ids; + } + + const auto child_id = bc_.base(node_id) ^ table_[*string.begin++]; + + if (bc_.check(child_id) != node_id) { + return num_ids; + } + + node_id = child_id; + } + + if (match_(string, bc_.link(node_id))) { + ids.push_back(to_string_id_(node_id)); + ++num_ids; + } + + return num_ids; + } + + // Return the corresponding string. + size_t predictive_lookup(CharRange string, std::vector& ids, + size_t limit = kDefaultLimit) const { + auto node_id = kRootId; + + for (; string.begin != string.end; ++string.begin) { + if (bc_.is_leaf(node_id)) { + if (prefix_match_(string, bc_.link(node_id))) { + ids.push_back(to_string_id_(node_id)); + return 1; + } + return 0; + } + + const auto child_id = bc_.base(node_id) ^ table_[*string.begin]; + + if (bc_.check(child_id) != node_id) { + return 0; + } + node_id = child_id; + } + + size_t num_ids = 0; + enumerate_ids_(node_id, ids, num_ids, limit); + return num_ids; + } + + size_t num_strings() const { + return num_strings_; + } + size_t alphabet_size() const { + return alphabet_.size(); + } + + size_t num_nodes() const { + return bc_.num_nodes(); + } + size_t num_used_nodes() const { + return bc_.num_used_nodes(); + } + size_t num_free_nodes() const { + return bc_.num_free_nodes(); + } + + // Returns the number of bytes. + size_t size_in_bytes() const { + size_t ret = 0; + ret += bc_.size_in_bytes(); + ret += terms_.size_in_bytes(); + ret += util::size_in_bytes(tail_); + ret += util::size_in_bytes(alphabet_); + ret += sizeof(table_); + ret += sizeof(num_strings_); + ret += sizeof(max_length_); + return ret; + } + + // Dumps statistics of the dictionary. + void show_stat(std::ostream& os) const { + const auto total_size = size_in_bytes(); + os << "basic statistics of xcdat::Trie" << std::endl; + util::show_stat("\tnum strings: ", num_strings(), os); + util::show_stat("\talphabet size: ", alphabet_size(), os); + util::show_stat("\tnum nodes: ", num_nodes(), os); + util::show_stat("\tnum used_nodes:", num_used_nodes(), os); + util::show_stat("\tnum free_nodes:", num_free_nodes(), os); + util::show_stat("\tsize in bytes: ", size_in_bytes(), os); + os << "member size statistics of xcdat::Trie" << std::endl; + util::show_stat("\tbc_: ", bc_.size_in_bytes(), total_size, os); + util::show_stat("\tterms_: ", terms_.size_in_bytes(), total_size, os); + util::show_stat("\ttail_: ", util::size_in_bytes(tail_), total_size, os); + util::show_stat("\talphabet_:", util::size_in_bytes(alphabet_), total_size, os); + util::show_stat("\ttable_: ", sizeof(table_), total_size, os); + bc_.show_stat(os); + } + + void write(std::ostream& os) const { + bc_.write(os); + terms_.write(os); + util::write_vector(tail_, os); + util::write_vector(alphabet_, os); + util::write_value(table_, os); + util::write_value(num_strings_, os); + util::write_value(max_length_, os); + } + + void read(std::istream& is) { + bc_.read(is); + terms_.read(is); + util::read_vector(tail_, is); + util::read_vector(alphabet_, is); + util::read_value(table_, is); + util::read_value(num_strings_, is); + util::read_value(max_length_, is); + } + + void swap(Type& rhs) { + bc_.swap(rhs.bc_); + terms_.swap(rhs.terms_); + tail_.swap(rhs.tail_); + alphabet_.swap(rhs.alphabet_); + table_.swap(rhs.table_); + std::swap(num_strings_, rhs.num_strings_); + std::swap(max_length_, rhs.max_length_); + } + + Trie(const Trie&) = delete; + Trie& operator=(const Trie&) = delete; + +private: + BcType bc_; + BitVector terms_; + std::vector tail_; + std::vector alphabet_; + std::array table_; // table[table[c] + 256] = c + + size_t num_strings_ = 0; + size_t max_length_ = 0; + + uint32_t to_string_id_(uint32_t node_id) const { + return terms_.rank(node_id); + }; + uint32_t to_node_id_(uint32_t string_id) const { + return terms_.select(string_id); + }; + uint8_t edge_(uint32_t node_id, uint32_t child_id) const { + return table_[static_cast(bc_.base(node_id) ^ child_id) + 256]; + } + + bool match_(CharRange string, uint32_t link) const { + auto tail = tail_.data() + link; + for (auto it = string.begin; it != string.end; ++it, ++tail) { + if (*tail == '\0' || *it != *tail) { + return false; + } + } + return *tail == '\0'; + } + + bool prefix_match_(CharRange string, uint32_t link) const { + auto tail = tail_.data() + link; + for (auto it = string.begin; it != string.end; ++it, ++tail) { + if (*tail == '\0' || *it != *tail) { + return false; + } + } + return true; + } + + void enumerate_ids_(uint32_t node_id, std::vector& ids, + size_t& num_ids, size_t limit) const { + if (terms_[node_id]) { + ids.push_back(to_string_id_(node_id)); + ++num_ids; + if (bc_.is_leaf(node_id)) { + return; + } + } + const auto base = bc_.base(node_id); + for (const auto label : alphabet_) { + if (num_ids == limit) { + break; + } + const auto child_id = base ^ table_[label]; + if (bc_.check(child_id) == node_id) { + enumerate_ids_(child_id, ids, num_ids, limit); + } + } + } +}; + +} //namespace - xcdat + +#endif //XCDAT_TRIE_HPP_ diff --git a/src/TrieBuilder.cpp b/src/TrieBuilder.cpp new file mode 100644 index 0000000..bce1e80 --- /dev/null +++ b/src/TrieBuilder.cpp @@ -0,0 +1,306 @@ +#include "TrieBuilder.hpp" + +namespace xcdat { + +TrieBuilder::TrieBuilder(const std::vector& strings, + uint32_t first_bit_size) + : strings_(strings), block_size_(1U << first_bit_size) { + if (strings_.empty() || kBcUpper < strings_.size()) { + XCDAT_THROW("The number of strings is out of range."); + } + + { + size_t init_capacity = 1; + while (init_capacity < strings_.size()) { + init_capacity <<= 1; + } + bc_.reserve(init_capacity); + terms_.reserve(init_capacity); + heads_.reserve(init_capacity / block_size_); + } + + alphabet_.reserve(256); + edges_.reserve(256); + suffixes_.reserve(strings_.size()); + + for (uint32_t i = 0; i < 256; ++i) { + bc_.push_back({i + 1, false, i - 1, false}); + terms_.push_back(false); + } + bc_[255].base = 0; + bc_[0].check = 255; + + for (uint32_t i = 0; i < 256; i += block_size_) { + heads_.emplace_back(i); + } + + use_(kRootId); + bc_[kRootId].check = kTabooId; + bc_[kTabooId].is_used = true; + heads_[kTabooId / block_size_] = bc_[kTabooId].base; + + build_table_(); + build_bc_(0, strings_.size(), 0, kRootId); + build_tail_(); +} + +void TrieBuilder::build_table_() { + std::array, 256> table_builder; + + for (uint32_t i = 0; i < 256; ++i) { + table_builder[i] = {static_cast(i), 0}; + } + + auto char_count = [&](const CharRange& string) { + for (auto it = string.begin; it != string.end; ++it) { + ++table_builder[*it].second; + } + }; + + char_count(strings_[0]); + max_length_ = strings_[0].length(); + + for (size_t i = 1; i < strings_.size(); ++i) { + if (!(strings_[i - 1] < strings_[i])) { + XCDAT_THROW("The input strings do not consist of a set in lexicographical order."); + } + char_count(strings_[i]); + max_length_ = std::max(max_length_, strings_[i].length()); + } + + if (table_builder[0].second) { + XCDAT_THROW("The input strings include an ASCII zero character."); + } + + for (const auto& item : table_builder) { + if (item.second != 0) { + alphabet_.push_back(item.first); + } + } + alphabet_.shrink_to_fit(); + + std::sort(std::begin(table_builder), std::end(table_builder), + [](const std::pair& lhs, const std::pair& rhs) { + return lhs.second > rhs.second; + }); + + for (uint32_t i = 0; i < 256; ++i) { + table_[table_builder[i].first] = static_cast(i); + } + + for (uint32_t i = 0; i < 256; ++i) { + table_[table_[i] + 256] = static_cast(i); + } +} + +void TrieBuilder::build_bc_(size_t begin, size_t end, size_t depth, uint32_t node_id) { + if (strings_[begin].length() == depth) { + terms_.set_bit(node_id, true); + if (++begin == end) { // without link + bc_[node_id].base = 0; + bc_[node_id].is_leaf = true; + return; + } + } else if (begin + 1 == end) { // leaf + terms_.set_bit(node_id, true); + auto& string = strings_[begin]; + suffixes_.push_back({{string.begin + depth, string.end}, node_id}); + return; + } + + { // Fetching edges + edges_.clear(); + auto label = strings_[begin].begin[depth]; + for (auto str_id = begin + 1; str_id < end; ++str_id) { + const auto _label = strings_[str_id].begin[depth]; + if (label != _label) { + edges_.push_back(label); + label = _label; + } + } + edges_.push_back(label); + } + + const auto base = find_base_(node_id / block_size_); + if (bc_.size() <= base) { + expand_(); + } + + // Defining new edges + bc_[node_id].base = base; + for (const auto label : edges_) { + const auto child_id = base ^ table_[label]; + use_(child_id); + bc_[child_id].check = node_id; + } + + // Following children + auto _begin = begin; + auto label = strings_[begin].begin[depth]; + for (auto _end = begin + 1; _end < end; ++_end) { + const auto _label = strings_[_end].begin[depth]; + if (label != _label) { + build_bc_(_begin, _end, depth + 1, base ^ table_[label]); + label = _label; + _begin = _end; + } + } + build_bc_(_begin, end, depth + 1, base ^ table_[label]); +} + +void TrieBuilder::build_tail_() { + auto can_unify = [](const Suffix& lhs, const Suffix& rhs) { + if (lhs.string.length() > rhs.string.length()) { + return false; + } + + auto lhs_range = std::make_pair(lhs.rbegin(), lhs.rend()); + auto rhs_range = std::make_pair(rhs.rbegin(), rhs.rend()); + + while (lhs_range.first != lhs_range.second) { + if (*lhs_range.first != *rhs_range.first) { + return false; + } + ++lhs_range.first; + ++rhs_range.first; + } + return true; + }; + + std::sort(std::begin(suffixes_), std::end(suffixes_), + [](const Suffix& lhs, const Suffix& rhs) { + return std::lexicographical_compare( + lhs.rbegin(), lhs.rend(), rhs.rbegin(), rhs.rend() + ); + }); + + tail_.push_back('\0'); // for an empty suffix + + size_t begin = 0; + for (size_t i = 1; i < suffixes_.size(); ++i) { + const auto& lhs = suffixes_[i - 1]; + const auto& rhs = suffixes_[i]; + + if (can_unify(lhs, rhs)) { + continue; + } + + append_tail_(begin, i, lhs.string); + begin = i; + } + + append_tail_(begin, suffixes_.size(), suffixes_.back().string); + tail_.shrink_to_fit(); +} + +void TrieBuilder::expand_() { + if (kBcUpper < bc_.size() + 256) { + XCDAT_THROW("The length of BASE/CHECK is out of range."); + } + + const auto old_size = static_cast(bc_.size()); + const auto new_size = old_size + 256; + + for (auto i = old_size; i < new_size; ++i) { + bc_.push_back({i + 1, false, i - 1, false}); + terms_.push_back(false); + } + + { + const auto last = bc_[kTabooId].check; + bc_[old_size].check = last; + bc_[last].base = old_size; + bc_[new_size - 1].base = kTabooId; + bc_[kTabooId].check = new_size - 1; + } + + for (auto i = old_size; i < new_size; i += block_size_) { + heads_.push_back(i); + } + + const auto block_id = old_size / 256; + if (kFreeBlocks <= block_id) { + close_block_(block_id - kFreeBlocks); + } +} + +void TrieBuilder::use_(uint32_t node_id) { + bc_[node_id].is_used = true; + + const auto next = bc_[node_id].base; + const auto prev = bc_[node_id].check; + bc_[prev].base = next; + bc_[next].check = prev; + + const auto block_id = node_id / block_size_; + if (heads_[block_id] == node_id) { + heads_[block_id] = (block_id != next / block_size_) ? kTabooId : next; + } +} + +void TrieBuilder::close_block_(uint32_t block_id) { + const auto begin = block_id * 256; + const auto end = begin + 256; + + for (auto i = begin; i < end; ++i) { + if (!bc_[i].is_used) { + use_(i); + bc_[i].is_used = false; + } + } + + for (auto i = begin; i < end; i += block_size_) { + heads_[i / block_size_] = kTabooId; + } +} + +uint32_t TrieBuilder::find_base_(uint32_t block_id) const { + if (bc_[kTabooId].base == kTabooId) { // Full? + return static_cast(bc_.size()) ^ table_[edges_[0]]; + } + + // search in the same block + for (auto i = heads_[block_id]; i != kTabooId && i / block_size_ == block_id; i = bc_[i].base) { + const auto base = i ^ table_[edges_[0]]; + if (is_target_(base)) { + return base; // base / block_size_ == block_id + } + } + + for (auto i = bc_[kTabooId].base; i != kTabooId; i = bc_[i].base) { + const auto base = i ^ table_[edges_[0]]; + if (is_target_(base)) { + return base; // base / block_size_ != block_id + } + } + + return static_cast(bc_.size()) ^ table_[edges_[0]]; +} + +bool TrieBuilder::is_target_(uint32_t base) const { + for (const auto label : edges_) { + const auto i = base ^ table_[label]; + if (bc_[i].is_used) { + return false; + } + } + return true; +} + +void TrieBuilder::append_tail_(size_t begin, size_t end, const CharRange& string) { + for (auto it = string.begin; it != string.end; ++it) { + tail_.push_back(*it); + } + while (begin < end) { + const auto& suffix = suffixes_[begin++]; + const auto tail_offset = tail_.size() - suffix.string.length(); + if (kBcUpper < tail_offset) { + XCDAT_THROW("A pointer to TAIL is out of range."); + } + bc_[suffix.node_id].base = static_cast(tail_offset); + bc_[suffix.node_id].is_leaf = true; + } + tail_.push_back('\0'); // terminator +} + +} //namespace - xcdat diff --git a/src/TrieBuilder.hpp b/src/TrieBuilder.hpp new file mode 100644 index 0000000..caf0563 --- /dev/null +++ b/src/TrieBuilder.hpp @@ -0,0 +1,68 @@ +#ifndef XCDAT_TRIE_BUILDER_HPP_ +#define XCDAT_TRIE_BUILDER_HPP_ + +#include + +#include "BitVectorBuilder.hpp" + +namespace xcdat { + +template +class Trie; // prototype + +// +class TrieBuilder { +public: + friend class Trie; + friend class Trie; + + static constexpr uint32_t kFreeBlocks = 16; // inspired by darts-clone + + TrieBuilder(const std::vector& strings, uint32_t first_bit_size); + ~TrieBuilder() {} + + TrieBuilder(const TrieBuilder&) = delete; + TrieBuilder& operator=(const TrieBuilder&) = delete; + +private: + struct Suffix { + CharRange string; + uint32_t node_id; + std::reverse_iterator rbegin() const { + return std::reverse_iterator(string.end); + } + std::reverse_iterator rend() const { + return std::reverse_iterator(string.begin); + } + }; + + const std::vector& strings_; + const uint32_t block_size_; + + std::vector bc_; + BitVectorBuilder terms_; + std::vector tail_; + std::vector alphabet_; + std::array table_; + + std::vector edges_; + std::vector heads_; + std::vector suffixes_; + + size_t max_length_ = 0; + + void build_table_(); + void build_bc_(size_t begin, size_t end, size_t depth, uint32_t node_id); + void build_tail_(); + + void expand_(); + void use_(uint32_t node_id); + void close_block_(uint32_t block_id); + uint32_t find_base_(uint32_t block_id) const; + bool is_target_(uint32_t base) const; + void append_tail_(size_t begin, size_t end, const CharRange& string); +}; + +} //namespace - xcdat + +#endif //XCDAT_TRIE_BUILDER_HPP_ diff --git a/src/testBc.cpp b/src/testBc.cpp new file mode 100644 index 0000000..0665cf0 --- /dev/null +++ b/src/testBc.cpp @@ -0,0 +1,73 @@ +#undef NDEBUG + +#include +#include +#include + +#include "DacBc.hpp" +#include "FastDacBc.hpp" + +using namespace xcdat; + +namespace { + +constexpr size_t kSize = (1U << 16); +constexpr uint32_t kUpper = (1U << 31) - 1; + +std::vector make_bc() { + std::random_device rnd; + std::vector ret; + + for (size_t i = 0; i < kSize; ++i) { + BcItem item; + item.base = rnd() % kUpper; + item.check = rnd() % kUpper; + switch (rnd() % 3) { + case 0: // internal node + item.is_used = true; + break; + case 1: // leaf node + item.is_leaf = true; + item.is_used = true; + break; + case 2: // free node + break; + default: + break; + } + ret.push_back(item); + } + + return ret; +} + +template +void test_bc(const std::vector& orig_bc) { + Bc bc{orig_bc}; + + assert(bc.num_nodes() == orig_bc.size()); + for (uint32_t i = 0; i < orig_bc.size(); ++i) { + assert(bc.is_used(i) == orig_bc[i].is_used); + if (!bc.is_used(i)) { + continue; + } + assert(bc.check(i) == orig_bc[i].check); + assert(bc.is_leaf(i) == orig_bc[i].is_leaf); + if (!bc.is_leaf(i)) { + assert(bc.base(i) == orig_bc[i].base); + } else { + assert(bc.link(i) == orig_bc[i].base); + } + } +} + +} // namespace + +int main() { + auto orig_bc = make_bc(); + + test_bc(orig_bc); + test_bc(orig_bc); + + return 0; +} diff --git a/src/testTrie.cpp b/src/testTrie.cpp new file mode 100644 index 0000000..f995c65 --- /dev/null +++ b/src/testTrie.cpp @@ -0,0 +1,208 @@ +#undef NDEBUG + +#include +#include +#include +#include + +#include "Trie.hpp" + +using namespace xcdat; + +namespace { + +constexpr size_t kNumStrings = 1U << 10; +constexpr size_t kMaxLength = 20; + +void to_set(std::vector& strings) { + std::sort(std::begin(strings), std::end(strings)); + strings.erase(std::unique(std::begin(strings), std::end(strings)), std::end(strings)); +} + +std::string make_string() { + std::random_device rnd; + + std::string str; + size_t length = (rnd() % kMaxLength) + 1; + for (size_t j = 0; j < length; ++j) { + str += 'A' + (rnd() % 26); + } + + return str; +} + +void make_strings(std::vector& strings) { + strings.clear(); + strings.reserve(kNumStrings); + + for (size_t i = 0; i < kNumStrings; ++i) { + strings.push_back(make_string()); + } + + to_set(strings); +} + +void make_other_strings(const std::vector& strings, std::vector& others) { + others.clear(); + + for (size_t i = 0; i < kNumStrings; ++i) { + auto string = make_string(); + if (std::find(std::begin(strings), std::end(strings), string) == std::end(strings)) { + others.push_back(string); + } + } + + to_set(others); +} + +template +void test_build(Trie& trie, const std::vector& strings) { + std::cerr << "Construction -> build()" << std::endl; + + Trie{strings}.swap(trie); + assert(trie.num_strings() == strings.size()); +} + +template +void test_basic_operations(const Trie& trie, const std::vector& strings, + const std::vector& others) { + std::cerr << "Basic operations -> lookup() and access()" << std::endl; + + for (size_t i = 0; i < strings.size(); ++i) { + const auto id = trie.lookup(strings[i]); + assert(id != kNotFound); + assert(CharRange{trie.access(id)} == strings[i]); + } + + for (size_t i = 0; i < others.size(); ++i) { + const auto id = trie.lookup(others[i]); + assert(id == kNotFound); + } +} + +template +void test_prefix_operations(const Trie& trie, const std::vector& strings, + const std::vector& others) { + std::cerr << "Prefix operations -> common_prefix_lookup()" << std::endl; + + for (auto& string : strings) { + std::vector ids; + auto num_ids = trie.common_prefix_lookup(string, ids); + + assert(1 <= num_ids); + assert(num_ids <= kMaxLength); + assert(num_ids == ids.size()); + + for (auto id : ids) { + assert(trie.access(id).length() <= string.length()); + } + } + + for (auto& other : others) { + std::vector ids; + auto num_ids = trie.common_prefix_lookup(other, ids); + + assert(num_ids <= kMaxLength); + assert(num_ids == ids.size()); + + for (auto id : ids) { + assert(trie.access(id).length() < other.length()); + } + } +} + +template +void test_predictive_operations(const Trie& trie, const std::vector& strings, + const std::vector& others) { + std::cerr << "Predictive operations -> predictive_lookup()" << std::endl; + + for (auto& string : strings) { + std::vector ids; + auto num_ids = trie.predictive_lookup(string, ids); + + assert(1 <= num_ids); + assert(num_ids == ids.size()); + + for (auto id : ids) { + assert(string.length() <= trie.access(id).length()); + } + } + + for (auto& other : others) { + std::vector ids; + auto num_ids = trie.predictive_lookup(other, ids); + + assert(num_ids == ids.size()); + + for (auto id : ids) { + assert(other.length() < trie.access(id).length()); + } + } +} + +template +void test_io(const Trie& trie) { + std::cerr << "File I/O -> write() and read()" << std::endl; + + const char* file_name = "test.trie"; + { + std::ofstream ofs{file_name}; + trie.write(ofs); + } + { + std::ifstream ifs{file_name}; + auto size = static_cast(ifs.seekg(0, std::ios::end).tellg()); + assert(size == trie.size_in_bytes()); + } + + Trie _trie; + { + std::ifstream ifs{file_name}; + _trie.read(ifs); + } + + assert(trie.num_strings() == _trie.num_strings()); + assert(trie.alphabet_size() == _trie.alphabet_size()); + assert(trie.num_nodes() == _trie.num_nodes()); + assert(trie.num_used_nodes() == _trie.num_used_nodes()); + assert(trie.num_free_nodes() == _trie.num_free_nodes()); + assert(trie.size_in_bytes() == _trie.size_in_bytes()); +} + +template +void test_trie(const std::vector& strings, const std::vector& others) { + Trie trie; + + std::cerr << "Testing xcdat::Trie<" << (Fast ? "true" : "false") << ">" << std::endl; + test_build(trie, strings); + test_basic_operations(trie, strings, others); + test_prefix_operations(trie, strings, others); + test_predictive_operations(trie, strings, others); + test_io(trie); + std::cerr << "OK!" << std::endl; +} + +} // namespace + +int main() { + std::vector strings_buffer; + make_strings(strings_buffer); + + std::vector others_buffer; + make_other_strings(strings_buffer, others_buffer); + + std::vector strings(strings_buffer.size()); + for (size_t i = 0; i < strings.size(); ++i) { + strings[i] = {strings_buffer[i]}; + } + + std::vector others(others_buffer.size()); + for (size_t i = 0; i < others.size(); ++i) { + others[i] = {others_buffer[i]}; + } + + test_trie(strings, others); + test_trie(strings, others); + + return 0; +} diff --git a/src/testVector.cpp b/src/testVector.cpp new file mode 100644 index 0000000..14298b8 --- /dev/null +++ b/src/testVector.cpp @@ -0,0 +1,73 @@ +#undef NDEBUG + +#include +#include +#include + +#include "BitVector.hpp" +#include "SmallVector.hpp" + +using namespace xcdat; + +namespace { + +constexpr size_t kSize = 1U << 16; + +void test_bit_vector() { + std::vector orig_bit_vector; + { + std::random_device rnd; + for (size_t i = 0; i < kSize; ++i) { + orig_bit_vector.push_back(rnd() % 2 == 0); + } + } + + BitVector bit_vector; + { + BitVectorBuilder builder; + for (size_t i = 0; i < kSize; ++i) { + builder.push_back(orig_bit_vector[i]); + } + BitVector{builder, true}.swap(bit_vector); + } + + assert(bit_vector.size() == kSize); + + uint32_t sum = 0; + for (uint32_t i = 0; i < kSize; ++i) { + assert(bit_vector[i] == orig_bit_vector[i]); + if (bit_vector[i]) { + assert(sum == bit_vector.rank(i)); + assert(i == bit_vector.select(sum)); + ++sum; + } + } + + assert(bit_vector.num_1s() == sum); + assert(bit_vector.num_0s() == kSize - sum); +} + +void test_small_vector() { + std::vector orig_vector; + { + std::random_device rnd; + for (size_t i = 0; i < kSize; ++i) { + orig_vector.push_back(rnd() & UINT16_MAX); + } + } + + SmallVector small_vector{orig_vector}; + assert(orig_vector.size() == small_vector.size()); + + for (size_t i = 0; i < kSize; ++i) { + assert(orig_vector[i] == small_vector[i]); + } +} + +} // namespace + +int main() { + test_bit_vector(); + test_small_vector(); + return 0; +} diff --git a/src/xcdat.cpp b/src/xcdat.cpp new file mode 100644 index 0000000..726b1cd --- /dev/null +++ b/src/xcdat.cpp @@ -0,0 +1,305 @@ +#include +#include +#include + +#include "Trie.hpp" + +using namespace xcdat; + +namespace { + +constexpr uint32_t kRuns = 10; + +enum class Times { + SEC, MILLI, MICRO +}; + +class StopWatch { +public: + StopWatch() : tp_(std::chrono::high_resolution_clock::now()) {} + ~StopWatch() {} + + double operator()(Times time) const { + const auto tp = std::chrono::high_resolution_clock::now() - tp_; + switch (time) { + case Times::SEC: + return std::chrono::duration(tp).count(); + case Times::MILLI: + return std::chrono::duration(tp).count(); + case Times::MICRO: + return std::chrono::duration(tp).count(); + } + return 0.0; + } + + StopWatch(const StopWatch&) = delete; + StopWatch& operator=(const StopWatch&) = delete; + +private: + std::chrono::high_resolution_clock::time_point tp_; +}; + +class StringBuffer { +public: + StringBuffer() {} + ~StringBuffer() {} + + bool load(const char* file_path) { + std::ifstream ifs{file_path}; + if (!ifs) { + return false; + } + + std::string line; + while (std::getline(ifs, line)) { + offsets_.push_back(buffer_.size()); + for (uint8_t c : line) { + buffer_.push_back(c); + } + } + offsets_.push_back(buffer_.size()); + + buffer_.shrink_to_fit(); + offsets_.shrink_to_fit(); + + return true; + } + + void extract(std::vector& strings) const { + strings.clear(); + strings.resize(offsets_.size() - 1); + for (size_t i = 0; i < strings.size(); ++i) { + strings[i] = {buffer_.data() + offsets_[i], buffer_.data() + offsets_[i + 1]}; + } + } + + size_t raw_size() const { // including a terminators + return buffer_.size() + offsets_.size() - 1; + } + + StringBuffer(const StringBuffer&) = delete; + StringBuffer& operator=(const StringBuffer&) = delete; + +private: + std::vector buffer_; + std::vector offsets_; +}; + +void show_usage(std::ostream& os) { + os << "xcdat build " << std::endl; + os << "\t\t'1' for DACs; '2' for FDACs." << std::endl; + os << "\t \tinput file of the set of strings." << std::endl; + os << "\t\toutput file for storing the dictionary." << std::endl; + os << "xcdat query " << std::endl; + os << "\t \t'1' for DACs; '2' for FDACs." << std::endl; + os << "\t \tinput file of the dictionary." << std::endl; + os << "\t\tlimit at lookup (default=10)." << std::endl; + os << "xcdat bench " << std::endl; + os << "\t\t'1' for DACs; '2' for FDACs." << std::endl; + os << "\t\tinput file of the dictionary." << std::endl; + os << "\t \tinput file of strings for benchmark." << std::endl; +} + +template +int build(std::vector& args) { + if (args.size() != 4) { + show_usage(std::cerr); + return 1; + } + + StringBuffer buffer; + if (!buffer.load(args[2].c_str())) { + std::cerr << "open error : " << args[2] << std::endl; + return 1; + } + + std::vector strings; + buffer.extract(strings); + + Trie trie; + try { + StopWatch sw; + Trie{strings}.swap(trie); + std::cout << "constr. time: " << sw(Times::SEC) << " sec" << std::endl; + } catch (const xcdat::Exception& ex) { + std::cerr << ex.what() << " : " << ex.file_name() << " : " + << ex.line() << " : " << ex.func_name() << std::endl; + return 1; + } + + std::cout << "cmpr. ratio: " << (double) trie.size_in_bytes() / buffer.raw_size() << std::endl; + std::cout << "trie stat:" << std::endl; + trie.show_stat(std::cout); + + { + std::ofstream ofs{args[3]}; + if (!ofs) { + std::cerr << "open error : " << args[3] << std::endl; + return 1; + } + trie.write(ofs); + } + + return 0; +} + +template +int query(std::vector& args) { + if (args.size() != 3 && args.size() != 4) { + show_usage(std::cerr); + return 1; + } + + Trie trie; + { + std::ifstream ifs{args[2]}; + if (!ifs) { + std::cerr << "open error : " << args[2] << std::endl; + return 1; + } + trie.read(ifs); + } + + size_t limit = 10; + if (args.size() == 4) { + limit = std::stoull(args.back()); + } + + std::string query; + std::vector ids; + + while (true){ + putchar('>'); + getline(std::cin, query); + if (query.size() == 0){ + break; + } + + std::cout << "lookup()" << std::endl; + auto id = trie.lookup({query}); + if (id == kNotFound) { + std::cout << "not found" << std::endl; + } else { + std::cout << id << '\t' << query << std::endl; + } + + std::cout << "common_prefix_lookup()" << std::endl; + ids.clear(); + trie.common_prefix_lookup({query}, ids); + std::cout << ids.size() << " found" << std::endl; + for (size_t i = 0; i < std::min(ids.size(), limit); ++i) { + std::cout << ids[i] << '\t' << trie.access(ids[i]) << std::endl; + } + + std::cout << "predictive_lookup()" << std::endl; + ids.clear(); + trie.predictive_lookup({query}, ids); + std::cout << ids.size() << " found" << std::endl; + for (size_t i = 0; i < std::min(ids.size(), limit); ++i) { + std::cout << ids[i] << '\t' << trie.access(ids[i]) << std::endl; + } + } + + return 0; +} + +template +int bench(std::vector& args) { + if (args.size() != 4) { + show_usage(std::cerr); + return 1; + } + + Trie trie; + { + std::ifstream ifs{args[2]}; + if (!ifs) { + std::cerr << "open error : " << args[2] << std::endl; + return 1; + } + trie.read(ifs); + } + + StringBuffer buffer; + if (!buffer.load(args[3].c_str())) { + std::cerr << "open error : " << args[3] << std::endl; + return 1; + } + + std::vector strings; + buffer.extract(strings); + + std::vector ids(strings.size()); + for (size_t i = 0; i < strings.size(); ++i) { + ids[i] = trie.lookup(strings[i]); + } + + { + std::cout << "Lookup benchmark on " << kRuns << " runs" << std::endl; + + StopWatch sw; + for (uint32_t r = 0; r < kRuns; ++r) { + for (size_t i = 0; i < strings.size(); ++i) { + if (trie.lookup(strings[i]) == kNotFound) { + std::cerr << "Failed to lookup " << strings[i] << std::endl; + return 1; + } + } + } + + std::cout << sw(Times::MICRO) / kRuns / strings.size() << " us per str" << std::endl; + } + + { + std::cout << "Access benchmark on " << kRuns << " runs" << std::endl; + + StopWatch sw; + for (uint32_t r = 0; r < kRuns; ++r) { + for (size_t i = 0; i < ids.size(); ++i) { + if (trie.access(ids[i]).empty()) { + std::cerr << "Failed to access " << ids[i] << std::endl; + return 1; + } + } + } + + std::cout << sw(Times::MICRO) / kRuns / ids.size() << " us per ID" << std::endl; + } + + return 0; +} + +} // namespace + +int main(int argc, const char* argv[]) { + if (argc < 3) { + show_usage(std::cerr); + return 1; + } + + std::vector args; + for (int i = 1; i < argc; ++i) { + args.push_back({argv[i]}); + } + + bool is_fast; + if (args[1].front() == '1') { + is_fast = false; + } else if (args[1].front() == '2') { + is_fast = true; + } else { + show_usage(std::cerr); + return 1; + } + + if (args[0] == "build") { + return is_fast ? build(args) : build(args); + } else if (args[0] == "query") { + return is_fast ? query(args) : query(args); + } else if (args[0] == "bench") { + return is_fast ? bench(args) : bench(args); + } + + show_usage(std::cerr); + return 1; +} diff --git a/src/xcdatBasics.hpp b/src/xcdatBasics.hpp new file mode 100644 index 0000000..c7da827 --- /dev/null +++ b/src/xcdatBasics.hpp @@ -0,0 +1,145 @@ +#ifndef XCDAT_BASICS_HPP_ +#define XCDAT_BASICS_HPP_ + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace xcdat { + +constexpr auto kNotFound = static_cast(-1); + +// For Base and Check arrays +constexpr uint32_t kRootId = 0; +constexpr uint32_t kTabooId = 1; +constexpr uint32_t kBcUpper = (1U << 31) - 1; + +// For builder +struct BcItem { + uint32_t base : 31; + bool is_leaf : 1; + uint32_t check : 31; + bool is_used : 1; +}; + +struct CharRange { + using Type = const uint8_t*; + + Type begin = nullptr; + Type end = nullptr; + + CharRange() {} + CharRange(const std::string& string) + : CharRange{string.c_str(), string.c_str() + string.length()} {} + CharRange(const char* b, const char* e) + : begin{reinterpret_cast(b)}, end{reinterpret_cast(e)} {} + CharRange(Type b, Type e) : begin{b}, end{e} {} + + size_t length() const { return static_cast(end - begin); } +}; + +inline bool operator==(const CharRange& lhs, const CharRange& rhs) { + if (lhs.length() != rhs.length()) { + return false; + } + return std::equal(lhs.begin, lhs.end, rhs.begin); +} + +inline bool operator!=(const CharRange& lhs, const CharRange& rhs) { + return !(lhs == rhs); +} + +inline bool operator<(const CharRange& lhs, const CharRange& rhs) { + return std::lexicographical_compare(lhs.begin, lhs.end, rhs.begin, rhs.end); +} + +inline std::ostream& operator<<(std::ostream& os, const CharRange& string) { + for (auto it = string.begin; it != string.end; ++it) { + os << char(*it); + } + return os; +} + +#define XCDAT_THROW(message) \ + throw Exception(message, __FILE__, __func__, __LINE__) + +class Exception : public std::exception { +public: + Exception(const char* message, const char* file_name, + const char* func_name, const int line) + : message_{message}, file_name_{file_name}, func_name_{func_name}, line_{line} {} + virtual ~Exception() throw() {} + + virtual const char* what() const throw() override { + return message_.c_str(); + } + + const char* file_name() const { return file_name_; } + const char* func_name() const { return func_name_; } + int line() const { return line_; } + +private: + std::string message_; + const char* file_name_ = nullptr; + const char* func_name_ = nullptr; + int line_ = 0; +}; + +namespace util { + +template +inline size_t size_in_bytes(const std::vector& vec) { + static_assert(!std::is_same::value, "no support type"); + return vec.size() * sizeof(T) + sizeof(vec.size()); +} + +inline void show_stat(const char* str, double size, std::ostream& os) { + os << str << "\t" << size << std::endl; +} + +inline void show_stat(const char* str, size_t size, std::ostream& os) { + os << str << "\t" << size << std::endl; +} + +inline void show_stat(const char* str, size_t size, size_t denom, std::ostream& os) { + os << str << "\t" << size << "\t" << (double) size / denom << std::endl; +} + +template +inline void write_value(const T val, std::ostream& os) { + os.write(reinterpret_cast(&val), sizeof(val)); +} + +template +inline void write_vector(const std::vector& vec, std::ostream& os) { + static_assert(!std::is_same::value, "no support type"); + auto size = vec.size(); + write_value(size, os); + os.write(reinterpret_cast(&vec[0]), sizeof(T) * size); +} + +template +inline void read_value(T& val, std::istream& is) { + is.read(reinterpret_cast(&val), sizeof(val)); +} + +template +inline void read_vector(std::vector& vec, std::istream& is) { + static_assert(!std::is_same::value, "no support type"); + vec.clear(); + size_t size = 0; + read_value(size, is); + vec.resize(size); + is.read(reinterpret_cast(&vec[0]), sizeof(T) * size); +} + +} //namespace - util + +} //namespace - xcdat + +#endif //XCDAT_BASICS_HPP_