xcdat/tests/test_trie.cpp

327 lines
11 KiB
C++
Raw Normal View History

2021-06-26 14:48:29 +00:00
#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
#include <algorithm>
#include <iostream>
#include <random>
#include <string>
#include "doctest/doctest.h"
2021-06-29 01:09:49 +00:00
#include "mm_file/mm_file.hpp"
2021-06-26 14:48:29 +00:00
#include "test_common.hpp"
#include "xcdat.hpp"
2021-06-29 01:09:49 +00:00
#ifdef TRIE_7
2021-06-27 05:06:08 +00:00
using trie_type = xcdat::trie_7_type;
2021-07-11 04:06:23 +00:00
#define TRIE_NAME "xcdat::trie_7_type"
2021-06-29 01:09:49 +00:00
#elif TRIE_8
using trie_type = xcdat::trie_8_type;
2021-07-11 04:06:23 +00:00
#define TRIE_NAME "xcdat::trie_8_type"
2021-07-02 23:12:35 +00:00
#elif TRIE_15
using trie_type = xcdat::trie_15_type;
2021-07-11 04:06:23 +00:00
#define TRIE_NAME "xcdat::trie_15_type"
2021-07-02 23:12:35 +00:00
#elif TRIE_16
using trie_type = xcdat::trie_16_type;
2021-07-11 04:06:23 +00:00
#define TRIE_NAME "xcdat::trie_16_type"
2021-06-29 01:09:49 +00:00
#endif
2021-06-27 04:12:35 +00:00
2021-07-08 13:47:59 +00:00
std::vector<std::string> load_strings(const std::string& filepath, char delim = '\n') {
std::ifstream ifs(filepath);
XCDAT_THROW_IF(!ifs.good(), "Cannot open the input file");
std::vector<std::string> strs;
for (std::string str; std::getline(ifs, str, delim);) {
strs.push_back(str);
}
return strs;
}
2021-06-27 04:12:35 +00:00
void test_basic_operations(const trie_type& trie, const std::vector<std::string>& keys,
2021-06-27 03:57:34 +00:00
const std::vector<std::string>& others) {
REQUIRE_EQ(trie.num_keys(), keys.size());
REQUIRE_EQ(trie.max_length(), xcdat::test::max_length(keys));
2021-06-26 22:40:15 +00:00
for (std::uint64_t i = 0; i < keys.size(); i++) {
2021-06-26 14:48:29 +00:00
auto id = trie.lookup(keys[i]);
REQUIRE(id.has_value());
REQUIRE_LT(id.value(), keys.size());
2021-06-27 03:57:34 +00:00
auto decoded = trie.decode(id.value());
2021-06-26 14:48:29 +00:00
REQUIRE_EQ(keys[i], decoded);
}
2021-06-26 22:40:15 +00:00
for (std::uint64_t i = 0; i < others.size(); i++) {
2021-06-26 14:48:29 +00:00
auto id = trie.lookup(others[i]);
REQUIRE_FALSE(id.has_value());
}
}
2021-06-27 04:12:35 +00:00
void test_prefix_search(const trie_type& trie, const std::vector<std::string>& keys,
2021-07-08 14:48:58 +00:00
const std::vector<std::string>& queries) {
for (auto& query : queries) {
std::vector<std::string> results;
for (auto itr = trie.make_prefix_iterator(query); itr.next();) {
2021-06-27 03:57:34 +00:00
const auto id = itr.id();
const auto decoded = itr.decoded_view();
2021-07-08 14:48:58 +00:00
REQUIRE_LE(decoded.size(), query.size());
2021-06-27 03:57:34 +00:00
REQUIRE_EQ(id, trie.lookup(decoded));
REQUIRE_EQ(decoded, trie.decode(id));
2021-07-08 14:48:58 +00:00
results.push_back(itr.decoded());
2021-06-27 03:57:34 +00:00
}
2021-07-08 14:48:58 +00:00
auto naive_results = xcdat::test::prefix_search_naive(keys, query);
REQUIRE_EQ(results.size(), naive_results.size());
2021-06-27 03:57:34 +00:00
2021-07-08 14:48:58 +00:00
for (std::size_t i = 0; i < results.size(); i++) {
REQUIRE_EQ(results[i], naive_results[i]);
2021-06-27 03:57:34 +00:00
}
}
}
2021-06-27 04:12:35 +00:00
void test_predictive_search(const trie_type& trie, const std::vector<std::string>& keys,
2021-07-08 14:48:58 +00:00
const std::vector<std::string>& queries) {
for (auto& query : queries) {
std::string_view query_view{query.c_str(), query.size() / 3 + 1};
2021-06-27 03:57:34 +00:00
2021-07-08 14:48:58 +00:00
std::vector<std::string> results;
for (auto itr = trie.make_predictive_iterator(query_view); itr.next();) {
2021-06-27 03:57:34 +00:00
const auto id = itr.id();
const auto decoded = itr.decoded_view();
2021-07-08 14:48:58 +00:00
REQUIRE_LE(query_view.size(), decoded.size());
2021-06-27 03:57:34 +00:00
REQUIRE_EQ(id, trie.lookup(decoded));
REQUIRE_EQ(decoded, trie.decode(id));
2021-07-08 14:48:58 +00:00
results.push_back(itr.decoded());
2021-06-27 03:57:34 +00:00
}
2021-07-08 14:48:58 +00:00
auto naive_results = xcdat::test::predictive_search_naive(keys, query_view);
REQUIRE_EQ(results.size(), naive_results.size());
2021-06-27 03:57:34 +00:00
2021-07-08 14:48:58 +00:00
for (std::size_t i = 0; i < results.size(); i++) {
REQUIRE_EQ(results[i], naive_results[i]);
2021-06-27 03:57:34 +00:00
}
}
}
2021-06-27 04:12:35 +00:00
void test_enumerate(const trie_type& trie, const std::vector<std::string>& keys) {
2021-06-27 03:57:34 +00:00
auto itr = trie.make_enumerative_iterator();
for (auto& key : keys) {
REQUIRE(itr.next());
REQUIRE_EQ(itr.decoded_view(), key);
REQUIRE_EQ(itr.id(), trie.lookup(key));
}
REQUIRE_FALSE(itr.next());
}
2021-06-29 01:09:49 +00:00
void test_io(const trie_type& trie, const std::vector<std::string>& keys, const std::vector<std::string>& others) {
const char* tmp_filepath = "tmp.idx";
const std::uint64_t memory = xcdat::memory_in_bytes(trie);
REQUIRE_EQ(memory, xcdat::save(trie, tmp_filepath));
{
const auto loaded = xcdat::load<trie_type>(tmp_filepath);
REQUIRE_EQ(trie.bin_mode(), loaded.bin_mode());
REQUIRE_EQ(trie.num_keys(), loaded.num_keys());
REQUIRE_EQ(trie.alphabet_size(), loaded.alphabet_size());
REQUIRE_EQ(trie.max_length(), loaded.max_length());
2021-07-03 00:46:04 +00:00
REQUIRE_EQ(trie.num_nodes(), loaded.num_nodes());
REQUIRE_EQ(trie.num_units(), loaded.num_units());
REQUIRE_EQ(trie.num_free_units(), loaded.num_free_units());
REQUIRE_EQ(trie.tail_length(), loaded.tail_length());
2021-06-29 01:09:49 +00:00
REQUIRE_EQ(memory, xcdat::memory_in_bytes(loaded));
test_basic_operations(loaded, keys, others);
}
{
mm::file_source<char> fin(tmp_filepath, mm::advice::sequential);
const auto mapped = xcdat::mmap<trie_type>(fin.data());
REQUIRE_EQ(trie.bin_mode(), mapped.bin_mode());
REQUIRE_EQ(trie.num_keys(), mapped.num_keys());
REQUIRE_EQ(trie.alphabet_size(), mapped.alphabet_size());
REQUIRE_EQ(trie.max_length(), mapped.max_length());
2021-07-03 00:46:04 +00:00
REQUIRE_EQ(trie.num_nodes(), mapped.num_nodes());
REQUIRE_EQ(trie.num_units(), mapped.num_units());
REQUIRE_EQ(trie.num_free_units(), mapped.num_free_units());
REQUIRE_EQ(trie.tail_length(), mapped.tail_length());
2021-06-29 01:09:49 +00:00
REQUIRE_EQ(memory, xcdat::memory_in_bytes(mapped));
test_basic_operations(mapped, keys, others);
}
std::remove(tmp_filepath);
}
2021-07-11 04:06:23 +00:00
TEST_CASE("Test " TRIE_NAME " (tiny)") {
2021-06-27 03:57:34 +00:00
std::vector<std::string> keys = {
"AirPods", "AirTag", "Mac", "MacBook", "MacBook_Air", "MacBook_Pro",
"Mac_Mini", "Mac_Pro", "iMac", "iPad", "iPhone", "iPhone_SE",
};
std::vector<std::string> others = {
"Google_Pixel", "iPad_mini", "iPadOS", "iPod", "ThinkPad",
};
2021-06-26 14:48:29 +00:00
2021-06-29 00:06:40 +00:00
trie_type trie(keys);
2021-06-27 03:57:34 +00:00
REQUIRE_FALSE(trie.bin_mode());
test_basic_operations(trie, keys, others);
2021-06-29 01:09:49 +00:00
2021-06-27 03:57:34 +00:00
{
2022-07-10 05:37:25 +00:00
auto itr = trie.make_prefix_iterator("MacBook_Pro_13inch");
2021-06-27 03:57:34 +00:00
std::vector<std::string> expected = {"Mac", "MacBook", "MacBook_Pro"};
for (const auto& exp : expected) {
REQUIRE(itr.next());
REQUIRE_EQ(itr.decoded(), exp);
REQUIRE_EQ(itr.id(), trie.lookup(exp));
}
REQUIRE_FALSE(itr.next());
}
{
auto itr = trie.make_predictive_iterator("MacBook");
std::vector<std::string> expected = {"MacBook", "MacBook_Air", "MacBook_Pro"};
for (const auto& exp : expected) {
REQUIRE(itr.next());
REQUIRE_EQ(itr.decoded(), exp);
REQUIRE_EQ(itr.id(), trie.lookup(exp));
}
REQUIRE_FALSE(itr.next());
}
{
auto itr = trie.make_enumerative_iterator();
for (const auto& key : keys) {
REQUIRE(itr.next());
REQUIRE_EQ(itr.decoded(), key);
REQUIRE_EQ(itr.id(), trie.lookup(key));
}
REQUIRE_FALSE(itr.next());
}
2021-06-29 01:09:49 +00:00
test_io(trie, keys, others);
2021-06-26 14:48:29 +00:00
}
2021-07-11 04:06:23 +00:00
TEST_CASE("Test " TRIE_NAME " (unsort)") {
std::vector<std::string> keys = {
"AirPods", "AirTag", "Mac", "MacBook", "MacBook_Pro", "MacBook_Air",
"Mac_Mini", "Mac_Pro", "iMac", "iPad", "iPhone", "iPhone_SE",
};
auto func = [&]() { auto trie = trie_type(keys); };
REQUIRE_THROWS_AS(func(), const xcdat::exception&);
}
TEST_CASE("Test " TRIE_NAME " (not unique)") {
std::vector<std::string> keys = {
"AirPods", "AirTag", "Mac", "MacBook", "MacBook", "MacBook_Pro",
"Mac_Mini", "Mac_Pro", "iMac", "iPad", "iPhone", "iPhone_SE",
};
auto func = [&]() { auto trie = trie_type(keys); };
REQUIRE_THROWS_AS(func(), const xcdat::exception&);
}
TEST_CASE("Test " TRIE_NAME " (real)") {
2021-07-08 13:47:59 +00:00
auto keys = xcdat::test::to_unique_vec(load_strings("keys.txt"));
2021-06-26 14:48:29 +00:00
auto others = xcdat::test::extract_keys(keys);
2021-07-08 14:48:58 +00:00
auto queries = xcdat::test::sample_keys(keys, 100);
2021-06-26 14:48:29 +00:00
2021-06-29 00:06:40 +00:00
trie_type trie(keys);
2021-06-27 03:57:34 +00:00
REQUIRE_FALSE(trie.bin_mode());
test_basic_operations(trie, keys, others);
2021-07-08 14:48:58 +00:00
test_prefix_search(trie, keys, queries);
test_predictive_search(trie, keys, queries);
2021-06-27 03:57:34 +00:00
test_enumerate(trie, keys);
2021-06-29 01:09:49 +00:00
test_io(trie, keys, others);
2021-06-26 14:48:29 +00:00
}
2021-07-11 04:06:23 +00:00
TEST_CASE("Test " TRIE_NAME " (random 10K, A--B)") {
2021-06-27 03:57:34 +00:00
auto keys = xcdat::test::to_unique_vec(xcdat::test::make_random_keys(10000, 1, 30, 'A', 'B'));
auto others = xcdat::test::extract_keys(keys);
2021-07-08 14:48:58 +00:00
auto queries = xcdat::test::sample_keys(keys, 100);
2021-06-27 03:57:34 +00:00
2021-06-29 00:06:40 +00:00
trie_type trie(keys);
2021-06-27 03:57:34 +00:00
REQUIRE_FALSE(trie.bin_mode());
test_basic_operations(trie, keys, others);
2021-07-08 14:48:58 +00:00
test_prefix_search(trie, keys, queries);
test_predictive_search(trie, keys, queries);
2021-06-27 03:57:34 +00:00
test_enumerate(trie, keys);
2021-06-29 01:09:49 +00:00
test_io(trie, keys, others);
2021-06-27 03:57:34 +00:00
}
2021-07-11 04:06:23 +00:00
TEST_CASE("Test " TRIE_NAME " (random 10K, A--Z)") {
2021-06-27 03:57:34 +00:00
auto keys = xcdat::test::to_unique_vec(xcdat::test::make_random_keys(10000, 1, 30, 'A', 'Z'));
auto others = xcdat::test::extract_keys(keys);
2021-07-08 14:48:58 +00:00
auto queries = xcdat::test::sample_keys(keys, 100);
2021-06-27 03:57:34 +00:00
2021-06-29 00:06:40 +00:00
trie_type trie(keys);
2021-06-27 03:57:34 +00:00
REQUIRE_FALSE(trie.bin_mode());
test_basic_operations(trie, keys, others);
2021-07-08 14:48:58 +00:00
test_prefix_search(trie, keys, queries);
test_predictive_search(trie, keys, queries);
2021-06-27 03:57:34 +00:00
test_enumerate(trie, keys);
2021-06-29 01:09:49 +00:00
test_io(trie, keys, others);
2021-06-27 03:57:34 +00:00
}
2021-07-11 04:06:23 +00:00
TEST_CASE("Test " TRIE_NAME " (random 10K, 0x00--0xFF)") {
2021-06-27 03:57:34 +00:00
auto keys = xcdat::test::to_unique_vec(xcdat::test::make_random_keys(10000, 1, 30, INT8_MIN, INT8_MAX));
2021-06-26 14:48:29 +00:00
auto others = xcdat::test::extract_keys(keys);
2021-07-08 14:48:58 +00:00
auto queries = xcdat::test::sample_keys(keys, 100);
2021-06-26 14:48:29 +00:00
2021-06-29 00:06:40 +00:00
trie_type trie(keys);
2021-06-27 03:57:34 +00:00
REQUIRE(trie.bin_mode());
test_basic_operations(trie, keys, others);
2021-07-08 14:48:58 +00:00
test_prefix_search(trie, keys, queries);
test_predictive_search(trie, keys, queries);
2021-06-27 03:57:34 +00:00
test_enumerate(trie, keys);
2021-06-29 01:09:49 +00:00
test_io(trie, keys, others);
2021-06-26 14:48:29 +00:00
}
2021-06-29 00:06:40 +00:00
#ifdef NDEBUG
2021-07-11 04:06:23 +00:00
TEST_CASE("Test " TRIE_NAME " (random 100K, A--B)") {
2021-06-29 00:06:40 +00:00
auto keys = xcdat::test::to_unique_vec(xcdat::test::make_random_keys(100000, 1, 30, 'A', 'B'));
auto others = xcdat::test::extract_keys(keys);
2021-07-08 14:48:58 +00:00
auto queries = xcdat::test::sample_keys(keys, 1000);
2021-06-29 00:06:40 +00:00
trie_type trie(keys);
REQUIRE_FALSE(trie.bin_mode());
test_basic_operations(trie, keys, others);
2021-07-08 14:48:58 +00:00
test_prefix_search(trie, keys, queries);
test_predictive_search(trie, keys, queries);
2021-06-29 00:06:40 +00:00
test_enumerate(trie, keys);
2021-06-29 01:09:49 +00:00
test_io(trie, keys, others);
2021-06-29 00:06:40 +00:00
}
2021-07-11 04:06:23 +00:00
TEST_CASE("Test " TRIE_NAME " (random 100K, A--Z)") {
2021-06-29 00:06:40 +00:00
auto keys = xcdat::test::to_unique_vec(xcdat::test::make_random_keys(100000, 1, 30, 'A', 'Z'));
auto others = xcdat::test::extract_keys(keys);
2021-07-08 14:48:58 +00:00
auto queries = xcdat::test::sample_keys(keys, 1000);
2021-06-29 00:06:40 +00:00
trie_type trie(keys);
REQUIRE_FALSE(trie.bin_mode());
test_basic_operations(trie, keys, others);
2021-07-08 14:48:58 +00:00
test_prefix_search(trie, keys, queries);
test_predictive_search(trie, keys, queries);
2021-06-29 00:06:40 +00:00
test_enumerate(trie, keys);
2021-06-29 01:09:49 +00:00
test_io(trie, keys, others);
2021-06-29 00:06:40 +00:00
}
2021-07-11 04:06:23 +00:00
TEST_CASE("Test " TRIE_NAME " (random 100K, 0x00--0xFF)") {
2021-06-29 00:06:40 +00:00
auto keys = xcdat::test::to_unique_vec(xcdat::test::make_random_keys(100000, 1, 30, INT8_MIN, INT8_MAX));
auto others = xcdat::test::extract_keys(keys);
2021-07-08 14:48:58 +00:00
auto queries = xcdat::test::sample_keys(keys, 1000);
2021-06-29 00:06:40 +00:00
trie_type trie(keys);
REQUIRE(trie.bin_mode());
test_basic_operations(trie, keys, others);
2021-07-08 14:48:58 +00:00
test_prefix_search(trie, keys, queries);
test_predictive_search(trie, keys, queries);
2021-06-29 00:06:40 +00:00
test_enumerate(trie, keys);
2021-06-29 01:09:49 +00:00
test_io(trie, keys, others);
2021-06-29 00:06:40 +00:00
}
#endif