2016-12-03 07:51:00 +00:00
|
|
|
#include <chrono>
|
|
|
|
#include <iostream>
|
|
|
|
#include <random>
|
|
|
|
|
2017-07-12 06:48:49 +00:00
|
|
|
#include "TrieBuilder.hpp"
|
2016-12-03 07:51:00 +00:00
|
|
|
|
|
|
|
using namespace xcdat;
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
constexpr uint32_t kRuns = 10;
|
|
|
|
|
2017-07-12 06:48:49 +00:00
|
|
|
using Key = TrieBuilder::Key;
|
2016-12-03 07:51:00 +00:00
|
|
|
|
|
|
|
class StopWatch {
|
|
|
|
public:
|
2017-11-04 14:06:08 +00:00
|
|
|
using hrc = std::chrono::high_resolution_clock;
|
|
|
|
|
|
|
|
StopWatch() : tp_{hrc::now()} {}
|
2016-12-03 07:51:00 +00:00
|
|
|
|
2017-11-04 14:06:08 +00:00
|
|
|
double sec() const {
|
|
|
|
const auto tp = hrc::now() - tp_;
|
|
|
|
return std::chrono::duration<double>(tp).count();
|
|
|
|
}
|
|
|
|
double milli_sec() const {
|
|
|
|
const auto tp = hrc::now() - tp_;
|
|
|
|
return std::chrono::duration<double, std::milli>(tp).count();
|
|
|
|
}
|
|
|
|
double micro_sec() const {
|
|
|
|
const auto tp = hrc::now() - tp_;
|
|
|
|
return std::chrono::duration<double, std::micro>(tp).count();
|
|
|
|
}
|
2016-12-03 07:51:00 +00:00
|
|
|
|
|
|
|
private:
|
2017-11-04 14:06:08 +00:00
|
|
|
hrc::time_point tp_;
|
2016-12-03 07:51:00 +00:00
|
|
|
};
|
|
|
|
|
2017-03-29 06:01:06 +00:00
|
|
|
size_t read_keys(const char* file_name, std::vector<std::string>& keys) {
|
|
|
|
std::ifstream ifs(file_name);
|
|
|
|
if (!ifs) {
|
|
|
|
return 0;
|
2016-12-03 07:51:00 +00:00
|
|
|
}
|
|
|
|
|
2017-03-29 06:01:06 +00:00
|
|
|
size_t size = 0;
|
|
|
|
std::string line;
|
|
|
|
|
|
|
|
while (std::getline(ifs, line)) {
|
|
|
|
if (!line.empty()) {
|
|
|
|
keys.emplace_back(line);
|
|
|
|
size += line.length() + 1; // with terminator
|
2016-12-03 07:51:00 +00:00
|
|
|
}
|
|
|
|
}
|
2017-03-29 06:01:06 +00:00
|
|
|
return size;
|
|
|
|
}
|
2016-12-03 07:51:00 +00:00
|
|
|
|
2017-07-12 06:48:49 +00:00
|
|
|
void extract_pairs(const std::vector<std::string>& keys, std::vector<Key>& pairs) {
|
2017-03-29 06:01:06 +00:00
|
|
|
pairs.clear();
|
|
|
|
pairs.resize(keys.size());
|
|
|
|
for (size_t i = 0; i < keys.size(); ++i) {
|
|
|
|
pairs[i] = {reinterpret_cast<const uint8_t*>(keys[i].c_str()), keys[i].length()};
|
2016-12-03 07:51:00 +00:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
void show_usage(std::ostream& os) {
|
2017-11-04 14:06:08 +00:00
|
|
|
os << "xcdat build <type> <key> <dict>\n";
|
|
|
|
os << "\t<type>\t'1' for DACs; '2' for FDACs.\n";
|
|
|
|
os << "\t<key> \tinput file of a set of keys.\n";
|
|
|
|
os << "\t<dict>\toutput file of the dictionary.\n";
|
|
|
|
os << "xcdat query <type> <dict> <limit>\n";
|
|
|
|
os << "\t<type> \t'1' for DACs; '2' for FDACs.\n";
|
|
|
|
os << "\t<dict> \tinput file of the dictionary.\n";
|
|
|
|
os << "\t<limit>\tlimit at lookup (default=10).\n";
|
|
|
|
os << "xcdat bench <type> <dict> <key>\n";
|
|
|
|
os << "\t<type>\t'1' for DACs; '2' for FDACs.\n";
|
|
|
|
os << "\t<dict>\tinput file of the dictionary.\n";
|
|
|
|
os << "\t<key> \tinput file of keys for benchmark.\n";
|
|
|
|
os.flush();
|
2016-12-03 07:51:00 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
template<bool Fast>
|
|
|
|
int build(std::vector<std::string>& args) {
|
|
|
|
if (args.size() != 4) {
|
|
|
|
show_usage(std::cerr);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2017-07-12 06:48:49 +00:00
|
|
|
std::vector<std::string> strs;
|
|
|
|
auto raw_size = read_keys(args[2].c_str(), strs);
|
2017-03-29 06:01:06 +00:00
|
|
|
|
|
|
|
if (raw_size == 0) {
|
2016-12-03 07:51:00 +00:00
|
|
|
std::cerr << "open error : " << args[2] << std::endl;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2017-07-12 06:48:49 +00:00
|
|
|
std::vector<Key> keys;
|
|
|
|
extract_pairs(strs, keys);
|
2016-12-03 07:51:00 +00:00
|
|
|
|
|
|
|
Trie<Fast> trie;
|
|
|
|
try {
|
|
|
|
StopWatch sw;
|
2017-07-12 06:48:49 +00:00
|
|
|
trie = TrieBuilder::build<Fast>(keys);
|
2017-11-04 14:06:08 +00:00
|
|
|
std::cout << "constr. time: " << sw.sec() << " sec" << std::endl;
|
2017-03-29 06:01:06 +00:00
|
|
|
} catch (const xcdat::TrieBuilder::Exception& ex) {
|
|
|
|
std::cerr << ex.what() << std::endl;
|
2016-12-03 07:51:00 +00:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2017-03-29 06:01:06 +00:00
|
|
|
std::cout << "cmpr. ratio: " << (double) trie.size_in_bytes() / raw_size << std::endl;
|
2016-12-03 07:51:00 +00:00
|
|
|
trie.show_stat(std::cout);
|
|
|
|
|
|
|
|
{
|
2017-03-29 06:01:06 +00:00
|
|
|
std::ofstream ofs(args[3]);
|
2016-12-03 07:51:00 +00:00
|
|
|
if (!ofs) {
|
|
|
|
std::cerr << "open error : " << args[3] << std::endl;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
trie.write(ofs);
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
template<bool Fast>
|
|
|
|
int query(std::vector<std::string>& args) {
|
|
|
|
if (args.size() != 3 && args.size() != 4) {
|
|
|
|
show_usage(std::cerr);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
Trie<Fast> trie;
|
|
|
|
{
|
2017-03-29 06:01:06 +00:00
|
|
|
std::ifstream ifs(args[2]);
|
2016-12-03 07:51:00 +00:00
|
|
|
if (!ifs) {
|
|
|
|
std::cerr << "open error : " << args[2] << std::endl;
|
|
|
|
return 1;
|
|
|
|
}
|
2017-07-12 06:48:49 +00:00
|
|
|
trie = Trie<Fast>(ifs);
|
2016-12-03 07:51:00 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
size_t limit = 10;
|
|
|
|
if (args.size() == 4) {
|
|
|
|
limit = std::stoull(args.back());
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string query;
|
2017-03-29 06:01:06 +00:00
|
|
|
std::vector<id_type> ids;
|
|
|
|
std::vector<uint8_t> buf;
|
2016-12-03 07:51:00 +00:00
|
|
|
|
|
|
|
while (true){
|
|
|
|
putchar('>');
|
|
|
|
getline(std::cin, query);
|
2017-11-04 14:06:08 +00:00
|
|
|
if (query.empty()){
|
2016-12-03 07:51:00 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2017-03-29 06:01:06 +00:00
|
|
|
auto key = reinterpret_cast<const uint8_t*>(query.c_str());
|
|
|
|
auto length = query.size();
|
|
|
|
|
2016-12-03 07:51:00 +00:00
|
|
|
std::cout << "lookup()" << std::endl;
|
2017-03-29 06:01:06 +00:00
|
|
|
auto id = trie.lookup(key, length);
|
2016-12-03 07:51:00 +00:00
|
|
|
if (id == kNotFound) {
|
|
|
|
std::cout << "not found" << std::endl;
|
|
|
|
} else {
|
|
|
|
std::cout << id << '\t' << query << std::endl;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::cout << "common_prefix_lookup()" << std::endl;
|
|
|
|
ids.clear();
|
2017-03-29 06:01:06 +00:00
|
|
|
trie.common_prefix_lookup(key, length, ids);
|
2016-12-03 07:51:00 +00:00
|
|
|
std::cout << ids.size() << " found" << std::endl;
|
2017-07-12 06:48:49 +00:00
|
|
|
|
2016-12-03 07:51:00 +00:00
|
|
|
for (size_t i = 0; i < std::min(ids.size(), limit); ++i) {
|
2017-03-29 06:01:06 +00:00
|
|
|
buf.clear();
|
|
|
|
trie.access(ids[i], buf);
|
2017-07-12 06:48:49 +00:00
|
|
|
std::cout << ids[i] << '\t';
|
|
|
|
std::cout.write(reinterpret_cast<const char*>(buf.data()), buf.size());
|
|
|
|
std::cout << std::endl;
|
2016-12-03 07:51:00 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
std::cout << "predictive_lookup()" << std::endl;
|
|
|
|
ids.clear();
|
2017-03-29 06:01:06 +00:00
|
|
|
trie.predictive_lookup(key, length, ids);
|
2016-12-03 07:51:00 +00:00
|
|
|
std::cout << ids.size() << " found" << std::endl;
|
2017-07-12 06:48:49 +00:00
|
|
|
|
2016-12-03 07:51:00 +00:00
|
|
|
for (size_t i = 0; i < std::min(ids.size(), limit); ++i) {
|
2017-03-29 06:01:06 +00:00
|
|
|
buf.clear();
|
|
|
|
trie.access(ids[i], buf);
|
2017-07-12 06:48:49 +00:00
|
|
|
std::cout << ids[i] << '\t';
|
|
|
|
std::cout.write(reinterpret_cast<const char*>(buf.data()), buf.size());
|
|
|
|
std::cout << std::endl;
|
2016-12-03 07:51:00 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
template<bool Fast>
|
|
|
|
int bench(std::vector<std::string>& args) {
|
|
|
|
if (args.size() != 4) {
|
|
|
|
show_usage(std::cerr);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
Trie<Fast> trie;
|
|
|
|
{
|
2017-03-29 06:01:06 +00:00
|
|
|
std::ifstream ifs(args[2]);
|
2016-12-03 07:51:00 +00:00
|
|
|
if (!ifs) {
|
|
|
|
std::cerr << "open error : " << args[2] << std::endl;
|
|
|
|
return 1;
|
|
|
|
}
|
2017-07-12 06:48:49 +00:00
|
|
|
trie = Trie<Fast>(ifs);
|
2016-12-03 07:51:00 +00:00
|
|
|
}
|
|
|
|
|
2017-07-12 06:48:49 +00:00
|
|
|
std::vector<std::string> strs;
|
|
|
|
if (read_keys(args[3].c_str(), strs) == 0) {
|
2016-12-03 07:51:00 +00:00
|
|
|
std::cerr << "open error : " << args[3] << std::endl;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2017-07-12 06:48:49 +00:00
|
|
|
std::vector<Key> keys;
|
|
|
|
extract_pairs(strs, keys);
|
2016-12-03 07:51:00 +00:00
|
|
|
|
2017-07-12 06:48:49 +00:00
|
|
|
std::vector<id_type> ids(keys.size());
|
|
|
|
for (size_t i = 0; i < keys.size(); ++i) {
|
|
|
|
ids[i] = trie.lookup(keys[i].ptr, keys[i].length);
|
2016-12-03 07:51:00 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
std::cout << "Lookup benchmark on " << kRuns << " runs" << std::endl;
|
|
|
|
|
|
|
|
StopWatch sw;
|
|
|
|
for (uint32_t r = 0; r < kRuns; ++r) {
|
2017-07-12 06:48:49 +00:00
|
|
|
for (size_t i = 0; i < keys.size(); ++i) {
|
|
|
|
if (trie.lookup(keys[i].ptr, keys[i].length) == kNotFound) {
|
|
|
|
std::cerr << "Failed to lookup " << strs[i] << std::endl;
|
2016-12-03 07:51:00 +00:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-11-04 14:06:08 +00:00
|
|
|
std::cout << sw.micro_sec() / kRuns / keys.size() << " us per str" << std::endl;
|
2016-12-03 07:51:00 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
std::cout << "Access benchmark on " << kRuns << " runs" << std::endl;
|
|
|
|
|
|
|
|
StopWatch sw;
|
|
|
|
for (uint32_t r = 0; r < kRuns; ++r) {
|
|
|
|
for (size_t i = 0; i < ids.size(); ++i) {
|
2017-03-29 06:01:06 +00:00
|
|
|
std::vector<uint8_t> key;
|
|
|
|
if (!trie.access(ids[i], key)) {
|
2016-12-03 07:51:00 +00:00
|
|
|
std::cerr << "Failed to access " << ids[i] << std::endl;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-11-04 14:06:08 +00:00
|
|
|
std::cout << sw.micro_sec() / kRuns / ids.size() << " us per ID" << std::endl;
|
2016-12-03 07:51:00 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
int main(int argc, const char* argv[]) {
|
|
|
|
if (argc < 3) {
|
|
|
|
show_usage(std::cerr);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<std::string> args;
|
|
|
|
for (int i = 1; i < argc; ++i) {
|
2017-11-04 14:06:08 +00:00
|
|
|
args.emplace_back(std::string{argv[i]});
|
2016-12-03 07:51:00 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
bool is_fast;
|
2017-11-04 14:06:08 +00:00
|
|
|
if (args[1][0] == '1') {
|
2016-12-03 07:51:00 +00:00
|
|
|
is_fast = false;
|
2017-11-04 14:06:08 +00:00
|
|
|
} else if (args[1][0] == '2') {
|
2016-12-03 07:51:00 +00:00
|
|
|
is_fast = true;
|
|
|
|
} else {
|
|
|
|
show_usage(std::cerr);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (args[0] == "build") {
|
|
|
|
return is_fast ? build<true>(args) : build<false>(args);
|
|
|
|
} else if (args[0] == "query") {
|
|
|
|
return is_fast ? query<true>(args) : query<false>(args);
|
|
|
|
} else if (args[0] == "bench") {
|
|
|
|
return is_fast ? bench<true>(args) : bench<false>(args);
|
|
|
|
}
|
|
|
|
|
|
|
|
show_usage(std::cerr);
|
|
|
|
return 1;
|
|
|
|
}
|