xcdat/tool/xcdat.cpp

305 lines
6.8 KiB
C++
Raw Normal View History

2016-12-03 07:51:00 +00:00
#include <chrono>
#include <iostream>
#include <random>
2017-11-11 12:01:10 +00:00
#include "xcdat.hpp"
2016-12-03 07:51:00 +00:00
using namespace xcdat;
namespace {
2017-11-12 11:49:13 +00:00
constexpr int RUNS = 10;
2016-12-03 07:51:00 +00:00
class StopWatch {
public:
2017-11-04 14:06:08 +00:00
using hrc = std::chrono::high_resolution_clock;
StopWatch() : tp_{hrc::now()} {}
2016-12-03 07:51:00 +00:00
2017-11-04 14:06:08 +00:00
double sec() const {
const auto tp = hrc::now() - tp_;
return std::chrono::duration<double>(tp).count();
}
double milli_sec() const {
const auto tp = hrc::now() - tp_;
return std::chrono::duration<double, std::milli>(tp).count();
}
double micro_sec() const {
const auto tp = hrc::now() - tp_;
return std::chrono::duration<double, std::micro>(tp).count();
}
2016-12-03 07:51:00 +00:00
private:
2017-11-04 14:06:08 +00:00
hrc::time_point tp_;
2016-12-03 07:51:00 +00:00
};
2017-03-29 06:01:06 +00:00
size_t read_keys(const char* file_name, std::vector<std::string>& keys) {
2017-11-12 11:49:13 +00:00
std::ifstream ifs{file_name};
2017-03-29 06:01:06 +00:00
if (!ifs) {
return 0;
2016-12-03 07:51:00 +00:00
}
2017-03-29 06:01:06 +00:00
size_t size = 0;
2017-11-12 11:49:13 +00:00
for (std::string line; std::getline(ifs, line);) {
keys.push_back(line);
size += line.length() + 1; // with terminator
2016-12-03 07:51:00 +00:00
}
2017-11-12 11:49:13 +00:00
2017-03-29 06:01:06 +00:00
return size;
}
2016-12-03 07:51:00 +00:00
2017-11-12 11:49:13 +00:00
std::vector<std::string_view>
extract_views(const std::vector<std::string>& keys) {
std::vector<std::string_view> views(keys.size());
2017-03-29 06:01:06 +00:00
for (size_t i = 0; i < keys.size(); ++i) {
2017-11-12 11:49:13 +00:00
views[i] = keys[i];
2016-12-03 07:51:00 +00:00
}
2017-11-12 11:49:13 +00:00
return views;
2016-12-03 07:51:00 +00:00
};
void show_usage(std::ostream& os) {
2017-11-04 14:06:08 +00:00
os << "xcdat build <type> <key> <dict>\n";
os << "\t<type>\t'1' for DACs; '2' for FDACs.\n";
os << "\t<key> \tinput file of a set of keys.\n";
os << "\t<dict>\toutput file of the dictionary.\n";
os << "xcdat query <type> <dict> <limit>\n";
os << "\t<type> \t'1' for DACs; '2' for FDACs.\n";
os << "\t<dict> \tinput file of the dictionary.\n";
os << "\t<limit>\tlimit at lookup (default=10).\n";
os << "xcdat bench <type> <dict> <key>\n";
os << "\t<type>\t'1' for DACs; '2' for FDACs.\n";
os << "\t<dict>\tinput file of the dictionary.\n";
os << "\t<key> \tinput file of keys for benchmark.\n";
os.flush();
2016-12-03 07:51:00 +00:00
}
template<bool Fast>
int build(std::vector<std::string>& args) {
if (args.size() != 4) {
show_usage(std::cerr);
return 1;
}
2017-11-12 11:49:13 +00:00
std::vector<std::string> keys_buffer;
auto raw_size = read_keys(args[2].c_str(), keys_buffer);
2017-03-29 06:01:06 +00:00
if (raw_size == 0) {
2016-12-03 07:51:00 +00:00
std::cerr << "open error : " << args[2] << std::endl;
return 1;
}
2017-11-12 11:49:13 +00:00
auto keys = extract_views(keys_buffer);
2016-12-03 07:51:00 +00:00
Trie<Fast> trie;
try {
StopWatch sw;
2017-07-12 06:48:49 +00:00
trie = TrieBuilder::build<Fast>(keys);
2017-11-04 14:06:08 +00:00
std::cout << "constr. time: " << sw.sec() << " sec" << std::endl;
2017-03-29 06:01:06 +00:00
} catch (const xcdat::TrieBuilder::Exception& ex) {
std::cerr << ex.what() << std::endl;
2016-12-03 07:51:00 +00:00
return 1;
}
2017-11-12 11:49:13 +00:00
std::cout << "cmpr. ratio: "
<< static_cast<double>(trie.size_in_bytes()) / raw_size
<< std::endl;
2016-12-03 07:51:00 +00:00
trie.show_stat(std::cout);
{
2017-11-12 11:49:13 +00:00
std::ofstream ofs{args[3] + (Fast ? ".fdac" : ".dac")};
2016-12-03 07:51:00 +00:00
if (!ofs) {
std::cerr << "open error : " << args[3] << std::endl;
return 1;
}
trie.write(ofs);
}
return 0;
}
template<bool Fast>
int query(std::vector<std::string>& args) {
if (args.size() != 3 && args.size() != 4) {
show_usage(std::cerr);
return 1;
}
Trie<Fast> trie;
{
2017-03-29 06:01:06 +00:00
std::ifstream ifs(args[2]);
2016-12-03 07:51:00 +00:00
if (!ifs) {
std::cerr << "open error : " << args[2] << std::endl;
return 1;
}
2017-07-12 06:48:49 +00:00
trie = Trie<Fast>(ifs);
2016-12-03 07:51:00 +00:00
}
size_t limit = 10;
if (args.size() == 4) {
limit = std::stoull(args.back());
}
std::string query;
while (true){
2017-11-12 11:49:13 +00:00
putchar('> ');
2016-12-03 07:51:00 +00:00
getline(std::cin, query);
2017-11-04 14:06:08 +00:00
if (query.empty()){
2016-12-03 07:51:00 +00:00
break;
}
2017-11-12 11:49:13 +00:00
std::cout << "Lookup" << std::endl;
auto id = trie.lookup(query);
if (id == Trie<Fast>::NOT_FOUND) {
2016-12-03 07:51:00 +00:00
std::cout << "not found" << std::endl;
} else {
std::cout << id << '\t' << query << std::endl;
}
2017-11-12 11:49:13 +00:00
std::cout << "Common Prefix Lookup" << std::endl;
2017-11-11 12:01:10 +00:00
{
size_t N = 0;
2017-11-12 11:49:13 +00:00
auto it = trie.make_prefix_iterator(query);
2017-11-11 12:01:10 +00:00
while (N < limit && it.next()) {
2017-11-12 11:49:13 +00:00
std::cout << it.id() << '\t' << it.key() << std::endl;
2017-11-11 12:01:10 +00:00
++N;
}
size_t M = 0;
while (it.next()) {
++M;
}
if (M != 0) {
std::cout << "and more..." << std::endl;
}
std::cout << N + M << " found" << std::endl;
2016-12-03 07:51:00 +00:00
}
2017-11-12 11:49:13 +00:00
std::cout << "Predictive Lookup" << std::endl;
2017-11-11 12:01:10 +00:00
{
size_t N = 0;
2017-11-12 11:49:13 +00:00
auto it = trie.make_predictive_iterator(query);
2017-11-11 12:01:10 +00:00
while (N < limit && it.next()) {
2017-11-12 11:49:13 +00:00
std::cout << it.id() << '\t' << it.key() << std::endl;
2017-11-11 12:01:10 +00:00
++N;
}
size_t M = 0;
while (it.next()) {
++M;
}
if (M != 0) {
std::cout << "and more..." << std::endl;
}
std::cout << N + M << " found" << std::endl;
2016-12-03 07:51:00 +00:00
}
}
return 0;
}
template<bool Fast>
int bench(std::vector<std::string>& args) {
if (args.size() != 4) {
show_usage(std::cerr);
return 1;
}
Trie<Fast> trie;
{
2017-03-29 06:01:06 +00:00
std::ifstream ifs(args[2]);
2016-12-03 07:51:00 +00:00
if (!ifs) {
std::cerr << "open error : " << args[2] << std::endl;
return 1;
}
2017-07-12 06:48:49 +00:00
trie = Trie<Fast>(ifs);
2016-12-03 07:51:00 +00:00
}
2017-11-12 11:49:13 +00:00
std::vector<std::string> keys_buffer;
if (read_keys(args[3].c_str(), keys_buffer) == 0) {
2016-12-03 07:51:00 +00:00
std::cerr << "open error : " << args[3] << std::endl;
return 1;
}
2017-11-12 11:49:13 +00:00
auto keys = extract_views(keys_buffer);
2016-12-03 07:51:00 +00:00
2017-07-12 06:48:49 +00:00
std::vector<id_type> ids(keys.size());
for (size_t i = 0; i < keys.size(); ++i) {
2017-11-12 11:49:13 +00:00
ids[i] = trie.lookup(keys[i]);
2016-12-03 07:51:00 +00:00
}
{
2017-11-12 11:49:13 +00:00
std::cout << "Lookup benchmark on " << RUNS << " runs" << std::endl;
2016-12-03 07:51:00 +00:00
StopWatch sw;
2017-11-12 11:49:13 +00:00
for (uint32_t r = 0; r < RUNS; ++r) {
2017-07-12 06:48:49 +00:00
for (size_t i = 0; i < keys.size(); ++i) {
2017-11-12 11:49:13 +00:00
if (trie.lookup(keys[i]) == Trie<Fast>::NOT_FOUND) {
std::cerr << "Failed to lookup " << keys_buffer[i] << std::endl;
2016-12-03 07:51:00 +00:00
return 1;
}
}
}
2017-11-12 11:49:13 +00:00
std::cout << sw.micro_sec() / RUNS / keys.size()
<< " us per str" << std::endl;
2016-12-03 07:51:00 +00:00
}
{
2017-11-12 11:49:13 +00:00
std::cout << "Access benchmark on " << RUNS << " runs" << std::endl;
2016-12-03 07:51:00 +00:00
StopWatch sw;
2017-11-12 11:49:13 +00:00
for (uint32_t r = 0; r < RUNS; ++r) {
for (auto id : ids) {
auto dec = trie.access(id);
if (dec.empty()) {
std::cerr << "Failed to access " << id << std::endl;
2016-12-03 07:51:00 +00:00
return 1;
}
}
}
2017-11-12 11:49:13 +00:00
std::cout << sw.micro_sec() / RUNS / ids.size()
<< " us per ID" << std::endl;
2016-12-03 07:51:00 +00:00
}
return 0;
}
} // namespace
int main(int argc, const char* argv[]) {
if (argc < 3) {
show_usage(std::cerr);
return 1;
}
std::vector<std::string> args;
for (int i = 1; i < argc; ++i) {
2017-11-04 14:06:08 +00:00
args.emplace_back(std::string{argv[i]});
2016-12-03 07:51:00 +00:00
}
bool is_fast;
2017-11-04 14:06:08 +00:00
if (args[1][0] == '1') {
2016-12-03 07:51:00 +00:00
is_fast = false;
2017-11-04 14:06:08 +00:00
} else if (args[1][0] == '2') {
2016-12-03 07:51:00 +00:00
is_fast = true;
} else {
show_usage(std::cerr);
return 1;
}
if (args[0] == "build") {
return is_fast ? build<true>(args) : build<false>(args);
} else if (args[0] == "query") {
return is_fast ? query<true>(args) : query<false>(args);
} else if (args[0] == "bench") {
return is_fast ? bench<true>(args) : bench<false>(args);
}
show_usage(std::cerr);
return 1;
}