This commit is contained in:
Shunsuke Kanda 2021-06-28 02:15:09 +09:00
parent 69235e1a87
commit eebe7ac0db
18 changed files with 1427 additions and 48 deletions

2
.gitignore vendored
View file

@ -29,7 +29,7 @@
*.app *.app
# My Definition # My Definition
build/ build*/
cmake-build-debug/ cmake-build-debug/
.idea/ .idea/
.DS_Store .DS_Store

View file

@ -33,9 +33,11 @@ message(STATUS "CXX_FLAGS_RELEASE are ${CMAKE_CXX_FLAGS_RELEASE}")
include_directories(include) include_directories(include)
enable_testing()
add_subdirectory(test)
add_subdirectory(sample) add_subdirectory(sample)
add_subdirectory(tools)
file(COPY ${CMAKE_SOURCE_DIR}/test/keys.txt DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/test) enable_testing()
add_subdirectory(tests)
file(COPY ${CMAKE_SOURCE_DIR}/tests/keys.txt DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tests)

View file

@ -11,26 +11,4 @@ namespace xcdat {
using trie_7_type = trie<bc_vector_7>; using trie_7_type = trie<bc_vector_7>;
using trie_8_type = trie<bc_vector_8>; using trie_8_type = trie<bc_vector_8>;
template <class Trie, class Strings>
static Trie build(const Strings& keys, bool bin_mode = false) {
return Trie(trie_builder(keys, Trie::bc_vector_type::l1_bits, bin_mode));
}
template <class Trie>
static Trie load(std::string_view filename) {
Trie trie;
essentials::load(trie, filename.data());
return trie;
}
template <class Trie>
static std::uint64_t save(Trie& trie, std::string_view filename) {
return essentials::save(trie, filename.data());
}
template <class Trie>
static std::uint64_t get_memory_in_bytes(Trie& trie) {
return essentials::visit<Trie, essentials::sizer>(trie, "");
}
} // namespace xcdat } // namespace xcdat

View file

@ -26,6 +26,8 @@ class trie {
using this_type = trie<BcVector>; using this_type = trie<BcVector>;
using bc_vector_type = BcVector; using bc_vector_type = BcVector;
static constexpr auto l1_bits = bc_vector_type::l1_bits;
private: private:
std::uint64_t m_num_keys = 0; std::uint64_t m_num_keys = 0;
code_table m_table; code_table m_table;
@ -52,19 +54,23 @@ class trie {
//! Move constructor //! Move constructor
trie& operator=(trie&&) noexcept = default; trie& operator=(trie&&) noexcept = default;
template <class Strings>
explicit trie(trie_builder<Strings>&& b)
: m_num_keys(b.m_keys.size()), m_table(std::move(b.m_table)), m_terms(b.m_terms, true, true),
m_bcvec(b.m_units, std::move(b.m_leaves)), m_tvec(std::move(b.m_suffixes)) {}
/**
* Build the trie dictioanry from the input keywords.
* @param[in] key The query keyword.
* @return The associated ID if found.
*/
template <class Strings> template <class Strings>
static this_type build(const Strings& keys, bool bin_mode = false) { static this_type build(const Strings& keys, bool bin_mode = false) {
return this_type(trie_builder(keys, bc_vector_type::l1_bits, bin_mode)); return this_type(trie_builder(keys, l1_bits, bin_mode));
}
static this_type load(std::string_view filepath) {
this_type obj;
essentials::load(obj, filepath.data());
return obj;
}
std::uint64_t save(std::string_view filepath) const {
return essentials::save(const_cast<this_type&>(*this), filepath.data());
}
std::uint64_t memory_in_bytes() const {
return essentials::visit<this_type, essentials::sizer>(const_cast<this_type&>(*this), "");
} }
//! Check the binary mode. //! Check the binary mode.
@ -271,6 +277,11 @@ class trie {
} }
private: private:
template <class Strings>
explicit trie(trie_builder<Strings>&& b)
: m_num_keys(b.m_keys.size()), m_table(std::move(b.m_table)), m_terms(b.m_terms, true, true),
m_bcvec(b.m_units, std::move(b.m_leaves)), m_tvec(std::move(b.m_suffixes)) {}
template <class String> template <class String>
static constexpr String get_suffix(const String& s, std::uint64_t i) { static constexpr String get_suffix(const String& s, std::uint64_t i) {
assert(i <= s.size()); assert(i <= s.size());

View file

@ -3,17 +3,28 @@
#include <xcdat.hpp> #include <xcdat.hpp>
using xcdat_trie = xcdat::trie_8_type;
int main() { int main() {
std::vector<std::string> keys = { std::vector<std::string> keys = {
"AirPods", "AirTag", "Mac", "MacBook", "MacBook_Air", "MacBook_Pro", "AirPods", "AirTag", "Mac", "MacBook", "MacBook_Air", "MacBook_Pro",
"Mac_Mini", "Mac_Pro", "iMac", "iPad", "iPhone", "iPhone_SE", "Mac_Mini", "Mac_Pro", "iMac", "iPad", "iPhone", "iPhone_SE",
}; };
// The dataset must be sorted and unique. // The dataset must be sorted and unique (although it is not needed for the keys).
std::sort(keys.begin(), keys.end()); std::sort(keys.begin(), keys.end());
keys.erase(std::unique(keys.begin(), keys.end()), keys.end()); keys.erase(std::unique(keys.begin(), keys.end()), keys.end());
auto trie = xcdat::build<xcdat::trie_8_type>(keys); const std::string index_filename = "tmp.idx";
// Build and save the trie index
{
const auto trie = xcdat_trie::build(keys);
trie.save(index_filename);
}
// Load the trie index
const auto trie = xcdat_trie::load(index_filename);
std::cout << "Basic operations" << std::endl; std::cout << "Basic operations" << std::endl;
{ {
@ -41,14 +52,7 @@ int main() {
} }
} }
std::string index_filename = "tmp.idx"; std::remove(index_filename.c_str());
std::cout << "mem: " << xcdat::save(trie, index_filename) << std::endl;
{
auto ohter = xcdat::load<xcdat::trie_8_type>(index_filename);
std::cout << "num_keys:" << ohter.num_keys() << std::endl;
std::cout << "mem: " << xcdat::get_memory_in_bytes(ohter) << std::endl;
}
return 0; return 0;
} }

1
tools/CMakeLists.txt Normal file
View file

@ -0,0 +1 @@
add_executable(xcdat_build xcdat_build.cpp)

View file

@ -0,0 +1,158 @@
#pragma once
#include <algorithm>
#include <cassert>
#include <iostream>
#include <sstream>
#include <type_traits>
#include <unordered_map>
#include <vector>
namespace cmd_line_parser {
struct parser {
inline static const std::string empty = "";
parser(int argc, char** argv) : m_argc(argc), m_argv(argv), m_required(0) {}
struct cmd {
std::string shorthand, value, descr;
bool is_boolean;
};
bool parse() {
if (size_t(m_argc - 1) < m_required) return abort();
size_t k = 0;
for (int i = 1; i != m_argc; ++i, ++k) {
std::string parsed(m_argv[i]);
if (parsed == "-h" or parsed == "--help") return abort();
size_t id = k;
bool is_optional = id >= m_required;
if (is_optional) {
auto it = m_shorthands.find(parsed);
if (it == m_shorthands.end()) {
std::cerr << "== error: shorthand '" + parsed + "' not found" << std::endl;
return abort();
}
id = (*it).second;
}
assert(id < m_names.size());
auto const& name = m_names[id];
auto& c = m_cmds[name];
if (is_optional) {
if (c.is_boolean) {
parsed = "true";
} else {
++i;
if (i == m_argc) return abort();
parsed = m_argv[i];
}
}
c.value = parsed;
}
return true;
}
void help() const {
std::cerr << "Usage: \e[1m" << m_argv[0] << "\e[0m [-h,--help]";
auto print = [this](bool with_description) {
for (size_t i = 0; i != m_names.size(); ++i) {
auto const& c = m_cmds.at(m_names[i]);
bool is_optional = i >= m_required;
if (is_optional) std::cerr << " [\e[1m" << c.shorthand << "\e[0m";
if (!c.is_boolean) std::cerr << " \e[4m" << m_names[i] << "\e[0m";
if (is_optional) std::cerr << "]";
if (with_description) std::cerr << "\n\t" << c.descr << "\n";
}
};
print(false);
std::cerr << "\n\n";
print(true);
std::cerr << " [-h,--help]\n\tPrint this help text and silently exits." << std::endl;
}
bool add(std::string const& name, std::string const& descr) {
bool ret = m_cmds.emplace(name, cmd{empty, empty, descr, false}).second;
if (ret) {
m_names.push_back(name);
m_required += 1;
}
return ret;
}
bool add(std::string const& name, std::string const& descr, std::string const& shorthand, bool is_boolean = true) {
bool ret = m_cmds.emplace(name, cmd{shorthand, is_boolean ? "false" : empty, descr, is_boolean}).second;
if (ret) {
m_names.push_back(name);
m_shorthands.emplace(shorthand, m_names.size() - 1);
}
return ret;
}
template <typename T>
T get(std::string const& name) const {
auto it = m_cmds.find(name);
if (it == m_cmds.end()) {
throw std::runtime_error("error: '" + name + "' not found");
}
auto const& value = (*it).second.value;
return parse<T>(value);
}
// added by Kampersanda
template <typename T>
T get(std::string const& name, const T& default_value) const {
return parsed(name) ? get<T>(name) : default_value;
}
bool parsed(std::string const& name) const {
auto it = m_cmds.find(name);
if (it == m_cmds.end() or (*it).second.value == empty) return false;
return true;
}
template <typename T>
T parse(std::string const& value) const {
if constexpr (std::is_same<T, std::string>::value) {
return value;
} else if constexpr (std::is_same<T, char>::value or std::is_same<T, signed char>::value or
std::is_same<T, unsigned char>::value) {
return value.front();
} else if constexpr (std::is_same<T, unsigned int>::value or std::is_same<T, int>::value or
std::is_same<T, unsigned short int>::value or std::is_same<T, short int>::value) {
return std::atoi(value.c_str());
} else if constexpr (std::is_same<T, unsigned long int>::value or std::is_same<T, long int>::value or
std::is_same<T, unsigned long long int>::value or std::is_same<T, long long int>::value) {
return std::atoll(value.c_str());
} else if constexpr (std::is_same<T, float>::value or std::is_same<T, double>::value or
std::is_same<T, long double>::value) {
return std::atof(value.c_str());
} else if constexpr (std::is_same<T, bool>::value) {
std::istringstream stream(value);
bool ret;
if (value == "true" or value == "false") {
stream >> std::boolalpha >> ret;
} else {
stream >> std::noboolalpha >> ret;
}
return ret;
}
assert(false);
__builtin_unreachable();
}
private:
int m_argc;
char** m_argv;
size_t m_required;
std::unordered_map<std::string, cmd> m_cmds;
std::unordered_map<std::string, int> m_shorthands;
std::vector<std::string> m_names;
bool abort() const {
help();
return false;
}
};
} // namespace cmd_line_parser

File diff suppressed because it is too large Load diff

70
tools/xcdat_build.cpp Normal file
View file

@ -0,0 +1,70 @@
#include <xcdat.hpp>
#include "cmd_line_parser/parser.hpp"
#include "tinyformat/tinyformat.h"
cmd_line_parser::parser make_parser(int argc, char** argv) {
cmd_line_parser::parser p(argc, argv);
p.add("input_keys", "Input filepath of data keys");
p.add("output_idx", "Output filepath of trie index");
p.add("trie_type", "Type of trie impl. from [7|8], (default=7)", "-t", false);
p.add("to_unique", "Make unique the input keys? (default=0)", "-u", false);
return p;
}
template <class Trie>
int build(const cmd_line_parser::parser& p) {
const auto input_keys = p.get<std::string>("input_keys");
const auto output_idx = p.get<std::string>("output_idx");
const auto to_unique = p.get<bool>("to_unique", false);
auto keys = xcdat::io::load_strings(input_keys);
if (keys.empty()) {
tfm::errorfln("Error: The input dataset is empty.");
}
if (to_unique) {
std::sort(keys.begin(), keys.end());
keys.erase(std::unique(keys.begin(), keys.end()), keys.end());
}
essentials::timer<essentials::clock_type, std::chrono::seconds> timer;
timer.start();
const auto trie = Trie::build(keys);
timer.stop();
const double construction_time_in_sec = timer.average();
const double memory_in_bytes = trie.memory_in_bytes();
tfm::printfln("construction_time_in_sec: %g", construction_time_in_sec);
tfm::printfln("memory_in_bytes: %d", memory_in_bytes);
tfm::printfln("memory_in_MiB: %g", memory_in_bytes / essentials::MiB);
tfm::printfln("number_of_keys: %d", trie.num_keys());
tfm::printfln("alphabet_size: %d", trie.alphabet_size());
tfm::printfln("max_length: %d", trie.max_length());
return 0;
}
int main(int argc, char** argv) {
#ifndef NDEBUG
tfm::warnfln("The code is running in debug mode.");
#endif
auto p = make_parser(argc, argv);
if (!p.parse()) {
return 1;
}
const auto trie_type = p.get<int>("trie_type", 7);
switch (trie_type) {
case 7:
return build<xcdat::trie_7_type>(p);
default:
break;
}
p.help();
return 1;
}