update
This commit is contained in:
parent
69235e1a87
commit
eebe7ac0db
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -29,7 +29,7 @@
|
||||||
*.app
|
*.app
|
||||||
|
|
||||||
# My Definition
|
# My Definition
|
||||||
build/
|
build*/
|
||||||
cmake-build-debug/
|
cmake-build-debug/
|
||||||
.idea/
|
.idea/
|
||||||
.DS_Store
|
.DS_Store
|
||||||
|
|
|
@ -33,9 +33,11 @@ message(STATUS "CXX_FLAGS_RELEASE are ${CMAKE_CXX_FLAGS_RELEASE}")
|
||||||
|
|
||||||
include_directories(include)
|
include_directories(include)
|
||||||
|
|
||||||
enable_testing()
|
|
||||||
add_subdirectory(test)
|
|
||||||
|
|
||||||
add_subdirectory(sample)
|
add_subdirectory(sample)
|
||||||
|
add_subdirectory(tools)
|
||||||
|
|
||||||
file(COPY ${CMAKE_SOURCE_DIR}/test/keys.txt DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/test)
|
enable_testing()
|
||||||
|
add_subdirectory(tests)
|
||||||
|
|
||||||
|
|
||||||
|
file(COPY ${CMAKE_SOURCE_DIR}/tests/keys.txt DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tests)
|
||||||
|
|
|
@ -11,26 +11,4 @@ namespace xcdat {
|
||||||
using trie_7_type = trie<bc_vector_7>;
|
using trie_7_type = trie<bc_vector_7>;
|
||||||
using trie_8_type = trie<bc_vector_8>;
|
using trie_8_type = trie<bc_vector_8>;
|
||||||
|
|
||||||
template <class Trie, class Strings>
|
|
||||||
static Trie build(const Strings& keys, bool bin_mode = false) {
|
|
||||||
return Trie(trie_builder(keys, Trie::bc_vector_type::l1_bits, bin_mode));
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class Trie>
|
|
||||||
static Trie load(std::string_view filename) {
|
|
||||||
Trie trie;
|
|
||||||
essentials::load(trie, filename.data());
|
|
||||||
return trie;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class Trie>
|
|
||||||
static std::uint64_t save(Trie& trie, std::string_view filename) {
|
|
||||||
return essentials::save(trie, filename.data());
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class Trie>
|
|
||||||
static std::uint64_t get_memory_in_bytes(Trie& trie) {
|
|
||||||
return essentials::visit<Trie, essentials::sizer>(trie, "");
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace xcdat
|
} // namespace xcdat
|
||||||
|
|
|
@ -26,6 +26,8 @@ class trie {
|
||||||
using this_type = trie<BcVector>;
|
using this_type = trie<BcVector>;
|
||||||
using bc_vector_type = BcVector;
|
using bc_vector_type = BcVector;
|
||||||
|
|
||||||
|
static constexpr auto l1_bits = bc_vector_type::l1_bits;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
std::uint64_t m_num_keys = 0;
|
std::uint64_t m_num_keys = 0;
|
||||||
code_table m_table;
|
code_table m_table;
|
||||||
|
@ -52,19 +54,23 @@ class trie {
|
||||||
//! Move constructor
|
//! Move constructor
|
||||||
trie& operator=(trie&&) noexcept = default;
|
trie& operator=(trie&&) noexcept = default;
|
||||||
|
|
||||||
template <class Strings>
|
|
||||||
explicit trie(trie_builder<Strings>&& b)
|
|
||||||
: m_num_keys(b.m_keys.size()), m_table(std::move(b.m_table)), m_terms(b.m_terms, true, true),
|
|
||||||
m_bcvec(b.m_units, std::move(b.m_leaves)), m_tvec(std::move(b.m_suffixes)) {}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Build the trie dictioanry from the input keywords.
|
|
||||||
* @param[in] key The query keyword.
|
|
||||||
* @return The associated ID if found.
|
|
||||||
*/
|
|
||||||
template <class Strings>
|
template <class Strings>
|
||||||
static this_type build(const Strings& keys, bool bin_mode = false) {
|
static this_type build(const Strings& keys, bool bin_mode = false) {
|
||||||
return this_type(trie_builder(keys, bc_vector_type::l1_bits, bin_mode));
|
return this_type(trie_builder(keys, l1_bits, bin_mode));
|
||||||
|
}
|
||||||
|
|
||||||
|
static this_type load(std::string_view filepath) {
|
||||||
|
this_type obj;
|
||||||
|
essentials::load(obj, filepath.data());
|
||||||
|
return obj;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::uint64_t save(std::string_view filepath) const {
|
||||||
|
return essentials::save(const_cast<this_type&>(*this), filepath.data());
|
||||||
|
}
|
||||||
|
|
||||||
|
std::uint64_t memory_in_bytes() const {
|
||||||
|
return essentials::visit<this_type, essentials::sizer>(const_cast<this_type&>(*this), "");
|
||||||
}
|
}
|
||||||
|
|
||||||
//! Check the binary mode.
|
//! Check the binary mode.
|
||||||
|
@ -271,6 +277,11 @@ class trie {
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
template <class Strings>
|
||||||
|
explicit trie(trie_builder<Strings>&& b)
|
||||||
|
: m_num_keys(b.m_keys.size()), m_table(std::move(b.m_table)), m_terms(b.m_terms, true, true),
|
||||||
|
m_bcvec(b.m_units, std::move(b.m_leaves)), m_tvec(std::move(b.m_suffixes)) {}
|
||||||
|
|
||||||
template <class String>
|
template <class String>
|
||||||
static constexpr String get_suffix(const String& s, std::uint64_t i) {
|
static constexpr String get_suffix(const String& s, std::uint64_t i) {
|
||||||
assert(i <= s.size());
|
assert(i <= s.size());
|
||||||
|
|
|
@ -3,17 +3,28 @@
|
||||||
|
|
||||||
#include <xcdat.hpp>
|
#include <xcdat.hpp>
|
||||||
|
|
||||||
|
using xcdat_trie = xcdat::trie_8_type;
|
||||||
|
|
||||||
int main() {
|
int main() {
|
||||||
std::vector<std::string> keys = {
|
std::vector<std::string> keys = {
|
||||||
"AirPods", "AirTag", "Mac", "MacBook", "MacBook_Air", "MacBook_Pro",
|
"AirPods", "AirTag", "Mac", "MacBook", "MacBook_Air", "MacBook_Pro",
|
||||||
"Mac_Mini", "Mac_Pro", "iMac", "iPad", "iPhone", "iPhone_SE",
|
"Mac_Mini", "Mac_Pro", "iMac", "iPad", "iPhone", "iPhone_SE",
|
||||||
};
|
};
|
||||||
|
|
||||||
// The dataset must be sorted and unique.
|
// The dataset must be sorted and unique (although it is not needed for the keys).
|
||||||
std::sort(keys.begin(), keys.end());
|
std::sort(keys.begin(), keys.end());
|
||||||
keys.erase(std::unique(keys.begin(), keys.end()), keys.end());
|
keys.erase(std::unique(keys.begin(), keys.end()), keys.end());
|
||||||
|
|
||||||
auto trie = xcdat::build<xcdat::trie_8_type>(keys);
|
const std::string index_filename = "tmp.idx";
|
||||||
|
|
||||||
|
// Build and save the trie index
|
||||||
|
{
|
||||||
|
const auto trie = xcdat_trie::build(keys);
|
||||||
|
trie.save(index_filename);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Load the trie index
|
||||||
|
const auto trie = xcdat_trie::load(index_filename);
|
||||||
|
|
||||||
std::cout << "Basic operations" << std::endl;
|
std::cout << "Basic operations" << std::endl;
|
||||||
{
|
{
|
||||||
|
@ -41,14 +52,7 @@ int main() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string index_filename = "tmp.idx";
|
std::remove(index_filename.c_str());
|
||||||
std::cout << "mem: " << xcdat::save(trie, index_filename) << std::endl;
|
|
||||||
|
|
||||||
{
|
|
||||||
auto ohter = xcdat::load<xcdat::trie_8_type>(index_filename);
|
|
||||||
std::cout << "num_keys:" << ohter.num_keys() << std::endl;
|
|
||||||
std::cout << "mem: " << xcdat::get_memory_in_bytes(ohter) << std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
1
tools/CMakeLists.txt
Normal file
1
tools/CMakeLists.txt
Normal file
|
@ -0,0 +1 @@
|
||||||
|
add_executable(xcdat_build xcdat_build.cpp)
|
158
tools/cmd_line_parser/parser.hpp
Normal file
158
tools/cmd_line_parser/parser.hpp
Normal file
|
@ -0,0 +1,158 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <cassert>
|
||||||
|
#include <iostream>
|
||||||
|
#include <sstream>
|
||||||
|
#include <type_traits>
|
||||||
|
#include <unordered_map>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
namespace cmd_line_parser {
|
||||||
|
|
||||||
|
struct parser {
|
||||||
|
inline static const std::string empty = "";
|
||||||
|
|
||||||
|
parser(int argc, char** argv) : m_argc(argc), m_argv(argv), m_required(0) {}
|
||||||
|
|
||||||
|
struct cmd {
|
||||||
|
std::string shorthand, value, descr;
|
||||||
|
bool is_boolean;
|
||||||
|
};
|
||||||
|
|
||||||
|
bool parse() {
|
||||||
|
if (size_t(m_argc - 1) < m_required) return abort();
|
||||||
|
size_t k = 0;
|
||||||
|
for (int i = 1; i != m_argc; ++i, ++k) {
|
||||||
|
std::string parsed(m_argv[i]);
|
||||||
|
if (parsed == "-h" or parsed == "--help") return abort();
|
||||||
|
size_t id = k;
|
||||||
|
bool is_optional = id >= m_required;
|
||||||
|
if (is_optional) {
|
||||||
|
auto it = m_shorthands.find(parsed);
|
||||||
|
if (it == m_shorthands.end()) {
|
||||||
|
std::cerr << "== error: shorthand '" + parsed + "' not found" << std::endl;
|
||||||
|
return abort();
|
||||||
|
}
|
||||||
|
id = (*it).second;
|
||||||
|
}
|
||||||
|
assert(id < m_names.size());
|
||||||
|
auto const& name = m_names[id];
|
||||||
|
auto& c = m_cmds[name];
|
||||||
|
if (is_optional) {
|
||||||
|
if (c.is_boolean) {
|
||||||
|
parsed = "true";
|
||||||
|
} else {
|
||||||
|
++i;
|
||||||
|
if (i == m_argc) return abort();
|
||||||
|
parsed = m_argv[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
c.value = parsed;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
void help() const {
|
||||||
|
std::cerr << "Usage: \e[1m" << m_argv[0] << "\e[0m [-h,--help]";
|
||||||
|
auto print = [this](bool with_description) {
|
||||||
|
for (size_t i = 0; i != m_names.size(); ++i) {
|
||||||
|
auto const& c = m_cmds.at(m_names[i]);
|
||||||
|
bool is_optional = i >= m_required;
|
||||||
|
if (is_optional) std::cerr << " [\e[1m" << c.shorthand << "\e[0m";
|
||||||
|
if (!c.is_boolean) std::cerr << " \e[4m" << m_names[i] << "\e[0m";
|
||||||
|
if (is_optional) std::cerr << "]";
|
||||||
|
if (with_description) std::cerr << "\n\t" << c.descr << "\n";
|
||||||
|
}
|
||||||
|
};
|
||||||
|
print(false);
|
||||||
|
std::cerr << "\n\n";
|
||||||
|
print(true);
|
||||||
|
std::cerr << " [-h,--help]\n\tPrint this help text and silently exits." << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool add(std::string const& name, std::string const& descr) {
|
||||||
|
bool ret = m_cmds.emplace(name, cmd{empty, empty, descr, false}).second;
|
||||||
|
if (ret) {
|
||||||
|
m_names.push_back(name);
|
||||||
|
m_required += 1;
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool add(std::string const& name, std::string const& descr, std::string const& shorthand, bool is_boolean = true) {
|
||||||
|
bool ret = m_cmds.emplace(name, cmd{shorthand, is_boolean ? "false" : empty, descr, is_boolean}).second;
|
||||||
|
if (ret) {
|
||||||
|
m_names.push_back(name);
|
||||||
|
m_shorthands.emplace(shorthand, m_names.size() - 1);
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
T get(std::string const& name) const {
|
||||||
|
auto it = m_cmds.find(name);
|
||||||
|
if (it == m_cmds.end()) {
|
||||||
|
throw std::runtime_error("error: '" + name + "' not found");
|
||||||
|
}
|
||||||
|
auto const& value = (*it).second.value;
|
||||||
|
return parse<T>(value);
|
||||||
|
}
|
||||||
|
|
||||||
|
// added by Kampersanda
|
||||||
|
template <typename T>
|
||||||
|
T get(std::string const& name, const T& default_value) const {
|
||||||
|
return parsed(name) ? get<T>(name) : default_value;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool parsed(std::string const& name) const {
|
||||||
|
auto it = m_cmds.find(name);
|
||||||
|
if (it == m_cmds.end() or (*it).second.value == empty) return false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
T parse(std::string const& value) const {
|
||||||
|
if constexpr (std::is_same<T, std::string>::value) {
|
||||||
|
return value;
|
||||||
|
} else if constexpr (std::is_same<T, char>::value or std::is_same<T, signed char>::value or
|
||||||
|
std::is_same<T, unsigned char>::value) {
|
||||||
|
return value.front();
|
||||||
|
} else if constexpr (std::is_same<T, unsigned int>::value or std::is_same<T, int>::value or
|
||||||
|
std::is_same<T, unsigned short int>::value or std::is_same<T, short int>::value) {
|
||||||
|
return std::atoi(value.c_str());
|
||||||
|
} else if constexpr (std::is_same<T, unsigned long int>::value or std::is_same<T, long int>::value or
|
||||||
|
std::is_same<T, unsigned long long int>::value or std::is_same<T, long long int>::value) {
|
||||||
|
return std::atoll(value.c_str());
|
||||||
|
} else if constexpr (std::is_same<T, float>::value or std::is_same<T, double>::value or
|
||||||
|
std::is_same<T, long double>::value) {
|
||||||
|
return std::atof(value.c_str());
|
||||||
|
} else if constexpr (std::is_same<T, bool>::value) {
|
||||||
|
std::istringstream stream(value);
|
||||||
|
bool ret;
|
||||||
|
if (value == "true" or value == "false") {
|
||||||
|
stream >> std::boolalpha >> ret;
|
||||||
|
} else {
|
||||||
|
stream >> std::noboolalpha >> ret;
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
assert(false);
|
||||||
|
__builtin_unreachable();
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
int m_argc;
|
||||||
|
char** m_argv;
|
||||||
|
size_t m_required;
|
||||||
|
std::unordered_map<std::string, cmd> m_cmds;
|
||||||
|
std::unordered_map<std::string, int> m_shorthands;
|
||||||
|
std::vector<std::string> m_names;
|
||||||
|
|
||||||
|
bool abort() const {
|
||||||
|
help();
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace cmd_line_parser
|
1155
tools/tinyformat/tinyformat.h
Normal file
1155
tools/tinyformat/tinyformat.h
Normal file
File diff suppressed because it is too large
Load diff
70
tools/xcdat_build.cpp
Normal file
70
tools/xcdat_build.cpp
Normal file
|
@ -0,0 +1,70 @@
|
||||||
|
#include <xcdat.hpp>
|
||||||
|
|
||||||
|
#include "cmd_line_parser/parser.hpp"
|
||||||
|
#include "tinyformat/tinyformat.h"
|
||||||
|
|
||||||
|
cmd_line_parser::parser make_parser(int argc, char** argv) {
|
||||||
|
cmd_line_parser::parser p(argc, argv);
|
||||||
|
p.add("input_keys", "Input filepath of data keys");
|
||||||
|
p.add("output_idx", "Output filepath of trie index");
|
||||||
|
p.add("trie_type", "Type of trie impl. from [7|8], (default=7)", "-t", false);
|
||||||
|
p.add("to_unique", "Make unique the input keys? (default=0)", "-u", false);
|
||||||
|
return p;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class Trie>
|
||||||
|
int build(const cmd_line_parser::parser& p) {
|
||||||
|
const auto input_keys = p.get<std::string>("input_keys");
|
||||||
|
const auto output_idx = p.get<std::string>("output_idx");
|
||||||
|
const auto to_unique = p.get<bool>("to_unique", false);
|
||||||
|
|
||||||
|
auto keys = xcdat::io::load_strings(input_keys);
|
||||||
|
if (keys.empty()) {
|
||||||
|
tfm::errorfln("Error: The input dataset is empty.");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (to_unique) {
|
||||||
|
std::sort(keys.begin(), keys.end());
|
||||||
|
keys.erase(std::unique(keys.begin(), keys.end()), keys.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
essentials::timer<essentials::clock_type, std::chrono::seconds> timer;
|
||||||
|
timer.start();
|
||||||
|
const auto trie = Trie::build(keys);
|
||||||
|
timer.stop();
|
||||||
|
|
||||||
|
const double construction_time_in_sec = timer.average();
|
||||||
|
const double memory_in_bytes = trie.memory_in_bytes();
|
||||||
|
|
||||||
|
tfm::printfln("construction_time_in_sec: %g", construction_time_in_sec);
|
||||||
|
tfm::printfln("memory_in_bytes: %d", memory_in_bytes);
|
||||||
|
tfm::printfln("memory_in_MiB: %g", memory_in_bytes / essentials::MiB);
|
||||||
|
tfm::printfln("number_of_keys: %d", trie.num_keys());
|
||||||
|
tfm::printfln("alphabet_size: %d", trie.alphabet_size());
|
||||||
|
tfm::printfln("max_length: %d", trie.max_length());
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char** argv) {
|
||||||
|
#ifndef NDEBUG
|
||||||
|
tfm::warnfln("The code is running in debug mode.");
|
||||||
|
#endif
|
||||||
|
|
||||||
|
auto p = make_parser(argc, argv);
|
||||||
|
if (!p.parse()) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto trie_type = p.get<int>("trie_type", 7);
|
||||||
|
|
||||||
|
switch (trie_type) {
|
||||||
|
case 7:
|
||||||
|
return build<xcdat::trie_7_type>(p);
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
p.help();
|
||||||
|
return 1;
|
||||||
|
}
|
Loading…
Reference in a new issue