flag -> type_id

This commit is contained in:
Shunsuke Kanda 2021-07-08 22:47:59 +09:00
parent 12c7f9be7f
commit ebaa3a8518
11 changed files with 69 additions and 43 deletions

View file

@ -32,9 +32,9 @@ template <class Trie>
[[maybe_unused]] Trie mmap(const char* address) { [[maybe_unused]] Trie mmap(const char* address) {
mmap_visitor visitor(address); mmap_visitor visitor(address);
std::uint32_t flag; std::uint32_t type_id;
visitor.visit(flag); visitor.visit(type_id);
XCDAT_THROW_IF(flag != Trie::l1_bits, "The input dictionary type is different."); XCDAT_THROW_IF(type_id != Trie::l1_bits, "The input dictionary type is different.");
Trie idx; Trie idx;
visitor.visit(idx); visitor.visit(idx);
@ -46,9 +46,9 @@ template <class Trie>
[[maybe_unused]] Trie load(const std::string& filepath) { [[maybe_unused]] Trie load(const std::string& filepath) {
load_visitor visitor(filepath); load_visitor visitor(filepath);
std::uint32_t flag; std::uint32_t type_id;
visitor.visit(flag); visitor.visit(type_id);
XCDAT_THROW_IF(flag != Trie::l1_bits, "The input dictionary type is different."); XCDAT_THROW_IF(type_id != Trie::l1_bits, "The input dictionary type is different.");
Trie idx; Trie idx;
visitor.visit(idx); visitor.visit(idx);
@ -56,10 +56,11 @@ template <class Trie>
} }
//! Save the trie dictionary to the file and returns the file size in bytes. //! Save the trie dictionary to the file and returns the file size in bytes.
//! The identifier of the trie type will be written in the first 4 bytes.
template <class Trie> template <class Trie>
[[maybe_unused]] std::uint64_t save(const Trie& idx, const std::string& filepath) { [[maybe_unused]] std::uint64_t save(const Trie& idx, const std::string& filepath) {
save_visitor visitor(filepath); save_visitor visitor(filepath);
visitor.visit(static_cast<std::uint32_t>(Trie::l1_bits)); // flag visitor.visit(static_cast<std::uint32_t>(Trie::l1_bits)); // identifier
visitor.visit(const_cast<Trie&>(idx)); visitor.visit(const_cast<Trie&>(idx));
return visitor.bytes(); return visitor.bytes();
} }
@ -68,14 +69,14 @@ template <class Trie>
template <class Trie> template <class Trie>
[[maybe_unused]] std::uint64_t memory_in_bytes(const Trie& idx) { [[maybe_unused]] std::uint64_t memory_in_bytes(const Trie& idx) {
size_visitor visitor; size_visitor visitor;
visitor.visit(static_cast<std::uint32_t>(Trie::l1_bits)); // flag visitor.visit(static_cast<std::uint32_t>(Trie::l1_bits)); // identifier
visitor.visit(const_cast<Trie&>(idx)); visitor.visit(const_cast<Trie&>(idx));
return visitor.bytes(); return visitor.bytes();
} }
//! Get the flag indicating the trie dictionary type, embedded by the function 'save'. //! Get the identifier of the trie type embedded by the function 'save'.
//! The flag corresponds to trie::l1_bits and will be used to detect the trie type from the file. //! The identifier corresponds to trie::l1_bits and will be used to detect the trie type.
[[maybe_unused]] std::uint32_t get_flag(const std::string& filepath) { [[maybe_unused]] std::uint32_t get_type_id(const std::string& filepath) {
std::ifstream ifs(filepath); std::ifstream ifs(filepath);
XCDAT_THROW_IF(!ifs.good(), "Cannot open the input file"); XCDAT_THROW_IF(!ifs.good(), "Cannot open the input file");
@ -84,16 +85,4 @@ template <class Trie>
return flag; return flag;
} }
//! Load the keywords from the file.
[[maybe_unused]] std::vector<std::string> load_strings(const std::string& filepath, char delim = '\n') {
std::ifstream ifs(filepath);
XCDAT_THROW_IF(!ifs.good(), "Cannot open the input file");
std::vector<std::string> strs;
for (std::string str; std::getline(ifs, str, delim);) {
strs.push_back(str);
}
return strs;
}
} // namespace xcdat } // namespace xcdat

View file

@ -17,6 +17,7 @@ class trie {
using trie_type = trie<BcVector>; using trie_type = trie<BcVector>;
using bc_vector_type = BcVector; using bc_vector_type = BcVector;
//! The identifier of BC vector.
static constexpr auto l1_bits = bc_vector_type::l1_bits; static constexpr auto l1_bits = bc_vector_type::l1_bits;
private: private:
@ -59,7 +60,7 @@ class trie {
//! - end() returns the iterator to the end. //! - end() returns the iterator to the end.
//! The type 'Strings::value_type::value_type' should be one-byte integer type such as 'char'. //! The type 'Strings::value_type::value_type' should be one-byte integer type such as 'char'.
template <class Strings> template <class Strings>
trie(const Strings& keys, bool bin_mode = false) : trie(trie_builder(keys, l1_bits, bin_mode)) { trie(const Strings& keys, bool bin_mode = false) : trie(trie_builder(keys, bc_vector_type::l1_bits, bin_mode)) {
static_assert(sizeof(char) == sizeof(typename Strings::value_type::value_type)); static_assert(sizeof(char) == sizeof(typename Strings::value_type::value_type));
} }

View file

@ -10,15 +10,15 @@ int main() {
"Mac_Mini", "Mac_Pro", "iMac", "iPad", "iPhone", "iPhone_SE", "Mac_Mini", "Mac_Pro", "iMac", "iPad", "iPhone", "iPhone_SE",
}; };
// The input keys must be sorted and unique (although they have already satisfied in this case). // The input keys must be sorted and unique (already satisfied in this case).
std::sort(keys.begin(), keys.end()); std::sort(keys.begin(), keys.end());
keys.erase(std::unique(keys.begin(), keys.end()), keys.end()); keys.erase(std::unique(keys.begin(), keys.end()), keys.end());
// The trie dictionary type // The trie dictionary type from the four types
using trie_type = xcdat::trie_7_type; using trie_type = xcdat::trie_8_type;
// using trie_type = xcdat::trie_8_type;
// using trie_type = xcdat::trie_15_type;
// using trie_type = xcdat::trie_16_type; // using trie_type = xcdat::trie_16_type;
// using trie_type = xcdat::trie_7_type;
// using trie_type = xcdat::trie_15_type;
// The dictionary filename // The dictionary filename
const char* tmp_filename = "dic.bin"; const char* tmp_filename = "dic.bin";
@ -35,6 +35,9 @@ int main() {
// Load the trie dictionary on memory. // Load the trie dictionary on memory.
const auto trie = xcdat::load<trie_type>(tmp_filename); const auto trie = xcdat::load<trie_type>(tmp_filename);
// Or, you can set the continuous memory block via a memory-mapped file.
// const auto trie = xcdat::mmap<trie_type>(mapped_data);
// Basic statistics // Basic statistics
std::cout << "Number of keys: " << trie.num_keys() << std::endl; std::cout << "Number of keys: " << trie.num_keys() << std::endl;
std::cout << "Number of trie nodes: " << trie.num_nodes() << std::endl; std::cout << "Number of trie nodes: " << trie.num_nodes() << std::endl;
@ -77,7 +80,7 @@ int main() {
std::cout << "}" << std::endl; std::cout << "}" << std::endl;
} }
// Enumerate all the keys (in lex order). // Enumerate all the keys (in lexicographical order).
{ {
std::cout << "Enumerate() = {" << std::endl; std::cout << "Enumerate() = {" << std::endl;
auto itr = trie.make_enumerative_iterator(); auto itr = trie.make_enumerative_iterator();

View file

@ -20,6 +20,17 @@ using trie_type = xcdat::trie_15_type;
using trie_type = xcdat::trie_16_type; using trie_type = xcdat::trie_16_type;
#endif #endif
std::vector<std::string> load_strings(const std::string& filepath, char delim = '\n') {
std::ifstream ifs(filepath);
XCDAT_THROW_IF(!ifs.good(), "Cannot open the input file");
std::vector<std::string> strs;
for (std::string str; std::getline(ifs, str, delim);) {
strs.push_back(str);
}
return strs;
}
void test_basic_operations(const trie_type& trie, const std::vector<std::string>& keys, void test_basic_operations(const trie_type& trie, const std::vector<std::string>& keys,
const std::vector<std::string>& others) { const std::vector<std::string>& others) {
REQUIRE_EQ(trie.num_keys(), keys.size()); REQUIRE_EQ(trie.num_keys(), keys.size());
@ -209,7 +220,7 @@ TEST_CASE("Test trie_type (tiny)") {
} }
TEST_CASE("Test trie_type (real)") { TEST_CASE("Test trie_type (real)") {
auto keys = xcdat::test::to_unique_vec(xcdat::load_strings("keys.txt")); auto keys = xcdat::test::to_unique_vec(load_strings("keys.txt"));
auto others = xcdat::test::extract_keys(keys); auto others = xcdat::test::extract_keys(keys);
trie_type trie(keys); trie_type trie(keys);

View file

@ -17,6 +17,17 @@ cmd_line_parser::parser make_parser(int argc, char** argv) {
return p; return p;
} }
std::vector<std::string> load_strings(const std::string& filepath, char delim = '\n') {
std::ifstream ifs(filepath);
XCDAT_THROW_IF(!ifs.good(), "Cannot open the input file");
std::vector<std::string> strs;
for (std::string str; std::getline(ifs, str, delim);) {
strs.push_back(str);
}
return strs;
}
std::vector<std::string_view> sample_keys(const std::vector<std::string>& keys, std::uint64_t num_samples, std::vector<std::string_view> sample_keys(const std::vector<std::string>& keys, std::uint64_t num_samples,
std::uint64_t random_seed) { std::uint64_t random_seed) {
std::mt19937_64 engine(random_seed); std::mt19937_64 engine(random_seed);
@ -127,7 +138,7 @@ int main(int argc, char** argv) {
const auto random_seed = p.get<std::uint64_t>("random_seed", 13); const auto random_seed = p.get<std::uint64_t>("random_seed", 13);
const auto binary_mode = p.get<bool>("binary_mode", false); const auto binary_mode = p.get<bool>("binary_mode", false);
auto keys = xcdat::load_strings(input_keys); auto keys = load_strings(input_keys);
if (keys.empty()) { if (keys.empty()) {
tfm::errorfln("Error: The input dataset is empty."); tfm::errorfln("Error: The input dataset is empty.");
return 1; return 1;

View file

@ -12,13 +12,24 @@ cmd_line_parser::parser make_parser(int argc, char** argv) {
return p; return p;
} }
std::vector<std::string> load_strings(const std::string& filepath, char delim = '\n') {
std::ifstream ifs(filepath);
XCDAT_THROW_IF(!ifs.good(), "Cannot open the input file");
std::vector<std::string> strs;
for (std::string str; std::getline(ifs, str, delim);) {
strs.push_back(str);
}
return strs;
}
template <class Trie> template <class Trie>
int build(const cmd_line_parser::parser& p) { int build(const cmd_line_parser::parser& p) {
const auto input_keys = p.get<std::string>("input_keys"); const auto input_keys = p.get<std::string>("input_keys");
const auto output_dic = p.get<std::string>("output_dic"); const auto output_dic = p.get<std::string>("output_dic");
const auto binary_mode = p.get<bool>("binary_mode", false); const auto binary_mode = p.get<bool>("binary_mode", false);
auto keys = xcdat::load_strings(input_keys); auto keys = load_strings(input_keys);
if (keys.empty()) { if (keys.empty()) {
tfm::errorfln("Error: The input dataset is empty."); tfm::errorfln("Error: The input dataset is empty.");
} }

View file

@ -37,9 +37,9 @@ int main(int argc, char** argv) {
} }
const auto input_dic = p.get<std::string>("input_dic"); const auto input_dic = p.get<std::string>("input_dic");
const auto flag = xcdat::get_flag(input_dic); const auto type_id = xcdat::get_type_id(input_dic);
switch (flag) { switch (type_id) {
case 7: case 7:
return decode<xcdat::trie_7_type>(p); return decode<xcdat::trie_7_type>(p);
case 8: case 8:

View file

@ -34,9 +34,9 @@ int main(int argc, char** argv) {
} }
const auto input_dic = p.get<std::string>("input_dic"); const auto input_dic = p.get<std::string>("input_dic");
const auto flag = xcdat::get_flag(input_dic); const auto type_id = xcdat::get_type_id(input_dic);
switch (flag) { switch (type_id) {
case 7: case 7:
return enumerate<xcdat::trie_7_type>(p); return enumerate<xcdat::trie_7_type>(p);
case 8: case 8:

View file

@ -41,9 +41,9 @@ int main(int argc, char** argv) {
} }
const auto input_dic = p.get<std::string>("input_dic"); const auto input_dic = p.get<std::string>("input_dic");
const auto flag = xcdat::get_flag(input_dic); const auto type_id = xcdat::get_type_id(input_dic);
switch (flag) { switch (type_id) {
case 7: case 7:
return lookup<xcdat::trie_7_type>(p); return lookup<xcdat::trie_7_type>(p);
case 8: case 8:

View file

@ -54,9 +54,9 @@ int main(int argc, char** argv) {
} }
const auto input_dic = p.get<std::string>("input_dic"); const auto input_dic = p.get<std::string>("input_dic");
const auto flag = xcdat::get_flag(input_dic); const auto type_id = xcdat::get_type_id(input_dic);
switch (flag) { switch (type_id) {
case 7: case 7:
return predictive_search<xcdat::trie_7_type>(p); return predictive_search<xcdat::trie_7_type>(p);
case 8: case 8:

View file

@ -50,9 +50,9 @@ int main(int argc, char** argv) {
} }
const auto input_dic = p.get<std::string>("input_dic"); const auto input_dic = p.get<std::string>("input_dic");
const auto flag = xcdat::get_flag(input_dic); const auto type_id = xcdat::get_type_id(input_dic);
switch (flag) { switch (type_id) {
case 7: case 7:
return prefix_search<xcdat::trie_7_type>(p); return prefix_search<xcdat::trie_7_type>(p);
case 8: case 8: