flag -> type_id
This commit is contained in:
parent
12c7f9be7f
commit
ebaa3a8518
|
@ -32,9 +32,9 @@ template <class Trie>
|
||||||
[[maybe_unused]] Trie mmap(const char* address) {
|
[[maybe_unused]] Trie mmap(const char* address) {
|
||||||
mmap_visitor visitor(address);
|
mmap_visitor visitor(address);
|
||||||
|
|
||||||
std::uint32_t flag;
|
std::uint32_t type_id;
|
||||||
visitor.visit(flag);
|
visitor.visit(type_id);
|
||||||
XCDAT_THROW_IF(flag != Trie::l1_bits, "The input dictionary type is different.");
|
XCDAT_THROW_IF(type_id != Trie::l1_bits, "The input dictionary type is different.");
|
||||||
|
|
||||||
Trie idx;
|
Trie idx;
|
||||||
visitor.visit(idx);
|
visitor.visit(idx);
|
||||||
|
@ -46,9 +46,9 @@ template <class Trie>
|
||||||
[[maybe_unused]] Trie load(const std::string& filepath) {
|
[[maybe_unused]] Trie load(const std::string& filepath) {
|
||||||
load_visitor visitor(filepath);
|
load_visitor visitor(filepath);
|
||||||
|
|
||||||
std::uint32_t flag;
|
std::uint32_t type_id;
|
||||||
visitor.visit(flag);
|
visitor.visit(type_id);
|
||||||
XCDAT_THROW_IF(flag != Trie::l1_bits, "The input dictionary type is different.");
|
XCDAT_THROW_IF(type_id != Trie::l1_bits, "The input dictionary type is different.");
|
||||||
|
|
||||||
Trie idx;
|
Trie idx;
|
||||||
visitor.visit(idx);
|
visitor.visit(idx);
|
||||||
|
@ -56,10 +56,11 @@ template <class Trie>
|
||||||
}
|
}
|
||||||
|
|
||||||
//! Save the trie dictionary to the file and returns the file size in bytes.
|
//! Save the trie dictionary to the file and returns the file size in bytes.
|
||||||
|
//! The identifier of the trie type will be written in the first 4 bytes.
|
||||||
template <class Trie>
|
template <class Trie>
|
||||||
[[maybe_unused]] std::uint64_t save(const Trie& idx, const std::string& filepath) {
|
[[maybe_unused]] std::uint64_t save(const Trie& idx, const std::string& filepath) {
|
||||||
save_visitor visitor(filepath);
|
save_visitor visitor(filepath);
|
||||||
visitor.visit(static_cast<std::uint32_t>(Trie::l1_bits)); // flag
|
visitor.visit(static_cast<std::uint32_t>(Trie::l1_bits)); // identifier
|
||||||
visitor.visit(const_cast<Trie&>(idx));
|
visitor.visit(const_cast<Trie&>(idx));
|
||||||
return visitor.bytes();
|
return visitor.bytes();
|
||||||
}
|
}
|
||||||
|
@ -68,14 +69,14 @@ template <class Trie>
|
||||||
template <class Trie>
|
template <class Trie>
|
||||||
[[maybe_unused]] std::uint64_t memory_in_bytes(const Trie& idx) {
|
[[maybe_unused]] std::uint64_t memory_in_bytes(const Trie& idx) {
|
||||||
size_visitor visitor;
|
size_visitor visitor;
|
||||||
visitor.visit(static_cast<std::uint32_t>(Trie::l1_bits)); // flag
|
visitor.visit(static_cast<std::uint32_t>(Trie::l1_bits)); // identifier
|
||||||
visitor.visit(const_cast<Trie&>(idx));
|
visitor.visit(const_cast<Trie&>(idx));
|
||||||
return visitor.bytes();
|
return visitor.bytes();
|
||||||
}
|
}
|
||||||
|
|
||||||
//! Get the flag indicating the trie dictionary type, embedded by the function 'save'.
|
//! Get the identifier of the trie type embedded by the function 'save'.
|
||||||
//! The flag corresponds to trie::l1_bits and will be used to detect the trie type from the file.
|
//! The identifier corresponds to trie::l1_bits and will be used to detect the trie type.
|
||||||
[[maybe_unused]] std::uint32_t get_flag(const std::string& filepath) {
|
[[maybe_unused]] std::uint32_t get_type_id(const std::string& filepath) {
|
||||||
std::ifstream ifs(filepath);
|
std::ifstream ifs(filepath);
|
||||||
XCDAT_THROW_IF(!ifs.good(), "Cannot open the input file");
|
XCDAT_THROW_IF(!ifs.good(), "Cannot open the input file");
|
||||||
|
|
||||||
|
@ -84,16 +85,4 @@ template <class Trie>
|
||||||
return flag;
|
return flag;
|
||||||
}
|
}
|
||||||
|
|
||||||
//! Load the keywords from the file.
|
|
||||||
[[maybe_unused]] std::vector<std::string> load_strings(const std::string& filepath, char delim = '\n') {
|
|
||||||
std::ifstream ifs(filepath);
|
|
||||||
XCDAT_THROW_IF(!ifs.good(), "Cannot open the input file");
|
|
||||||
|
|
||||||
std::vector<std::string> strs;
|
|
||||||
for (std::string str; std::getline(ifs, str, delim);) {
|
|
||||||
strs.push_back(str);
|
|
||||||
}
|
|
||||||
return strs;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace xcdat
|
} // namespace xcdat
|
||||||
|
|
|
@ -17,6 +17,7 @@ class trie {
|
||||||
using trie_type = trie<BcVector>;
|
using trie_type = trie<BcVector>;
|
||||||
using bc_vector_type = BcVector;
|
using bc_vector_type = BcVector;
|
||||||
|
|
||||||
|
//! The identifier of BC vector.
|
||||||
static constexpr auto l1_bits = bc_vector_type::l1_bits;
|
static constexpr auto l1_bits = bc_vector_type::l1_bits;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
@ -59,7 +60,7 @@ class trie {
|
||||||
//! - end() returns the iterator to the end.
|
//! - end() returns the iterator to the end.
|
||||||
//! The type 'Strings::value_type::value_type' should be one-byte integer type such as 'char'.
|
//! The type 'Strings::value_type::value_type' should be one-byte integer type such as 'char'.
|
||||||
template <class Strings>
|
template <class Strings>
|
||||||
trie(const Strings& keys, bool bin_mode = false) : trie(trie_builder(keys, l1_bits, bin_mode)) {
|
trie(const Strings& keys, bool bin_mode = false) : trie(trie_builder(keys, bc_vector_type::l1_bits, bin_mode)) {
|
||||||
static_assert(sizeof(char) == sizeof(typename Strings::value_type::value_type));
|
static_assert(sizeof(char) == sizeof(typename Strings::value_type::value_type));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -10,15 +10,15 @@ int main() {
|
||||||
"Mac_Mini", "Mac_Pro", "iMac", "iPad", "iPhone", "iPhone_SE",
|
"Mac_Mini", "Mac_Pro", "iMac", "iPad", "iPhone", "iPhone_SE",
|
||||||
};
|
};
|
||||||
|
|
||||||
// The input keys must be sorted and unique (although they have already satisfied in this case).
|
// The input keys must be sorted and unique (already satisfied in this case).
|
||||||
std::sort(keys.begin(), keys.end());
|
std::sort(keys.begin(), keys.end());
|
||||||
keys.erase(std::unique(keys.begin(), keys.end()), keys.end());
|
keys.erase(std::unique(keys.begin(), keys.end()), keys.end());
|
||||||
|
|
||||||
// The trie dictionary type
|
// The trie dictionary type from the four types
|
||||||
using trie_type = xcdat::trie_7_type;
|
using trie_type = xcdat::trie_8_type;
|
||||||
// using trie_type = xcdat::trie_8_type;
|
|
||||||
// using trie_type = xcdat::trie_15_type;
|
|
||||||
// using trie_type = xcdat::trie_16_type;
|
// using trie_type = xcdat::trie_16_type;
|
||||||
|
// using trie_type = xcdat::trie_7_type;
|
||||||
|
// using trie_type = xcdat::trie_15_type;
|
||||||
|
|
||||||
// The dictionary filename
|
// The dictionary filename
|
||||||
const char* tmp_filename = "dic.bin";
|
const char* tmp_filename = "dic.bin";
|
||||||
|
@ -35,6 +35,9 @@ int main() {
|
||||||
// Load the trie dictionary on memory.
|
// Load the trie dictionary on memory.
|
||||||
const auto trie = xcdat::load<trie_type>(tmp_filename);
|
const auto trie = xcdat::load<trie_type>(tmp_filename);
|
||||||
|
|
||||||
|
// Or, you can set the continuous memory block via a memory-mapped file.
|
||||||
|
// const auto trie = xcdat::mmap<trie_type>(mapped_data);
|
||||||
|
|
||||||
// Basic statistics
|
// Basic statistics
|
||||||
std::cout << "Number of keys: " << trie.num_keys() << std::endl;
|
std::cout << "Number of keys: " << trie.num_keys() << std::endl;
|
||||||
std::cout << "Number of trie nodes: " << trie.num_nodes() << std::endl;
|
std::cout << "Number of trie nodes: " << trie.num_nodes() << std::endl;
|
||||||
|
@ -77,7 +80,7 @@ int main() {
|
||||||
std::cout << "}" << std::endl;
|
std::cout << "}" << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Enumerate all the keys (in lex order).
|
// Enumerate all the keys (in lexicographical order).
|
||||||
{
|
{
|
||||||
std::cout << "Enumerate() = {" << std::endl;
|
std::cout << "Enumerate() = {" << std::endl;
|
||||||
auto itr = trie.make_enumerative_iterator();
|
auto itr = trie.make_enumerative_iterator();
|
||||||
|
|
|
@ -20,6 +20,17 @@ using trie_type = xcdat::trie_15_type;
|
||||||
using trie_type = xcdat::trie_16_type;
|
using trie_type = xcdat::trie_16_type;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
std::vector<std::string> load_strings(const std::string& filepath, char delim = '\n') {
|
||||||
|
std::ifstream ifs(filepath);
|
||||||
|
XCDAT_THROW_IF(!ifs.good(), "Cannot open the input file");
|
||||||
|
|
||||||
|
std::vector<std::string> strs;
|
||||||
|
for (std::string str; std::getline(ifs, str, delim);) {
|
||||||
|
strs.push_back(str);
|
||||||
|
}
|
||||||
|
return strs;
|
||||||
|
}
|
||||||
|
|
||||||
void test_basic_operations(const trie_type& trie, const std::vector<std::string>& keys,
|
void test_basic_operations(const trie_type& trie, const std::vector<std::string>& keys,
|
||||||
const std::vector<std::string>& others) {
|
const std::vector<std::string>& others) {
|
||||||
REQUIRE_EQ(trie.num_keys(), keys.size());
|
REQUIRE_EQ(trie.num_keys(), keys.size());
|
||||||
|
@ -209,7 +220,7 @@ TEST_CASE("Test trie_type (tiny)") {
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_CASE("Test trie_type (real)") {
|
TEST_CASE("Test trie_type (real)") {
|
||||||
auto keys = xcdat::test::to_unique_vec(xcdat::load_strings("keys.txt"));
|
auto keys = xcdat::test::to_unique_vec(load_strings("keys.txt"));
|
||||||
auto others = xcdat::test::extract_keys(keys);
|
auto others = xcdat::test::extract_keys(keys);
|
||||||
|
|
||||||
trie_type trie(keys);
|
trie_type trie(keys);
|
||||||
|
|
|
@ -17,6 +17,17 @@ cmd_line_parser::parser make_parser(int argc, char** argv) {
|
||||||
return p;
|
return p;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::vector<std::string> load_strings(const std::string& filepath, char delim = '\n') {
|
||||||
|
std::ifstream ifs(filepath);
|
||||||
|
XCDAT_THROW_IF(!ifs.good(), "Cannot open the input file");
|
||||||
|
|
||||||
|
std::vector<std::string> strs;
|
||||||
|
for (std::string str; std::getline(ifs, str, delim);) {
|
||||||
|
strs.push_back(str);
|
||||||
|
}
|
||||||
|
return strs;
|
||||||
|
}
|
||||||
|
|
||||||
std::vector<std::string_view> sample_keys(const std::vector<std::string>& keys, std::uint64_t num_samples,
|
std::vector<std::string_view> sample_keys(const std::vector<std::string>& keys, std::uint64_t num_samples,
|
||||||
std::uint64_t random_seed) {
|
std::uint64_t random_seed) {
|
||||||
std::mt19937_64 engine(random_seed);
|
std::mt19937_64 engine(random_seed);
|
||||||
|
@ -127,7 +138,7 @@ int main(int argc, char** argv) {
|
||||||
const auto random_seed = p.get<std::uint64_t>("random_seed", 13);
|
const auto random_seed = p.get<std::uint64_t>("random_seed", 13);
|
||||||
const auto binary_mode = p.get<bool>("binary_mode", false);
|
const auto binary_mode = p.get<bool>("binary_mode", false);
|
||||||
|
|
||||||
auto keys = xcdat::load_strings(input_keys);
|
auto keys = load_strings(input_keys);
|
||||||
if (keys.empty()) {
|
if (keys.empty()) {
|
||||||
tfm::errorfln("Error: The input dataset is empty.");
|
tfm::errorfln("Error: The input dataset is empty.");
|
||||||
return 1;
|
return 1;
|
||||||
|
|
|
@ -12,13 +12,24 @@ cmd_line_parser::parser make_parser(int argc, char** argv) {
|
||||||
return p;
|
return p;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::vector<std::string> load_strings(const std::string& filepath, char delim = '\n') {
|
||||||
|
std::ifstream ifs(filepath);
|
||||||
|
XCDAT_THROW_IF(!ifs.good(), "Cannot open the input file");
|
||||||
|
|
||||||
|
std::vector<std::string> strs;
|
||||||
|
for (std::string str; std::getline(ifs, str, delim);) {
|
||||||
|
strs.push_back(str);
|
||||||
|
}
|
||||||
|
return strs;
|
||||||
|
}
|
||||||
|
|
||||||
template <class Trie>
|
template <class Trie>
|
||||||
int build(const cmd_line_parser::parser& p) {
|
int build(const cmd_line_parser::parser& p) {
|
||||||
const auto input_keys = p.get<std::string>("input_keys");
|
const auto input_keys = p.get<std::string>("input_keys");
|
||||||
const auto output_dic = p.get<std::string>("output_dic");
|
const auto output_dic = p.get<std::string>("output_dic");
|
||||||
const auto binary_mode = p.get<bool>("binary_mode", false);
|
const auto binary_mode = p.get<bool>("binary_mode", false);
|
||||||
|
|
||||||
auto keys = xcdat::load_strings(input_keys);
|
auto keys = load_strings(input_keys);
|
||||||
if (keys.empty()) {
|
if (keys.empty()) {
|
||||||
tfm::errorfln("Error: The input dataset is empty.");
|
tfm::errorfln("Error: The input dataset is empty.");
|
||||||
}
|
}
|
||||||
|
|
|
@ -37,9 +37,9 @@ int main(int argc, char** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto input_dic = p.get<std::string>("input_dic");
|
const auto input_dic = p.get<std::string>("input_dic");
|
||||||
const auto flag = xcdat::get_flag(input_dic);
|
const auto type_id = xcdat::get_type_id(input_dic);
|
||||||
|
|
||||||
switch (flag) {
|
switch (type_id) {
|
||||||
case 7:
|
case 7:
|
||||||
return decode<xcdat::trie_7_type>(p);
|
return decode<xcdat::trie_7_type>(p);
|
||||||
case 8:
|
case 8:
|
||||||
|
|
|
@ -34,9 +34,9 @@ int main(int argc, char** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto input_dic = p.get<std::string>("input_dic");
|
const auto input_dic = p.get<std::string>("input_dic");
|
||||||
const auto flag = xcdat::get_flag(input_dic);
|
const auto type_id = xcdat::get_type_id(input_dic);
|
||||||
|
|
||||||
switch (flag) {
|
switch (type_id) {
|
||||||
case 7:
|
case 7:
|
||||||
return enumerate<xcdat::trie_7_type>(p);
|
return enumerate<xcdat::trie_7_type>(p);
|
||||||
case 8:
|
case 8:
|
||||||
|
|
|
@ -41,9 +41,9 @@ int main(int argc, char** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto input_dic = p.get<std::string>("input_dic");
|
const auto input_dic = p.get<std::string>("input_dic");
|
||||||
const auto flag = xcdat::get_flag(input_dic);
|
const auto type_id = xcdat::get_type_id(input_dic);
|
||||||
|
|
||||||
switch (flag) {
|
switch (type_id) {
|
||||||
case 7:
|
case 7:
|
||||||
return lookup<xcdat::trie_7_type>(p);
|
return lookup<xcdat::trie_7_type>(p);
|
||||||
case 8:
|
case 8:
|
||||||
|
|
|
@ -54,9 +54,9 @@ int main(int argc, char** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto input_dic = p.get<std::string>("input_dic");
|
const auto input_dic = p.get<std::string>("input_dic");
|
||||||
const auto flag = xcdat::get_flag(input_dic);
|
const auto type_id = xcdat::get_type_id(input_dic);
|
||||||
|
|
||||||
switch (flag) {
|
switch (type_id) {
|
||||||
case 7:
|
case 7:
|
||||||
return predictive_search<xcdat::trie_7_type>(p);
|
return predictive_search<xcdat::trie_7_type>(p);
|
||||||
case 8:
|
case 8:
|
||||||
|
|
|
@ -50,9 +50,9 @@ int main(int argc, char** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto input_dic = p.get<std::string>("input_dic");
|
const auto input_dic = p.get<std::string>("input_dic");
|
||||||
const auto flag = xcdat::get_flag(input_dic);
|
const auto type_id = xcdat::get_type_id(input_dic);
|
||||||
|
|
||||||
switch (flag) {
|
switch (type_id) {
|
||||||
case 7:
|
case 7:
|
||||||
return prefix_search<xcdat::trie_7_type>(p);
|
return prefix_search<xcdat::trie_7_type>(p);
|
||||||
case 8:
|
case 8:
|
||||||
|
|
Loading…
Reference in a new issue