flag -> type_id
This commit is contained in:
parent
12c7f9be7f
commit
ebaa3a8518
|
@ -32,9 +32,9 @@ template <class Trie>
|
|||
[[maybe_unused]] Trie mmap(const char* address) {
|
||||
mmap_visitor visitor(address);
|
||||
|
||||
std::uint32_t flag;
|
||||
visitor.visit(flag);
|
||||
XCDAT_THROW_IF(flag != Trie::l1_bits, "The input dictionary type is different.");
|
||||
std::uint32_t type_id;
|
||||
visitor.visit(type_id);
|
||||
XCDAT_THROW_IF(type_id != Trie::l1_bits, "The input dictionary type is different.");
|
||||
|
||||
Trie idx;
|
||||
visitor.visit(idx);
|
||||
|
@ -46,9 +46,9 @@ template <class Trie>
|
|||
[[maybe_unused]] Trie load(const std::string& filepath) {
|
||||
load_visitor visitor(filepath);
|
||||
|
||||
std::uint32_t flag;
|
||||
visitor.visit(flag);
|
||||
XCDAT_THROW_IF(flag != Trie::l1_bits, "The input dictionary type is different.");
|
||||
std::uint32_t type_id;
|
||||
visitor.visit(type_id);
|
||||
XCDAT_THROW_IF(type_id != Trie::l1_bits, "The input dictionary type is different.");
|
||||
|
||||
Trie idx;
|
||||
visitor.visit(idx);
|
||||
|
@ -56,10 +56,11 @@ template <class Trie>
|
|||
}
|
||||
|
||||
//! Save the trie dictionary to the file and returns the file size in bytes.
|
||||
//! The identifier of the trie type will be written in the first 4 bytes.
|
||||
template <class Trie>
|
||||
[[maybe_unused]] std::uint64_t save(const Trie& idx, const std::string& filepath) {
|
||||
save_visitor visitor(filepath);
|
||||
visitor.visit(static_cast<std::uint32_t>(Trie::l1_bits)); // flag
|
||||
visitor.visit(static_cast<std::uint32_t>(Trie::l1_bits)); // identifier
|
||||
visitor.visit(const_cast<Trie&>(idx));
|
||||
return visitor.bytes();
|
||||
}
|
||||
|
@ -68,14 +69,14 @@ template <class Trie>
|
|||
template <class Trie>
|
||||
[[maybe_unused]] std::uint64_t memory_in_bytes(const Trie& idx) {
|
||||
size_visitor visitor;
|
||||
visitor.visit(static_cast<std::uint32_t>(Trie::l1_bits)); // flag
|
||||
visitor.visit(static_cast<std::uint32_t>(Trie::l1_bits)); // identifier
|
||||
visitor.visit(const_cast<Trie&>(idx));
|
||||
return visitor.bytes();
|
||||
}
|
||||
|
||||
//! Get the flag indicating the trie dictionary type, embedded by the function 'save'.
|
||||
//! The flag corresponds to trie::l1_bits and will be used to detect the trie type from the file.
|
||||
[[maybe_unused]] std::uint32_t get_flag(const std::string& filepath) {
|
||||
//! Get the identifier of the trie type embedded by the function 'save'.
|
||||
//! The identifier corresponds to trie::l1_bits and will be used to detect the trie type.
|
||||
[[maybe_unused]] std::uint32_t get_type_id(const std::string& filepath) {
|
||||
std::ifstream ifs(filepath);
|
||||
XCDAT_THROW_IF(!ifs.good(), "Cannot open the input file");
|
||||
|
||||
|
@ -84,16 +85,4 @@ template <class Trie>
|
|||
return flag;
|
||||
}
|
||||
|
||||
//! Load the keywords from the file.
|
||||
[[maybe_unused]] std::vector<std::string> load_strings(const std::string& filepath, char delim = '\n') {
|
||||
std::ifstream ifs(filepath);
|
||||
XCDAT_THROW_IF(!ifs.good(), "Cannot open the input file");
|
||||
|
||||
std::vector<std::string> strs;
|
||||
for (std::string str; std::getline(ifs, str, delim);) {
|
||||
strs.push_back(str);
|
||||
}
|
||||
return strs;
|
||||
}
|
||||
|
||||
} // namespace xcdat
|
||||
|
|
|
@ -17,6 +17,7 @@ class trie {
|
|||
using trie_type = trie<BcVector>;
|
||||
using bc_vector_type = BcVector;
|
||||
|
||||
//! The identifier of BC vector.
|
||||
static constexpr auto l1_bits = bc_vector_type::l1_bits;
|
||||
|
||||
private:
|
||||
|
@ -59,7 +60,7 @@ class trie {
|
|||
//! - end() returns the iterator to the end.
|
||||
//! The type 'Strings::value_type::value_type' should be one-byte integer type such as 'char'.
|
||||
template <class Strings>
|
||||
trie(const Strings& keys, bool bin_mode = false) : trie(trie_builder(keys, l1_bits, bin_mode)) {
|
||||
trie(const Strings& keys, bool bin_mode = false) : trie(trie_builder(keys, bc_vector_type::l1_bits, bin_mode)) {
|
||||
static_assert(sizeof(char) == sizeof(typename Strings::value_type::value_type));
|
||||
}
|
||||
|
||||
|
|
|
@ -10,15 +10,15 @@ int main() {
|
|||
"Mac_Mini", "Mac_Pro", "iMac", "iPad", "iPhone", "iPhone_SE",
|
||||
};
|
||||
|
||||
// The input keys must be sorted and unique (although they have already satisfied in this case).
|
||||
// The input keys must be sorted and unique (already satisfied in this case).
|
||||
std::sort(keys.begin(), keys.end());
|
||||
keys.erase(std::unique(keys.begin(), keys.end()), keys.end());
|
||||
|
||||
// The trie dictionary type
|
||||
using trie_type = xcdat::trie_7_type;
|
||||
// using trie_type = xcdat::trie_8_type;
|
||||
// using trie_type = xcdat::trie_15_type;
|
||||
// The trie dictionary type from the four types
|
||||
using trie_type = xcdat::trie_8_type;
|
||||
// using trie_type = xcdat::trie_16_type;
|
||||
// using trie_type = xcdat::trie_7_type;
|
||||
// using trie_type = xcdat::trie_15_type;
|
||||
|
||||
// The dictionary filename
|
||||
const char* tmp_filename = "dic.bin";
|
||||
|
@ -35,6 +35,9 @@ int main() {
|
|||
// Load the trie dictionary on memory.
|
||||
const auto trie = xcdat::load<trie_type>(tmp_filename);
|
||||
|
||||
// Or, you can set the continuous memory block via a memory-mapped file.
|
||||
// const auto trie = xcdat::mmap<trie_type>(mapped_data);
|
||||
|
||||
// Basic statistics
|
||||
std::cout << "Number of keys: " << trie.num_keys() << std::endl;
|
||||
std::cout << "Number of trie nodes: " << trie.num_nodes() << std::endl;
|
||||
|
@ -77,7 +80,7 @@ int main() {
|
|||
std::cout << "}" << std::endl;
|
||||
}
|
||||
|
||||
// Enumerate all the keys (in lex order).
|
||||
// Enumerate all the keys (in lexicographical order).
|
||||
{
|
||||
std::cout << "Enumerate() = {" << std::endl;
|
||||
auto itr = trie.make_enumerative_iterator();
|
||||
|
|
|
@ -20,6 +20,17 @@ using trie_type = xcdat::trie_15_type;
|
|||
using trie_type = xcdat::trie_16_type;
|
||||
#endif
|
||||
|
||||
std::vector<std::string> load_strings(const std::string& filepath, char delim = '\n') {
|
||||
std::ifstream ifs(filepath);
|
||||
XCDAT_THROW_IF(!ifs.good(), "Cannot open the input file");
|
||||
|
||||
std::vector<std::string> strs;
|
||||
for (std::string str; std::getline(ifs, str, delim);) {
|
||||
strs.push_back(str);
|
||||
}
|
||||
return strs;
|
||||
}
|
||||
|
||||
void test_basic_operations(const trie_type& trie, const std::vector<std::string>& keys,
|
||||
const std::vector<std::string>& others) {
|
||||
REQUIRE_EQ(trie.num_keys(), keys.size());
|
||||
|
@ -209,7 +220,7 @@ TEST_CASE("Test trie_type (tiny)") {
|
|||
}
|
||||
|
||||
TEST_CASE("Test trie_type (real)") {
|
||||
auto keys = xcdat::test::to_unique_vec(xcdat::load_strings("keys.txt"));
|
||||
auto keys = xcdat::test::to_unique_vec(load_strings("keys.txt"));
|
||||
auto others = xcdat::test::extract_keys(keys);
|
||||
|
||||
trie_type trie(keys);
|
||||
|
|
|
@ -17,6 +17,17 @@ cmd_line_parser::parser make_parser(int argc, char** argv) {
|
|||
return p;
|
||||
}
|
||||
|
||||
std::vector<std::string> load_strings(const std::string& filepath, char delim = '\n') {
|
||||
std::ifstream ifs(filepath);
|
||||
XCDAT_THROW_IF(!ifs.good(), "Cannot open the input file");
|
||||
|
||||
std::vector<std::string> strs;
|
||||
for (std::string str; std::getline(ifs, str, delim);) {
|
||||
strs.push_back(str);
|
||||
}
|
||||
return strs;
|
||||
}
|
||||
|
||||
std::vector<std::string_view> sample_keys(const std::vector<std::string>& keys, std::uint64_t num_samples,
|
||||
std::uint64_t random_seed) {
|
||||
std::mt19937_64 engine(random_seed);
|
||||
|
@ -127,7 +138,7 @@ int main(int argc, char** argv) {
|
|||
const auto random_seed = p.get<std::uint64_t>("random_seed", 13);
|
||||
const auto binary_mode = p.get<bool>("binary_mode", false);
|
||||
|
||||
auto keys = xcdat::load_strings(input_keys);
|
||||
auto keys = load_strings(input_keys);
|
||||
if (keys.empty()) {
|
||||
tfm::errorfln("Error: The input dataset is empty.");
|
||||
return 1;
|
||||
|
|
|
@ -12,13 +12,24 @@ cmd_line_parser::parser make_parser(int argc, char** argv) {
|
|||
return p;
|
||||
}
|
||||
|
||||
std::vector<std::string> load_strings(const std::string& filepath, char delim = '\n') {
|
||||
std::ifstream ifs(filepath);
|
||||
XCDAT_THROW_IF(!ifs.good(), "Cannot open the input file");
|
||||
|
||||
std::vector<std::string> strs;
|
||||
for (std::string str; std::getline(ifs, str, delim);) {
|
||||
strs.push_back(str);
|
||||
}
|
||||
return strs;
|
||||
}
|
||||
|
||||
template <class Trie>
|
||||
int build(const cmd_line_parser::parser& p) {
|
||||
const auto input_keys = p.get<std::string>("input_keys");
|
||||
const auto output_dic = p.get<std::string>("output_dic");
|
||||
const auto binary_mode = p.get<bool>("binary_mode", false);
|
||||
|
||||
auto keys = xcdat::load_strings(input_keys);
|
||||
auto keys = load_strings(input_keys);
|
||||
if (keys.empty()) {
|
||||
tfm::errorfln("Error: The input dataset is empty.");
|
||||
}
|
||||
|
|
|
@ -37,9 +37,9 @@ int main(int argc, char** argv) {
|
|||
}
|
||||
|
||||
const auto input_dic = p.get<std::string>("input_dic");
|
||||
const auto flag = xcdat::get_flag(input_dic);
|
||||
const auto type_id = xcdat::get_type_id(input_dic);
|
||||
|
||||
switch (flag) {
|
||||
switch (type_id) {
|
||||
case 7:
|
||||
return decode<xcdat::trie_7_type>(p);
|
||||
case 8:
|
||||
|
|
|
@ -34,9 +34,9 @@ int main(int argc, char** argv) {
|
|||
}
|
||||
|
||||
const auto input_dic = p.get<std::string>("input_dic");
|
||||
const auto flag = xcdat::get_flag(input_dic);
|
||||
const auto type_id = xcdat::get_type_id(input_dic);
|
||||
|
||||
switch (flag) {
|
||||
switch (type_id) {
|
||||
case 7:
|
||||
return enumerate<xcdat::trie_7_type>(p);
|
||||
case 8:
|
||||
|
|
|
@ -41,9 +41,9 @@ int main(int argc, char** argv) {
|
|||
}
|
||||
|
||||
const auto input_dic = p.get<std::string>("input_dic");
|
||||
const auto flag = xcdat::get_flag(input_dic);
|
||||
const auto type_id = xcdat::get_type_id(input_dic);
|
||||
|
||||
switch (flag) {
|
||||
switch (type_id) {
|
||||
case 7:
|
||||
return lookup<xcdat::trie_7_type>(p);
|
||||
case 8:
|
||||
|
|
|
@ -54,9 +54,9 @@ int main(int argc, char** argv) {
|
|||
}
|
||||
|
||||
const auto input_dic = p.get<std::string>("input_dic");
|
||||
const auto flag = xcdat::get_flag(input_dic);
|
||||
const auto type_id = xcdat::get_type_id(input_dic);
|
||||
|
||||
switch (flag) {
|
||||
switch (type_id) {
|
||||
case 7:
|
||||
return predictive_search<xcdat::trie_7_type>(p);
|
||||
case 8:
|
||||
|
|
|
@ -50,9 +50,9 @@ int main(int argc, char** argv) {
|
|||
}
|
||||
|
||||
const auto input_dic = p.get<std::string>("input_dic");
|
||||
const auto flag = xcdat::get_flag(input_dic);
|
||||
const auto type_id = xcdat::get_type_id(input_dic);
|
||||
|
||||
switch (flag) {
|
||||
switch (type_id) {
|
||||
case 7:
|
||||
return prefix_search<xcdat::trie_7_type>(p);
|
||||
case 8:
|
||||
|
|
Loading…
Reference in a new issue