ModErn Text Analysis
META Enumerates Textual Applications
perfect_hash_builder.h
Go to the documentation of this file.
1 
10 #ifndef META_HASHING_PERFECT_HASH_BUILDER_H_
11 #define META_HASHING_PERFECT_HASH_BUILDER_H_
12 
13 #include <cstdint>
14 #include <string>
15 #include <vector>
16 
17 #include "meta/config.h"
18 
19 namespace meta
20 {
21 namespace hashing
22 {
23 
54 template <class K>
56 {
57  public:
58  struct options
59  {
60  std::string prefix;
61  uint64_t max_ram = 1024 * 1024 * 1024; // 1 GB
62  uint64_t num_keys;
63  uint64_t num_per_bucket = 4;
64  float load_factor = 0.99f;
65 
66  options() = default;
67  options(const options&) = default;
68  options(options&&) = default;
69  options& operator=(const options&) = default;
70  options& operator=(options&&) = default;
71  };
72 
77 
82  void operator()(const K& key);
83 
87  void write();
88 
89  private:
90  void flush_chunk();
91 
92  void merge_chunks_by_bucket_id();
93 
94  template <class Iterator>
95  void flush_bucket_chunk(Iterator begin, Iterator end);
96 
97  void merge_chunks_by_bucket_size();
98  void sort_buckets_by_size();
99  void construct_perfect_hash();
100 
103 
105  uint64_t bucket_seed_;
106 
108  uint64_t num_buckets_;
109 
111  uint64_t num_chunks_;
112 
113  struct hashed_key
114  {
115  uint64_t idx;
116  K key;
117 
118  hashed_key(uint64_t index, const K& akey) : idx{index}, key{akey}
119  {
120  // nothing
121  }
122 
123  bool operator<(const hashed_key& other) const
124  {
125  return idx < other.idx;
126  }
127  };
128 
130  std::vector<hashed_key> buffer_;
131 };
132 }
133 }
134 
135 #include "perfect_hash_builder.tcc"
136 #endif
Definition: perfect_hash_builder.h:113
options opts_
The options used during building.
Definition: perfect_hash_builder.h:102
std::vector< hashed_key > buffer_
The buffer used for performing the bucket partitioning.
Definition: perfect_hash_builder.h:130
uint64_t num_chunks_
The current number of chunks that have been flushed to disk.
Definition: perfect_hash_builder.h:111
perfect_hash_builder(options opts)
Definition: perfect_hash_builder.tcc:97
uint64_t bucket_seed_
The seed for the bucket hash function.
Definition: perfect_hash_builder.h:105
The ModErn Text Analysis toolkit is a suite of natural language processing, classification, information retrieval, data mining, and other applications of text processing.
Definition: analyzer.h:25
void write()
Writes the perfect_hash to disk.
Definition: perfect_hash_builder.tcc:145
void operator()(const K &key)
Records observed keys.
Definition: perfect_hash_builder.tcc:137
Definition: perfect_hash_builder.h:58
uint64_t num_buckets_
The number of buckets to use during building.
Definition: perfect_hash_builder.h:108
Constructs a minimal perfect hash using a streaming variant of the hash, displace, and compress algorithm.
Definition: perfect_hash_builder.h:55