ModErn Text Analysis
META Enumerates Textual Applications
perfect_hash_map.h
Go to the documentation of this file.
1 
10 #ifndef META_HASHING_PERFECT_HASH_MAP_H_
11 #define META_HASHING_PERFECT_HASH_MAP_H_
12 
13 #include <cstdint>
14 
15 #include "meta/config.h"
18 #include "meta/io/filesystem.h"
19 #include "meta/io/mmap_file.h"
21 #include "meta/io/packed.h"
22 #include "meta/util/optional.h"
23 #include "meta/util/progress.h"
24 
25 namespace meta
26 {
27 namespace hashing
28 {
29 
30 namespace detail
31 {
32 template <class Value, class FingerPrint = uint32_t>
34 {
35  using fingerprint_type = FingerPrint;
36  using value_type = Value;
37  FingerPrint id;
38  Value value;
39 };
40 
41 template <class Value, class FingerPrint = uint32_t>
43 {
44  uint64_t idx;
46 
47  void merge_with(hashed_value&&)
48  {
49  throw std::runtime_error{"can't merge detail::hashed_value"};
50  }
51 };
52 
53 template <class OutputStream, class Value, class FingerPrint>
54 uint64_t packed_write(OutputStream& os,
56 {
57  using io::packed::write;
58  return write(os, r.id) + write(os, r.value);
59 }
60 
61 template <class InputStream, class Value, class FingerPrint>
62 uint64_t packed_read(InputStream& is, hash_record<Value, FingerPrint>& r)
63 {
64  using io::packed::read;
65  return read(is, r.id) + read(is, r.value);
66 }
67 
68 template <class OutputStream, class Value, class FingerPrint>
69 uint64_t packed_write(OutputStream& os,
71 {
72  using io::packed::write;
73  return write(os, hv.idx) + write(os, hv.record);
74 }
75 
76 template <class InputStream, class Value, class FingerPrint>
77 uint64_t packed_read(InputStream& is, hashed_value<Value, FingerPrint>& hv)
78 {
79  using io::packed::read;
80  return read(is, hv.idx) + read(is, hv.record);
81 }
82 
83 template <class HashedValue>
85 }
86 
101 template <class KeyType, class ValueType, class FingerPrint = uint32_t>
103 {
104  public:
106  using fingerprint_type = typename record_type::fingerprint_type;
108  using options_type = typename hash_builder_type::options;
110 
115  perfect_hash_map_builder(const options_type& options)
116  : options_(options),
117  output_{options.prefix + "/values.bin.tmp", std::ios::binary},
118  hash_builder_{make_unique<hash_builder_type>(options)},
119  fingerprint_{47}
120  {
121  // nothing
122  }
123 
127  void operator()(const KeyType& key, const ValueType& value)
128  {
129  (*hash_builder_)(key);
130 
131  io::packed::write(output_.stream(), key);
132  io::packed::write(output_.stream(), value);
133  }
134 
139  void write()
140  {
141  output_.stream().close();
142 
143  LOG(progress) << "> Building hash function...\n" << ENDLG;
144  hash_builder_->write();
145  hash_builder_ = nullptr;
146 
147  reorder_values();
148  }
149 
150  private:
152 
153  void reorder_values()
154  {
155  // hash every (key, value) we recorded using the perfect hash
156  // function we generated in the first pass, then do a multi-way
157  // merge to put the values into the right order according to the
158  // new hash function
159  LOG(info) << "Loading hash..." << ENDLG;
160  hashing::perfect_hash<KeyType> hash{options_.prefix};
161  LOG(info) << "Hash loaded" << ENDLG;
162 
163  uint64_t num_chunks = 0;
164 
165  {
166  std::vector<hashed_value> buffer;
167  buffer.reserve(options_.max_ram / sizeof(hashed_value));
168 
169  printing::progress progress{
170  " > Reordering values: ",
171  filesystem::file_size(options_.prefix + "/values.bin.tmp")};
172 
173  std::ifstream input{options_.prefix + "/values.bin.tmp",
174  std::ios::binary};
175 
176  for (uint64_t bytes = 0; input.peek() != EOF;)
177  {
178  progress(bytes);
179 
180  KeyType key;
181  hashed_value h_value;
182  bytes += io::packed::read(input, key);
183  bytes += io::packed::read(input, h_value.record.value);
184  h_value.record.id
185  = static_cast<fingerprint_type>(fingerprint_(key));
186  h_value.idx = hash(key);
187 
188  buffer.push_back(h_value);
189 
190  if (buffer.size() == options_.max_ram / sizeof(hashed_value))
191  {
192  flush_chunk(buffer, num_chunks);
193  ++num_chunks;
194  }
195  }
196 
197  if (!buffer.empty())
198  {
199  flush_chunk(buffer, num_chunks);
200  ++num_chunks;
201  }
202  }
203 
204  filesystem::remove_all(options_.prefix + "/values.bin.tmp");
205 
206  std::vector<detail::hv_chunk_iterator<hashed_value>> iterators;
207  iterators.reserve(num_chunks);
208  for (uint64_t i = 0; i < num_chunks; ++i)
209  iterators.emplace_back(options_.prefix + "/value-chunk."
210  + std::to_string(i));
211 
212  std::ofstream output{options_.prefix + "/values.bin", std::ios::binary};
214  iterators.begin(), iterators.end(),
215  // sort records at head of chunks by decreasing hash id
216  [](const hashed_value& a, const hashed_value& b) {
217  return a.idx < b.idx;
218  },
219  // never merge two records together
220  [](const hashed_value&, const hashed_value&) { return false; },
221  [&](hashed_value&& hv) {
222  output.write(reinterpret_cast<char*>(&hv.record),
223  sizeof(hv.record));
224  });
225 
226  // delete temporary files
227  for (uint64_t i = 0; i < num_chunks; ++i)
228  {
229  filesystem::delete_file(options_.prefix + "/value-chunk."
230  + std::to_string(i));
231  }
232  }
233 
234  void flush_chunk(std::vector<hashed_value>& buffer, uint64_t chunk_num)
235  {
236  std::sort(buffer.begin(), buffer.end(),
237  [](const hashed_value& a, const hashed_value& b) {
238  return a.idx < b.idx;
239  });
240 
241  std::ofstream chunk{options_.prefix + "/value-chunk."
242  + std::to_string(chunk_num)};
243  for (const auto& hval : buffer)
244  {
245  io::packed::write(chunk, hval.idx);
246  io::packed::write(chunk, hval.record);
247  }
248  buffer.clear();
249  }
250 
251  options_type options_;
252  io::mofstream output_;
253  std::unique_ptr<hash_builder_type> hash_builder_;
254  fingerprint_hash fingerprint_;
255 };
256 
261 template <class Value>
263 {
264  uint64_t idx;
265  Value value;
266 };
267 
271 template <class Key, class Value, class FingerPrint = uint32_t>
273 {
274  public:
275  using fingerprint_type = FingerPrint;
279 
283  perfect_hash_map(const std::string& prefix)
284  : hash_{prefix}, file_{prefix + "/values.bin"}, fingerprint_{47}
285  {
286  // nothing
287  }
288 
289  perfect_hash_map(perfect_hash_map&&) = default;
290 
294  const perfect_hash<Key>& hash() const
295  {
296  return hash_;
297  }
298 
305  {
306  auto idx = index(key);
307  if (!idx)
308  return util::nullopt;
309 
310  auto record = reinterpret_cast<const record_type*>(
311  file_.begin() + *idx * sizeof(record_type));
312 
313  return index_and_value_type{*idx, record->value};
314  }
315 
320  util::optional<uint64_t> index(const Key& key) const
321  {
322  auto idx = hash_(key);
323  auto id = static_cast<fingerprint_type>(fingerprint_(key));
324 
325  auto record = reinterpret_cast<const record_type*>(
326  file_.begin() + idx * sizeof(record_type));
327 
328  if (id == record->id)
329  return idx;
330 
331  return util::nullopt;
332  }
333 
338  util::optional<Value> at(const Key& key) const
339  {
340  auto idx = hash_(key);
341  auto id = static_cast<fingerprint_type>(fingerprint_(key));
342 
343  auto record = reinterpret_cast<const record_type*>(
344  file_.begin() + idx * sizeof(record_type));
345 
346  if (id == record->id)
347  return record->value;
348 
349  return util::nullopt;
350  }
351 
358  const Value& operator[](uint64_t idx) const
359  {
360  auto record = reinterpret_cast<const record_type*>(
361  file_.begin() + idx * sizeof(record_type));
362  return record->value;
363  }
364 
365  private:
366  perfect_hash<Key> hash_;
367  io::mmap_file file_;
368  fingerprint_hash fingerprint_;
369 };
370 }
371 }
372 #endif
Definition: perfect_hash_map.h:42
Query class for the minimal perfect hash functions created by perfect_hash_builder.
Definition: perfect_hash.h:27
perfect_hash_map_builder(const options_type &options)
Definition: perfect_hash_map.h:115
A class for representing optional values.
Definition: optional.h:115
Represents an entry in a perfect_hash_map, containing its index into the dense values array and its a...
Definition: perfect_hash_map.h:262
A generic, randomly seeded hash function.
Definition: hash.h:343
Memory maps a text file readonly.
Definition: mmap_file.h:27
void write()
Finalizes writing the perfect hash map to disk.
Definition: perfect_hash_map.h:139
A simple implementation of the ChunkIterator concept that reads Records from a binary file using io::...
Definition: multiway_merge.h:191
util::optional< index_and_value_type > index_and_value(const Key &key) const
Definition: perfect_hash_map.h:304
const Value & operator[](uint64_t idx) const
Definition: perfect_hash_map.h:358
util::optional< Value > at(const Key &key) const
Definition: perfect_hash_map.h:338
A stupid wrapper around a std::fstream to work around GCC&#39;s libstdc++ lacking move constructors for s...
Definition: moveable_stream.h:93
The ModErn Text Analysis toolkit is a suite of natural language processing, classification, information retrieval, data mining, and other applications of text processing.
Definition: analyzer.h:25
const perfect_hash< Key > & hash() const
Definition: perfect_hash_map.h:294
constexpr nullopt_t nullopt
A global nullopt_t constant.
Definition: optional.h:56
uint64_t multiway_merge(ForwardIterator begin, ForwardIterator end, Compare &&record_comp, ShouldMerge &&should_merge, RecordHandler &&output, ProgressTrait=ProgressTrait{})
A generic algorithm for performing an N-way merge on a collection of sorted "chunks".
Definition: multiway_merge.h:100
perfect_hash_map(const std::string &prefix)
Definition: perfect_hash_map.h:283
void operator()(const KeyType &key, const ValueType &value)
Handles a key/value pair.
Definition: perfect_hash_map.h:127
Definition: perfect_hash_map.h:33
An immutable minimal perfect hashing based hash_map, read from disk.
Definition: perfect_hash_map.h:272
Builder class for constructing the on-disk representation for a minimal perfect hashing based hash ma...
Definition: perfect_hash_map.h:102
Definition: perfect_hash_builder.h:58
Simple class for reporting progress of lengthy operations.
Definition: progress.h:34
Constructs a minimal perfect hash using a streaming variant of the hash, displace, and compress algorithm.
Definition: perfect_hash_builder.h:55
util::optional< uint64_t > index(const Key &key) const
Definition: perfect_hash_map.h:320