ModErn Text Analysis
META Enumerates Textual Applications
Go to the documentation of this file.
10 #include "meta/config.h"
12 #include "meta/succinct/sarray.h"
14 namespace meta
15 {
16 namespace hashing
17 {
26 template <class K>
28 {
29  public:
30  perfect_hash(const std::string& prefix)
31  : seeds_{prefix + "/seeds"},
32  sarray_{prefix + "/sarray"},
33  empty_rank_{prefix + "/sarray", sarray_}
34  {
35  std::ifstream metadata{prefix + "/hash-metadata.bin", std::ios::binary};
36  io::packed::read(metadata, bucket_seed_);
37  io::packed::read(metadata, num_bins_);
38  // nothing
39  }
41  uint64_t operator()(const K& key) const
42  {
43  using meta::hashing::hash_append;
45  hash_append(hasher, key);
46  auto hash = static_cast<farm_hash_seeded::result_type>(hasher);
47  auto bucket_id = hash % seeds_.size();
48  auto seed = seeds_[bucket_id];
49  auto pos = farm::hash_len_16(hash, seed) % num_bins_;
50  // the final position is the hash function's position shifted to
51  // the left by the number of empty bins that came before it.
52  return pos - empty_rank_.rank(pos);
53  }
55  private:
57  uint64_t bucket_seed_;
59  uint64_t num_bins_;
66 };
67 }
68 }
succinct::sarray sarray_
The sarray that backs the rank data structure.
Definition: perfect_hash.h:63
Query class for the minimal perfect hash functions created by perfect_hash_builder.
Definition: perfect_hash.h:27
succinct::compressed_vector seeds_
The seeds to use for each bucket.
Definition: perfect_hash.h:61
succinct::sarray_rank empty_rank_
The ranking data structure that counts the number of empty slots.
Definition: perfect_hash.h:65
A generic, randomly seeded hash function.
Definition: hash.h:343
uint64_t rank(uint64_t i) const
Definition: sarray.cpp:112
uint64_t bucket_seed_
The seed to use for the bucket hash function.
Definition: perfect_hash.h:57
Query class for rank queries on an sarray succinct data structure.
Definition: sarray.h:101
void pos(const std::string &file, const cpptoml::table &config, bool replace)
Performs part-of-speech tagging on a text file.
Definition: profile.cpp:133
The ModErn Text Analysis toolkit is a suite of natural language processing, classification, information retrieval, data mining, and other applications of text processing.
Definition: analyzer.h:25
A seeded version of farm_hash.
Definition: farm_hash.h:338
Compressed, time random-access sequences of unsigned 64-bit numbers.
Definition: compressed_vector.h:32
Storage class for the high and low bits of the sarray structure.
Definition: sarray.h:77
uint64_t num_bins_
The number of bins for the perfect hash function.
Definition: perfect_hash.h:59