ModErn Text Analysis
META Enumerates Textual Applications
metro_hash.h
Go to the documentation of this file.
1 
10 #ifndef META_HASHING_METRO_HASH_H_
11 #define META_HASHING_METRO_HASH_H_
12 
13 #include <algorithm>
14 #include <array>
15 #include <cassert>
16 #include <cstdint>
17 
18 namespace meta
19 {
20 namespace hashing
21 {
22 
23 namespace metro
24 {
25 inline uint64_t rotr(uint64_t x, uint8_t r)
26 {
27  return (x >> r) | (x << (64 - r));
28 }
29 
30 template <class T>
31 inline T read(const uint8_t*& data)
32 {
33  T ret = *reinterpret_cast<const T*>(data);
34  data += sizeof(T);
35  return ret;
36 }
37 }
38 
47 {
48  public:
49  using result_type = uint64_t;
50 
51  metro_hash(uint64_t seed) : buflen_{0}, seed_{(seed + k2) * k0}, big_{false}
52  {
53  std::fill(std::begin(state_), std::end(state_), seed_);
54  }
55 
56  void operator()(const void* key, std::size_t len)
57  {
58  auto data = reinterpret_cast<const uint8_t*>(key);
59  auto end = data + len;
60 
61  // if the input buffer is partially filled
62  if (buflen_ > 0 && buflen_ < 32)
63  {
64  uint64_t copylen = 32 - buflen_ > len ? len : 32 - buflen_;
65  std::copy(data, data + copylen, buffer_.data() + buflen_);
66 
67  data += copylen;
68  buflen_ += copylen;
69 
70  // if the buffer still isn't filled, bail out for now
71  if (buflen_ < 32)
72  return;
73 
74  // otherwise, we can process the full 32-byte input buffer
75  const uint8_t* buf = buffer_.data();
76  handle_block_32(buf);
77  buflen_ = 0;
78  }
79 
80  // now, process the remaining 32-byte blocks
81  const auto nblocks = (end - data) / 32;
82  for (int i = 0; i < nblocks; ++i)
83  handle_block_32(data);
84 
85  // copy over the remaining 31 bytes or less for finalizing or use
86  // on the next call to operator()
87  if (end - data)
88  {
89  buflen_ = static_cast<std::size_t>(end - data);
90  assert(buflen_ < 32);
91  std::copy(data, end, buffer_.begin());
92  }
93  }
94 
95  explicit operator std::size_t()
96  {
97  // loop finalizer
98  if (big_)
99  {
100  state_[2]
101  ^= metro::rotr((state_[0] + state_[3]) * k0 + state_[1], 37)
102  * k1;
103  state_[3]
104  ^= metro::rotr((state_[1] + state_[2]) * k1 + state_[0], 37)
105  * k0;
106  state_[0]
107  ^= metro::rotr((state_[0] + state_[2]) * k0 + state_[3], 37)
108  * k1;
109  state_[1]
110  ^= metro::rotr((state_[1] + state_[3]) * k1 + state_[2], 37)
111  * k0;
112 
113  state_[0] = seed_ + (state_[0] ^ state_[1]);
114  }
115 
116  // handle any leftover bytes
117  const uint8_t* data = buffer_.data();
118  auto end = data + buflen_;
119 
120  if (end - data >= 16)
121  {
122  state_[1] = state_[0] + metro::read<uint64_t>(data) * k2;
123  state_[1] = metro::rotr(state_[1], 29) * k3;
124 
125  state_[2] = state_[0] + metro::read<uint64_t>(data) * k2;
126  state_[2] = metro::rotr(state_[2], 29) * k3;
127 
128  state_[1] ^= metro::rotr(state_[1] * k0, 21) + state_[2];
129  state_[2] ^= metro::rotr(state_[2] * k3, 21) + state_[1];
130  state_[0] += state_[2];
131  }
132 
133  if (end - data >= 8)
134  {
135  state_[0] += metro::read<uint64_t>(data) * k3;
136  state_[0] ^= metro::rotr(state_[0], 55) * k1;
137  }
138 
139  if (end - data >= 4)
140  {
141  state_[0] += metro::read<uint32_t>(data) * k3;
142  state_[0] ^= metro::rotr(state_[0], 26) * k1;
143  }
144 
145  if (end - data >= 2)
146  {
147  state_[0] += metro::read<uint16_t>(data) * k3;
148  state_[0] ^= metro::rotr(state_[0], 48) * k1;
149  }
150 
151  if (end - data >= 1)
152  {
153  state_[0] += *(data++) * k3;
154  state_[0] ^= metro::rotr(state_[0], 37) * k1;
155  }
156 
157  state_[0] ^= metro::rotr(state_[0], 28);
158  state_[0] *= k0;
159  state_[0] ^= metro::rotr(state_[0], 29);
160 
161  return state_[0];
162  }
163 
164  private:
165  inline void handle_block_32(const uint8_t*& data)
166  {
167  big_ = true;
168  state_[0] += metro::read<uint64_t>(data) * k0;
169  state_[0] = metro::rotr(state_[0], 29) + state_[2];
170  state_[1] += metro::read<uint64_t>(data) * k1;
171  state_[1] = metro::rotr(state_[1], 29) + state_[3];
172  state_[2] += metro::read<uint64_t>(data) * k2;
173  state_[2] = metro::rotr(state_[2], 29) + state_[0];
174  state_[3] += metro::read<uint64_t>(data) * k3;
175  state_[3] = metro::rotr(state_[3], 29) + state_[1];
176  }
177 
178  const static constexpr uint64_t k0 = 0xD6D018F5;
179  const static constexpr uint64_t k1 = 0xA2AA033B;
180  const static constexpr uint64_t k2 = 0x62992FC1;
181  const static constexpr uint64_t k3 = 0x30BC5B29;
182 
183  std::array<uint64_t, 4> state_;
184  std::array<uint8_t, 32> buffer_;
185  uint64_t buflen_;
186  uint64_t seed_;
187  bool big_;
188 };
189 }
190 }
191 #endif
The ModErn Text Analysis toolkit is a suite of natural language processing, classification, information retrieval, data mining, and other applications of text processing.
Definition: analyzer.h:25
Port of the incremental MetroHash64, which was released under the MIT License (see LICENSE...
Definition: metro_hash.h:46