ModErn Text Analysis
META Enumerates Textual Applications
murmur_hash.h
Go to the documentation of this file.
1 
10 #ifndef META_HASHING_MURMUR_HASH_H_
11 #define META_HASHING_MURMUR_HASH_H_
12 
13 #include <algorithm>
14 #include <array>
15 #include <cassert>
16 #include <cstdint>
17 
18 namespace meta
19 {
20 namespace hashing
21 {
22 
27 template <std::size_t = sizeof(std::size_t)>
29 
30 namespace murmur
31 {
32 inline uint32_t rotl(uint32_t x, int8_t r)
33 {
34  return (x << r) | (x >> (32 - r));
35 }
36 
37 inline uint64_t rotl(uint64_t x, int8_t r)
38 {
39  return (x << r) | (x >> (64 - r));
40 }
41 
42 inline uint32_t fmix(uint32_t h)
43 {
44  h ^= h >> 16;
45  h *= 0x85ebca6b;
46  h ^= h >> 13;
47  h *= 0xc2b2ae35;
48  h ^= h >> 16;
49 
50  return h;
51 }
52 
53 inline uint64_t fmix(uint64_t h)
54 {
55  h ^= h >> 33;
56  h *= 0xff51afd7ed558ccdLLU;
57  h ^= h >> 33;
58  h *= 0xc4ceb9fe1a85ec53LLU;
59  h ^= h >> 33;
60 
61  return h;
62 }
63 }
64 
68 template <>
69 class murmur_hash<4>
70 {
71  private:
72  // this *has* to be uint32_t for OS X clang to correctly resolve
73  // between the two versions of rotl/fmix in namespace murmur above.
74  uint32_t out_;
75  std::array<uint8_t, 4> buf_;
76  uint32_t buflen_;
77  uint32_t total_length_;
78 
79  const static constexpr uint32_t c1 = 0xcc9e2d51;
80  const static constexpr uint32_t c2 = 0x1b873593;
81 
82  void handle_block_4(uint32_t block)
83  {
84  block *= c1;
85  block = murmur::rotl(block, 15);
86  block *= c2;
87 
88  out_ ^= block;
89  out_ = murmur::rotl(out_, 13);
90  out_ = out_ * 5 + 0xe6546b64;
91  }
92 
93  public:
94  using result_type = uint32_t;
95 
96  murmur_hash(result_type seed)
97  : out_{seed}, buflen_{0}, total_length_{0}
98  {
99  }
100 
101  void operator()(const void* in, std::size_t len)
102  {
103  auto data = reinterpret_cast<const uint8_t*>(in);
104  total_length_ += static_cast<uint32_t>(len);
105 
106  // handle 4-byte blocks at a time, starting from the data we had
107  // "left over" from the last call to operator()
108  auto end = data + len;
109  while (buflen_ > 0 && buflen_ < 4 && data < end)
110  buf_[buflen_++] = *(data++);
111 
112  if (buflen_ / 4 > 0)
113  {
114  handle_block_4(reinterpret_cast<const uint32_t*>(buf_.data())[0]);
115  buflen_ = 0;
116  }
117 
118  // now handle the remaining 4-byte blocks in this data
119  const auto nblocks = (end - data) / 4;
120  auto blocks = reinterpret_cast<const uint32_t*>(data + nblocks * 4);
121  for (long i = -nblocks; i; ++i)
122  handle_block_4(blocks[i]);
123 
124  // copy over the remaining 3 bytes or less for finalizing or use on
125  // the next call to operator()
126  const uint8_t* tail = data + nblocks * 4;
127  if (end - tail)
128  {
129  buflen_ = static_cast<uint32_t>(end - tail);
130  assert(buflen_ < 4);
131  std::copy(tail, end, buf_.begin());
132  }
133  }
134 
135  explicit operator result_type()
136  {
137  uint32_t k1 = 0;
138  switch (buflen_ & 3)
139  {
140  case 3:
141  k1 ^= static_cast<uint32_t>(buf_[2]) << 16;
142  case 2:
143  k1 ^= static_cast<uint32_t>(buf_[1]) << 8;
144  case 1:
145  k1 ^= buf_[0];
146  k1 *= c1;
147  k1 = murmur::rotl(k1, 15);
148  k1 *= c2;
149  out_ ^= k1;
150  }
151 
152  out_ ^= total_length_;
153 
154  return murmur::fmix(out_);
155  }
156 };
157 
161 template <>
162 class murmur_hash<8>
163 {
164  private:
165  uint64_t h1_;
166  uint64_t h2_;
167  std::array<uint8_t, 16> buf_;
168  std::size_t buflen_;
169  std::size_t total_length_;
170 
171  const static constexpr uint64_t c1 = 0x87c37b91114253d5LLU;
172  const static constexpr uint64_t c2 = 0x4cf5ad432745937fLLU;
173 
174  inline void handle_block_16(const uint8_t* start)
175  {
176  auto blocks = reinterpret_cast<const uint64_t*>(start);
177  auto k1 = blocks[0];
178  auto k2 = blocks[1];
179 
180  k1 *= c1;
181  k1 = murmur::rotl(k1, 31);
182  k1 *= c2;
183  h1_ ^= k1;
184 
185  h1_ = murmur::rotl(h1_, 27);
186  h1_ += h2_;
187  h1_ = h1_ * 5 + 0x52dce729;
188 
189  k2 *= c2;
190  k2 = murmur::rotl(k2, 33);
191  k2 *= c1;
192  h2_ ^= k2;
193 
194  h2_ = murmur::rotl(h2_, 31);
195  h2_ += h1_;
196  h2_ = h2_ * 5 + 0x38495ab5;
197  }
198 
199  public:
200  using result_type = uint64_t;
201 
202  murmur_hash(uint64_t seed)
203  : h1_{seed}, h2_{seed}, buflen_{0}, total_length_{0}
204  {
205  }
206 
207  void operator()(const void* in, std::size_t len)
208  {
209  auto data = reinterpret_cast<const uint8_t*>(in);
210  total_length_ += len;
211 
212  // handle 16-byte blocks at a time, starting from the data we had
213  // "left over" from the last call to operator()
214  auto end = data + len;
215  while (buflen_ > 0 && buflen_ < 16 && data < end)
216  buf_[buflen_++] = *(data++);
217 
218  if (buflen_ == 16)
219  {
220  handle_block_16(buf_.data());
221  buflen_ = 0;
222  }
223 
224  // now handle the remaining 16-byte blocks in this data
225  const auto nblocks = (end - data) / 16;
226  for (int i = 0; i < nblocks; ++i)
227  {
228  handle_block_16(data);
229  data += 16;
230  }
231 
232  // copy over the remaining 15 bytes or less for finalizing or use
233  // on the next call to operator()
234  if (end - data)
235  {
236  buflen_ = static_cast<std::size_t>(end - data);
237  assert(buflen_ < 16);
238  std::copy(data, end, buf_.begin());
239  }
240  }
241 
242  explicit operator result_type()
243  {
244  uint64_t k1 = 0;
245  uint64_t k2 = 0;
246 
247  switch (buflen_)
248  {
249  case 15:
250  k2 ^= static_cast<uint64_t>(buf_[14]) << 48;
251  case 14:
252  k2 ^= static_cast<uint64_t>(buf_[13]) << 40;
253  case 13:
254  k2 ^= static_cast<uint64_t>(buf_[12]) << 32;
255  case 12:
256  k2 ^= static_cast<uint64_t>(buf_[11]) << 24;
257  case 11:
258  k2 ^= static_cast<uint64_t>(buf_[10]) << 16;
259  case 10:
260  k2 ^= static_cast<uint64_t>(buf_[9]) << 8;
261  case 9:
262  k2 ^= static_cast<uint64_t>(buf_[8]);
263  k2 *= c2;
264  k2 = murmur::rotl(k2, 33);
265  k2 *= c1;
266  h2_ ^= k2;
267 
268  case 8:
269  k1 ^= static_cast<uint64_t>(buf_[7]) << 56;
270  case 7:
271  k1 ^= static_cast<uint64_t>(buf_[6]) << 48;
272  case 6:
273  k1 ^= static_cast<uint64_t>(buf_[5]) << 40;
274  case 5:
275  k1 ^= static_cast<uint64_t>(buf_[4]) << 32;
276  case 4:
277  k1 ^= static_cast<uint64_t>(buf_[3]) << 24;
278  case 3:
279  k1 ^= static_cast<uint64_t>(buf_[2]) << 16;
280  case 2:
281  k1 ^= static_cast<uint64_t>(buf_[1]) << 8;
282  case 1:
283  k1 ^= static_cast<uint64_t>(buf_[0]);
284  k1 *= c1;
285  k1 = murmur::rotl(k1, 31);
286  k1 *= c2;
287  h1_ ^= k1;
288  }
289 
290  h1_ ^= total_length_;
291  h2_ ^= total_length_;
292 
293  h1_ += h2_;
294  h2_ += h1_;
295 
296  h1_ = murmur::fmix(h1_);
297  h2_ = murmur::fmix(h2_);
298 
299  h1_ += h2_;
300  // h2 += h1, unneeded since we only want 64-bits.
301 
302  return h1_;
303  }
304 };
305 }
306 }
307 #endif
Implementation of MurmurHash3.
Definition: murmur_hash.h:28
The ModErn Text Analysis toolkit is a suite of natural language processing, classification, information retrieval, data mining, and other applications of text processing.
Definition: analyzer.h:25