ModErn Text Analysis
META Enumerates Textual Applications
farm_hash.h
Go to the documentation of this file.
1 
10 #ifndef META_HASHING_FARM_HASH_H_
11 #define META_HASHING_FARM_HASH_H_
12 
13 #include <algorithm>
14 #include <array>
15 #include <cassert>
16 #include <cstdint>
17 #include <cstring>
18 #include <utility>
19 
20 namespace meta
21 {
22 namespace hashing
23 {
24 
25 namespace farm
26 {
27 
28 template <class T>
29 inline T fetch(const uint8_t* data)
30 {
31  static_assert(std::is_integral<T>::value, "fetch only defined for ints");
32  T result;
33  std::memcpy(&result, data, sizeof(T));
34  return result;
35 }
36 
37 inline uint32_t rotate(uint32_t val, int shift)
38 {
39  // Avoid shifting by 32: doing so yields an undefined result.
40  return shift == 0 ? val : ((val >> shift) | (val << (32 - shift)));
41 }
42 
43 inline uint64_t rotate(uint64_t val, int shift)
44 {
45  // Avoid shifting by 64: doing so yields an undefined result.
46  return shift == 0 ? val : ((val >> shift) | (val << (64 - shift)));
47 }
48 
49 // Some primes between 2^63 and 2^64 for various uses.
50 const static constexpr uint64_t k0 = 0xc3a5c85c97cb3127ULL;
51 const static constexpr uint64_t k1 = 0xb492b66fbe98f273ULL;
52 const static constexpr uint64_t k2 = 0x9ae16a3b2f90404fULL;
53 
54 // Magic numbers for 32-bit hashing. Copied from Murmur3.
55 const static constexpr uint32_t c1 = 0xcc9e2d51;
56 const static constexpr uint32_t c2 = 0x1b873593;
57 
58 // A 32-bit to 32-bit integer hash copied from Murmur3.
59 inline uint32_t fmix(uint32_t h)
60 {
61  h ^= h >> 16;
62  h *= 0x85ebca6b;
63  h ^= h >> 13;
64  h *= 0xc2b2ae35;
65  h ^= h >> 16;
66 
67  return h;
68 }
69 
70 inline uint64_t shift_mix(uint64_t val)
71 {
72  return val ^ (val >> 47);
73 }
74 
75 struct u128
76 {
77  uint64_t low;
78  uint64_t high;
79 
80  u128() = default;
81  u128(uint64_t l, uint64_t h) : low{l}, high{h}
82  {
83  // nothing
84  }
85 };
86 
87 inline uint64_t hash_len_16(uint64_t u, uint64_t v,
88  uint64_t mul = 0x9ddfea08eb382d69ULL)
89 {
90  // Murmur-inspired hashing.
91  auto a = (u ^ v) * mul;
92  a ^= (a >> 47);
93  auto b = (v ^ a) * mul;
94  b ^= (b >> 47);
95  b *= mul;
96  return b;
97 }
98 
99 inline uint64_t hash_len_0_to_16(const uint8_t* data, std::size_t len)
100 {
101  if (len >= 8)
102  {
103  auto mul = k2 + len * 2;
104  auto a = fetch<uint64_t>(data) + k2;
105  auto b = fetch<uint64_t>(data + len - 8);
106  auto c = rotate(b, 37) * mul + a;
107  auto d = (rotate(a, 25) + b) * mul;
108  return hash_len_16(c, d, mul);
109  }
110  if (len >= 4)
111  {
112  auto mul = k2 + len * 2;
113  uint64_t a = fetch<uint32_t>(data);
114  auto b = len + (a << 3);
115  auto c = fetch<uint32_t>(data + len - 4);
116  return hash_len_16(b, c, mul);
117  }
118  if (len > 0)
119  {
120  auto a = data[0];
121  auto b = data[len >> 1];
122  auto c = data[len - 1];
123  uint32_t y = static_cast<uint32_t>(a) + (static_cast<uint32_t>(b) << 8);
124  auto z = static_cast<uint32_t>(len + (static_cast<uint32_t>(c) << 2));
125  return shift_mix(y * k2 ^ z * k0) * k2;
126  }
127  return k2;
128 }
129 
130 inline uint64_t hash_len_17_to_32(const uint8_t* data, std::size_t len)
131 {
132  auto mul = k2 + len * 2;
133  auto a = fetch<uint64_t>(data) * k1;
134  auto b = fetch<uint64_t>(data + 8);
135  auto c = fetch<uint64_t>(data + len - 8) * mul;
136  auto d = fetch<uint64_t>(data + len - 16) * k2;
137  return hash_len_16(rotate(a + b, 43) + rotate(c, 30) + d,
138  a + rotate(b + k2, 18) + c, mul);
139 }
140 
141 inline u128 weak_hash_len_32_with_seeds(const uint64_t* buffer, uint64_t a,
142  uint64_t b)
143 {
144  a += buffer[0];
145  b = rotate(b + a + buffer[3], 21);
146  auto c = a;
147  a += buffer[1];
148  a += buffer[2];
149  b += rotate(a, 44);
150  return {a + buffer[3], b + c};
151 }
152 
153 inline uint64_t hash_len_33_to_64(const uint8_t* data, std::size_t len)
154 {
155  auto mul = k2 + len * 2;
156  auto a = fetch<uint64_t>(data) * k2;
157  auto b = fetch<uint64_t>(data + 8);
158  auto c = fetch<uint64_t>(data + len - 8) * mul;
159  auto d = fetch<uint64_t>(data + len - 16) * k2;
160  auto y = rotate(a + b, 43) + rotate(c, 30) + d;
161  auto z = hash_len_16(y, a + rotate(b + k2, 18) + c, mul);
162  auto e = fetch<uint64_t>(data + 16) * mul;
163  auto f = fetch<uint64_t>(data + 24);
164  auto g = (y + fetch<uint64_t>(data + len - 32)) * mul;
165  auto h = (z + fetch<uint64_t>(data + len - 24)) * mul;
166  return hash_len_16(rotate(e + f, 43) + rotate(g, 30) + h,
167  e + rotate(f + a, 18) + g, mul);
168 }
169 }
170 
185 {
186  private:
187  // hashing state consists of 56 bytes for input data longer than 64
188  // bytes
189  uint64_t x_;
190  uint64_t y_;
191  uint64_t z_;
192  farm::u128 v_;
193  farm::u128 w_;
194 
195  // we maintain a buffer of 64 bytes since farm operates in chunks of
196  // that size or less
197  std::array<uint64_t, 8> buffer_;
198 
199  // buf_pos_ is where we'll start copying input data into the buffer
200  uint8_t* buf_pos_;
201 
202  // indicates whether we've done the initialization mix or not. this
203  // allows us to skip that block of code if the key is <= 64 bytes long
204  bool mixed_;
205 
206  inline void handle_block_64()
207  {
208  x_ = farm::rotate(x_ + y_ + v_.low + buffer_[1], 37) * farm::k1;
209  y_ = farm::rotate(y_ + v_.high + buffer_[6], 42) * farm::k1;
210  x_ ^= w_.high;
211  y_ += v_.low + buffer_[5];
212  z_ = farm::rotate(z_ + w_.low, 33) * farm::k1;
213  v_ = farm::weak_hash_len_32_with_seeds(buffer_.data(),
214  v_.high * farm::k1, x_ + w_.low);
215  w_ = farm::weak_hash_len_32_with_seeds(buffer_.data() + 4, z_ + w_.high,
216  y_ + buffer_[2]);
217  std::swap(z_, x_);
218  }
219 
220  inline uint64_t finalize(std::size_t len)
221  {
222  // the last bit of FarmHash operates on the last 64 bytes of input,
223  // in order. We have that, but it's not in the correct order within
224  // buffer_ since it's a circular buffer. Rotate fixes that and puts
225  // all the last 64 bytes in "chronological" order.
226  uint8_t* buf_start = reinterpret_cast<uint8_t*>(buffer_.data());
227  std::rotate(buf_start, buf_start + len, buf_start + 64);
228 
229  auto mul = farm::k1 + ((z_ & 0xff) << 1);
230  w_.low += ((len - 1) & 63);
231  v_.low += w_.low;
232  w_.low += v_.low;
233  x_ = farm::rotate(x_ + y_ + v_.low + buffer_[1], 37) * mul;
234  y_ = farm::rotate(y_ + v_.high + buffer_[6], 42) * mul;
235  x_ ^= w_.high * 9;
236  y_ += v_.low * 9 + buffer_[5];
237  z_ = farm::rotate(z_ + w_.low, 33) * mul;
238  v_ = farm::weak_hash_len_32_with_seeds(buffer_.data(), v_.high * mul,
239  x_ + w_.low);
240  w_ = farm::weak_hash_len_32_with_seeds(buffer_.data() + 4, z_ + w_.high,
241  y_ + buffer_[2]);
242  std::swap(z_, x_);
243  return farm::hash_len_16(farm::hash_len_16(v_.low, w_.low, mul)
244  + farm::shift_mix(y_) * farm::k0 + z_,
245  farm::hash_len_16(v_.high, w_.high, mul) + x_,
246  mul);
247  }
248 
249  public:
250  using result_type = uint64_t;
251 
252  farm_hash()
253  : buf_pos_{reinterpret_cast<uint8_t*>(buffer_.data())}, mixed_{false}
254  {
255  // nothing
256  }
257 
258  inline void operator()(const void* in, std::size_t len)
259  {
260  auto data = reinterpret_cast<const uint8_t*>(in);
261  // determine how much space we've got left in the buffer
262  auto buf_start = reinterpret_cast<uint8_t*>(buffer_.data());
263  auto buf_remaining
264  = static_cast<std::size_t>(buf_start + 64 - buf_pos_);
265 
266  // if we can fit the entire key into our buffer, so do so and bail
267  if (len <= buf_remaining)
268  {
269  std::copy(data, data + len, buf_pos_);
270  buf_pos_ += len;
271  return;
272  }
273 
274  std::size_t bytes = 0;
275  std::copy(data, data + buf_remaining, buf_pos_);
276  bytes += buf_remaining;
277 
278  // we have more than 64 total bytes to be hashed, so check if we
279  // need to do our initialization mix
280  if (!mixed_)
281  {
282  const constexpr uint64_t seed = 81;
283  x_ = seed;
284  y_ = seed * farm::k1 + 113;
285  z_ = farm::shift_mix(y_ * farm::k2 + 113) * farm::k2;
286  v_.low = v_.high = 0;
287  w_.low = w_.high = 0;
288  x_ = x_ * farm::k2 + buffer_[0];
289  mixed_ = true;
290  }
291 
292  // start hashing out of buffer_ blocks of size 64, refilling as
293  // necessary
294  handle_block_64();
295  while (len - bytes > 64)
296  {
297  std::copy(data + bytes, data + bytes + 64, buf_start);
298  bytes += 64;
299  handle_block_64();
300  }
301  std::copy(data + bytes, data + len, buf_start);
302  buf_pos_ = buf_start + (len - bytes);
303  }
304 
305  inline explicit operator result_type()
306  {
307  auto buf_start = reinterpret_cast<uint8_t*>(buffer_.data());
308  auto len = static_cast<std::size_t>(buf_pos_ - buf_start);
309 
310  if (!mixed_)
311  {
312  if (len <= 32)
313  {
314  if (len <= 16)
315  {
316  return farm::hash_len_0_to_16(buf_start, len);
317  }
318  else
319  {
320  return farm::hash_len_17_to_32(buf_start, len);
321  }
322  }
323  else
324  {
325  return farm::hash_len_33_to_64(buf_start, len);
326  }
327  }
328  else
329  {
330  return finalize(len);
331  }
332  }
333 };
334 
339 {
340  private:
341  // the initial seeds for the algorithm, used during finalization
342  farm::u128 seed_;
343 
344  public:
345  using farm_hash::result_type;
346 
347  farm_hash_seeded(uint64_t seed) : farm_hash_seeded{farm::k2, seed}
348  {
349  // nothing
350  }
351 
352  farm_hash_seeded(uint64_t seed0, uint64_t seed1) : seed_{seed0, seed1}
353  {
354  // nothing
355  }
356 
357  inline explicit operator result_type()
358  {
359  auto result = static_cast<result_type>(static_cast<farm_hash&>(*this));
360  return farm::hash_len_16(result - seed_.low, seed_.high);
361  }
362 };
363 }
364 }
365 #endif
Definition: farm_hash.h:75
The ModErn Text Analysis toolkit is a suite of natural language processing, classification, information retrieval, data mining, and other applications of text processing.
Definition: analyzer.h:25
A seeded version of farm_hash.
Definition: farm_hash.h:338
Implementation of FarmHash64.
Definition: farm_hash.h:184