ModErn Text Analysis
META Enumerates Textual Applications
postings_buffer.h
Go to the documentation of this file.
1 
10 #ifndef META_INDEX_POSTINGS_BUFFER_H_
11 #define META_INDEX_POSTINGS_BUFFER_H_
12 
13 #include <cstdint>
14 #include <memory>
15 #include <vector>
16 
17 #include "meta/config.h"
19 #include "meta/io/packed.h"
20 #include "meta/util/shim.h"
21 
22 namespace meta
23 {
24 namespace index
25 {
26 
27 namespace detail
28 {
32 template <class T>
33 uint64_t bytes_used(
34  const T& elem,
35  typename std::enable_if<std::is_same<T,
36  std::string>::value>::type* = nullptr)
37 {
38  return elem.capacity();
39 }
40 
44 template <class T>
45 uint64_t bytes_used(
46  const T& elem,
47  typename std::enable_if<!std::is_same<T,
48  std::string>::value>::type* = nullptr)
49 {
50  return sizeof(elem);
51 }
52 }
53 
63 template <class PrimaryKey, class SecondaryKey, class FeatureValue = uint64_t>
65 {
66  private:
67  using byte_type = uint8_t;
68  using buffer_type = std::vector<byte_type>;
69  using const_buffer_iterator = buffer_type::const_iterator;
70 
73  {
74  buffer_input_stream(const_buffer_iterator it) : it_{it}
75  {
76  // nothing
77  }
78 
79  char get()
80  {
81  return *it_++;
82  }
83 
84  const_buffer_iterator it_;
85  };
86 
87  public:
91  postings_buffer(PrimaryKey pk) : pk_(std::move(pk))
92  {
93  // nothing
94  }
95 
99  const PrimaryKey& primary_key() const
100  {
101  return pk_;
102  }
103 
110  void write_count(SecondaryKey id, FeatureValue count)
111  {
112  ++num_ids_;
113  total_counts_ += count;
114 
115  assert(id >= last_id_);
116  io::packed::write(buffer_, id - last_id_);
117  io::packed::write(buffer_, count);
118 
119  last_id_ = id;
120  }
121 
126  std::size_t bytes_used() const
127  {
128  auto bytes = buffer_.size_;
129 
130  // this only matters when PrimaryKey is std::string.
131  // if the capacity of the string is bigger than the size of the
132  // string itself, then we know it must also be using heap memory,
133  // which we haven't accounted for already.
134  if (detail::bytes_used(pk_) > sizeof(PrimaryKey))
135  bytes += detail::bytes_used(pk_);
136  return bytes;
137  }
138 
144  template <class OutputStream>
145  uint64_t write_packed(OutputStream& os)
146  {
147  auto bytes = io::packed::write(os, pk_);
148  bytes += io::packed::write(os, num_ids_);
149  bytes += io::packed::write(os, total_counts_);
150 
151  buffer_.write(os);
152  return bytes + buffer_.size_;
153  }
154 
159  {
160  return {reinterpret_cast<const char*>(buffer_.bytes_.get()), num_ids_,
161  total_counts_};
162  }
163 
169  bool operator<(const postings_buffer& rhs) const
170  {
171  return pk_ < rhs.pk_;
172  }
173 
178  bool operator==(const postings_buffer& rhs) const
179  {
180  return pk_ == rhs.pk_;
181  }
182 
183  private:
185  struct char_buffer
186  {
188  char_buffer() : size_{0}, pos_{0}
189  {
190  }
191 
196  char_buffer(const char_buffer& other)
197  : size_{other.size_}, pos_{other.pos_}
198  {
199  if (other.bytes_)
200  {
201  bytes_ = make_unique<uint8_t[]>(size_);
202  std::copy(other.bytes_.get(), other.bytes_.get() + pos_,
203  bytes_.get());
204  }
205  }
206 
208  char_buffer(char_buffer&&) = default;
209 
215  {
216  char_buffer copy{rhs};
217  swap(copy);
218  return *this;
219  }
220 
222  char_buffer& operator=(char_buffer&&) = default;
223 
228  void swap(char_buffer& other)
229  {
230  using std::swap;
231  swap(size_, other.size_);
232  swap(pos_, other.pos_);
233  swap(bytes_, other.bytes_);
234  }
235 
240  void put(char byte)
241  {
242  if (size_ == pos_)
243  resize();
244  bytes_[pos_] = static_cast<uint8_t>(byte);
245  ++pos_;
246  }
247 
251  void resize()
252  {
253  if (size_ == 0)
254  {
255  size_ = 8;
256  }
257  else
258  {
259  // 1.5x resize
260  size_ += (size_ + 1) / 2;
261  }
262 
263  auto newbytes = make_unique<uint8_t[]>(size_);
264  std::copy(bytes_.get(), bytes_.get() + pos_, newbytes.get());
265  std::swap(newbytes, bytes_);
266  }
267 
272  template <class OutputStream>
273  void write(OutputStream& os) const
274  {
275  os.write(reinterpret_cast<const char*>(bytes_.get()),
276  static_cast<std::streamsize>(pos_));
277  }
278 
280  std::unique_ptr<uint8_t[]> bytes_;
282  std::size_t size_;
284  std::size_t pos_;
285 
286  } buffer_;
287 
289  PrimaryKey pk_;
291  SecondaryKey last_id_ = SecondaryKey{0};
293  uint64_t num_ids_ = 0;
295  FeatureValue total_counts_ = 0;
296 };
297 
298 template <class HashAlgorithm, class PrimaryKey, class SecondaryKey>
299 void hash_append(HashAlgorithm& h,
301 {
302  using hashing::hash_append;
303  hash_append(h, pb.primary_key());
304 }
305 }
306 }
307 
308 namespace std
309 {
310 template <class PrimaryKey, class SecondaryKey>
311 struct hash<meta::index::postings_buffer<PrimaryKey, SecondaryKey>>
312 {
314  std::size_t operator()(const pbuffer_type& pbuffer) const
315  {
316  return std::hash<PrimaryKey>{}(pbuffer.primary_key());
317  }
318 };
319 }
320 #endif
char_buffer()
Constructs an empty buffer.
Definition: postings_buffer.h:188
const PrimaryKey & primary_key() const
Definition: postings_buffer.h:99
postings_buffer(PrimaryKey pk)
Creates a postings_buffer for a specific primary key.
Definition: postings_buffer.h:91
std::unique_ptr< uint8_t[]> bytes_
The bytes in this buffer.
Definition: postings_buffer.h:280
STL namespace.
void resize()
Resizes the buffer to 1.5x its old size.
Definition: postings_buffer.h:251
uint64_t bytes_used(const T &elem, typename std::enable_if< std::is_same< T, std::string >::value >::type *=nullptr)
Gets the bytes used by a std::string.
Definition: postings_buffer.h:33
postings_stream< SecondaryKey, FeatureValue > stream() const
Definition: postings_buffer.h:158
std::size_t size_
The current size of the buffer.
Definition: postings_buffer.h:282
char_buffer(const char_buffer &other)
Copies an existing buffer.
Definition: postings_buffer.h:196
char_buffer & operator=(const char_buffer &rhs)
Definition: postings_buffer.h:214
void write(OutputStream &os) const
Writes all the bytes in this buffer to the output stream.
Definition: postings_buffer.h:273
std::size_t pos_
The current byte position in the buffer.
Definition: postings_buffer.h:284
Represents the postings list for an in-memory chunk assocated with a specific PrimaryKey (usually a s...
Definition: postings_buffer.h:64
std::size_t bytes_used() const
Definition: postings_buffer.h:126
The ModErn Text Analysis toolkit is a suite of natural language processing, classification, information retrieval, data mining, and other applications of text processing.
Definition: analyzer.h:25
bool operator==(const postings_buffer &rhs) const
Definition: postings_buffer.h:178
A stream for extracting the postings list for a specific key in a postings file.
Definition: postings_stream.h:32
void write_count(SecondaryKey id, FeatureValue count)
Writes a postings entry to the in-memory byte buffer in compressed format.
Definition: postings_buffer.h:110
PrimaryKey pk_
The primary key for the buffer.
Definition: postings_buffer.h:289
void swap(char_buffer &other)
Swaps the current buffer with the argument.
Definition: postings_buffer.h:228
bool operator<(const postings_buffer &rhs) const
Definition: postings_buffer.h:169
void put(char byte)
Writes a single byte to the buffer, resizing if needed.
Definition: postings_buffer.h:240
uint64_t write_packed(OutputStream &os)
Writes this buffer directly to an output stream.
Definition: postings_buffer.h:145
A simple input stream that reads from a buffer using an iterator.
Definition: postings_buffer.h:72
A simple byte buffer that resizes with a 1.5x policy when full.
Definition: postings_buffer.h:185