ModErn Text Analysis
META Enumerates Textual Applications
chunk_reader.h
Go to the documentation of this file.
1 
10 #ifndef META_INDEX_CHUNK_READER_H_
11 #define META_INDEX_CHUNK_READER_H_
12 
13 #include <algorithm>
14 #include <fstream>
15 #include <memory>
16 #include <numeric>
17 #include <string>
18 
19 #include "meta/config.h"
20 #include "meta/io/filesystem.h"
23 #include "meta/util/progress.h"
24 
25 namespace meta
26 {
27 namespace index
28 {
29 
34 template <class PostingsData>
36 {
37  public:
38  using primary_key_type = typename PostingsData::primary_key_type;
39  using count_t = typename PostingsData::count_t;
40 
41  postings_record() = default;
42 
43  operator PostingsData() &&
44  {
45  PostingsData pdata{key_};
46  pdata.set_counts(std::move(counts_));
47  return pdata;
48  }
49 
50  void merge_with(postings_record&& other)
51  {
52  std::move(other.counts_.begin(), other.counts_.end(),
53  std::back_inserter(counts_));
54  count_t{}.swap(other.counts_);
55  }
56 
57  template <class InputStream>
58  uint64_t read(InputStream& in)
59  {
60  PostingsData pdata;
61  auto bytes = pdata.read_packed(in);
62  key_ = pdata.primary_key();
63  counts_ = pdata.counts();
64  return bytes;
65  }
66 
67  bool operator<(const postings_record& other) const
68  {
69  return key_ < other.key_;
70  }
71 
72  bool operator==(const postings_record& other) const
73  {
74  return key_ == other.key_;
75  }
76 
77  count_t& counts() const
78  {
79  return counts_;
80  }
81 
82  template <class InputStream>
83  friend uint64_t packed_read(InputStream& is, postings_record& record)
84  {
85  PostingsData pdata;
86  auto bytes = pdata.read_packed(is);
87  record.key_ = pdata.primary_key();
88  record.counts_ = pdata.counts();
89  return bytes;
90  }
91 
92  private:
93  primary_key_type key_;
94  count_t counts_;
95 };
96 
104 template <class PostingsData>
105 using chunk_reader
107 
121 template <class PostingsData, class ForwardIterator>
122 uint64_t multiway_merge(std::ostream& outstream, ForwardIterator begin,
123  ForwardIterator end)
124 {
125  using input_chunk = chunk_reader<PostingsData>;
126  std::vector<input_chunk> to_merge;
127  to_merge.reserve(static_cast<std::size_t>(std::distance(begin, end)));
128  for (; begin != end; ++begin)
129  to_merge.emplace_back(*begin);
130 
131  return util::multiway_merge(
132  to_merge.begin(), to_merge.end(),
133  [&](PostingsData&& pdata) { pdata.write_packed(outstream); });
134 }
135 }
136 }
137 #endif
Simple wrapper class to adapt PostingsData to the Record concept for multiway_merge.
Definition: chunk_reader.h:35
The ModErn Text Analysis toolkit is a suite of natural language processing, classification, information retrieval, data mining, and other applications of text processing.
Definition: analyzer.h:25
uint64_t multiway_merge(ForwardIterator begin, ForwardIterator end, Compare &&record_comp, ShouldMerge &&should_merge, RecordHandler &&output, ProgressTrait=ProgressTrait{})
A generic algorithm for performing an N-way merge on a collection of sorted "chunks".
Definition: multiway_merge.h:100
uint64_t multiway_merge(std::ostream &outstream, ForwardIterator begin, ForwardIterator end)
Performs a multi-way merge sort of all of the provided chunks, writing to the provided output stream...
Definition: chunk_reader.h:122
A simple implementation of the ChunkIterator concept that reads Records from a binary file using io::...
Definition: multiway_merge.h:282