ModErn Text Analysis
META Enumerates Textual Applications
ranker.h
Go to the documentation of this file.
1 
9 #ifndef META_RANKER_H_
10 #define META_RANKER_H_
11 
12 #include <utility>
13 #include <vector>
14 
16 #include "meta/meta.h"
17 
18 namespace meta
19 {
20 
21 namespace corpus
22 {
23 class document;
24 }
25 
26 namespace index
27 {
28 struct score_data;
29 }
30 }
31 
32 namespace meta
33 {
34 namespace index
35 {
36 
41 {
42  search_result(doc_id id, float s) : d_id{id}, score{s}
43  {
44  // nothing
45  }
46  doc_id d_id;
47  float score;
48 };
49 
53 namespace detail
54 {
56 {
58  using iterator = postings_stream<doc_id>::iterator;
59 
61  iterator begin;
62  iterator end;
63  term_id t_id;
64  float query_term_weight;
65  uint64_t doc_count;
66  uint64_t corpus_term_count;
67 
68  postings_context(postings_stream<doc_id> strm, float qtf, term_id term)
69  : stream{std::move(strm)},
70  begin{stream.begin()},
71  end{stream.end()},
72  t_id{term},
73  query_term_weight{qtf},
74  doc_count{stream.size()},
75  corpus_term_count{stream.total_counts()}
76  {
77  // nothing
78  }
79 };
80 
81 inline term_id get_term_id(disk_index& inv, const std::string& term)
82 {
83  return inv.get_term_id(term);
84 }
85 
86 inline term_id get_term_id(disk_index&, term_id tid)
87 {
88  return tid;
89 }
90 }
91 
104 {
105  template <class ForwardIterator, class FilterFunction>
106  ranker_context(inverted_index& inv, ForwardIterator begin,
107  ForwardIterator end, FilterFunction&& filter)
108  : idx(inv), cur_doc{idx.num_docs()}
109  {
110  postings.reserve(static_cast<std::size_t>(std::distance(begin, end)));
111 
112  query_length = 0.0;
113  for (; begin != end; ++begin)
114  {
115  const auto& count = *begin;
116 
117  using kv_traits = hashing::kv_traits<
118  typename std::decay<decltype(count)>::type>;
119 
120  query_length += kv_traits::value(count);
121  auto term = detail::get_term_id(inv, kv_traits::key(count));
122  auto pstream = idx.stream_for(term);
123  if (!pstream)
124  continue;
125 
126  postings.emplace_back(*pstream, kv_traits::value(count), term);
127 
128  while (postings.back().begin != postings.back().end
129  && !filter(postings.back().begin->first))
130  ++postings.back().begin;
131 
132  if (postings.back().begin != postings.back().end)
133  {
134  if (postings.back().begin->first < cur_doc)
135  cur_doc = postings.back().begin->first;
136  }
137  }
138  }
139 
140  inverted_index& idx;
141  std::vector<detail::postings_context> postings;
142  float query_length;
143  doc_id cur_doc;
144 };
145 
149 class ranker_exception : public std::runtime_error
150 {
151  public:
152  using std::runtime_error::runtime_error;
153 };
154 
159 class ranker
160 {
161  public:
162  using filter_function_type = std::function<bool(doc_id did)>;
163 
164  static bool passthrough(doc_id)
165  {
166  return true;
167  }
168 
178  template <class ForwardIterator, class Function = bool (*)(doc_id)>
179  std::vector<search_result>
180  score(inverted_index& idx, ForwardIterator begin, ForwardIterator end,
181  uint64_t num_results = 10, Function&& filter = passthrough)
182  {
183  ranker_context ctx{idx, begin, end, filter};
184  return rank(ctx, num_results, filter);
185  }
186 
194  std::vector<search_result>
195  score(inverted_index& idx, const corpus::document& query,
196  uint64_t num_results = 10,
197  const filter_function_type& filter = [](doc_id) { return true; });
198 
202  virtual ~ranker() = default;
203 
208  virtual void save(std::ostream& out) const = 0;
209 
220  virtual std::vector<search_result> rank(ranker_context& ctx,
221  uint64_t num_results,
222  const filter_function_type& filter)
223  = 0;
224 };
225 
226 class ranking_function : public ranker
227 {
228  public:
234  virtual float score_one(const score_data& sd) = 0;
235 
241  virtual float initial_score(const score_data& sd) const;
242 
243  virtual std::vector<search_result>
244  rank(ranker_context& ctx, uint64_t num_results,
245  const filter_function_type& filter) override final;
246 };
247 }
248 }
249 #endif
Contains top-level namespace documentation for the META toolkit.
The inverted_index class stores information on a corpus indexed by term_ids.
Definition: inverted_index.h:65
Holds generic data structures and functions that inverted_index and forward_index both use...
Definition: disk_index.h:53
term_id get_term_id(const std::string &term)
Definition: disk_index.cpp:35
A ranker scores a query against all the documents in an inverted index, returning a list of documents...
Definition: ranker.h:159
Stores a list of postings_stream and other relevant information for performing document-at-a-time ran...
Definition: ranker.h:103
uint64_t size() const
Definition: postings_stream.h:83
Definition: ranker.h:226
iterator begin() const
Definition: postings_stream.h:202
A simple struct to hold scored document data.
Definition: ranker.h:40
Exception class for ranker interactions.
Definition: ranker.h:149
iterator end() const
Definition: postings_stream.h:210
FeatureValue total_counts() const
Definition: postings_stream.h:92
Represents an indexable document.
Definition: document.h:34
The ModErn Text Analysis toolkit is a suite of natural language processing, classification, information retrieval, data mining, and other applications of text processing.
Definition: analyzer.h:25
A stream for extracting the postings list for a specific key in a postings file.
Definition: postings_stream.h:32
A class to represent the per-PrimaryKey data in an index&#39;s postings file.
Definition: forward_index.h:34
A score_data object contains information needed to evaluate a ranking function.
Definition: score_data.h:40
Definition: hash_storage.h:70
std::vector< search_result > score(inverted_index &idx, ForwardIterator begin, ForwardIterator end, uint64_t num_results=10, Function &&filter=passthrough)
Definition: ranker.h:180