ModErn Text Analysis
META Enumerates Textual Applications
corpus.h
Go to the documentation of this file.
1 
10 #ifndef META_CORPUS_H_
11 #define META_CORPUS_H_
12 
13 #include <memory>
14 #include <mutex>
15 #include <stdexcept>
16 
17 #include "cpptoml.h"
18 #include "meta/config.h"
19 #include "meta/corpus/document.h"
21 #include "meta/meta.h"
23 #include "meta/util/optional.h"
24 #include "meta/util/progress.h"
25 
26 namespace meta
27 {
28 namespace corpus
29 {
30 
58 class corpus
59 {
60  public:
65  corpus(std::string encoding);
66 
70  virtual bool has_next() const = 0;
71 
75  virtual document next() = 0;
76 
80  virtual uint64_t size() const = 0;
81 
85  virtual metadata::schema_type schema() const;
86 
90  virtual ~corpus() = default;
91 
95  const std::string& encoding() const;
96 
101  bool store_full_text() const;
102 
107  void set_store_full_text(bool store_full_text);
108 
109  protected:
114  std::vector<metadata::field> next_metadata();
115 
116  private:
117  friend std::unique_ptr<corpus> make_corpus(const cpptoml::table&);
118 
119  void set_metadata_parser(metadata_parser&& mdparser);
120 
122  std::string encoding_;
127 };
128 
132 class corpus_exception : public std::runtime_error
133 {
134  public:
135  using std::runtime_error::runtime_error;
136 };
137 
145 template <class LocalStorage, class ConsumeFunction>
147  LocalStorage&& ls_fn, ConsumeFunction&& consume_fn)
148 {
149  std::mutex mutex;
150  auto task = [&]() {
151  auto local_storage = ls_fn();
152  while (true)
153  {
155  {
156  std::lock_guard<std::mutex> lock{mutex};
157 
158  if (!docs.has_next())
159  return;
160 
161  doc = docs.next();
162  }
163 
164  consume_fn(local_storage, *doc);
165  }
166  };
167 
168  std::vector<std::future<void>> futures;
169  futures.reserve(pool.size());
170  for (std::size_t i = 0; i < pool.size(); ++i)
171  {
172  futures.emplace_back(pool.submit_task(task));
173  }
174  for (auto& fut : futures)
175  fut.get();
176 }
177 }
178 }
179 #endif
virtual bool has_next() const =0
Reads metadata from the metadata file of a corpus according to a schema.
Definition: metadata_parser.h:28
Contains top-level namespace documentation for the META toolkit.
A class for representing optional values.
Definition: optional.h:115
corpus(std::string encoding)
Constructs a new corpus with the given encoding.
Definition: corpus.cpp:17
std::future< typename std::result_of< Function()>::type > submit_task(Function func)
Adds a task to the thread_pool.
Definition: thread_pool.h:72
size_t size() const
Definition: thread_pool.h:111
void set_store_full_text(bool store_full_text)
Definition: corpus.cpp:48
Basic exception for corpus interactions.
Definition: corpus.h:132
std::vector< metadata::field > next_metadata()
Helper function to be used by deriving classes in implementing next() to set the metadata for the cur...
Definition: corpus.cpp:23
virtual metadata::schema_type schema() const
Definition: corpus.cpp:28
util::optional< metadata_parser > mdata_parser_
The metadata parser.
Definition: corpus.h:124
Represents an indexable document.
Definition: document.h:34
Provides interface to with multiple corpus input formats.
Definition: corpus.h:58
The ModErn Text Analysis toolkit is a suite of natural language processing, classification, information retrieval, data mining, and other applications of text processing.
Definition: analyzer.h:25
Represents a collection of a fixed number of threads, which tasks can be added to.
Definition: thread_pool.h:35
virtual document next()=0
friend std::unique_ptr< corpus > make_corpus(const cpptoml::table &)
Convenience method for creating a corpus using the factory.
Definition: corpus_factory.cpp:29
std::string encoding_
The type of encoding this document uses.
Definition: corpus.h:122
bool store_full_text() const
Definition: corpus.cpp:53
virtual ~corpus()=default
Destructor.
bool store_full_text_
Whether to store the original document text.
Definition: corpus.h:126
virtual uint64_t size() const =0
const std::string & encoding() const
Definition: corpus.cpp:38
void parallel_consume(corpus &docs, parallel::thread_pool &pool, LocalStorage &&ls_fn, ConsumeFunction &&consume_fn)
Consumes each document in a corpus using a pool of threads.
Definition: corpus.h:146