ModErn Text Analysis
META Enumerates Textual Applications
analyzer.h
Go to the documentation of this file.
1 
11 #ifndef META_ANALYZER_H_
12 #define META_ANALYZER_H_
13 
14 #include <memory>
15 #include <stdexcept>
16 
18 #include "meta/config.h"
19 
20 namespace cpptoml
21 {
22 class table;
23 }
24 
25 namespace meta
26 {
27 
28 namespace corpus
29 {
30 class document;
31 }
32 
33 namespace analyzers
34 {
35 
36 class token_stream;
37 
38 class multi_analyzer;
39 
51 class analyzer
52 {
53  public:
57  virtual ~analyzer() = default;
58 
65  template <class T>
67  {
69  featurizer feats{counts};
70  tokenize(doc, feats);
71  return counts;
72  }
73 
77  virtual std::unique_ptr<analyzer> clone() const = 0;
78 
79  friend multi_analyzer;
80 
81  private:
89  virtual void tokenize(const corpus::document& doc, featurizer& counts) = 0;
90 };
91 
95 class analyzer_exception : public std::runtime_error
96 {
97  public:
98  using std::runtime_error::runtime_error;
99 };
100 
105 std::unique_ptr<analyzer> load(const cpptoml::table& config);
106 
112 std::unique_ptr<token_stream>
113 default_filter_chain(const cpptoml::table& config);
114 
120 std::unique_ptr<token_stream>
121 default_unigram_chain(const cpptoml::table& config);
122 
128 std::unique_ptr<token_stream> load_filters(const cpptoml::table& global,
129  const cpptoml::table& config);
130 
136 std::unique_ptr<token_stream> load_filter(std::unique_ptr<token_stream> src,
137  const cpptoml::table& config);
138 
143 std::string get_content(const corpus::document& doc);
144 }
145 }
146 #endif
counts_t counts(const std::string &text, bool contains_label=true)
Definition: libsvm_parser.cpp:34
The multi_analyzer class contains more than one analyzer.
Definition: multi_analyzer.h:33
std::unique_ptr< token_stream > load_filters(const cpptoml::table &global, const cpptoml::table &config)
Definition: analyzer.cpp:80
std::unique_ptr< token_stream > load_filter(std::unique_ptr< token_stream > src, const cpptoml::table &config)
Definition: analyzer.cpp:71
std::unique_ptr< analyzer > load(const cpptoml::table &config)
Definition: analyzer.cpp:104
Basic exception for analyzer interactions.
Definition: analyzer.h:95
Represents an indexable document.
Definition: document.h:34
Provides interface to with multiple corpus input formats.
Definition: corpus.h:58
Used by analyzers to increment feature values in feature_maps generically.
Definition: featurizer.h:42
The ModErn Text Analysis toolkit is a suite of natural language processing, classification, information retrieval, data mining, and other applications of text processing.
Definition: analyzer.h:25
std::unique_ptr< token_stream > default_unigram_chain(const cpptoml::table &config)
Definition: analyzer.cpp:64
An class that provides a framework to produce token counts from documents.
Definition: analyzer.h:51
std::unique_ptr< token_stream > default_filter_chain(const cpptoml::table &config)
Definition: analyzer.cpp:55
std::string get_content(const corpus::document &doc)
Definition: analyzer.cpp:27
An insert-only probing hash table.
Definition: probe_map.h:40
Base class that represents a stream of tokens that have been extracted from a document.
Definition: token_stream.h:29
feature_map< T > analyze(const corpus::document &doc)
Tokenizes a document.
Definition: analyzer.h:66
Definition: analyzer.h:20