ModErn Text Analysis
META Enumerates Textual Applications
libsvm_corpus.h
Go to the documentation of this file.
1 
10 #ifndef META_CORPUS_LIBSVM_CORPUS_H_
11 #define META_CORPUS_LIBSVM_CORPUS_H_
12 
13 #include <fstream>
14 
15 #include "meta/config.h"
16 #include "meta/corpus/corpus.h"
18 
19 namespace meta
20 {
21 namespace corpus
22 {
23 
29 class libsvm_corpus : public corpus
30 {
31  public:
33  const static util::string_view id;
34 
36  enum class label_type
37  {
38  CLASSIFICATION,
39  REGRESSION
40  };
41 
50  libsvm_corpus(const std::string& file,
51  label_type type = label_type::CLASSIFICATION,
52  uint64_t num_docs = 0);
53 
54  bool has_next() const override;
55 
56  document next() override;
57 
58  uint64_t size() const override;
59 
60  metadata::schema_type schema() const override;
61 
62  private:
64  doc_id cur_id_;
65 
68 
70  uint64_t num_lines_;
71 
73  std::string next_content_;
74 
76  std::ifstream input_;
77 };
78 
83 template <>
84 std::unique_ptr<corpus>
86  const cpptoml::table& config);
87 }
88 }
89 #endif
std::ifstream input_
The stream being read from.
Definition: libsvm_corpus.h:76
std::string next_content_
The next document.
Definition: libsvm_corpus.h:73
label_type
The label type for the corpus.
Definition: libsvm_corpus.h:36
uint64_t size() const override
Definition: libsvm_corpus.cpp:80
bool has_next() const override
Definition: libsvm_corpus.cpp:36
document next() override
Definition: libsvm_corpus.cpp:41
static const util::string_view id
The identifier for this corpus.
Definition: libsvm_corpus.h:33
A non-owning reference to a string.
Definition: string_view.h:51
label_type lbl_type_
The label type.
Definition: libsvm_corpus.h:67
Represents an indexable document.
Definition: document.h:34
Provides interface to with multiple corpus input formats.
Definition: corpus.h:58
The ModErn Text Analysis toolkit is a suite of natural language processing, classification, information retrieval, data mining, and other applications of text processing.
Definition: analyzer.h:25
metadata::schema_type schema() const override
Definition: libsvm_corpus.cpp:68
uint64_t num_lines_
The number of lines in the file.
Definition: libsvm_corpus.h:70
Fills document objects with content line-by-line from a libsvm-formatted input file.
Definition: libsvm_corpus.h:29
std::unique_ptr< corpus > make_corpus< libsvm_corpus >(util::string_view prefix, util::string_view dataset, const cpptoml::table &config)
Specialization of the factory method used to create libsvm_corpus instances.
Definition: libsvm_corpus.cpp:86
doc_id cur_id_
The current document we are on.
Definition: libsvm_corpus.h:64
libsvm_corpus(const std::string &file, label_type type=label_type::CLASSIFICATION, uint64_t num_docs=0)
Definition: libsvm_corpus.cpp:18