ModErn Text Analysis
META Enumerates Textual Applications
disk_index_impl.h
Go to the documentation of this file.
1 
10 #ifndef META_INDEX_DISK_INDEX_IMPL_H_
11 #define META_INDEX_DISK_INDEX_IMPL_H_
12 
13 #include <mutex>
14 
15 #include "meta/config.h"
16 #include "meta/index/disk_index.h"
18 #include "meta/index/string_list.h"
20 #include "meta/util/disk_vector.h"
22 #include "meta/util/optional.h"
23 
24 namespace meta
25 {
26 namespace index
27 {
28 
29 class string_list_writer;
30 
35 {
36  DOC_LABELS,
37  LABEL_IDS_MAPPING,
38  POSTINGS,
39  POSTINGS_INDEX,
40  TERM_IDS_MAPPING,
41  TERM_IDS_MAPPING_INVERSE,
42  METADATA_DB,
43  METADATA_INDEX
44 };
45 
50 {
51  public:
53  friend disk_index;
54 
58  const static std::vector<const char*> files;
59 
63  void initialize_metadata();
64 
69  void load_labels(uint64_t num_docs = 0);
70 
74  void load_term_id_mapping();
75 
79  void load_label_id_mapping();
80 
84  void save_label_id_mapping();
85 
91  void set_label(doc_id id, const class_label& label);
92 
96  uint64_t total_unique_terms() const;
97 
102  label_id doc_label_id(doc_id id) const;
103 
107  std::vector<class_label> class_labels() const;
108 
109  private:
115  label_id get_label_id(const class_label& lbl);
116 
118  std::string index_name_;
119 
125 
128 
131 
134 
136  mutable std::mutex mutex_;
137 };
138 }
139 }
140 #endif
static const std::vector< const char * > files
Filenames used in the index.
Definition: disk_index_impl.h:58
index_file
Collection of all the files that comprise a disk_index.
Definition: disk_index_impl.h:34
A class for representing optional values.
Definition: optional.h:115
void initialize_metadata()
Loads the metadata file.
Definition: disk_index.cpp:147
util::optional< vocabulary_map > term_id_mapping_
Maps string terms to term_ids.
Definition: disk_index_impl.h:130
void save_label_id_mapping()
Saves the label_id mapping.
Definition: disk_index.cpp:173
std::mutex mutex_
mutex for thread-safe operations
Definition: disk_index_impl.h:136
uint64_t num_docs() const
Definition: disk_index.cpp:101
The implementation of a disk_index.
Definition: disk_index_impl.h:49
util::invertible_map< class_label, label_id > label_ids_
Assigns an integer to each class label (used for liblinear mappings)
Definition: disk_index_impl.h:133
class_label label(doc_id d_id) const
Definition: disk_index.cpp:47
friend disk_index
friend the interface
Definition: disk_index_impl.h:53
void load_labels(uint64_t num_docs=0)
Loads the doc labels.
Definition: disk_index.cpp:152
The ModErn Text Analysis toolkit is a suite of natural language processing, classification, information retrieval, data mining, and other applications of text processing.
Definition: analyzer.h:25
void load_term_id_mapping()
Loads the term_id mapping.
Definition: disk_index.cpp:163
std::string index_name_
the location of this index
Definition: disk_index_impl.h:118
uint64_t total_unique_terms() const
Definition: disk_index.cpp:183
label_id doc_label_id(doc_id id) const
Definition: disk_index.cpp:188
void set_label(doc_id id, const class_label &label)
Sets the label for a document.
Definition: disk_index.cpp:178
util::optional< metadata_file > metadata_
Stores additional metadata for each document.
Definition: disk_index_impl.h:127
void load_label_id_mapping()
Loads the label_id mapping.
Definition: disk_index.cpp:168
std::vector< class_label > class_labels() const
Definition: disk_index.cpp:193
label_id get_label_id(const class_label &lbl)
Definition: disk_index.cpp:133
util::optional< util::disk_vector< label_id > > labels_
Maps which class a document belongs to (if any).
Definition: disk_index_impl.h:124