Learning Curve Plus Plus (LCPP)
dataset.h
Go to the documentation of this file.
1 /**
2  * @file dataset.h
3  * @author Ozgur Taylan Turan
4  *
5  * Here lies the definitions of all the Dataset containers for regression,
6  * classification, functional and openml related containers.
7  */
8 
9 #ifndef DATASET_H
10 #define DATASET_H
11 
12 namespace data {
13 //-----------------------------------------------------------------------------
14 // data:: Dataset: This is a genearl Dataset container where you can decide at
15 // compile time. A classification or regression dataset.
16 //-----------------------------------------------------------------------------
17 template<class LABEL=arma::Row<DTYPE>,class T=DTYPE>
18 class Dataset
19 {
20 public:
21  size_t size_; // size of the dataset
22  size_t dimension_; // dimension of the dataset
23  // If an optional value for number of classes for classification problems
24  std::optional<size_t> num_class_;
25  std::optional<size_t> seed_; // seed for reproduction purposes
26 
27  arma::Mat<T> inputs_;
28  LABEL labels_;
29 
30  /* Dataset Empty Constructor */
31  Dataset ( ) { };
32 
33  /* Dataset initializer
34  *
35  * @param dim : dimension of the dataset
36  *
37  */
38  Dataset ( const size_t dim, const size_t seed = SEED );
39 
40  /* Dataset initializer
41  *
42  * @param inputs : input of the dataset
43  * @param labels : output of the dataset
44  *
45  */
46  Dataset ( const arma::Mat<T>& inputs,
47  const LABEL& labels );
48 
49  /* Dataset update with the given inputs and labels
50  *
51  * @param inputs : input of the dataset
52  * @param labels : output of the dataset
53  *
54  */
55  void Update ( const arma::Mat<T>& inputs, const LABEL& labels );
56 
57  void Update ( const LABEL& labels );
58 
59  /* Generate linear data for regression with Gaussian noise assumption
60  *
61  * @param N : number of samples
62  * @param noise_std : standard deviation of the Gaussian noise
63  *
64  */
65  void Linear ( const size_t N=10, const T noise_std=T(1.));
66 
67  /* Generate sinusoidal data for regression with Gaussian noise assumption
68  *
69  * @param N : number of samples
70  * @param noise_std : standard deviation of the Gaussian noise
71  *
72  */
73  void Sine ( const size_t N=10, const T noise_std=T(1.));
74 
75  /* Generate banana dataset for classification a.k.a the moon dataset in
76  * sklearn.
77  *
78  * @param N : number of samples for each class
79  * @param delta : distance between bananas
80  *
81  */
82  void Banana ( const size_t N=10, const T delta=0. );
83 
84  /* Generate dipping dataset for classification
85  * Loog, M., & Duin, R. P. W. (2012). The dipping phenomenon.
86  *
87  * @param N : number of samples for each class
88  * @param r : radius of the covering circle
89  * @param noise_std : noise of the circle
90  *
91  */
92  void Dipping ( const size_t N=10, const T r=1, const T noise_std=0.1 );
93 
94  /* Create Gaussian blobs with number of blobs equal to the means provided
95  * all the blobs have spherical covariance.
96  *
97  * @param N : number of samples for each class
98  * @param means : means of the Gaussian blobs
99  * @param stds : standard deviations of the Gaussian blobs
100  *
101  */
102  void Gaussian ( const size_t N=10,
103  const arma::Row<T>& means = {-1,1} );
104 
105  /* Serliazation with cereal for the class. */
106  template <class Archive>
107  void serialize(Archive& ar)
108  {
109  ar( CEREAL_NVP(size_),
110  CEREAL_NVP(inputs_),
111  CEREAL_NVP(labels_),
112  CEREAL_NVP(num_class_),
113  CEREAL_NVP(dimension_) );
114  }
115 
116  void Save( const std::string& filename );
117 
118 private:
119  void _update_info ( );
120 
121 };
122 
123 namespace oml {
124 
125 //-----------------------------------------------------------------------------
126 // oml::Dataset -> Downloads data from OpenML with given id
127 //-----------------------------------------------------------------------------
128 template<class LTYPE = size_t, class T = DTYPE>
129 class Dataset
130 {
131 public:
132  size_t id_; // Dataset ID
133  size_t size_; // Number of samples
134  size_t dimension_; // Feature dimension
135  std::optional<size_t> num_class_; // Number of classes (if categorical)
136  std::filesystem::path path_; // Dataset path
137 
138  arma::Mat<T> inputs_; // Input data
139  arma::Row<LTYPE> labels_; // Labels
140 
141  Dataset() { };
142  Dataset(const size_t& id, const std::filesystem::path& path);
143  Dataset(const size_t& id);
144 
145  // Update dataset content
146  void Update(const arma::Mat<T>& input, const arma::Row<LTYPE>& labels);
147 
148  /* Serialization with cereal */
149  template <class Archive>
150  void serialize(Archive& ar)
151  {
152  ar(CEREAL_NVP(size_),
153  CEREAL_NVP(id_),
154  CEREAL_NVP(path_.string()),
155  CEREAL_NVP(num_class_),
156  CEREAL_NVP(meta_url_),
157  CEREAL_NVP(down_url_),
158  CEREAL_NVP(file_),
159  CEREAL_NVP(metafile_),
160  CEREAL_NVP(inputs_),
161  CEREAL_NVP(labels_),
162  CEREAL_NVP(dimension_));
163  }
164 
165  // Save/load dataset in binary format
166  void Save(const std::string& filename);
167  static std::shared_ptr<Dataset<LTYPE, T>> Load(const std::string& filename);
168 
169 private:
170  std::filesystem::path filepath_ = path_ / "datasets"; // Data folder
171  std::filesystem::path metapath_ = path_ / "meta"; // Metadata folder
172 
173  bool _download(); // Download dataset
174  void _update_info(); // Update metadata
175  void _load(); // Load from disk
176 
177  bool _iscateg(const arma::Row<T>& row); // Check if row is categorical
178  arma::Row<size_t> _convcateg(const arma::Row<T>& row); // Convert to categs
179  arma::Row<size_t> _procrow(const arma::Row<T>& row); // Process row
180 
181  std::string _gettargetname(const std::string& metadata);// target name
182  std::string _getdownurl(const std::string& metadata); // download URL
183  int _findlabel(const std::string& targetname); // Find label index
184 
185  std::string _fetchmetadata(); // Fetch metadata from source
186  std::string _readmetadata(); // Read metadata from file
187 
188  std::string meta_url_; // Metadata URL
189  std::string down_url_; // Download URL
190  std::string file_; // Dataset file name
191  std::string metafile_; // Metadata file name
192 };
193 
194 //-----------------------------------------------------------------------------
195 // Collect : This is for collection of datasets through OpenML servers
196 //-----------------------------------------------------------------------------
197 template<class T=size_t>
198 class Collect
199 {
200 public:
201  /*
202  * Const
203  * @param id : id of the study
204  */
205  Collect ( const size_t& id );
206 
207  /*
208  * @param ids : ids of datasets
209  */
210  Collect ( const arma::Row<size_t>& ids );
211 
212  /*
213  * @param id : id of the study
214  * @param paht : path to save the collection
215  */
216  Collect ( const size_t& id, const std::filesystem::path& path );
217 
218  Dataset<T> GetNext ( );
219 
220  Dataset<T> GetID ( const size_t& id );
221 
222  size_t GetSize ( ) const {return size_;}
223  size_t GetCounter ( ) const {return counter_;}
224  arma::Row<size_t> GetKeys ( ) const {return keys_;}
225 
226 private:
227  size_t id_;
228  size_t size_;
229  size_t counter_ = 0;
230 
231  std::string url_;
232 
233  arma::Row<size_t> _getkeys ( );
234 
235  arma::Row<size_t> keys_;
236 
237  std::filesystem::path path_;
238  std::filesystem::path filespath_ = path_ / "collect";
239  std::filesystem::path metapath_ = path_ / "collect";
240  std::filesystem::path metafile_ = metapath_ / (std::to_string(id_)+".meta");
241 
242 };
243 
244 } // namesapce oml
245 
246 } // namespace data
247 
248 
249 #include "dataset_impl.h"
250 
251 #endif