Learning Curve Plus Plus (LCPP)
data.h
Go to the documentation of this file.
1 /**
2  * @file data.h
3  * @author Ozgur Taylan Turan
4  *
5  */
6 
7 #ifndef DATA_H
8 #define DATA_H
9 
10 
11 #include "dataset.h"
12 #include "manip.h"
13 #include "sample.h"
14 
15 namespace data {
16 
17 //-----------------------------------------------------------------------------
18 // Transform
19 //-----------------------------------------------------------------------------
20 template<class T = mlpack::data::StandardScaler,
23 {
24 private:
25  T inp_; // Transformer for inputs
26  T lab_; // Transformer for labels
27 public:
28  /**
29  * @brief Construct and fit transformers for both inputs and labels.
30  * @param data Dataset to fit the transformers on.
31  */
32  Transformer( const D& dataset )
33  {
34  inp_.Fit(arma::conv_to<arma::mat>::from(dataset.inputs_));
35  if constexpr (std::is_same<D, data::Dataset<arma::Row<DTYPE>>>::value ||
36  std::is_same<D, data::Dataset<arma::Mat<DTYPE>>>::value ||
37  std::is_same<D, data::oml::Dataset<DTYPE>>::value )
38  lab_.Fit(arma::conv_to<arma::rowvec>::from(dataset.labels_));
39  };
40 
41  /**
42  * @brief Transform only the inputs of a dataset.
43  * @param data Dataset to transform.
44  * @return Transformed dataset (inputs only).
45  */
46  D TransInp( const D& dataset )
47  {
48  D tdataset = dataset;
49  inp_.Transform( dataset.inputs_, tdataset.inputs_);
50  return tdataset;
51  };
52 
53  /**
54  * @brief Transform only the labels of a dataset.
55  * @param data Dataset to transform.
56  * @return Transformed dataset (labels only).
57  */
58  D TransLab ( const D& dataset )
59  {
60  if constexpr (std::is_same<D, data::Dataset<arma::Row<DTYPE>>>::value ||
61  std::is_same<D, data::Dataset<arma::Mat<DTYPE>>>::value ||
62  std::is_same<D, data::oml::Dataset<DTYPE>>::value )
63  {
64  D tdataset = dataset;
65  lab_.Transform( dataset.labels_, tdataset.labels_);
66  return tdataset;
67  }
68  else
69  {
70  WARNING("Unfortunately I will not let you transform the labels if you \
71  donot have a regression dataset!");
72  return dataset;
73  }
74  };
75 
76  /**
77  * @brief Transform both inputs and labels of a dataset.
78  * @param data Dataset to transform.
79  * @return Fully transformed dataset.
80  */
81  D Trans ( const D& dataset )
82  {
83  D tdataset = TransInp(dataset);
84  if constexpr (std::is_same<D, data::Dataset<arma::Row<DTYPE>>>::value ||
85  std::is_same<D, data::Dataset<arma::Mat<DTYPE>>>::value ||
86  std::is_same<D, data::oml::Dataset<DTYPE>>::value )
87  tdataset = TransLab(tdataset);
88  return tdataset;
89  };
90 
91  /**
92  * @brief Inverse transform both inputs
93  * @param data Dataset to inverse transform.
94  * @return Original-scale dataset.
95  */
96 
97  D InvTransInp( const D& dataset )
98  {
99  D tdataset = dataset;
100  inp_.InverseTransform( dataset.inputs_, tdataset.inputs_);
101  return tdataset;
102  };
103 
104  /**
105  * @brief Inverse transform only the labels.
106  * @param data Dataset to inverse transform.
107  * @return Dataset with original-scale labels.
108  */
109  D InvTransLab ( const D& dataset )
110  {
111  D tdataset = dataset;
112  lab_.InverseTransform( dataset.labels_, tdataset.labels_);
113  return tdataset;
114  };
115 
116  /**
117  * @brief Inverse transform both inputs and labels.
118  * @param data Dataset to inverse transform.
119  * @return Original-scale dataset.
120  */
121  D InvTrans ( const D& dataset )
122  {
123  D tdataset = InvTransInp(dataset);
124  if constexpr (std::is_same<D, data::Dataset<arma::Row<DTYPE>>>::value ||
125  std::is_same<D, data::Dataset<arma::Mat<DTYPE>>>::value ||
126  std::is_same<D, data::oml::Dataset<DTYPE>>::value )
127  tdataset = InvTransLab(tdataset);
128  return tdataset;
129  };
130 };
131 
132 //-----------------------------------------------------------------------------
133 // Gram
134 //-----------------------------------------------------------------------------
135 template<class KERNEL, class T = DTYPE>
136 struct Gram
137 {
138  /// Default constructor.
139  Gram() {}
140 
141  /**
142  * @brief Construct and initialize kernel with arbitrary arguments.
143  * @tparam Ts Argument types for the kernel constructor.
144  * @param args Arguments to forward to the kernel constructor.
145  */
146  template<typename... Ts>
147  Gram(Ts&&... args) : kernel_(args...) {}
148 
149  // Kernel instance used for computing Gram matrices.
150  KERNEL kernel_;
151 
152  /**
153  * @brief Compute Gram matrix for row-major ordered data.
154  * @param input1 First dataset (rows are samples).
155  * @param input2 Second dataset (rows are samples).
156  * @return Gram matrix of size input1.n_rows × input2.n_rows.
157  */
158  arma::Mat<T> GetMatrix2(const arma::Mat<T>& input1,
159  const arma::Mat<T>& input2) const
160  {
161  arma::Mat<T> matrix(input1.n_rows, input2.n_rows);
162 
163  #pragma omp parallel for collapse(2)
164  for (int i = 0; i < int(input1.n_rows); i++)
165  for (int j = 0; j < int(input2.n_rows); j++)
166  matrix(i,j) = kernel_.Evaluate(input1.row(i).eval(),
167  input2.row(j).eval());
168 
169  return matrix;
170  }
171 
172  /**
173  * @brief Compute Gram matrix for column-major ordered data.
174  * @param input1 First dataset (columns are samples).
175  * @param input2 Second dataset (columns are samples).
176  * @return Gram matrix of size input1.n_cols × input2.n_cols.
177  */
178  arma::Mat<T> GetMatrix(const arma::Mat<T>& input1,
179  const arma::Mat<T>& input2) const
180  {
181  arma::Mat<T> matrix(input1.n_cols, input2.n_cols);
182 
183  #pragma omp parallel for collapse(2)
184  for (int i = 0; i < int(input1.n_cols); i++)
185  for (int j = 0; j < int(input2.n_cols); j++)
186  matrix(i,j) = kernel_.Evaluate(input1.col(i).eval(),
187  input2.col(j).eval());
188 
189  return matrix;
190  }
191 
192  /**
193  * @brief Compute an approximate Gram matrix using the Nyström method.
194  *
195  * Selects k random landmark points, computes their kernel matrix W,
196  * and uses it to approximate the full kernel matrix:
197  * K_approx = C * pinv(W) * C^T
198  *
199  * @param input1 First dataset (columns are samples).
200  * @param input2 Second dataset (columns are samples).
201  * @param k Number of landmark points to sample.
202  * @return Approximated Gram matrix.
203  */
204  arma::Mat<T> GetApprox(const arma::Mat<T>& input1,
205  const arma::Mat<T>& input2,
206  size_t k) const
207  {
208  size_t n_samples = input1.n_rows;
209  arma::uvec indices = arma::randi<arma::uvec>(k, arma::distr_param(0, n_samples - 1));
210  arma::Mat<T> landmarks = input1.cols(indices);
211 
212  arma::Mat<T> W = this->GetMatrix(landmarks, landmarks);
213  arma::Mat<T> C = this->GetMatrix(input1, landmarks);
214  arma::Mat<T> W_pinv = arma::pinv(W);
215 
216  return C * W_pinv * C.t();
217  }
218 
219  /**
220  * @brief Compute Gram matrix of a dataset with itself (column-major).
221  * @param input1 Dataset (columns are samples).
222  * @return Symmetric Gram matrix of size input1.n_cols × input1.n_cols.
223  */
224  arma::Mat<T> GetMatrix(const arma::Mat<T>& input1) const
225  {
226  return GetMatrix(input1, input1);
227  }
228  /**
229  * Serialize the model.
230  */
231  template<typename Archive>
232  void serialize ( Archive& ar, const unsigned int )
233  {
234  ar ( cereal::make_nvp("kernel",kernel_) );
235  }
236 };
237 
238 //-----------------------------------------------------------------------------
239 // Report : This just summerizes some general information about the dataset
240 //-----------------------------------------------------------------------------
241 template<class Dataset,class O=DTYPE>
242 void report( const Dataset& dataset )
243 {
244  PRINT("### DATASET INFORMATION ###");
245  PRINT("features : " << dataset.dimension_ );
246  PRINT("size : " << dataset.size_ );
247 
248  PRINT("### FEATURE INFORMATION ###");
249  PRINT("Mean : \n" << arma::mean(dataset.inputs_,1) );
250  PRINT("Median : \n" << arma::median(dataset.inputs_,1) );
251  PRINT("Variance : \n" << arma::var(dataset.inputs_.t()) );
252  PRINT("Min : \n" << arma::min(dataset.inputs_,1) );
253  PRINT("Max : \n" << arma::max(dataset.inputs_,1) );
254  PRINT("Covariance : \n" << arma::cov(dataset.inputs_.t()) );
255 
256  PRINT("### LABEL INFORMATION ###");
257  PRINT("Unique : \n" << arma::unique(dataset.labels_) );
258  PRINT("Counts : \n" << arma::hist(dataset.labels_,arma::unique(dataset.labels_)) );
259 }
260 
261 //-----------------------------------------------------------------------
262 // Load : wrapper for mlpack::Load
263 //-----------------------------------------------------------------------
264 template<class T, class O=DTYPE>
265 arma::Mat<O> Load ( const T& filename,
266  const bool& transpose,
267  const bool& count = false )
268 {
269  arma::Mat<O> matrix;
270  mlpack::data::DatasetInfo info;
271  if ( count )
272  {
273  mlpack::data::Load(filename,matrix,info,true,transpose);
274  }
275  else
276  mlpack::data::Load(filename,matrix,true,transpose);
277  return matrix;
278 }
279 //-----------------------------------------------------------------------------
280 // Save : Easy saving to a file even with directory creation
281 //-----------------------------------------------------------------------------
282 template<class T>
283 void Save ( const std::filesystem::path& filename,
284  const T& data,
285  const bool transpose=true )
286 {
287  T temp;
288 
289  if (transpose)
290  temp = data.t();
291  else
292  temp = data;
293 
294  std::string ext = filename.extension();
295 
296  std::filesystem::create_directories(filename.parent_path());
297 
298  if (ext == "csv")
299  {
300  temp.save(filename,arma::csv_ascii);
301  }
302  else if (ext == "bin")
303  {
304  temp.save(filename,arma::arma_binary);
305  }
306  else if (ext == "arma")
307  {
308  temp.save(filename,arma::arma_ascii);
309  }
310  else if (ext == "txt")
311  {
312  temp.save(filename,arma::raw_ascii);
313  }
314  else
315  throw std::runtime_error("Not Implemented save extension!");
316 
317 }
318 
319 };
320 #endif
D TransInp(const D &dataset)
Transform only the inputs of a dataset.
Definition: data.h:46
D TransLab(const D &dataset)
Transform only the labels of a dataset.
Definition: data.h:58
D Trans(const D &dataset)
Transform both inputs and labels of a dataset.
Definition: data.h:81
D InvTransInp(const D &dataset)
Inverse transform both inputs.
Definition: data.h:97
D InvTransLab(const D &dataset)
Inverse transform only the labels.
Definition: data.h:109
D InvTrans(const D &dataset)
Inverse transform both inputs and labels.
Definition: data.h:121
Transformer(const D &dataset)
Construct and fit transformers for both inputs and labels.
Definition: data.h:32
void serialize(Archive &ar, const unsigned int)
Definition: data.h:232
arma::Mat< T > GetApprox(const arma::Mat< T > &input1, const arma::Mat< T > &input2, size_t k) const
Compute an approximate Gram matrix using the Nyström method.
Definition: data.h:204
arma::Mat< T > GetMatrix(const arma::Mat< T > &input1, const arma::Mat< T > &input2) const
Compute Gram matrix for column-major ordered data.
Definition: data.h:178
Gram(Ts &&... args)
Construct and initialize kernel with arbitrary arguments.
Definition: data.h:147
arma::Mat< T > GetMatrix(const arma::Mat< T > &input1) const
Compute Gram matrix of a dataset with itself (column-major).
Definition: data.h:224
arma::Mat< T > GetMatrix2(const arma::Mat< T > &input1, const arma::Mat< T > &input2) const
Compute Gram matrix for row-major ordered data.
Definition: data.h:158
Gram()
Default constructor.
Definition: data.h:139