Learning Curve Plus Plus (LCPP)
manip.h
Go to the documentation of this file.
1 /**
2  * @file manip.h
3  * @author Ozgur Taylan Turan
4  *
5  * Data manipulation related stuff
6  *
7  */
8 
9 #ifndef MANIP_H
10 #define MANIP_H
11 
12 
13 namespace data {
14 
15 //-----------------------------------------------------------------------------
16 // SetDiff : Difference between two vectors
17 //-----------------------------------------------------------------------------
18 /**
19  * @param check : The vector to be compared
20  * @param with : Comparison is made with this vector
21  */
22 template<class T>
23 T SetDiff( const T& check, const T& with )
24 {
25  assert ( check.is_sorted() && with.is_sorted() &&
26  "For this method I assumed you sorted your stuff...");
27 
28  T result;
29  size_t i = 0, j = 0;
30 
31  while (i < check.n_elem && j < with.n_elem)
32  {
33  if (check[i] < with[j])
34  {
35  result.resize(result.n_elem+1);
36  result[result.n_elem-1] = check[i];
37  i++;
38  }
39  else if (check[i] > with[j])
40  j++;
41  else
42  {
43  i++;
44  j++;
45  }
46  }
47 
48  // Append remaining elements from check
49  while (i < check.n_elem)
50  {
51  result.resize(result.n_elem+1);
52  result[result.n_elem-1] = check[i];
53  i++;
54  }
55 
56  return result;
57 }
58 
59 //-----------------------------------------------------------------------------
60 // Migrate : Exchange N random data points between train and test sets
61 //-----------------------------------------------------------------------------
62 /**
63  * @param train_inp : inputs of the training set
64  * @param train_lab : labels of the training set
65  * @param test_inp : inputs of the testing set
66  * @param test_lab : labels of the testing set
67  * @param N : number of points that to be migrated (test->train)
68  */
69 template<typename T, typename U>
70 void Migrate ( arma::Mat<T>& train_inp,
71  arma::Row<U>& train_lab,
72  arma::Mat<T>& test_inp,
73  arma::Row<U>& test_lab,
74  const size_t N )
75 {
76  assert ( ( train_inp.n_cols == train_lab.n_elem &&
77  train_inp.n_rows == test_inp.n_rows &&
78  test_inp.n_cols == test_lab.n_elem &&
79  test_lab.n_elem >= N) &&
80  "Requested element number is bigger than what you have.");
81 
82  train_inp.resize(train_inp.n_rows, train_inp.n_cols+N);
83  train_lab.resize(train_lab.n_cols+N);
84  arma::uvec idx = arma::randperm(test_inp.n_cols, N);
85  train_inp.tail_cols(N) = test_inp.cols(idx);
86  train_lab.tail_cols(N) = test_lab.cols(idx);
87  test_lab.shed_cols(idx);
88  test_inp.shed_cols(idx);
89 }
90 
91 template<typename T, typename U>
92 void Migrate ( arma::Mat<T>& train_inp,
93  arma::Mat<U>& train_lab,
94  arma::Mat<T>& test_inp,
95  arma::Mat<U>& test_lab,
96  const size_t N )
97 {
98  assert ( ( test_inp.n_cols == test_lab.n_elem &&
99  train_inp.n_rows == test_inp.n_rows &&
100  test_lab.n_rows == train_lab.n_rows &&
101  test_lab.n_elem >= N ) &&
102  "Requested element number is bigger than what you have.");
103 
104  train_inp.resize(train_inp.n_rows, train_inp.n_cols+N);
105  train_lab.resize(train_lab.n_rows, train_lab.n_cols+N);
106  arma::uvec idx = arma::randperm(test_inp.n_cols, N);
107  train_inp.tail_cols(N) = test_inp.cols(idx);
108  train_lab.tail_cols(N) = test_lab.cols(idx);
109  test_lab.shed_cols(idx);
110  test_inp.shed_cols(idx);
111 }
112 /**
113  * @param trainset : training dataset
114  * @param testset : testing dataset
115  * @param N : number of points that to be migrated (test->train)
116  */
117 template<typename T>
118 void Migrate ( T& trainset,
119  T& testset,
120  const size_t N )
121 {
122  Migrate(trainset.inputs_,trainset.labels_,testset.inputs_,testset.labels_,N);
123 
124  trainset.size_ = trainset.inputs_.n_cols;
125  testset.size_ = testset.inputs_.n_cols;
126 
127  if constexpr (std::is_same_v<T, data::oml::Dataset<size_t>> ||
128  std::is_same_v<T, data::Dataset<arma::Row<size_t>>> )
129  {
130  trainset.num_class_ = arma::unique(trainset.labels_).eval().n_cols;
131  testset.num_class_ = arma::unique(testset.labels_).eval().n_cols;
132  }
133 }
134 
135 template<typename T=arma::uword>
136 void Migrate ( arma::Col<T>& trainset,
137  arma::Col<T>& testset,
138  const size_t N )
139 {
140  assert ( testset.n_elem >= N &&
141  "Requested element number is bigger than what you have.");
142 
143  trainset.resize(trainset.n_elem+N);
144  arma::uvec idx = arma::randperm(testset.n_elem, N);
145  trainset.tail(N) = testset.rows(idx);
146  testset.shed_rows(idx);
147 }
148 
149 //-----------------------------------------------------------------------------
150 // Split : Split datasets for a given number of training points
151 //-----------------------------------------------------------------------------
152 /**
153  * @param input Input dataset to split.
154  * @param label Input labels to split.
155  * @param trainData Matrix to store training data into.
156  * @param testData Matrix to store test data into.
157  * @param trainLabel Vector to store training labels into.
158  * @param testLabel Vector to store test labels into.
159  * @param trainNum number of training points desired.
160  */
161 template<typename T, typename U>
162 void Split ( const arma::Mat<T>& input,
163  const arma::Row<U>& inputLabel,
164  arma::Mat<T>& trainData,
165  arma::Mat<T>& testData,
166  arma::Row<U>& trainLabel,
167  arma::Row<U>& testLabel,
168  const size_t trainNum )
169 {
170  const arma::uvec order =
171  arma::shuffle(arma::regspace<arma::uvec>(0, input.n_cols - 1));
172 
173  trainData = input.cols(order.rows(0,trainNum-1));
174  trainLabel = inputLabel.cols(order.rows(0,trainNum-1));
175 
176  testData = input.cols(order.rows(trainNum,input.n_cols-1));
177  testLabel = inputLabel.cols(order.rows(trainNum,input.n_cols-1));
178 }
179 
180 
181 template<typename T, typename U>
182 void Split ( const arma::Mat<T>& input,
183  const arma::Mat<U>& inputLabel,
184  arma::Mat<T>& trainData,
185  arma::Mat<T>& testData,
186  arma::Mat<U>& trainLabel,
187  arma::Mat<U>& testLabel,
188  const size_t trainNum )
189 {
190  // I am going to solve this problem with a tinier bit of code
191  const arma::uvec order =
192  arma::shuffle(arma::regspace<arma::uvec>(0, input.n_cols - 1));
193 
194  trainData = input.cols(order.rows(0,trainNum-1));
195  trainLabel = inputLabel.cols(order.rows(0,trainNum-1));
196 
197  testData = input.cols(order.rows(trainNum,input.n_cols-1));
198  testLabel = inputLabel.cols(order.rows(trainNum,input.n_cols-1));
199 }
200 
201 /**
202  * @param input Input dataset to split.
203  * @param trainData Matrix to store training data into.
204  * @param testData Matrix to store test data into.
205  * @param trainNum number of training points desired.
206  */
207 template<typename T>
208 void Split ( const arma::Mat<T>& input,
209  arma::Mat<T>& trainData,
210  arma::Mat<T>& testData,
211  const size_t trainNum )
212 {
213  const arma::uvec order =
214  arma::shuffle(arma::regspace<arma::uvec>(0, input.n_cols - 1));
215 
216  trainData = input.cols( order.head(trainNum) );
217  testData = input.cols( order.tail(input.n_cols-trainNum) );
218 
219 }
220 
221 template<typename T>
222 void Split ( const arma::Row<T>& input,
223  arma::Row<T>& trainData,
224  arma::Row<T>& testData,
225  const size_t trainNum )
226 {
227  const arma::uvec order =
228  arma::shuffle(arma::regspace<arma::uvec>(0, input.n_cols - 1));
229 
230  trainData = input.cols( order.head(trainNum) );
231  testData = input.cols( order.tail(input.n_cols-trainNum) );
232 
233 }
234 
235 template<typename T>
236 void Split ( const arma::Col<T>& input,
237  arma::Col<T>& trainData,
238  arma::Col<T>& testData,
239  const size_t trainNum )
240 {
241  const arma::uvec order =
242  arma::shuffle(arma::regspace<arma::uvec>(0, input.n_rows- 1));
243 
244  trainData = input.rows( order.head(trainNum) );
245  testData = input.rows( order.tail(input.n_rows-trainNum) );
246 }
247 
248 /**
249  * @param input Input dataset to split.
250  * @param label Input labels to split.
251  * @param trainNum number of training points desired.
252  * @return std::tuple containing trainData (arma::Mat<T>), testData
253  * (arma::Mat<T>), trainLabel (arma::Row<U>), and testLabel (arma::Row<U>).
254  */
255 template<typename T, typename U>
256 std::tuple<arma::Mat<T>, arma::Mat<T>, arma::Row<U>, arma::Row<U>>
257 Split ( const arma::Mat<T>& input,
258  const arma::Row<U>& inputLabel,
259  const size_t trainNum)
260 {
261  arma::Mat<T> trainData;
262  arma::Mat<T> testData;
263  arma::Row<U> trainLabel;
264  arma::Row<U> testLabel;
265 
266  Split(input, inputLabel, trainData, testData, trainLabel, testLabel,
267  trainNum);
268 
269  return std::make_tuple(std::move(trainData),
270  std::move(testData),
271  std::move(trainLabel),
272  std::move(testLabel));
273 }
274 
275 template<typename T, typename U>
276 std::tuple<arma::Mat<T>, arma::Mat<T>, arma::Mat<U>, arma::Mat<U>>
277 Split ( const arma::Mat<T>& input,
278  const arma::Mat<U>& inputLabel,
279  const size_t trainNum )
280 {
281  arma::Mat<T> trainData;
282  arma::Mat<T> testData;
283  arma::Mat<U> trainLabel;
284  arma::Mat<U> testLabel;
285 
286  Split(input, inputLabel, trainData, testData, trainLabel, testLabel,
287  trainNum);
288 
289  return std::make_tuple(std::move(trainData),
290  std::move(testData),
291  std::move(trainLabel),
292  std::move(testLabel));
293 }
294 /**
295  * @param input Input dataset to split.
296  * @param trainNum number of training points desired.
297  * @return std::tuple containing trainData (arma::Mat<T>)
298  * and testData (arma::Mat<T>).
299  */
300 template<typename T>
301 std::tuple<arma::Mat<T>, arma::Mat<T>>
302 Split ( const arma::Mat<T>& input,
303  const size_t trainNum)
304 {
305  arma::Mat<T> trainData;
306  arma::Mat<T> testData;
307  Split(input, trainData, testData, trainNum);
308 
309  return std::make_tuple(std::move(trainData),
310  std::move(testData));
311 }
312 
313 /**
314  * @param dataset to be splitted
315  * @param trainset to be splitted
316  * @param testset to be splitted
317  * @param trainNum number of training points
318  */
319 template<typename T>
320 void Split ( const T& dataset,
321  T& trainset,
322  T& testset,
323  const size_t trainNum )
324 {
325 
326  trainset = dataset; testset = dataset;
327 
328  Split(dataset.inputs_, dataset.labels_,
329  trainset.inputs_, testset.inputs_,
330  trainset.labels_, testset.labels_, trainNum);
331 
332  trainset.Update(trainset.inputs_,trainset.labels_);
333  testset.Update(testset.inputs_,testset.labels_);
334 
335 }
336 
337 /**
338  * @param dataset to be splitted
339  * @param trainset to be filled
340  * @param testset to be filled
341  * @param testRatio percentage of test set
342  */
343 template<typename T,class O=DTYPE>
344 void Split ( const T& dataset,
345  T& trainset,
346  T& testset,
347  const O testRatio )
348 {
349 
350  trainset = dataset; testset = dataset;
351 
352  mlpack::data::Split(dataset.inputs_, dataset.labels_,
353  trainset.inputs_, testset.inputs_,
354  trainset.labels_, testset.labels_, testRatio);
355 
356  trainset.Update(trainset.inputs_,trainset.labels_);
357  testset.Update(testset.inputs_,testset.labels_);
358 }
359 
360 //-----------------------------------------------------------------------------
361 // StratifiedSplit : Split datasets for a given number of training points
362 // in a stratified manner
363 //-----------------------------------------------------------------------------
364 /**
365  * @param input Input dataset to stratify.
366  * @param inputLabel Input labels to stratify.
367  * @param trainData Matrix to store training data into.
368  * @param testData Matrix to store test data into.
369  * @param trainLabel Vector to store training labels into.
370  * @param testLabel Vector to store test labels into.
371  * @param trainNum number of training points of dataset to use for test set
372  * @param shuffleData If true, the sample order is shuffled; otherwise, each
373  * sample is visited in linear order. (Default true.)
374  */
375 template<typename T, typename LabelsType,
376  typename = std::enable_if_t<arma::is_arma_type<LabelsType>::value> >
377 void StratifiedSplit(const arma::Mat<T>& input,
378  const LabelsType& inputLabel,
379  arma::Mat<T>& trainData,
380  arma::Mat<T>& testData,
381  LabelsType& trainLabel,
382  LabelsType& testLabel,
383  const size_t trainNum,
384  const bool shuffleData = true)
385 {
386  const bool typeCheck = (arma::is_Row<LabelsType>::value)
387  || (arma::is_Col<LabelsType>::value);
388  if (!typeCheck)
389  throw std::runtime_error("data::Split(): when stratified sampling is done, "
390  "labels must have type `arma::Row<>`!");
391  mlpack::util::CheckSameSizes(input, inputLabel, "data::Split()");
392 
393  double testRatio = double(1) - double(trainNum)/double(inputLabel.n_elem);
394  size_t trainIdx = 0;
395  size_t testIdx = 0;
396  size_t trainSize = 0;
397  size_t testSize = 0;
398  arma::uvec labelCounts;
399  arma::uvec testLabelCounts;
400  typename LabelsType::elem_type maxLabel = inputLabel.max();
401 
402  labelCounts.zeros(maxLabel+1);
403  testLabelCounts.zeros(maxLabel+1);
404 
405  for (typename LabelsType::elem_type label : inputLabel)
406  ++labelCounts[label];
407 
408  for (arma::uword labelCount : labelCounts)
409  {
410  testSize += floor(labelCount * testRatio+1e-6);
411  trainSize += labelCount - floor(labelCount * testRatio+1e-6);
412  }
413 
414  trainData.set_size(input.n_rows, trainSize);
415  testData.set_size(input.n_rows, testSize);
416  trainLabel.set_size(inputLabel.n_rows, trainSize);
417  testLabel.set_size(inputLabel.n_rows, testSize);
418 
419  if (shuffleData)
420  {
421  arma::uvec order = arma::shuffle(
422  arma::linspace<arma::uvec>(0, input.n_cols - 1, input.n_cols));
423 
424  for (arma::uword i : order)
425  {
426  typename LabelsType::elem_type label = inputLabel[i];
427  if (testLabelCounts[label] < floor(labelCounts[label] * testRatio+1e-6))
428  {
429  testLabelCounts[label] += 1;
430  testData.col(testIdx) = input.col(i);
431  testLabel[testIdx] = inputLabel[i];
432  testIdx += 1;
433  }
434  else
435  {
436  trainData.col(trainIdx) = input.col(i);
437  trainLabel[trainIdx] = inputLabel[i];
438  trainIdx += 1;
439  }
440  }
441  }
442  else
443  {
444  for (arma::uword i = 0; i < input.n_cols; i++)
445  {
446  typename LabelsType::elem_type label = inputLabel[i];
447  if (testLabelCounts[label] < floor(labelCounts[label] * testRatio+1e-6))
448  {
449  testLabelCounts[label] += 1;
450  testData.col(testIdx) = input.col(i);
451  testLabel[testIdx] = inputLabel[i];
452  testIdx += 1;
453  }
454  else
455  {
456  trainData.col(trainIdx) = input.col(i);
457  trainLabel[trainIdx] = inputLabel[i];
458  trainIdx += 1;
459  }
460  }
461  }
462 }
463 
464 /**
465  * Unfortunately mlpack has an issue if you have a balanced dataset, so got it
466  * from there with a minor fix
467  *
468  * @param input Input dataset to stratify.
469  * @param inputLabel Input labels to stratify.
470  * @param trainData Matrix to store training data into.
471  * @param testData Matrix to store test data into.
472  * @param trainLabel Vector to store training labels into.
473  * @param testLabel Vector to store test labels into.
474  * @param testRatio ratio of test set
475  * @param shuffleData If true, the sample order is shuffled; otherwise, each
476  * sample is visited in linear order. (Default true.)
477  */
478 template<typename T, typename LabelsType,
479  typename = std::enable_if_t<arma::is_arma_type<LabelsType>::value> >
480 void StratifiedSplit(const arma::Mat<T>& input,
481  const LabelsType& inputLabel,
482  arma::Mat<T>& trainData,
483  arma::Mat<T>& testData,
484  LabelsType& trainLabel,
485  LabelsType& testLabel,
486  const double testRatio,
487  const bool shuffleData = true)
488 {
489  const bool typeCheck = (arma::is_Row<LabelsType>::value)
490  || (arma::is_Col<LabelsType>::value);
491  if (!typeCheck)
492  throw std::runtime_error("data::Split(): when stratified sampling is done, "
493  "labels must have type `arma::Row<>`!");
494  mlpack::util::CheckSameSizes(input, inputLabel, "data::Split()");
495 
496  size_t trainIdx = 0;
497  size_t testIdx = 0;
498  size_t trainSize = 0;
499  size_t testSize = 0;
500  arma::uvec labelCounts;
501  arma::uvec testLabelCounts;
502  typename LabelsType::elem_type maxLabel = inputLabel.max();
503 
504  labelCounts.zeros(maxLabel+1);
505  testLabelCounts.zeros(maxLabel+1);
506 
507  for (typename LabelsType::elem_type label : inputLabel)
508  ++labelCounts[label];
509 
510  for (arma::uword labelCount : labelCounts)
511  {
512  testSize += floor(labelCount * testRatio+1e-6);
513  trainSize += labelCount - floor(labelCount * testRatio+1e-6);
514  }
515 
516  trainData.set_size(input.n_rows, trainSize);
517  testData.set_size(input.n_rows, testSize);
518  trainLabel.set_size(inputLabel.n_rows, trainSize);
519  testLabel.set_size(inputLabel.n_rows, testSize);
520 
521  if (shuffleData)
522  {
523  arma::uvec order = arma::shuffle(
524  arma::linspace<arma::uvec>(0, input.n_cols - 1, input.n_cols));
525 
526  for (arma::uword i : order)
527  {
528  typename LabelsType::elem_type label = inputLabel[i];
529  if (testLabelCounts[label] < floor(labelCounts[label] * testRatio+1e-6))
530  {
531  testLabelCounts[label] += 1;
532  testData.col(testIdx) = input.col(i);
533  testLabel[testIdx] = inputLabel[i];
534  testIdx += 1;
535  }
536  else
537  {
538  trainData.col(trainIdx) = input.col(i);
539  trainLabel[trainIdx] = inputLabel[i];
540  trainIdx += 1;
541  }
542  }
543  }
544  else
545  {
546  for (arma::uword i = 0; i < input.n_cols; i++)
547  {
548  typename LabelsType::elem_type label = inputLabel[i];
549  if (testLabelCounts[label] < floor(labelCounts[label] * testRatio+1e-6))
550  {
551  testLabelCounts[label] += 1;
552  testData.col(testIdx) = input.col(i);
553  testLabel[testIdx] = inputLabel[i];
554  testIdx += 1;
555  }
556  else
557  {
558  trainData.col(trainIdx) = input.col(i);
559  trainLabel[trainIdx] = inputLabel[i];
560  trainIdx += 1;
561  }
562  }
563  }
564 }
565 
566 /**
567  * Given a dataset, split into a training set and test set with stratification
568  *
569  * @param dataset to be splitted
570  * @param trainset to be splitted
571  * @param testset to be splitted
572  * @param trainNum number of training points
573  */
574 template<typename T>
575 void StratifiedSplit ( const T& dataset,
576  T& trainset,
577  T& testset,
578  const size_t trainNum )
579 {
580  assert ( ( typeid(T) == typeid(Dataset<arma::Row<size_t>>) ||
581  typeid(T) == typeid(oml::Dataset<size_t>)) &&
582  "StratifiedSplit can only be used for classification dataset type...");
583 
584  trainset = dataset; testset = dataset;
585 
586  StratifiedSplit(dataset.inputs_, dataset.labels_,
587  trainset.inputs_, testset.inputs_,
588  trainset.labels_, testset.labels_, trainNum);
589 
590  trainset.Update(trainset.inputs_,trainset.labels_);
591  testset.Update(testset.inputs_,testset.labels_);
592 }
593 
594 /**
595  * Given a dataset, split into a training set and test set with stratification
596  *
597  * @param dataset to be splitted
598  * @param trainset to be filled
599  * @param testset to be filled
600  * @param testRatio percentage of test set
601  */
602 template<typename T>
603 void StratifiedSplit ( const T& dataset,
604  T& trainset,
605  T& testset,
606  const double testRatio )
607 {
608 
609  assert ( (typeid(T) == typeid(Dataset<arma::Row<size_t>>) ||
610  typeid(T) == typeid(oml::Dataset<size_t>)) &&
611  "StratifiedSplit can only be used for classification dataset type...");
612 
613  trainset = dataset; testset = dataset;
614 
615  mlpack::data::StratifiedSplit(dataset.inputs_, dataset.labels_,
616  trainset.inputs_, testset.inputs_,
617  trainset.labels_, testset.labels_, testRatio);
618 
619  trainset.Update(trainset.inputs_,trainset.labels_);
620  testset.Update(testset.inputs_,testset.labels_);
621 }
622 
623 template<typename T, typename U>
624 std::tuple<arma::Mat<T>, arma::Mat<T>, arma::Row<U>, arma::Row<U>>
625 StratifiedSplit ( const arma::Mat<T>& input,
626  const arma::Row<U>& inputLabel,
627  const size_t trainNum)
628 {
629  arma::Mat<T> trainData;
630  arma::Mat<T> testData;
631  arma::Row<U> trainLabel;
632  arma::Row<U> testLabel;
633 
634  StratifiedSplit(input, inputLabel, trainData, testData, trainLabel, testLabel,
635  trainNum);
636 
637  return std::make_tuple(std::move(trainData),
638  std::move(testData),
639  std::move(trainLabel),
640  std::move(testLabel));
641 }
642 
643 } // namespace data
644 #endif
645 
void Split(const arma::Mat< T > &input, const arma::Row< U > &inputLabel, arma::Mat< T > &trainData, arma::Mat< T > &testData, arma::Row< U > &trainLabel, arma::Row< U > &testLabel, const size_t trainNum)
Definition: manip.h:162
void StratifiedSplit(const arma::Mat< T > &input, const LabelsType &inputLabel, arma::Mat< T > &trainData, arma::Mat< T > &testData, LabelsType &trainLabel, LabelsType &testLabel, const size_t trainNum, const bool shuffleData=true)
Definition: manip.h:377
T SetDiff(const T &check, const T &with)
Definition: manip.h:23
void Migrate(arma::Mat< T > &train_inp, arma::Row< U > &train_lab, arma::Mat< T > &test_inp, arma::Row< U > &test_lab, const size_t N)
Definition: manip.h:70