Learning Curve Plus Plus (LCPP)
manip.h File Reference
+ This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Functions

template<class T >
data::SetDiff (const T &check, const T &with)
 
template<typename T , typename U >
void data::Migrate (arma::Mat< T > &train_inp, arma::Row< U > &train_lab, arma::Mat< T > &test_inp, arma::Row< U > &test_lab, const size_t N)
 
template<typename T , typename U >
void data::Migrate (arma::Mat< T > &train_inp, arma::Mat< U > &train_lab, arma::Mat< T > &test_inp, arma::Mat< U > &test_lab, const size_t N)
 
template<typename T >
void data::Migrate (T &trainset, T &testset, const size_t N)
 
template<typename T = arma::uword>
void data::Migrate (arma::Col< T > &trainset, arma::Col< T > &testset, const size_t N)
 
template<typename T , typename U >
void data::Split (const arma::Mat< T > &input, const arma::Row< U > &inputLabel, arma::Mat< T > &trainData, arma::Mat< T > &testData, arma::Row< U > &trainLabel, arma::Row< U > &testLabel, const size_t trainNum)
 
template<typename T , typename U >
void data::Split (const arma::Mat< T > &input, const arma::Mat< U > &inputLabel, arma::Mat< T > &trainData, arma::Mat< T > &testData, arma::Mat< U > &trainLabel, arma::Mat< U > &testLabel, const size_t trainNum)
 
template<typename T >
void data::Split (const arma::Mat< T > &input, arma::Mat< T > &trainData, arma::Mat< T > &testData, const size_t trainNum)
 
template<typename T >
void data::Split (const arma::Row< T > &input, arma::Row< T > &trainData, arma::Row< T > &testData, const size_t trainNum)
 
template<typename T >
void data::Split (const arma::Col< T > &input, arma::Col< T > &trainData, arma::Col< T > &testData, const size_t trainNum)
 
template<typename T , typename U >
std::tuple< arma::Mat< T >, arma::Mat< T >, arma::Row< U >, arma::Row< U > > data::Split (const arma::Mat< T > &input, const arma::Row< U > &inputLabel, const size_t trainNum)
 
template<typename T , typename U >
std::tuple< arma::Mat< T >, arma::Mat< T >, arma::Mat< U >, arma::Mat< U > > data::Split (const arma::Mat< T > &input, const arma::Mat< U > &inputLabel, const size_t trainNum)
 
template<typename T >
std::tuple< arma::Mat< T >, arma::Mat< T > > data::Split (const arma::Mat< T > &input, const size_t trainNum)
 
template<typename T >
void data::Split (const T &dataset, T &trainset, T &testset, const size_t trainNum)
 
template<typename T , class O = DTYPE>
void data::Split (const T &dataset, T &trainset, T &testset, const O testRatio)
 
template<typename T , typename LabelsType , typename = std::enable_if_t<arma::is_arma_type<LabelsType>::value>>
void data::StratifiedSplit (const arma::Mat< T > &input, const LabelsType &inputLabel, arma::Mat< T > &trainData, arma::Mat< T > &testData, LabelsType &trainLabel, LabelsType &testLabel, const size_t trainNum, const bool shuffleData=true)
 
template<typename T , typename LabelsType , typename = std::enable_if_t<arma::is_arma_type<LabelsType>::value>>
void data::StratifiedSplit (const arma::Mat< T > &input, const LabelsType &inputLabel, arma::Mat< T > &trainData, arma::Mat< T > &testData, LabelsType &trainLabel, LabelsType &testLabel, const double testRatio, const bool shuffleData=true)
 
template<typename T >
void data::StratifiedSplit (const T &dataset, T &trainset, T &testset, const size_t trainNum)
 
template<typename T >
void data::StratifiedSplit (const T &dataset, T &trainset, T &testset, const double testRatio)
 
template<typename T , typename U >
std::tuple< arma::Mat< T >, arma::Mat< T >, arma::Row< U >, arma::Row< U > > data::StratifiedSplit (const arma::Mat< T > &input, const arma::Row< U > &inputLabel, const size_t trainNum)
 

Detailed Description

Author
Ozgur Taylan Turan

Data manipulation related stuff

Definition in file manip.h.

Function Documentation

◆ Migrate() [1/2]

template<typename T , typename U >
void data::Migrate ( arma::Mat< T > &  train_inp,
arma::Row< U > &  train_lab,
arma::Mat< T > &  test_inp,
arma::Row< U > &  test_lab,
const size_t  N 
)
Parameters
train_inp: inputs of the training set
train_lab: labels of the training set
test_inp: inputs of the testing set
test_lab: labels of the testing set
N: number of points that to be migrated (test->train)

Definition at line 70 of file manip.h.

75 {
76  assert ( ( train_inp.n_cols == train_lab.n_elem &&
77  train_inp.n_rows == test_inp.n_rows &&
78  test_inp.n_cols == test_lab.n_elem &&
79  test_lab.n_elem >= N) &&
80  "Requested element number is bigger than what you have.");
81 
82  train_inp.resize(train_inp.n_rows, train_inp.n_cols+N);
83  train_lab.resize(train_lab.n_cols+N);
84  arma::uvec idx = arma::randperm(test_inp.n_cols, N);
85  train_inp.tail_cols(N) = test_inp.cols(idx);
86  train_lab.tail_cols(N) = test_lab.cols(idx);
87  test_lab.shed_cols(idx);
88  test_inp.shed_cols(idx);
89 }

◆ Migrate() [2/2]

template<typename T >
void data::Migrate ( T &  trainset,
T &  testset,
const size_t  N 
)
Parameters
trainset: training dataset
testset: testing dataset
N: number of points that to be migrated (test->train)

Definition at line 118 of file manip.h.

121 {
122  Migrate(trainset.inputs_,trainset.labels_,testset.inputs_,testset.labels_,N);
123 
124  trainset.size_ = trainset.inputs_.n_cols;
125  testset.size_ = testset.inputs_.n_cols;
126 
127  if constexpr (std::is_same_v<T, data::oml::Dataset<size_t>> ||
128  std::is_same_v<T, data::Dataset<arma::Row<size_t>>> )
129  {
130  trainset.num_class_ = arma::unique(trainset.labels_).eval().n_cols;
131  testset.num_class_ = arma::unique(testset.labels_).eval().n_cols;
132  }
133 }
void Migrate(arma::Mat< T > &train_inp, arma::Row< U > &train_lab, arma::Mat< T > &test_inp, arma::Row< U > &test_lab, const size_t N)
Definition: manip.h:70

◆ SetDiff()

template<class T >
T data::SetDiff ( const T &  check,
const T &  with 
)
Parameters
check: The vector to be compared
with: Comparison is made with this vector

Definition at line 23 of file manip.h.

24 {
25  assert ( check.is_sorted() && with.is_sorted() &&
26  "For this method I assumed you sorted your stuff...");
27 
28  T result;
29  size_t i = 0, j = 0;
30 
31  while (i < check.n_elem && j < with.n_elem)
32  {
33  if (check[i] < with[j])
34  {
35  result.resize(result.n_elem+1);
36  result[result.n_elem-1] = check[i];
37  i++;
38  }
39  else if (check[i] > with[j])
40  j++;
41  else
42  {
43  i++;
44  j++;
45  }
46  }
47 
48  // Append remaining elements from check
49  while (i < check.n_elem)
50  {
51  result.resize(result.n_elem+1);
52  result[result.n_elem-1] = check[i];
53  i++;
54  }
55 
56  return result;
57 }

References data::SetDiff().

Referenced by data::SetDiff().

+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ Split() [1/6]

template<typename T >
void data::Split ( const arma::Mat< T > &  input,
arma::Mat< T > &  trainData,
arma::Mat< T > &  testData,
const size_t  trainNum 
)
Parameters
inputInput dataset to split.
trainDataMatrix to store training data into.
testDataMatrix to store test data into.
trainNumnumber of training points desired.

Definition at line 208 of file manip.h.

212 {
213  const arma::uvec order =
214  arma::shuffle(arma::regspace<arma::uvec>(0, input.n_cols - 1));
215 
216  trainData = input.cols( order.head(trainNum) );
217  testData = input.cols( order.tail(input.n_cols-trainNum) );
218 
219 }

References data::Split().

+ Here is the call graph for this function:

◆ Split() [2/6]

template<typename T , typename U >
void data::Split ( const arma::Mat< T > &  input,
const arma::Row< U > &  inputLabel,
arma::Mat< T > &  trainData,
arma::Mat< T > &  testData,
arma::Row< U > &  trainLabel,
arma::Row< U > &  testLabel,
const size_t  trainNum 
)
Parameters
inputInput dataset to split.
labelInput labels to split.
trainDataMatrix to store training data into.
testDataMatrix to store test data into.
trainLabelVector to store training labels into.
testLabelVector to store test labels into.
trainNumnumber of training points desired.

Definition at line 162 of file manip.h.

169 {
170  const arma::uvec order =
171  arma::shuffle(arma::regspace<arma::uvec>(0, input.n_cols - 1));
172 
173  trainData = input.cols(order.rows(0,trainNum-1));
174  trainLabel = inputLabel.cols(order.rows(0,trainNum-1));
175 
176  testData = input.cols(order.rows(trainNum,input.n_cols-1));
177  testLabel = inputLabel.cols(order.rows(trainNum,input.n_cols-1));
178 }

References data::Split().

Referenced by data::Split().

+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ Split() [3/6]

template<typename T , typename U >
std::tuple<arma::Mat<T>, arma::Mat<T>, arma::Row<U>, arma::Row<U> > data::Split ( const arma::Mat< T > &  input,
const arma::Row< U > &  inputLabel,
const size_t  trainNum 
)
Parameters
inputInput dataset to split.
labelInput labels to split.
trainNumnumber of training points desired.
Returns
std::tuple containing trainData (arma::Mat<T>), testData (arma::Mat<T>), trainLabel (arma::Row<U>), and testLabel (arma::Row<U>).

Definition at line 257 of file manip.h.

260 {
261  arma::Mat<T> trainData;
262  arma::Mat<T> testData;
263  arma::Row<U> trainLabel;
264  arma::Row<U> testLabel;
265 
266  Split(input, inputLabel, trainData, testData, trainLabel, testLabel,
267  trainNum);
268 
269  return std::make_tuple(std::move(trainData),
270  std::move(testData),
271  std::move(trainLabel),
272  std::move(testLabel));
273 }
void Split(const arma::Mat< T > &input, const arma::Row< U > &inputLabel, arma::Mat< T > &trainData, arma::Mat< T > &testData, arma::Row< U > &trainLabel, arma::Row< U > &testLabel, const size_t trainNum)
Definition: manip.h:162

References data::Split().

+ Here is the call graph for this function:

◆ Split() [4/6]

template<typename T >
std::tuple<arma::Mat<T>, arma::Mat<T> > data::Split ( const arma::Mat< T > &  input,
const size_t  trainNum 
)
Parameters
inputInput dataset to split.
trainNumnumber of training points desired.
Returns
std::tuple containing trainData (arma::Mat<T>) and testData (arma::Mat<T>).

Definition at line 302 of file manip.h.

304 {
305  arma::Mat<T> trainData;
306  arma::Mat<T> testData;
307  Split(input, trainData, testData, trainNum);
308 
309  return std::make_tuple(std::move(trainData),
310  std::move(testData));
311 }

References data::Split().

+ Here is the call graph for this function:

◆ Split() [5/6]

template<typename T , class O = DTYPE>
void data::Split ( const T &  dataset,
T &  trainset,
T &  testset,
const O  testRatio 
)
Parameters
datasetto be splitted
trainsetto be filled
testsetto be filled
testRatiopercentage of test set

Definition at line 344 of file manip.h.

348 {
349 
350  trainset = dataset; testset = dataset;
351 
352  mlpack::data::Split(dataset.inputs_, dataset.labels_,
353  trainset.inputs_, testset.inputs_,
354  trainset.labels_, testset.labels_, testRatio);
355 
356  trainset.Update(trainset.inputs_,trainset.labels_);
357  testset.Update(testset.inputs_,testset.labels_);
358 }

References data::Split().

+ Here is the call graph for this function:

◆ Split() [6/6]

template<typename T >
void data::Split ( const T &  dataset,
T &  trainset,
T &  testset,
const size_t  trainNum 
)
Parameters
datasetto be splitted
trainsetto be splitted
testsetto be splitted
trainNumnumber of training points

Definition at line 320 of file manip.h.

324 {
325 
326  trainset = dataset; testset = dataset;
327 
328  Split(dataset.inputs_, dataset.labels_,
329  trainset.inputs_, testset.inputs_,
330  trainset.labels_, testset.labels_, trainNum);
331 
332  trainset.Update(trainset.inputs_,trainset.labels_);
333  testset.Update(testset.inputs_,testset.labels_);
334 
335 }

References data::Split().

+ Here is the call graph for this function:

◆ StratifiedSplit() [1/4]

template<typename T , typename LabelsType , typename = std::enable_if_t<arma::is_arma_type<LabelsType>::value>>
void data::StratifiedSplit ( const arma::Mat< T > &  input,
const LabelsType &  inputLabel,
arma::Mat< T > &  trainData,
arma::Mat< T > &  testData,
LabelsType &  trainLabel,
LabelsType &  testLabel,
const double  testRatio,
const bool  shuffleData = true 
)

Unfortunately mlpack has an issue if you have a balanced dataset, so got it from there with a minor fix

Parameters
inputInput dataset to stratify.
inputLabelInput labels to stratify.
trainDataMatrix to store training data into.
testDataMatrix to store test data into.
trainLabelVector to store training labels into.
testLabelVector to store test labels into.
testRatioratio of test set
shuffleDataIf true, the sample order is shuffled; otherwise, each sample is visited in linear order. (Default true.)

Definition at line 480 of file manip.h.

488 {
489  const bool typeCheck = (arma::is_Row<LabelsType>::value)
490  || (arma::is_Col<LabelsType>::value);
491  if (!typeCheck)
492  throw std::runtime_error("data::Split(): when stratified sampling is done, "
493  "labels must have type `arma::Row<>`!");
494  mlpack::util::CheckSameSizes(input, inputLabel, "data::Split()");
495 
496  size_t trainIdx = 0;
497  size_t testIdx = 0;
498  size_t trainSize = 0;
499  size_t testSize = 0;
500  arma::uvec labelCounts;
501  arma::uvec testLabelCounts;
502  typename LabelsType::elem_type maxLabel = inputLabel.max();
503 
504  labelCounts.zeros(maxLabel+1);
505  testLabelCounts.zeros(maxLabel+1);
506 
507  for (typename LabelsType::elem_type label : inputLabel)
508  ++labelCounts[label];
509 
510  for (arma::uword labelCount : labelCounts)
511  {
512  testSize += floor(labelCount * testRatio+1e-6);
513  trainSize += labelCount - floor(labelCount * testRatio+1e-6);
514  }
515 
516  trainData.set_size(input.n_rows, trainSize);
517  testData.set_size(input.n_rows, testSize);
518  trainLabel.set_size(inputLabel.n_rows, trainSize);
519  testLabel.set_size(inputLabel.n_rows, testSize);
520 
521  if (shuffleData)
522  {
523  arma::uvec order = arma::shuffle(
524  arma::linspace<arma::uvec>(0, input.n_cols - 1, input.n_cols));
525 
526  for (arma::uword i : order)
527  {
528  typename LabelsType::elem_type label = inputLabel[i];
529  if (testLabelCounts[label] < floor(labelCounts[label] * testRatio+1e-6))
530  {
531  testLabelCounts[label] += 1;
532  testData.col(testIdx) = input.col(i);
533  testLabel[testIdx] = inputLabel[i];
534  testIdx += 1;
535  }
536  else
537  {
538  trainData.col(trainIdx) = input.col(i);
539  trainLabel[trainIdx] = inputLabel[i];
540  trainIdx += 1;
541  }
542  }
543  }
544  else
545  {
546  for (arma::uword i = 0; i < input.n_cols; i++)
547  {
548  typename LabelsType::elem_type label = inputLabel[i];
549  if (testLabelCounts[label] < floor(labelCounts[label] * testRatio+1e-6))
550  {
551  testLabelCounts[label] += 1;
552  testData.col(testIdx) = input.col(i);
553  testLabel[testIdx] = inputLabel[i];
554  testIdx += 1;
555  }
556  else
557  {
558  trainData.col(trainIdx) = input.col(i);
559  trainLabel[trainIdx] = inputLabel[i];
560  trainIdx += 1;
561  }
562  }
563  }
564 }

◆ StratifiedSplit() [2/4]

template<typename T , typename LabelsType , typename = std::enable_if_t<arma::is_arma_type<LabelsType>::value>>
void data::StratifiedSplit ( const arma::Mat< T > &  input,
const LabelsType &  inputLabel,
arma::Mat< T > &  trainData,
arma::Mat< T > &  testData,
LabelsType &  trainLabel,
LabelsType &  testLabel,
const size_t  trainNum,
const bool  shuffleData = true 
)
Parameters
inputInput dataset to stratify.
inputLabelInput labels to stratify.
trainDataMatrix to store training data into.
testDataMatrix to store test data into.
trainLabelVector to store training labels into.
testLabelVector to store test labels into.
trainNumnumber of training points of dataset to use for test set
shuffleDataIf true, the sample order is shuffled; otherwise, each sample is visited in linear order. (Default true.)

Definition at line 377 of file manip.h.

385 {
386  const bool typeCheck = (arma::is_Row<LabelsType>::value)
387  || (arma::is_Col<LabelsType>::value);
388  if (!typeCheck)
389  throw std::runtime_error("data::Split(): when stratified sampling is done, "
390  "labels must have type `arma::Row<>`!");
391  mlpack::util::CheckSameSizes(input, inputLabel, "data::Split()");
392 
393  double testRatio = double(1) - double(trainNum)/double(inputLabel.n_elem);
394  size_t trainIdx = 0;
395  size_t testIdx = 0;
396  size_t trainSize = 0;
397  size_t testSize = 0;
398  arma::uvec labelCounts;
399  arma::uvec testLabelCounts;
400  typename LabelsType::elem_type maxLabel = inputLabel.max();
401 
402  labelCounts.zeros(maxLabel+1);
403  testLabelCounts.zeros(maxLabel+1);
404 
405  for (typename LabelsType::elem_type label : inputLabel)
406  ++labelCounts[label];
407 
408  for (arma::uword labelCount : labelCounts)
409  {
410  testSize += floor(labelCount * testRatio+1e-6);
411  trainSize += labelCount - floor(labelCount * testRatio+1e-6);
412  }
413 
414  trainData.set_size(input.n_rows, trainSize);
415  testData.set_size(input.n_rows, testSize);
416  trainLabel.set_size(inputLabel.n_rows, trainSize);
417  testLabel.set_size(inputLabel.n_rows, testSize);
418 
419  if (shuffleData)
420  {
421  arma::uvec order = arma::shuffle(
422  arma::linspace<arma::uvec>(0, input.n_cols - 1, input.n_cols));
423 
424  for (arma::uword i : order)
425  {
426  typename LabelsType::elem_type label = inputLabel[i];
427  if (testLabelCounts[label] < floor(labelCounts[label] * testRatio+1e-6))
428  {
429  testLabelCounts[label] += 1;
430  testData.col(testIdx) = input.col(i);
431  testLabel[testIdx] = inputLabel[i];
432  testIdx += 1;
433  }
434  else
435  {
436  trainData.col(trainIdx) = input.col(i);
437  trainLabel[trainIdx] = inputLabel[i];
438  trainIdx += 1;
439  }
440  }
441  }
442  else
443  {
444  for (arma::uword i = 0; i < input.n_cols; i++)
445  {
446  typename LabelsType::elem_type label = inputLabel[i];
447  if (testLabelCounts[label] < floor(labelCounts[label] * testRatio+1e-6))
448  {
449  testLabelCounts[label] += 1;
450  testData.col(testIdx) = input.col(i);
451  testLabel[testIdx] = inputLabel[i];
452  testIdx += 1;
453  }
454  else
455  {
456  trainData.col(trainIdx) = input.col(i);
457  trainLabel[trainIdx] = inputLabel[i];
458  trainIdx += 1;
459  }
460  }
461  }
462 }

◆ StratifiedSplit() [3/4]

template<typename T >
void data::StratifiedSplit ( const T &  dataset,
T &  trainset,
T &  testset,
const double  testRatio 
)

Given a dataset, split into a training set and test set with stratification

Parameters
datasetto be splitted
trainsetto be filled
testsetto be filled
testRatiopercentage of test set

Definition at line 603 of file manip.h.

607 {
608 
609  assert ( (typeid(T) == typeid(Dataset<arma::Row<size_t>>) ||
610  typeid(T) == typeid(oml::Dataset<size_t>)) &&
611  "StratifiedSplit can only be used for classification dataset type...");
612 
613  trainset = dataset; testset = dataset;
614 
615  mlpack::data::StratifiedSplit(dataset.inputs_, dataset.labels_,
616  trainset.inputs_, testset.inputs_,
617  trainset.labels_, testset.labels_, testRatio);
618 
619  trainset.Update(trainset.inputs_,trainset.labels_);
620  testset.Update(testset.inputs_,testset.labels_);
621 }

◆ StratifiedSplit() [4/4]

template<typename T >
void data::StratifiedSplit ( const T &  dataset,
T &  trainset,
T &  testset,
const size_t  trainNum 
)

Given a dataset, split into a training set and test set with stratification

Parameters
datasetto be splitted
trainsetto be splitted
testsetto be splitted
trainNumnumber of training points

Definition at line 575 of file manip.h.

579 {
580  assert ( ( typeid(T) == typeid(Dataset<arma::Row<size_t>>) ||
581  typeid(T) == typeid(oml::Dataset<size_t>)) &&
582  "StratifiedSplit can only be used for classification dataset type...");
583 
584  trainset = dataset; testset = dataset;
585 
586  StratifiedSplit(dataset.inputs_, dataset.labels_,
587  trainset.inputs_, testset.inputs_,
588  trainset.labels_, testset.labels_, trainNum);
589 
590  trainset.Update(trainset.inputs_,trainset.labels_);
591  testset.Update(testset.inputs_,testset.labels_);
592 }
void StratifiedSplit(const arma::Mat< T > &input, const LabelsType &inputLabel, arma::Mat< T > &trainData, arma::Mat< T > &testData, LabelsType &trainLabel, LabelsType &testLabel, const size_t trainNum, const bool shuffleData=true)
Definition: manip.h:377