You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by ss...@apache.org on 2021/03/17 22:34:39 UTC
[systemds] branch master updated: [MINOR] Added support for
categorical features in SMOTE
This is an automated email from the ASF dual-hosted git repository.
ssiddiqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/master by this push:
new 797ab88 [MINOR] Added support for categorical features in SMOTE
797ab88 is described below
commit 797ab881507ad2389aa947430411e04256fc1801
Author: Shafaq Siddiqi <sh...@tugraz.at>
AuthorDate: Wed Mar 17 22:48:11 2021 +0100
[MINOR] Added support for categorical features in SMOTE
---
scripts/builtin/smote.dml | 56 ++++++++++++++++++----
.../builtin/BuiltinGaussianClassifierTest.java | 3 +-
.../test/functions/builtin/BuiltinSmoteTest.java | 41 +++++++++++-----
src/test/scripts/functions/builtin/smote.dml | 3 +-
4 files changed, 82 insertions(+), 21 deletions(-)
diff --git a/scripts/builtin/smote.dml b/scripts/builtin/smote.dml
index 1a416ea..c6fc751 100644
--- a/scripts/builtin/smote.dml
+++ b/scripts/builtin/smote.dml
@@ -21,12 +21,14 @@
# Builtin function for handing class imbalance using Synthetic Minority Over-sampling Technique (SMOTE)
+# by Nitesh V. Chawla et. al. In Journal of Artificial Intelligence Research 16 (2002). 321–357
#
# INPUT PARAMETERS:
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# X Double --- Matrix of minority class samples
+# mask Double --- 0/1 mask vector where 0 represent numeric value and 1 represent categorical value
# s Integer 25 Amount of SMOTE (percentage of oversampling), integral multiple of 100
# k Integer 1 Number of nearest neighbour
# ---------------------------------------------------------------------------------------------
@@ -38,7 +40,7 @@
# ---------------------------------------------------------------------------------------------
# Y Double --- Matrix of (N/100)-1 * nrow(X) synthetic minority class samples
-m_smote = function(Matrix[Double] X, Integer s = 200, Integer k = 1, Boolean verbose = FALSE)
+m_smote = function(Matrix[Double] X, Matrix[Double] mask, Integer s = 200, Integer k = 1, Boolean verbose = FALSE)
return (Matrix[Double] Y) {
if(s < 100 | (s%%100) != 0)
@@ -46,18 +48,19 @@ return (Matrix[Double] Y) {
print("the number of samples should be an integral multiple of 100. Setting s = 100")
s = 100
}
-
if(k < 1) {
print("k should not be less than 1. Setting k value to default k = 1.")
k = 1
}
+ if(ncol(mask) != ncol(X))
+ stop("column mismatch: no. of columns in mask vector should be equal to no. of columns in data matrix")
# matrix to keep the index of KNN for each minority sample
knn_index = matrix(0,k,nrow(X))
# find nearest neighbour
for(i in 1:nrow(X))
{
- knn = nn(X, X[i, ], k)
+ knn = nn(X, X[i, ], mask, k)
knn_index[, i] = knn
}
@@ -79,13 +82,28 @@ return (Matrix[Double] Y) {
# pick the random NN
knn_sample = knn_index[as.scalar(rand_index[iter+1]),]
# generate sample
- for(i in 1:ncol(knn_index))
- {
+ for(i in 1:ncol(knn_index)) {
index = as.scalar(knn_sample[1,i])
+
X_diff = X[index,] - X[i, ]
gap = as.scalar(Rand(rows=1, cols=1, min=0, max=1, seed = 42))
+ # generate synthetic sample
X_sys = X[i, ] + (gap*X_diff)
+ # for nominal features replace their value with majority voting
+ if(sum(mask) > 0) {
+ categorical = X_sys * mask
+ # get all nn values
+ computation_matrix = table(knn_index[,i], knn_index[, i], nrow(X), nrow(X))
+ nn_X = computation_matrix %*% X
+ nn_X = removeEmpty(target=nn_X, margin = "rows")
+ nn_X = nn_X * mask
+ freq = getFrequentValue(nn_X)
+ categorical = (categorical > 0) * freq
+ X_sys = X_sys * (mask == 0)
+ X_sys = X_sys + categorical
+ }
synthetic_samples[iter*ncol(knn_index)+i,] = X_sys;
+
}
iter = iter + 1
}
@@ -97,19 +115,41 @@ return (Matrix[Double] Y) {
}
+# as described in the paper, fr categorical columns compute the difference by replacing the
+# categorical values with the median of standard deviation of numerical values
-
-nn = function(Matrix[Double] X, Matrix[Double] instance, Integer k )
+nn = function(Matrix[Double] X, Matrix[Double] instance, Matrix[Double] mask, Integer k )
return (Matrix[Double] knn_)
{
if(nrow(X) < k)
stop("can not pick "+k+" nearest neighbours from "+nrow(X)+" total instances")
- # compute the euclidean distance
diff = X - instance
+ diff_nominal = diff * mask
+ if(sum(diff_nominal) != 0) {
+ only_number = removeEmpty(target=X, margin="cols", select=(mask==0))
+ num_std = colSds(only_number)
+ num_std_median = median(t(num_std))
+ diff_nominal = (diff_nominal != 0)
+ diff_nominal = diff_nominal * num_std_median
+ diff = diff_nominal + (diff * (mask==0))
+ }
square_diff = diff^2
distance = sqrt(rowSums(square_diff))
sort_dist = order(target = distance, by = 1, decreasing= FALSE, index.return = TRUE)
knn_ = sort_dist[2:k+1,]
}
+getFrequentValue = function(Matrix[Double] X)
+return (Matrix[Double] freq)
+{
+ freq = matrix(0, rows=1, cols=ncol(X))
+ for(i in 1:ncol(X))
+ {
+ if(sum(X[, i]) != 0) {
+ cat_counts = table(X[, i], 1, nrow(X), 1); # counts for each category
+ freq[1,i] = as.scalar(rowIndexMax(t(cat_counts))) # mode
+ }
+ }
+}
+
diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinGaussianClassifierTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinGaussianClassifierTest.java
index 38ac980..c1a277b 100644
--- a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinGaussianClassifierTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinGaussianClassifierTest.java
@@ -85,6 +85,7 @@ public class BuiltinGaussianClassifierTest extends AutomatedTestBase
public void testGaussianClassifier(int rows, int cols, double sparsity, int classes)
{
+ setOutputBuffering(true);
loadTestConfiguration(getTestConfiguration(TEST_NAME));
String HOME = SCRIPT_DIR + TEST_DIR;
fullDMLScriptName = HOME + TEST_NAME + ".dml";
@@ -136,7 +137,7 @@ public class BuiltinGaussianClassifierTest extends AutomatedTestBase
double[][] invcovsSYSTEMDS = TestUtils.convertHashMapToDoubleArray(invcovsSYSTEMDStemp);
TestUtils.compareMatrices(priorR, priorSYSTEMDS, Math.pow(10, -5.0), "priorR", "priorSYSTEMDS");
- TestUtils.compareMatricesBitAvgDistance(meansR, meansSYSTEMDS, 5L,5L, this.toString());
+ TestUtils.compareMatricesBitAvgDistance(meansR, meansSYSTEMDS, 10L,10L, this.toString());
TestUtils.compareMatricesBitAvgDistance(determinantsR, determinantsSYSTEMDS, (long)2E+12,(long)2E+12, this.toString());
TestUtils.compareMatricesBitAvgDistance(invcovsR, invcovsSYSTEMDS, (long)2E+20,(long)2E+20, this.toString());
}
diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinSmoteTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinSmoteTest.java
index 0c1fd77..c750719 100644
--- a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinSmoteTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinSmoteTest.java
@@ -49,29 +49,45 @@ public class BuiltinSmoteTest extends AutomatedTestBase {
@Test
public void testSmote0CP() {
- runSmoteTest(100, 1, LopProperties.ExecType.CP);
+ double[][] mask = new double[][]{{1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}};
+ runSmoteTest(100, 3, mask, LopProperties.ExecType.CP);
}
@Test
public void testSmote1CP() {
- runSmoteTest(300, 10, LopProperties.ExecType.CP);
+ double[][] mask = new double[][]{{1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1}};
+ runSmoteTest(300, 10, mask, LopProperties.ExecType.CP);
}
@Test
public void testSmote2CP() {
- runSmoteTest(400, 5, LopProperties.ExecType.CP);
+ double[][] mask = new double[][]{{1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}};
+ runSmoteTest(400, 5, mask, LopProperties.ExecType.CP);
}
@Test
- public void testSmote1Spark() {
- runSmoteTest(300, 3, LopProperties.ExecType.SPARK);
+ public void testSmote3CP() {
+ double[][] mask = new double[][]{{1,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0}};
+ runSmoteTest(300, 3, mask, LopProperties.ExecType.CP);
}
@Test
- public void testSmote2Spark() { runSmoteTest(400, 5, LopProperties.ExecType.SPARK); }
+ public void testSmote4CP() {
+ double[][] mask = new double[][]{{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}};
+ runSmoteTest(400, 5, mask, LopProperties.ExecType.CP); }
+ public void testSmote3Spark() {
+ double[][] mask = new double[][]{{1,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0}};
+ runSmoteTest(300, 3, mask, LopProperties.ExecType.SPARK);
+ }
+
+ @Test
+ public void testSmote4Spark() {
+ double[][] mask = new double[][]{{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}};
+ runSmoteTest(400, 5, mask, LopProperties.ExecType.SPARK); }
+
- private void runSmoteTest(int sample, int nn, LopProperties.ExecType instType) {
+ private void runSmoteTest(int sample, int nn, double[][] mask, LopProperties.ExecType instType) {
Types.ExecMode platformOld = setExecMode(instType);
boolean oldFlag = OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION;
@@ -81,13 +97,16 @@ public class BuiltinSmoteTest extends AutomatedTestBase {
loadTestConfiguration(getTestConfiguration(TEST_NAME));
String HOME = SCRIPT_DIR + TEST_DIR;
fullDMLScriptName = HOME + TEST_NAME + ".dml";
- programArgs = new String[] {"-nvargs", "X=" + input("X"), "S=" + sample, "K=" + nn , "Z="+output("Sum"), "T="+input("T")};
-
- double[][] X = getRandomMatrix(rows, colsX, 0, 1, 0.3, 1);
+ programArgs = new String[] {"-nvargs", "X=" + input("X"), "S=" + sample, "M="+input("M"),
+ "K=" + nn , "Z="+output("Sum"), "T="+input("T")};
+ double[][] X = getRandomMatrix(rows, colsX, 1, 10, 1, 1);
+ X = TestUtils.round(X);
writeInputMatrixWithMTD("X", X, true);
+ writeInputMatrixWithMTD("M", mask, true);
- double[][] T = getRandomMatrix(rows, colsX, 2, 3.0, 0.3, 3);
+ double[][] T = getRandomMatrix(rows, colsX, 20, 30, 1, 3);
+ T = TestUtils.round(T);
writeInputMatrixWithMTD("T", T, true);
diff --git a/src/test/scripts/functions/builtin/smote.dml b/src/test/scripts/functions/builtin/smote.dml
index 5a8d5d6..8385f1b 100644
--- a/src/test/scripts/functions/builtin/smote.dml
+++ b/src/test/scripts/functions/builtin/smote.dml
@@ -21,7 +21,8 @@
A = read($X);
-B = smote(X = A, s=$S, k=$K, verbose=TRUE);
+M = read($M)
+B = smote(X = A, mask=M, s=$S, k=$K, verbose=TRUE);
# test if all point fall in same cluster (closed to each other)
# read some new data T != A