You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by ss...@apache.org on 2021/03/17 22:34:39 UTC

[systemds] branch master updated: [MINOR] Added support for categorical features in SMOTE

This is an automated email from the ASF dual-hosted git repository.

ssiddiqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/master by this push:
     new 797ab88  [MINOR] Added support for categorical features in SMOTE
797ab88 is described below

commit 797ab881507ad2389aa947430411e04256fc1801
Author: Shafaq Siddiqi <sh...@tugraz.at>
AuthorDate: Wed Mar 17 22:48:11 2021 +0100

    [MINOR] Added support for categorical features in SMOTE
---
 scripts/builtin/smote.dml                          | 56 ++++++++++++++++++----
 .../builtin/BuiltinGaussianClassifierTest.java     |  3 +-
 .../test/functions/builtin/BuiltinSmoteTest.java   | 41 +++++++++++-----
 src/test/scripts/functions/builtin/smote.dml       |  3 +-
 4 files changed, 82 insertions(+), 21 deletions(-)

diff --git a/scripts/builtin/smote.dml b/scripts/builtin/smote.dml
index 1a416ea..c6fc751 100644
--- a/scripts/builtin/smote.dml
+++ b/scripts/builtin/smote.dml
@@ -21,12 +21,14 @@
 
 
 # Builtin function for handing class imbalance using Synthetic Minority Over-sampling Technique (SMOTE)
+# by Nitesh V. Chawla et. al. In Journal of Artificial Intelligence Research 16 (2002). 321–357
 #
 # INPUT PARAMETERS:
 # ---------------------------------------------------------------------------------------------
 # NAME            TYPE    DEFAULT     MEANING
 # ---------------------------------------------------------------------------------------------
 # X               Double   ---       Matrix of minority class samples 
+# mask             Double   ---       0/1 mask vector where 0 represent numeric value and 1 represent categorical value
 # s               Integer   25       Amount of SMOTE (percentage of oversampling), integral multiple of 100
 # k               Integer   1        Number of nearest neighbour
 # ---------------------------------------------------------------------------------------------
@@ -38,7 +40,7 @@
 # ---------------------------------------------------------------------------------------------
 # Y               Double   ---       Matrix of (N/100)-1 * nrow(X) synthetic minority class samples 
 
-m_smote = function(Matrix[Double] X, Integer s = 200, Integer k = 1, Boolean verbose = FALSE) 
+m_smote = function(Matrix[Double] X, Matrix[Double] mask, Integer s = 200, Integer k = 1, Boolean verbose = FALSE) 
 return (Matrix[Double] Y) {
 
   if(s < 100 | (s%%100) != 0)
@@ -46,18 +48,19 @@ return (Matrix[Double] Y) {
     print("the number of samples should be an integral multiple of 100. Setting s = 100")
     s = 100
   }
-  
   if(k < 1) {
     print("k should not be less than 1. Setting k value to default k = 1.")
     k = 1
   }
+  if(ncol(mask) != ncol(X))
+    stop("column mismatch: no. of columns in mask vector should be equal to no. of columns in data matrix")
   
   # matrix to keep the index of KNN for each minority sample
   knn_index = matrix(0,k,nrow(X))
   # find nearest neighbour
   for(i in 1:nrow(X))
   {
-    knn = nn(X, X[i, ], k)
+    knn = nn(X, X[i, ], mask, k)
     knn_index[, i] = knn
   }
   
@@ -79,13 +82,28 @@ return (Matrix[Double] Y) {
     # pick the random NN
     knn_sample = knn_index[as.scalar(rand_index[iter+1]),] 
     # generate sample    
-    for(i in 1:ncol(knn_index))
-    {
+    for(i in 1:ncol(knn_index)) {
       index = as.scalar(knn_sample[1,i])
+
       X_diff = X[index,] - X[i, ]
       gap = as.scalar(Rand(rows=1, cols=1, min=0, max=1, seed = 42))
+      # generate synthetic sample
       X_sys = X[i, ] + (gap*X_diff)
+      # for nominal features replace their value with majority voting
+      if(sum(mask) > 0) {
+        categorical = X_sys * mask
+        # get all nn values
+        computation_matrix = table(knn_index[,i], knn_index[, i], nrow(X), nrow(X))
+        nn_X = computation_matrix %*% X
+        nn_X = removeEmpty(target=nn_X, margin = "rows")
+        nn_X = nn_X * mask
+        freq = getFrequentValue(nn_X)
+        categorical = (categorical > 0) * freq
+        X_sys = X_sys * (mask == 0)
+        X_sys = X_sys + categorical
+      }
       synthetic_samples[iter*ncol(knn_index)+i,] = X_sys;
+
     }
     iter = iter + 1
   }
@@ -97,19 +115,41 @@ return (Matrix[Double] Y) {
 
 }
   
+# as described in the paper, fr categorical columns compute the difference by replacing the 
+# categorical values with the median of standard deviation of numerical values
 
-
-nn = function(Matrix[Double] X, Matrix[Double] instance, Integer k )
+nn = function(Matrix[Double] X, Matrix[Double] instance, Matrix[Double] mask, Integer k )
 return (Matrix[Double] knn_)
 {
   if(nrow(X) < k)
     stop("can not pick "+k+" nearest neighbours from "+nrow(X)+" total instances")
 
-  # compute the euclidean distance
   diff = X - instance
+  diff_nominal  = diff * mask
+  if(sum(diff_nominal) != 0) {
+    only_number = removeEmpty(target=X, margin="cols", select=(mask==0))
+    num_std = colSds(only_number)
+    num_std_median = median(t(num_std))
+    diff_nominal = (diff_nominal != 0)
+    diff_nominal = diff_nominal * num_std_median 
+    diff = diff_nominal + (diff * (mask==0))  
+  }
   square_diff = diff^2
   distance = sqrt(rowSums(square_diff))
   sort_dist = order(target = distance, by = 1, decreasing= FALSE, index.return =  TRUE)
   knn_ = sort_dist[2:k+1,]
 }
 
+getFrequentValue = function(Matrix[Double] X)
+return (Matrix[Double] freq)
+{
+  freq = matrix(0, rows=1, cols=ncol(X))
+  for(i in 1:ncol(X))
+  {
+    if(sum(X[, i]) != 0) {
+      cat_counts = table(X[, i], 1, nrow(X), 1);  # counts for each category
+      freq[1,i] = as.scalar(rowIndexMax(t(cat_counts))) # mode
+    }
+  }
+}
+
diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinGaussianClassifierTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinGaussianClassifierTest.java
index 38ac980..c1a277b 100644
--- a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinGaussianClassifierTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinGaussianClassifierTest.java
@@ -85,6 +85,7 @@ public class BuiltinGaussianClassifierTest extends AutomatedTestBase
 
 	public void testGaussianClassifier(int rows, int cols, double sparsity, int classes)
 	{
+		setOutputBuffering(true);
 		loadTestConfiguration(getTestConfiguration(TEST_NAME));
 		String HOME = SCRIPT_DIR + TEST_DIR;
 		fullDMLScriptName = HOME + TEST_NAME + ".dml";
@@ -136,7 +137,7 @@ public class BuiltinGaussianClassifierTest extends AutomatedTestBase
 		double[][] invcovsSYSTEMDS = TestUtils.convertHashMapToDoubleArray(invcovsSYSTEMDStemp);
 
 		TestUtils.compareMatrices(priorR, priorSYSTEMDS, Math.pow(10, -5.0), "priorR", "priorSYSTEMDS");
-		TestUtils.compareMatricesBitAvgDistance(meansR, meansSYSTEMDS, 5L,5L, this.toString());
+		TestUtils.compareMatricesBitAvgDistance(meansR, meansSYSTEMDS, 10L,10L, this.toString());
 		TestUtils.compareMatricesBitAvgDistance(determinantsR, determinantsSYSTEMDS, (long)2E+12,(long)2E+12, this.toString());
 		TestUtils.compareMatricesBitAvgDistance(invcovsR, invcovsSYSTEMDS, (long)2E+20,(long)2E+20, this.toString());
 	}
diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinSmoteTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinSmoteTest.java
index 0c1fd77..c750719 100644
--- a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinSmoteTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinSmoteTest.java
@@ -49,29 +49,45 @@ public class BuiltinSmoteTest extends AutomatedTestBase {
 
 	@Test
 	public void testSmote0CP() {
-		runSmoteTest(100, 1, LopProperties.ExecType.CP);
+		double[][] mask =  new double[][]{{1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}};
+		runSmoteTest(100, 3, mask, LopProperties.ExecType.CP);
 	}
 
 	@Test
 	public void testSmote1CP() {
-		runSmoteTest(300, 10, LopProperties.ExecType.CP);
+		double[][] mask =  new double[][]{{1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1}};
+		runSmoteTest(300, 10, mask, LopProperties.ExecType.CP);
 	}
 
 	@Test
 	public void testSmote2CP() {
-		runSmoteTest(400, 5, LopProperties.ExecType.CP);
+		double[][] mask =  new double[][]{{1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}};
+		runSmoteTest(400, 5, mask, LopProperties.ExecType.CP);
 	}
 
 	@Test
-	public void testSmote1Spark() {
-		runSmoteTest(300, 3, LopProperties.ExecType.SPARK);
+	public void testSmote3CP() {
+		double[][] mask =  new double[][]{{1,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0}};
+		runSmoteTest(300, 3, mask, LopProperties.ExecType.CP);
 	}
 
 	@Test
-	public void testSmote2Spark() { runSmoteTest(400, 5, LopProperties.ExecType.SPARK);	}
+	public void testSmote4CP() {
+		double[][] mask =  new double[][]{{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}};
+		runSmoteTest(400, 5, mask, LopProperties.ExecType.CP);	}
 
+	public void testSmote3Spark() {
+		double[][] mask =  new double[][]{{1,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0}};
+		runSmoteTest(300, 3, mask, LopProperties.ExecType.SPARK);
+	}
+
+	@Test
+	public void testSmote4Spark() {
+		double[][] mask =  new double[][]{{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}};
+		runSmoteTest(400, 5, mask, LopProperties.ExecType.SPARK);	}
+		
 
-	private void runSmoteTest(int sample, int nn, LopProperties.ExecType instType) {
+	private void runSmoteTest(int sample, int nn, double[][] mask, LopProperties.ExecType instType) {
 		Types.ExecMode platformOld = setExecMode(instType);
 
 		boolean oldFlag = OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION;
@@ -81,13 +97,16 @@ public class BuiltinSmoteTest extends AutomatedTestBase {
 			loadTestConfiguration(getTestConfiguration(TEST_NAME));
 			String HOME = SCRIPT_DIR + TEST_DIR;
 			fullDMLScriptName = HOME + TEST_NAME + ".dml";
-			programArgs = new String[] {"-nvargs", "X=" + input("X"), "S=" + sample, "K=" + nn , "Z="+output("Sum"), "T="+input("T")};
-
-			double[][] X = getRandomMatrix(rows, colsX, 0, 1, 0.3, 1);
+			programArgs = new String[] {"-nvargs", "X=" + input("X"), "S=" + sample, "M="+input("M"),
+				"K=" + nn , "Z="+output("Sum"), "T="+input("T")};
 
+			double[][] X = getRandomMatrix(rows, colsX, 1, 10, 1, 1);
+			X = TestUtils.round(X);
 			writeInputMatrixWithMTD("X", X, true);
+			writeInputMatrixWithMTD("M", mask, true);
 
-			double[][] T = getRandomMatrix(rows, colsX, 2, 3.0, 0.3, 3);
+			double[][] T = getRandomMatrix(rows, colsX, 20, 30, 1, 3);
+			T = TestUtils.round(T);
 
 			writeInputMatrixWithMTD("T", T, true);
 
diff --git a/src/test/scripts/functions/builtin/smote.dml b/src/test/scripts/functions/builtin/smote.dml
index 5a8d5d6..8385f1b 100644
--- a/src/test/scripts/functions/builtin/smote.dml
+++ b/src/test/scripts/functions/builtin/smote.dml
@@ -21,7 +21,8 @@
 
 
 A = read($X);
-B = smote(X = A, s=$S, k=$K, verbose=TRUE);
+M = read($M)
+B = smote(X = A, mask=M, s=$S, k=$K, verbose=TRUE);
 
 # test if all point fall in same cluster (closed to each other)
 # read some new data T != A