You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by ss...@apache.org on 2022/08/19 18:52:13 UTC
[systemds] branch main updated: [MINOR] Cleanups in builtin scripts (i.e., removing unnecessary variables and branches)

This is an automated email from the ASF dual-hosted git repository.

ssiddiqi pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/main by this push:
     new 6d84711dce [MINOR] Cleanups in builtin scripts (i.e., removing unnecessary variables and branches)
6d84711dce is described below

commit 6d84711dce9142f3e2542356ab328a122025b622
Author: Shafaq Siddiqi <sh...@tugraz.at>
AuthorDate: Fri Aug 19 12:34:53 2022 +0200

    [MINOR] Cleanups in builtin scripts (i.e., removing unnecessary variables and branches)
---
 scripts/builtin/bandit.dml                         |  1 +
 scripts/builtin/executePipeline.dml                | 69 +++++++++++-----------
 scripts/builtin/mice.dml                           |  2 +-
 scripts/builtin/outlierByIQR.dml                   |  3 +-
 scripts/builtin/outlierBySd.dml                    | 43 ++++----------
 scripts/builtin/underSampling.dml                  | 22 +++----
 scripts/pipelines/scripts/utils.dml                | 27 +++++----
 .../spark/ParameterizedBuiltinSPInstruction.java   |  2 +
 .../builtin/part2/BuiltinUnderSamplingTest.java    |  2 +-
 .../functions/builtin/underSamplingTest.dml        |  5 +-
 10 files changed, 81 insertions(+), 95 deletions(-)

diff --git a/scripts/builtin/bandit.dml b/scripts/builtin/bandit.dml
index 43e3a1a9c5..d27a7d1058 100644
--- a/scripts/builtin/bandit.dml
+++ b/scripts/builtin/bandit.dml
@@ -640,6 +640,7 @@ run_with_hyperparamNested = function(Frame[Unknown] ph_pip, Integer r_i = 1, Mat
 
   parfor(i in 1:nrow(ph_pip), check = 0) # , opt=CONSTRAINED, mode=REMOTE_SPARK
   {
+    evalFunOutput = as.matrix(0)
     # execute configurations with r resources
     op = removeEmpty(target=ph_pip[i], margin="cols")
     # print("PIPELINE EXECUTION START ... "+toString(op))
diff --git a/scripts/builtin/executePipeline.dml b/scripts/builtin/executePipeline.dml
index 38f110be6c..a9cd918bc2 100644
--- a/scripts/builtin/executePipeline.dml
+++ b/scripts/builtin/executePipeline.dml
@@ -57,7 +57,6 @@ s_executePipeline = function(Frame[String] pipeline, Matrix[Double] Xtrain,  Mat
 {
   internalStates = list()
   mask=as.matrix(metaList['mask'])
-  FD = as.matrix(metaList['fd'])
   applyFunc = as.frame(metaList['applyFunc'])
   changesAll = 0.0
   d = ncol(Xtrain)
@@ -74,7 +73,7 @@ s_executePipeline = function(Frame[String] pipeline, Matrix[Double] Xtrain,  Mat
     applyOp = toString(as.scalar(applyFunc[1,i]))
     Xclone = Xtrain
     XtestClone = Xtest
-    [hp, dataFlag, yFlag, executeFlag] = matrixToList(Xtrain, Ytrain, mask, FD, hyperParameters[i], flagsCount, op)
+    [hp, dataFlag, yFlag, executeFlag] = matrixToList(Xtrain, Ytrain, mask, as.matrix(metaList['fd']), hyperParameters[i,], flagsCount, op)
     if(executeFlag == 1) {
       L = evalList(op, hp)
       [L, O] = remove(L, 1);
@@ -96,19 +95,21 @@ s_executePipeline = function(Frame[String] pipeline, Matrix[Double] Xtrain,  Mat
         [L, Y] =  remove(L, 1);
         Ytrain = as.matrix(Y)
       }
-      Xtrain = confirmMeta(Xtrain, mask)
-      Xtest = confirmMeta(Xtest, mask)
+      # Xtrain = confirmMeta(Xtrain, mask)
+      # Xtest = confirmMeta(Xtest, mask)
     }
     else {
-      print("not applying "+op+" executeFlag = 0")
+      print("not applying operation executeFlag = 0")
     }
     
-    if(ncol(Xtest) == d & nrow(Xtest) == nrow(XtestClone)) {
+    if(ncol(Xtest) == d & nrow(Xtest) == nrow(XtestClone) & ncol(hpForPruning) > 1) {
       changesSingle = sum(abs(replace(target=Xtest, pattern=NaN, replacement=0) - replace(target=XtestClone, pattern=NaN, replacement=0))  > 0.001 )
       changesAll  = sum(abs(replace(target=Xtest, pattern=NaN, replacement=0) - replace(target=Xorig, pattern=NaN, replacement=0))  > 0.001 )
     
       if(as.scalar(pipeline[1, i]) == "outlierBySd" | as.scalar(pipeline[1, i]) == "outlierByIQR" | as.scalar(pipeline[1, i]) == "imputeByFd") {
-        [hpForPruning, changesByOp] = storeDataForPrunning(pipeline, hyperParameters, hpForPruning,  changesByOp, changesSingle, i)
+        
+        hpForPruning[1, i] = hyperParameters[i, 2]
+        changesByOp[1, i] = changesSingle
       }
     }
   }
@@ -191,33 +192,33 @@ return(Matrix[Double] X,Integer executeFlag)
   else X = X
 }
 
-confirmMeta = function(Matrix[Double] X, Matrix[Double] mask)
-return (Matrix[Double] X)
-{
-  if((sum(mask) > 0) & (ncol(X) == ncol(mask)))
-  {
-    # get  the max + 1 for nan replacement
-    nanMask = is.na(X)
-    # replace nan
-    X = replace(target = X, pattern = NaN, replacement = 9999)
-    # take categorical out
-    cat = removeEmpty(target=X, margin="cols", select = mask)
-    # round categorical (if there is any floating  point)
-    cat = round(cat)
-    less_than_1_mask = cat < 1
-    less_than_1 = less_than_1_mask * 9999
-    cat = (cat * (less_than_1_mask == 0)) +  less_than_1
-    # reconstruct original X
-    X = X * (mask == 0)
-    q = table(seq(1, ncol(cat)), removeEmpty(target=seq(1, ncol(mask)), margin="rows", 
-      select=t(mask)), ncol(cat), ncol(X))
-    X = (cat %*% q) + X 
-
-    # put nan back
-    nanMask = replace(target = nanMask, pattern = 1, replacement = NaN)
-    X = X + nanMask
-  }
-}
+# confirmMeta = function(Matrix[Double] X, Matrix[Double] mask)
+# return (Matrix[Double] X)
+# {
+  # if((sum(mask) > 0) & (ncol(X) == ncol(mask)))
+  # {
+    # # get  the max + 1 for nan replacement
+    # nanMask = is.na(X)
+    # # replace nan
+    # X = replace(target = X, pattern = NaN, replacement = 9999)
+    # # take categorical out
+    # cat = removeEmpty(target=X, margin="cols", select = mask)
+    # # round categorical (if there is any floating  point)
+    # cat = round(cat)
+    # less_than_1_mask = cat < 1
+    # less_than_1 = less_than_1_mask * 9999
+    # cat = (cat * (less_than_1_mask == 0)) +  less_than_1
+    # # reconstruct original X
+    # X = X * (mask == 0)
+    # q = table(seq(1, ncol(cat)), removeEmpty(target=seq(1, ncol(mask)), margin="rows", 
+      # select=t(mask)), ncol(cat), ncol(X))
+    # X = (cat %*% q) + X 
+
+    # # put nan back
+    # nanMask = replace(target = nanMask, pattern = 1, replacement = NaN)
+    # X = X + nanMask
+  # }
+# }
 
 
 confirmData = function(Matrix[Double] nX, Matrix[Double] originalX, Matrix[Double] mask, Integer dataFlag)
diff --git a/scripts/builtin/mice.dml b/scripts/builtin/mice.dml
index 24ffaccf5b..ca2c4592e6 100644
--- a/scripts/builtin/mice.dml
+++ b/scripts/builtin/mice.dml
@@ -180,7 +180,7 @@ return (Matrix[Double] dist){
  
   dist = matrix(1, 1, ncol(X))
   X = replace(target=X, pattern=0, replacement=max(X)+1)
-  parfor(i in 1:ncol(X))
+  for(i in 1:ncol(X))
   {
     if(as.scalar(mask[,i]) == 1)
     {
diff --git a/scripts/builtin/outlierByIQR.dml b/scripts/builtin/outlierByIQR.dml
index 7abf43c065..bd15bb0f65 100644
--- a/scripts/builtin/outlierByIQR.dml
+++ b/scripts/builtin/outlierByIQR.dml
@@ -114,7 +114,8 @@ compute_quartiles = function(Matrix[Double] X)
   if(nrow(X) > 1) {
     parfor(i in 1:cols) {
       isNull = is.na(X[, i])
-      Xt = removeEmpty(target=X[, i], margin="rows", select=(isNull == 0))
+      sel = (isNull == 0)
+      Xt = removeEmpty(target=X[, i], margin="rows", select=sel)
       colQ1[,i] = quantile(Xt, 0.25)
       colQ3[,i] = quantile(Xt, 0.75)
     }
diff --git a/scripts/builtin/outlierBySd.dml b/scripts/builtin/outlierBySd.dml
index 0e7a192f3f..c53ed1cfef 100644
--- a/scripts/builtin/outlierBySd.dml
+++ b/scripts/builtin/outlierBySd.dml
@@ -51,9 +51,8 @@ m_outlierBySd = function(Matrix[Double] X, Double k = 3, Integer repairMethod =
 
   while( max_iterations == 0 | counter < max_iterations )
   {
-    colSD = getColSd(X)
-    colMean = getColMean(X)
-
+    [colMean, colSD] = getColMean_Sd(X)
+    
     upperBound = colMean + k * colSD
     lowerBound = colMean - k * colSD
 
@@ -74,27 +73,19 @@ m_outlierBySd = function(Matrix[Double] X, Double k = 3, Integer repairMethod =
   }
   out = X
   if(verbose) {
-    print("last outlier filter:\n"+ toString(outlierFilter))
-    print("Total executed iterations = "+counter)
     print("Upper-bound of data was calculated using Mean + k * Standard Deviation")
     print("lower-bound of data was calculated using Mean - k * Standard Deviation")
     print("Anything less than the lower-bound and greater than the upper-bound was treated as outlier")
     if(sum(out) == 0)
       print("output is a zero matrix due to iterative evaluation of outliers ")
-    print("output:\n"+ toString(out))
   }
-  bounds = rbind(lowerBound, upperBound)
 }
 
 fix_outliers_sd = function(Matrix[Double] X, Matrix[Double] outlierFilter, Integer repairMethod = 2)
-  return(Matrix[Double] fixed_X)
+  return(Matrix[Double] X)
 {
-  rows = nrow(X)
-  cols = ncol(X)
-  if(repairMethod == 0) {
-    sel = (rowMaxs(outlierFilter) == 0)
-    X = removeEmpty(target = X, margin = "rows", select = sel)
-  }
+  if(repairMethod == 0) 
+    X = removeEmpty(target = X, margin = "rows", select = (rowMaxs(outlierFilter) == 0))
   else if(repairMethod == 1)
     X = (outlierFilter == 0) * X
   else if (repairMethod == 2) {    
@@ -103,31 +94,19 @@ fix_outliers_sd = function(Matrix[Double] X, Matrix[Double] outlierFilter, Integ
   }
   else
     stop("outlierBySd: invalid argument - repair required 0-1 found: "+repairMethod)
-
-  fixed_X = X
 }
 
-getColSd = function(Matrix[Double] X)
-return(Matrix[Double] colSd)
-{
-  colSd = matrix(0, 1, ncol(X))
-  for(i in 1:ncol(X))
-  {
-    isNull = is.na(X[, i])
-    Xt = removeEmpty(target=X[, i], margin="rows", select=(isNull == 0))
-    colSd[1, i] = sd(Xt)
-  }
-}
-
-getColMean = function(Matrix[Double] X)
-return(Matrix[Double] colMean)
+getColMean_Sd = function(Matrix[Double] X)
+return(Matrix[Double] colMean, Matrix[Double] colSd)
 {
   colMean = matrix(0, 1, ncol(X))
+  colSd = matrix(0, 1, ncol(X))
   for(i in 1:ncol(X))
   {
-    isNull = is.na(X[, i])
-    Xt = removeEmpty(target=X[, i], margin="rows", select=(isNull == 0))
+    Xt = replace(target=X[, i], pattern=NaN, replacement=0)
+    Xt = removeEmpty(target=Xt, margin="rows")
     colMean[1, i] = mean(Xt)
+    colSd[1, i] = sd(Xt)
   }
 }
 
diff --git a/scripts/builtin/underSampling.dml b/scripts/builtin/underSampling.dml
index 48d601ed73..8256ffd485 100644
--- a/scripts/builtin/underSampling.dml
+++ b/scripts/builtin/underSampling.dml
@@ -44,19 +44,19 @@ return(Matrix[Double] X, Matrix[Double] Y)
   # # get the minority class
   classes = table(Y, 1)
   # # # get the minority class
-  minority = as.scalar(rowIndexMin(t(classes)))
+  maxClass = as.scalar(rowIndexMax(t(classes)))
   # # # separate the minority class
-  notMin = (Y != matrix(minority, rows=nrow(Y), cols=1))
-  dX = cbind(seq(1, nrow(X)), X)
+  notMin = (Y == maxClass)
+  dX = seq(1, nrow(X))
   majority = removeEmpty(target=dX, margin="rows", select=notMin)
   # # # formulate the undersampling ratio
-  u_ratio = floor(nrow(majority) * ratio)
-  # take the samples for oversampling
-  u_sample = sample(nrow(majority), u_ratio)
-  u_select = table(u_sample, 1, 1, nrow(majority), 1)
-  u_select = u_select * majority[, 1]
-  u_select = removeEmpty(target = u_select, margin = "rows")
+  # take the samples for undersampling
+  u_select = rand(rows=nrow(majority), cols=1, min=1, max=2, sparsity=(ratio), seed=1)
+  u_select = u_select > 0
+  u_select = u_select * majority
+  u_select = removeEmpty(target = u_select, margin = "rows")  
   u_select1 = table(u_select, 1, 1, nrow(X), 1)
-  X = removeEmpty(target=X, margin="rows", select = (u_select1 == 0))
-  Y = removeEmpty(target=Y, margin="rows", select = (u_select1 == 0))
+  sel = (u_select1 == 0)
+  X = removeEmpty(target=X, margin="rows", select = sel)
+  Y = removeEmpty(target=Y, margin="rows", select = sel)
 }
\ No newline at end of file
diff --git a/scripts/pipelines/scripts/utils.dml b/scripts/pipelines/scripts/utils.dml
index 7fb95297df..3f9378c9d2 100644
--- a/scripts/pipelines/scripts/utils.dml
+++ b/scripts/pipelines/scripts/utils.dml
@@ -78,6 +78,7 @@ doSample = function(Matrix[Double] eX, Matrix[Double] eY, Double ratio, Boolean
       sampledY = eY 
     }
   }
+  print("sampled rows "+nrow(sampledY)+" out of "+nrow(eY))
 }
 
 # #######################################################################
@@ -138,16 +139,16 @@ return(Frame[Unknown] data, List[Unknown] distanceMatrix, List[Unknown] dictiona
   print(prefix+" convert strings to lower case");
   data = map(data, "x -> x.toLowerCase()")
   # step 2 fix invalid lengths
-  # q0 = 0.05
-  # q1 = 0.95
-  # print(prefix+" fixing invalid lengths between "+q0+" and "+q1+" quantile");
+  q0 = 0.05
+  q1 = 0.95
+  print(prefix+" fixing invalid lengths between "+q0+" and "+q1+" quantile");
 
-  # [data, mask, qlow, qup] = fixInvalidLengths(data, mask, q0, q1)
+  [data, mask, qlow, qup] = fixInvalidLengths(data, mask, q0, q1)
 
   
     # # step 3 fix swap values
-  # print(prefix+" value swap fixing");
-  # data = valueSwap(data, schema)
+  print(prefix+" value swap fixing");
+  data = valueSwap(data, schema)
   
   # step 3 drop invalid types
   print(prefix+" drop values with type mismatch");
@@ -155,8 +156,8 @@ return(Frame[Unknown] data, List[Unknown] distanceMatrix, List[Unknown] dictiona
 
 
   # step 5 porter stemming on all features
-  # print(prefix+" porter-stemming on all features");
-  # data = map(data, "x -> PorterStemmer.stem(x)", 0)
+  print(prefix+" porter-stemming on all features");
+  data = map(data, "x -> PorterStemmer.stem(x)", 0)
 
   # step 6 typo correction  
   if(CorrectTypos)
@@ -202,20 +203,20 @@ return(Frame[Unknown] data)
   data = map(data, "x -> x.toLowerCase()")
   # step 2 fix invalid lengths
 
-  # q0 = 0.05
-  # q1 = 0.95
+  q0 = 0.05
+  q1 = 0.95
 
-  # [data, mask, qlow, qup] = fixInvalidLengths(data, mask, q0, q1)
+  [data, mask, qlow, qup] = fixInvalidLengths(data, mask, q0, q1)
 
   # # step 3 fix swap values
-  # data = valueSwap(data, schema)
+  data = valueSwap(data, schema)
 
   # step 3 drop invalid types
   data = dropInvalidType(data, schema)
 
 
   # step 5 porter stemming on all features
-  # data = map(data, "x -> PorterStemmer.stem(x)", 0)
+  data = map(data, "x -> PorterStemmer.stem(x)", 0)
 
   
   # step 6 typo correction  
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/spark/ParameterizedBuiltinSPInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/spark/ParameterizedBuiltinSPInstruction.java
index 7135f141c6..7bb547673c 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/spark/ParameterizedBuiltinSPInstruction.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/spark/ParameterizedBuiltinSPInstruction.java
@@ -686,6 +686,8 @@ public class ParameterizedBuiltinSPInstruction extends ComputationSPInstruction
 					_off.getBlock(1, (int) arg0._1().getColumnIndex()));
 
 			// execute remove empty operations
+			System.out.println("offset: "+offsets.getValue().getNumRows());
+			System.out.println("_rmRows: "+_rmRows);
 			ArrayList<IndexedMatrixValue> out = new ArrayList<>();
 			LibMatrixReorg.rmempty(data, offsets, _rmRows, _len, _blen, out);
 
diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinUnderSamplingTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinUnderSamplingTest.java
index ea9ca8da8f..45e025aae8 100644
--- a/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinUnderSamplingTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinUnderSamplingTest.java
@@ -47,7 +47,7 @@ public class BuiltinUnderSamplingTest extends AutomatedTestBase {
 	@Test
 	public void test_CP2() {
 
-		runUnderSamplingTest(0.5, Types.ExecType.CP);
+		runUnderSamplingTest(0.4, Types.ExecType.CP);
 
 	}
 
diff --git a/src/test/scripts/functions/builtin/underSamplingTest.dml b/src/test/scripts/functions/builtin/underSamplingTest.dml
index d59d2832c3..ad1bb5fabb 100644
--- a/src/test/scripts/functions/builtin/underSamplingTest.dml
+++ b/src/test/scripts/functions/builtin/underSamplingTest.dml
@@ -20,7 +20,7 @@
 #-------------------------------------------------------------
 
 ratio = as.double($1)
-X = rand(rows=20, cols=4, min=1, max =100)
+X = rand(rows=20, cols=4, min=1, max =100, seed=1)
 Y = rbind(matrix(1, rows=15, cols=1), matrix(2, rows=5, cols=1))
 classesUnBalanced = table(Y[, ncol(Y)], 1)
 # # # randomize the data
@@ -31,7 +31,8 @@ Y = P %*% Y
 
 [balancedX, balancedY] = underSampling(X, Y, ratio)
 classesBalanced = table(balancedY, 1)
-out = as.scalar(classesUnBalanced[1] - classesBalanced[1]) == floor(15.0*ratio)
+out = as.scalar(classesUnBalanced[1] - classesBalanced[1]) >= (floor(15.0*ratio) - 1) &
+      as.scalar(classesUnBalanced[1] - classesBalanced[1]) <= (floor(15.0*ratio) + 1)
 print(out)