You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by ss...@apache.org on 2021/09/27 08:43:22 UTC

[systemds] branch master updated: [MINOR] Cleanups in cleaning pipelines (validation conditions, typos etc.) Closes #1396.

This is an automated email from the ASF dual-hosted git repository.

ssiddiqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/master by this push:
     new 9673336  [MINOR] Cleanups in cleaning pipelines (validation conditions, typos etc.) Closes #1396.
9673336 is described below

commit 96733360c8f600355d5600f2edb8960ba1d47861
Author: Shafaq Siddiqi <sh...@tugraz.at>
AuthorDate: Mon Sep 27 10:41:40 2021 +0200

    [MINOR] Cleanups in cleaning pipelines (validation conditions, typos etc.)
    Closes #1396.
---
 scripts/builtin/executePipeline.dml                              | 9 +++++----
 .../pipelines/BuiltinTopkCleaningClassificationTest.java         | 1 +
 .../sysds/test/functions/pipelines/BuiltinTopkLogicalTest.java   | 2 +-
 .../functions/pipelines/intermediates/classification/bestAcc.csv | 6 +++---
 .../pipelines/intermediates/classification/dirtyScore.csv        | 2 +-
 .../functions/pipelines/intermediates/classification/evalHp.csv  | 2 +-
 .../functions/pipelines/intermediates/classification/hp.csv      | 6 +++---
 .../functions/pipelines/intermediates/classification/lp.csv      | 2 +-
 .../functions/pipelines/intermediates/classification/pip.csv     | 6 +++---
 9 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/scripts/builtin/executePipeline.dml b/scripts/builtin/executePipeline.dml
index 304feda..d21037f 100644
--- a/scripts/builtin/executePipeline.dml
+++ b/scripts/builtin/executePipeline.dml
@@ -199,7 +199,7 @@ return (Matrix[Double] X)
     nX = nX[, 1: ncol(nX) - 1]
   
   }
-  if(dataFlag == 0 & (sum(mask) > 0))
+  if(dataFlag == 0 & (sum(mask) > 0) & (sum(mask) != ncol(nX)))
   {
     maxDummy = max(nX) + 1
     nX = replace(target = nX, pattern = NaN, replacement = maxDummy)
@@ -219,7 +219,7 @@ return (Matrix[Double] X)
     X = replace(target = X, pattern = maxDummy, replacement = NaN)
     X = replace(target = X, pattern = -1111, replacement = NaN)
   }
-  else if(dataFlag == 1 & (sum(mask) > 0))
+  else if(dataFlag == 1 & (sum(mask) > 0) & (sum(mask) != ncol(nX)))
   {
     maxDummy = max(nX) + 1
     nX = replace(target = nX, pattern = NaN, replacement = maxDummy)
@@ -287,7 +287,8 @@ return (Matrix[Double] X_filled)
       for(i in 1: nrow(FD))
       {
         for(j in 1:ncol(FD)) {
-          if(as.scalar(FD[i, j]) > 0 & (min(X[, i]) != 0) & (min(X[, j]) != 0) & (sum(FD[, j]) != nrow(FD)))
+          if(as.scalar(FD[i, j]) > 0 & (min(X[, i]) != 0) & (min(X[, j]) != 0) & (sum(FD[, j]) != nrow(FD))
+            & (as.scalar(fdMask[1, j]) != 0) & (as.scalar(fdMask[1, i]) != 0))
             X = imputeByFD(X, i, j, threshold, FALSE)
         }
       }
@@ -364,7 +365,7 @@ fillDefault = function(Matrix[Double] X)
 return(Matrix[Double] X){
   defaullt = round(colMaxs(X) - colMins(X))
   Mask = is.na(X)
-  X = replace(target=X, pattern=NaN, replacement=max(X))
+  X = replace(target=X, pattern=NaN, replacement=0)
   Mask = Mask * defaullt
   X = X + Mask
  # print("fillDefault: no of NaNs "+sum(is.na(X)))
diff --git a/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java b/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java
index 47e1347..007fade 100644
--- a/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java
@@ -45,6 +45,7 @@ public class BuiltinTopkCleaningClassificationTest extends AutomatedTestBase {
 		addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[]{"R"}));
 	}
 
+	@Test
 	public void testFindBestPipelineCompany() {
 		runtopkCleaning(DATA_DIR+ "company.csv", RESOURCE+ "meta/meta_company.csv", 1.0, 3,5,
 			"FALSE", 0,0.8, Types.ExecMode.SINGLE_NODE);
diff --git a/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkLogicalTest.java b/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkLogicalTest.java
index 5818128..7e02647 100644
--- a/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkLogicalTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkLogicalTest.java
@@ -50,7 +50,7 @@ public class BuiltinTopkLogicalTest extends AutomatedTestBase {
 
 	@Test
 	public void testLogical1() {
-		runTestLogical(10, 5, 2, ExecMode.SINGLE_NODE);
+		runTestLogical(4, 5, 2, ExecMode.SINGLE_NODE);
 	}
 
 	@Test
diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv b/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv
index 746303d..41e60e1 100644
--- a/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv
@@ -1,3 +1,3 @@
-93.69369369369369
-93.69369369369369
-93.69369369369369
+64.88439306358381
+64.73988439306359
+64.73988439306359
diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv b/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv
index 69535a4..11f9760 100644
--- a/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv
@@ -1 +1 @@
-90.09009009009009
\ No newline at end of file
+63.72832369942196
\ No newline at end of file
diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/evalHp.csv b/src/test/scripts/functions/pipelines/intermediates/classification/evalHp.csv
index faeeae7..7404df9 100644
--- a/src/test/scripts/functions/pipelines/intermediates/classification/evalHp.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/classification/evalHp.csv
@@ -1 +1 @@
-2.0,0.001,1.0,1000.0
+0,1.0,0.001,10.0
diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv b/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv
index 668c597..1d75c70 100644
--- a/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv
@@ -1,3 +1,3 @@
-32.0,2.0,0.0203644573130835,0.9538010240498609,0,0,0,1.0,0,0,0,0,1.0,0,0,0,2.0,1.0,0.6367394902267174,0,0,0,1.0,1.0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-32.0,2.0,0.04436413689764156,0.9601592761408282,0,0,0,1.0,0,0,0,0,1.0,0,0,0,2.0,1.0,0.6541009026313958,0,0,0,1.0,1.0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-32.0,3.0,0.0418452608516319,0.9715979748926613,1.0,0,0,1.0,1.0,0,0,0,1.0,0,1.0,0,2.0,0,2.0,1.0,0.6003640116471959,0,1.0,0,2.0,1.0,1.0,2.0,1.0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+54.0,3.0,7.0,2.0,1.0,0,0,0,1.0,0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,1.0,0,0,0,2.0,2.0,0,1.0,0,0,0,0,0,0,1.0,0.7053074081820746,0,0,0,0,1.0,1.0,2.0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+54.0,3.0,6.0,2.0,1.0,0,0,0,1.0,0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,1.0,0,0,0,2.0,2.0,1.0,1.0,0,0,0,0,0,0,1.0,0.7784943734333777,0,0,0,0,1.0,1.0,2.0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+54.0,3.0,4.0,1.0,1.0,0,0,0,1.0,0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,1.0,0,0,0,2.0,2.0,1.0,1.0,0,0,0,0,0,0,1.0,0.7567266322372848,0,0,0,0,1.0,1.0,2.0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/lp.csv b/src/test/scripts/functions/pipelines/intermediates/classification/lp.csv
index fc7d67f..d4015e9 100644
--- a/src/test/scripts/functions/pipelines/intermediates/classification/lp.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/classification/lp.csv
@@ -1 +1 @@
-OTLR,MVI,CI,DUMMY
+ED,MVI,EC,SCALE,CI,DUMMY
diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv b/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv
index b88ec19..5e77238 100644
--- a/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv
@@ -1,3 +1,3 @@
-winsorize,imputeByMedian,wtomeklink,dummycoding
-winsorize,imputeByMedian,wtomeklink,dummycoding
-outlierBySd,imputeByMean,abstain,dummycoding
+outlierBySd,imputeByMedian,imputeByMean,scale,abstain,dummycoding
+outlierBySd,imputeByMean,imputeByMedian,scale,abstain,dummycoding
+outlierBySd,imputeByMean,imputeByMedian,scale,abstain,dummycoding