You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by ss...@apache.org on 2021/09/14 17:33:43 UTC

[systemds] 01/02: fixing bug

This is an automated email from the ASF dual-hosted git repository.

ssiddiqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git

commit e3ee3720c03307b447e69e0c949b0c9ae5eb3c2d
Author: Shafaq Siddiqi <sh...@tugraz.at>
AuthorDate: Mon Sep 6 14:30:18 2021 +0200

    fixing bug
    
    for winsorizing
    
    whatever
    
    I am sleepy
    
    crossV fix
---
 scripts/builtin/bandit.dml                         | 19 ++++----
 scripts/builtin/tomeklink.dml                      | 51 ++++++++++++++++++++++
 scripts/builtin/topk_cleaning.dml                  |  2 +-
 .../functions/builtin/BuiltinTomeklinkTest.java    |  4 +-
 .../functions/pipelines/applyEvaluateTest.dml      |  6 +--
 .../pipelines/intermediates/classification/hp.csv  |  6 +--
 .../pipelines/intermediates/classification/lp.csv  |  2 +-
 .../pipelines/intermediates/classification/pip.csv |  6 +--
 8 files changed, 73 insertions(+), 23 deletions(-)

diff --git a/scripts/builtin/bandit.dml b/scripts/builtin/bandit.dml
index 28aa909..91abc08 100644
--- a/scripts/builtin/bandit.dml
+++ b/scripts/builtin/bandit.dml
@@ -53,7 +53,7 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train, Matrix[Doubl
   rows = 1, cols = NUM_FEATURES + 4 )
   frameList = list()
   
-  for(s in s_max:0) { # TODO convert to parfor
+  parfor(s in s_max:0, check=0) { # TODO convert to parfor
     
    # result variables
     bracket_hp = matrix(0, rows=k*(s+1)+k, cols=HYPERPARAM_LENGTH)
@@ -272,12 +272,11 @@ run_with_hyperparam = function(Frame[Unknown] lp, Frame[Unknown] ph_pip, Integer
         {
           pipList = list(lp = lp, ph = ph_pip[i], hp = hp_matrix, flags = no_of_flag_vars)
           [evalFunOutput, hpForPruning, changesByOp] = crossV(X=X, y=Y, cvk=cvk, evalFunHp=evalFunHp, pipList=pipList, metaList=metaList, hpForPruning=hpForPruning, 
-          changesByOp=changesByOp, evalFunc=evaluationFunc, trainML = FALSE)
-      
+          changesByOp=changesByOp, evalFunc=evaluationFunc, trainML = 0)
         }
         else 
         {
-          [eXtrain, eYtrain, eXtest, eYtest, Tr] = executePipeline(logical=lp, pipeline=ph_pip[i], X=X, Y=Y, Xtest=Xtest, Ytest=Ytest, metaList=metaList,
+          [eXtrain, eYtrain, eXtest, eYtest, Tr, hpForPruning, changesByOp] = executePipeline(logical=lp, pipeline=ph_pip[i], X=X, Y=Y, Xtest=Xtest, Ytest=Ytest, metaList=metaList,
             hyperParameters=hp_matrix, hpForPruning=hpForPruning, changesByOp=changesByOp, flagsCount=no_of_flag_vars, test=TRUE, verbose=FALSE)
           if(max(eYtrain) == min(eYtrain)) 
             print("Y contains only one class")
@@ -603,8 +602,8 @@ return (String s)
 }
 
 crossV = function(Matrix[double] X, Matrix[double] y, Integer cvk, Matrix[Double] evalFunHp, List[Unknown] pipList, List[Unknown] metaList,
-  Matrix[Double] hpForPruning = as.matrix(0), Matrix[Double] changesByOp = as.matrix(0), String evalFunc, Boolean trainML = FALSE) 
-return (Matrix[Double] accuracy, Matrix[Double] hpForPruning, Matrix[Double] changesByOp)
+  Matrix[Double] hpForPruning = as.matrix(0), Matrix[Double] changesByOp = as.matrix(0), String evalFunc, Integer trainML = 0) 
+return (Matrix[Double] output, Matrix[Double] hpForPruning, Matrix[Double] changesByOp)
 {
   accuracyMatrix = matrix(0, cvk, 1)
   dataList = list()
@@ -652,10 +651,12 @@ return (Matrix[Double] accuracy, Matrix[Double] hpForPruning, Matrix[Double] cha
         changesByOp=changesByOp, flagsCount=as.scalar(pipList['flags']), test=TRUE, verbose=FALSE)
     }
     # print("test out: "+nrow(testy))
-    res = eval(evalFunc, list(X=trainX, Y=trainy, Xtest=testX, Ytest=testy, Xorig=as.matrix(0), evalFunHp=evalFunHp, trainML = 0))
-    accuracyMatrix[i] = res
+    res = eval(evalFunc, list(X=trainX, Y=trainy, Xtest=testX, Ytest=testy, Xorig=as.matrix(0), evalFunHp=evalFunHp, trainML = trainML))
+    accuracyMatrix[i] = res[1, 1]
+    evalFunHp = res[, 2:ncol(res)]
   }
   accuracy = as.matrix(mean(accuracyMatrix))
+  output = cbind(accuracy, evalFunHp)
 }
 
 pruningSignal = function(Frame[Unknown] ph_pip, Matrix[Double] hp_matrix, Matrix[Double] hpForPruning, Matrix[Double] changesByOp)
@@ -680,5 +681,3 @@ return(Boolean execute)
   }
   execute = !(changeCount > 0)
 }
-
-
diff --git a/scripts/builtin/tomeklink.dml b/scripts/builtin/tomeklink.dml
index 6169dbf..a48f0b7 100644
--- a/scripts/builtin/tomeklink.dml
+++ b/scripts/builtin/tomeklink.dml
@@ -74,6 +74,8 @@ return (Matrix[Double] nn) {
     dists = rowSums((X - X[i,])^2) 
     dists[i,] = NaN; # mask out self-ref
     nn[i, 1] = rowIndexMin(t(dists))
+    # res = naiveKNNsearch(X, X[i], 2)
+    # nn[i,  1] = res[1,2]
   }
 }
 
@@ -86,3 +88,52 @@ return (Matrix[Double] tomek_links) {
   links = (y != majority_label) & (nn_labels == majority_label)
   tomek_links = (table(nn, 1, links, nrow(y), 1) > 0)
 }
+
+
+#naive knn search implement
+naiveKNNsearch = function(
+    Matrix[Double] P,
+    Matrix[Double] Q,
+    Integer K
+)return(
+    Matrix[Double] O
+){
+  num_records = nrow (P);
+  num_features = ncol (P);
+  num_queries = nrow (Q);
+  Qt = t(Q);
+  PQt = P %*% Qt;
+  P2 = rowSums (P ^ 2);
+  D = -2 * PQt + P2;
+  if (K == 1) {
+    Dt = t(D);
+    O = rowIndexMin (Dt);
+  } else {
+    O = matrix (0, rows = num_queries, cols = K);
+    parfor (i in 1:num_queries) {
+      D_sorted=order(target=D[,i], by=1, decreasing=FALSE, index.return=TRUE);
+      O[i,] = t(D_sorted[1:K,1]);
+    }
+  }
+}
+
+
+
+# #naive knn search implement
+# KNNApprox = function(
+    # Matrix[Double] P,
+    # Matrix[Double] Q,
+    # Integer K
+# )return(
+    # Matrix[Double] O
+# ){
+  
+# [C, Y] = kmeans(X, nrow(X)/ncol(X), 25, 50, 0.0001, TRUE, 50, 1324)
+# clusX = cbind(Y, X)
+# clusX = order(target=X, by=1, decreasing=FALSE, index.return=FALSE);
+# clus = table(Y, 1)
+
+
+# Y_1 = kmeansPredict(X, C)
+# }
+
diff --git a/scripts/builtin/topk_cleaning.dml b/scripts/builtin/topk_cleaning.dml
index 75ee184..1b32e3c 100644
--- a/scripts/builtin/topk_cleaning.dml
+++ b/scripts/builtin/topk_cleaning.dml
@@ -106,7 +106,7 @@ s_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest = a
                    
   tab = table(eYtrain, 1)
   dist = nrow(tab)
-  if((nrow(eYtrain) > 0 & dist < 10))
+  if(FALSE) #(nrow(eYtrain) > 0 & dist < 10)
     logical = logicalSeedCI
   else 
     logical = logicalSeedNoCI
diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinTomeklinkTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinTomeklinkTest.java
index 00f0b36..411be29 100644
--- a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinTomeklinkTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinTomeklinkTest.java
@@ -36,8 +36,8 @@ public class BuiltinTomeklinkTest extends AutomatedTestBase
 	private static final String TEST_CLASS_DIR = TEST_DIR + BuiltinTomeklinkTest.class.getSimpleName() + "/";
 
 	private final static double eps = 1e-3;
-	private final static int rows = 53;
-	private final static int cols = 6;
+	private final static int rows = 50000;
+	private final static int cols = 60;
 
 	@Override
 	public void setUp() {
diff --git a/src/test/scripts/functions/pipelines/applyEvaluateTest.dml b/src/test/scripts/functions/pipelines/applyEvaluateTest.dml
index 813ef94..6edd239 100644
--- a/src/test/scripts/functions/pipelines/applyEvaluateTest.dml
+++ b/src/test/scripts/functions/pipelines/applyEvaluateTest.dml
@@ -82,8 +82,8 @@ return(Matrix[Double] accuracy)
 
   beta = multiLogReg(X=X, Y=Y, icpt=as.scalar(evalFunHp[1,1]), reg=as.scalar(evalFunHp[1,2]), tol=as.scalar(evalFunHp[1,3]), 
     maxi=as.scalar(evalFunHp[1,4]), maxii=50, verbose=FALSE);
-  [prob, yhat, a] = multiLogRegPredict(Xtest, beta, Ytest, FALSE)
-  accuracy = getAccuracy(Ytest, yhat, TRUE)
-  print("accuracy weighted: "+accuracy)
+  [prob, yhat, accuracy] = multiLogRegPredict(Xtest, beta, Ytest, FALSE)
+  a = getAccuracy(Ytest, yhat, TRUE)
+  print("accuracy: "+ accuracy+", accuracy weighted: "+a)
   accuracy = as.matrix(accuracy)
 }
diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv b/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv
index 3d5c3ff..f92bc2f 100644
--- a/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv
@@ -1,3 +1,3 @@
-40.0,2.0,0.01816863223655686,0.9565161479438591,0,0,0,1.0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,1.0,0,0,0,2.0,1.0,0.6515164788504212,0,0,0,1.0,1.0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-40.0,2.0,0.03510876761722913,0.9673791862807241,0,0,0,1.0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,1.0,0,0,0,2.0,1.0,0.6149768032146687,0,0,0,1.0,1.0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-40.0,2.0,0.014861839294898092,0.9595626659056867,0,0,0,1.0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,1.0,0,0,0,2.0,1.0,0.6274449265973082,0,0,0,1.0,1.0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+27.0,3.0,5.0,2.0,1.0,0,0,0,1.0,0,1.0,1.0,0,0,0,0,0,1.0,2.0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+27.0,3.0,7.0,2.0,1.0,0,0,0,1.0,0,1.0,1.0,0,0,0,0,0,1.0,2.0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+27.0,3.0,2.0,2.0,1.0,0,0,0,1.0,0,1.0,0,0,0,0,0,0,1.0,2.0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/lp.csv b/src/test/scripts/functions/pipelines/intermediates/classification/lp.csv
index 52b30dc..5824d76 100644
--- a/src/test/scripts/functions/pipelines/intermediates/classification/lp.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/classification/lp.csv
@@ -1 +1 @@
-OTLR,EC,EC,CI,DUMMY
+ED,MVI,DUMMY
diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv b/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv
index 2feb234..f2d0efb 100644
--- a/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv
@@ -1,3 +1,3 @@
-winsorize,imputeByMean,imputeByMedian,abstain,dummycoding
-winsorize,imputeByMean,imputeByMedian,abstain,dummycoding
-winsorize,imputeByMean,imputeByMedian,abstain,dummycoding
+outlierBySd,forward_fill,dummycoding
+outlierBySd,forward_fill,dummycoding
+outlierBySd,forward_fill,dummycoding