You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by ss...@apache.org on 2021/09/14 17:33:43 UTC
[systemds] 01/02: fixing bug
This is an automated email from the ASF dual-hosted git repository.
ssiddiqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git
commit e3ee3720c03307b447e69e0c949b0c9ae5eb3c2d
Author: Shafaq Siddiqi <sh...@tugraz.at>
AuthorDate: Mon Sep 6 14:30:18 2021 +0200
fixing bug
for winsorizing
whatever
I am sleepy
crossV fix
---
scripts/builtin/bandit.dml | 19 ++++----
scripts/builtin/tomeklink.dml | 51 ++++++++++++++++++++++
scripts/builtin/topk_cleaning.dml | 2 +-
.../functions/builtin/BuiltinTomeklinkTest.java | 4 +-
.../functions/pipelines/applyEvaluateTest.dml | 6 +--
.../pipelines/intermediates/classification/hp.csv | 6 +--
.../pipelines/intermediates/classification/lp.csv | 2 +-
.../pipelines/intermediates/classification/pip.csv | 6 +--
8 files changed, 73 insertions(+), 23 deletions(-)
diff --git a/scripts/builtin/bandit.dml b/scripts/builtin/bandit.dml
index 28aa909..91abc08 100644
--- a/scripts/builtin/bandit.dml
+++ b/scripts/builtin/bandit.dml
@@ -53,7 +53,7 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train, Matrix[Doubl
rows = 1, cols = NUM_FEATURES + 4 )
frameList = list()
- for(s in s_max:0) { # TODO convert to parfor
+ parfor(s in s_max:0, check=0) { # TODO convert to parfor
# result variables
bracket_hp = matrix(0, rows=k*(s+1)+k, cols=HYPERPARAM_LENGTH)
@@ -272,12 +272,11 @@ run_with_hyperparam = function(Frame[Unknown] lp, Frame[Unknown] ph_pip, Integer
{
pipList = list(lp = lp, ph = ph_pip[i], hp = hp_matrix, flags = no_of_flag_vars)
[evalFunOutput, hpForPruning, changesByOp] = crossV(X=X, y=Y, cvk=cvk, evalFunHp=evalFunHp, pipList=pipList, metaList=metaList, hpForPruning=hpForPruning,
- changesByOp=changesByOp, evalFunc=evaluationFunc, trainML = FALSE)
-
+ changesByOp=changesByOp, evalFunc=evaluationFunc, trainML = 0)
}
else
{
- [eXtrain, eYtrain, eXtest, eYtest, Tr] = executePipeline(logical=lp, pipeline=ph_pip[i], X=X, Y=Y, Xtest=Xtest, Ytest=Ytest, metaList=metaList,
+ [eXtrain, eYtrain, eXtest, eYtest, Tr, hpForPruning, changesByOp] = executePipeline(logical=lp, pipeline=ph_pip[i], X=X, Y=Y, Xtest=Xtest, Ytest=Ytest, metaList=metaList,
hyperParameters=hp_matrix, hpForPruning=hpForPruning, changesByOp=changesByOp, flagsCount=no_of_flag_vars, test=TRUE, verbose=FALSE)
if(max(eYtrain) == min(eYtrain))
print("Y contains only one class")
@@ -603,8 +602,8 @@ return (String s)
}
crossV = function(Matrix[double] X, Matrix[double] y, Integer cvk, Matrix[Double] evalFunHp, List[Unknown] pipList, List[Unknown] metaList,
- Matrix[Double] hpForPruning = as.matrix(0), Matrix[Double] changesByOp = as.matrix(0), String evalFunc, Boolean trainML = FALSE)
-return (Matrix[Double] accuracy, Matrix[Double] hpForPruning, Matrix[Double] changesByOp)
+ Matrix[Double] hpForPruning = as.matrix(0), Matrix[Double] changesByOp = as.matrix(0), String evalFunc, Integer trainML = 0)
+return (Matrix[Double] output, Matrix[Double] hpForPruning, Matrix[Double] changesByOp)
{
accuracyMatrix = matrix(0, cvk, 1)
dataList = list()
@@ -652,10 +651,12 @@ return (Matrix[Double] accuracy, Matrix[Double] hpForPruning, Matrix[Double] cha
changesByOp=changesByOp, flagsCount=as.scalar(pipList['flags']), test=TRUE, verbose=FALSE)
}
# print("test out: "+nrow(testy))
- res = eval(evalFunc, list(X=trainX, Y=trainy, Xtest=testX, Ytest=testy, Xorig=as.matrix(0), evalFunHp=evalFunHp, trainML = 0))
- accuracyMatrix[i] = res
+ res = eval(evalFunc, list(X=trainX, Y=trainy, Xtest=testX, Ytest=testy, Xorig=as.matrix(0), evalFunHp=evalFunHp, trainML = trainML))
+ accuracyMatrix[i] = res[1, 1]
+ evalFunHp = res[, 2:ncol(res)]
}
accuracy = as.matrix(mean(accuracyMatrix))
+ output = cbind(accuracy, evalFunHp)
}
pruningSignal = function(Frame[Unknown] ph_pip, Matrix[Double] hp_matrix, Matrix[Double] hpForPruning, Matrix[Double] changesByOp)
@@ -680,5 +681,3 @@ return(Boolean execute)
}
execute = !(changeCount > 0)
}
-
-
diff --git a/scripts/builtin/tomeklink.dml b/scripts/builtin/tomeklink.dml
index 6169dbf..a48f0b7 100644
--- a/scripts/builtin/tomeklink.dml
+++ b/scripts/builtin/tomeklink.dml
@@ -74,6 +74,8 @@ return (Matrix[Double] nn) {
dists = rowSums((X - X[i,])^2)
dists[i,] = NaN; # mask out self-ref
nn[i, 1] = rowIndexMin(t(dists))
+ # res = naiveKNNsearch(X, X[i], 2)
+ # nn[i, 1] = res[1,2]
}
}
@@ -86,3 +88,52 @@ return (Matrix[Double] tomek_links) {
links = (y != majority_label) & (nn_labels == majority_label)
tomek_links = (table(nn, 1, links, nrow(y), 1) > 0)
}
+
+
+#naive knn search implement
+naiveKNNsearch = function(
+ Matrix[Double] P,
+ Matrix[Double] Q,
+ Integer K
+)return(
+ Matrix[Double] O
+){
+ num_records = nrow (P);
+ num_features = ncol (P);
+ num_queries = nrow (Q);
+ Qt = t(Q);
+ PQt = P %*% Qt;
+ P2 = rowSums (P ^ 2);
+ D = -2 * PQt + P2;
+ if (K == 1) {
+ Dt = t(D);
+ O = rowIndexMin (Dt);
+ } else {
+ O = matrix (0, rows = num_queries, cols = K);
+ parfor (i in 1:num_queries) {
+ D_sorted=order(target=D[,i], by=1, decreasing=FALSE, index.return=TRUE);
+ O[i,] = t(D_sorted[1:K,1]);
+ }
+ }
+}
+
+
+
+# #naive knn search implement
+# KNNApprox = function(
+ # Matrix[Double] P,
+ # Matrix[Double] Q,
+ # Integer K
+# )return(
+ # Matrix[Double] O
+# ){
+
+# [C, Y] = kmeans(X, nrow(X)/ncol(X), 25, 50, 0.0001, TRUE, 50, 1324)
+# clusX = cbind(Y, X)
+# clusX = order(target=X, by=1, decreasing=FALSE, index.return=FALSE);
+# clus = table(Y, 1)
+
+
+# Y_1 = kmeansPredict(X, C)
+# }
+
diff --git a/scripts/builtin/topk_cleaning.dml b/scripts/builtin/topk_cleaning.dml
index 75ee184..1b32e3c 100644
--- a/scripts/builtin/topk_cleaning.dml
+++ b/scripts/builtin/topk_cleaning.dml
@@ -106,7 +106,7 @@ s_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest = a
tab = table(eYtrain, 1)
dist = nrow(tab)
- if((nrow(eYtrain) > 0 & dist < 10))
+ if(FALSE) #(nrow(eYtrain) > 0 & dist < 10)
logical = logicalSeedCI
else
logical = logicalSeedNoCI
diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinTomeklinkTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinTomeklinkTest.java
index 00f0b36..411be29 100644
--- a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinTomeklinkTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinTomeklinkTest.java
@@ -36,8 +36,8 @@ public class BuiltinTomeklinkTest extends AutomatedTestBase
private static final String TEST_CLASS_DIR = TEST_DIR + BuiltinTomeklinkTest.class.getSimpleName() + "/";
private final static double eps = 1e-3;
- private final static int rows = 53;
- private final static int cols = 6;
+ private final static int rows = 50000;
+ private final static int cols = 60;
@Override
public void setUp() {
diff --git a/src/test/scripts/functions/pipelines/applyEvaluateTest.dml b/src/test/scripts/functions/pipelines/applyEvaluateTest.dml
index 813ef94..6edd239 100644
--- a/src/test/scripts/functions/pipelines/applyEvaluateTest.dml
+++ b/src/test/scripts/functions/pipelines/applyEvaluateTest.dml
@@ -82,8 +82,8 @@ return(Matrix[Double] accuracy)
beta = multiLogReg(X=X, Y=Y, icpt=as.scalar(evalFunHp[1,1]), reg=as.scalar(evalFunHp[1,2]), tol=as.scalar(evalFunHp[1,3]),
maxi=as.scalar(evalFunHp[1,4]), maxii=50, verbose=FALSE);
- [prob, yhat, a] = multiLogRegPredict(Xtest, beta, Ytest, FALSE)
- accuracy = getAccuracy(Ytest, yhat, TRUE)
- print("accuracy weighted: "+accuracy)
+ [prob, yhat, accuracy] = multiLogRegPredict(Xtest, beta, Ytest, FALSE)
+ a = getAccuracy(Ytest, yhat, TRUE)
+ print("accuracy: "+ accuracy+", accuracy weighted: "+a)
accuracy = as.matrix(accuracy)
}
diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv b/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv
index 3d5c3ff..f92bc2f 100644
--- a/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv
@@ -1,3 +1,3 @@
-40.0,2.0,0.01816863223655686,0.9565161479438591,0,0,0,1.0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,1.0,0,0,0,2.0,1.0,0.6515164788504212,0,0,0,1.0,1.0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-40.0,2.0,0.03510876761722913,0.9673791862807241,0,0,0,1.0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,1.0,0,0,0,2.0,1.0,0.6149768032146687,0,0,0,1.0,1.0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-40.0,2.0,0.014861839294898092,0.9595626659056867,0,0,0,1.0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,1.0,0,0,0,2.0,1.0,0.6274449265973082,0,0,0,1.0,1.0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+27.0,3.0,5.0,2.0,1.0,0,0,0,1.0,0,1.0,1.0,0,0,0,0,0,1.0,2.0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+27.0,3.0,7.0,2.0,1.0,0,0,0,1.0,0,1.0,1.0,0,0,0,0,0,1.0,2.0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+27.0,3.0,2.0,2.0,1.0,0,0,0,1.0,0,1.0,0,0,0,0,0,0,1.0,2.0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/lp.csv b/src/test/scripts/functions/pipelines/intermediates/classification/lp.csv
index 52b30dc..5824d76 100644
--- a/src/test/scripts/functions/pipelines/intermediates/classification/lp.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/classification/lp.csv
@@ -1 +1 @@
-OTLR,EC,EC,CI,DUMMY
+ED,MVI,DUMMY
diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv b/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv
index 2feb234..f2d0efb 100644
--- a/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv
@@ -1,3 +1,3 @@
-winsorize,imputeByMean,imputeByMedian,abstain,dummycoding
-winsorize,imputeByMean,imputeByMedian,abstain,dummycoding
-winsorize,imputeByMean,imputeByMedian,abstain,dummycoding
+outlierBySd,forward_fill,dummycoding
+outlierBySd,forward_fill,dummycoding
+outlierBySd,forward_fill,dummycoding