You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by ss...@apache.org on 2022/08/19 18:52:13 UTC
[systemds] branch main updated: [MINOR] Cleanups in builtin scripts (i.e., removing unnecessary variables and branches)
This is an automated email from the ASF dual-hosted git repository.
ssiddiqi pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new 6d84711dce [MINOR] Cleanups in builtin scripts (i.e., removing unnecessary variables and branches)
6d84711dce is described below
commit 6d84711dce9142f3e2542356ab328a122025b622
Author: Shafaq Siddiqi <sh...@tugraz.at>
AuthorDate: Fri Aug 19 12:34:53 2022 +0200
[MINOR] Cleanups in builtin scripts (i.e., removing unnecessary variables and branches)
---
scripts/builtin/bandit.dml | 1 +
scripts/builtin/executePipeline.dml | 69 +++++++++++-----------
scripts/builtin/mice.dml | 2 +-
scripts/builtin/outlierByIQR.dml | 3 +-
scripts/builtin/outlierBySd.dml | 43 ++++----------
scripts/builtin/underSampling.dml | 22 +++----
scripts/pipelines/scripts/utils.dml | 27 +++++----
.../spark/ParameterizedBuiltinSPInstruction.java | 2 +
.../builtin/part2/BuiltinUnderSamplingTest.java | 2 +-
.../functions/builtin/underSamplingTest.dml | 5 +-
10 files changed, 81 insertions(+), 95 deletions(-)
diff --git a/scripts/builtin/bandit.dml b/scripts/builtin/bandit.dml
index 43e3a1a9c5..d27a7d1058 100644
--- a/scripts/builtin/bandit.dml
+++ b/scripts/builtin/bandit.dml
@@ -640,6 +640,7 @@ run_with_hyperparamNested = function(Frame[Unknown] ph_pip, Integer r_i = 1, Mat
parfor(i in 1:nrow(ph_pip), check = 0) # , opt=CONSTRAINED, mode=REMOTE_SPARK
{
+ evalFunOutput = as.matrix(0)
# execute configurations with r resources
op = removeEmpty(target=ph_pip[i], margin="cols")
# print("PIPELINE EXECUTION START ... "+toString(op))
diff --git a/scripts/builtin/executePipeline.dml b/scripts/builtin/executePipeline.dml
index 38f110be6c..a9cd918bc2 100644
--- a/scripts/builtin/executePipeline.dml
+++ b/scripts/builtin/executePipeline.dml
@@ -57,7 +57,6 @@ s_executePipeline = function(Frame[String] pipeline, Matrix[Double] Xtrain, Mat
{
internalStates = list()
mask=as.matrix(metaList['mask'])
- FD = as.matrix(metaList['fd'])
applyFunc = as.frame(metaList['applyFunc'])
changesAll = 0.0
d = ncol(Xtrain)
@@ -74,7 +73,7 @@ s_executePipeline = function(Frame[String] pipeline, Matrix[Double] Xtrain, Mat
applyOp = toString(as.scalar(applyFunc[1,i]))
Xclone = Xtrain
XtestClone = Xtest
- [hp, dataFlag, yFlag, executeFlag] = matrixToList(Xtrain, Ytrain, mask, FD, hyperParameters[i], flagsCount, op)
+ [hp, dataFlag, yFlag, executeFlag] = matrixToList(Xtrain, Ytrain, mask, as.matrix(metaList['fd']), hyperParameters[i,], flagsCount, op)
if(executeFlag == 1) {
L = evalList(op, hp)
[L, O] = remove(L, 1);
@@ -96,19 +95,21 @@ s_executePipeline = function(Frame[String] pipeline, Matrix[Double] Xtrain, Mat
[L, Y] = remove(L, 1);
Ytrain = as.matrix(Y)
}
- Xtrain = confirmMeta(Xtrain, mask)
- Xtest = confirmMeta(Xtest, mask)
+ # Xtrain = confirmMeta(Xtrain, mask)
+ # Xtest = confirmMeta(Xtest, mask)
}
else {
- print("not applying "+op+" executeFlag = 0")
+ print("not applying operation executeFlag = 0")
}
- if(ncol(Xtest) == d & nrow(Xtest) == nrow(XtestClone)) {
+ if(ncol(Xtest) == d & nrow(Xtest) == nrow(XtestClone) & ncol(hpForPruning) > 1) {
changesSingle = sum(abs(replace(target=Xtest, pattern=NaN, replacement=0) - replace(target=XtestClone, pattern=NaN, replacement=0)) > 0.001 )
changesAll = sum(abs(replace(target=Xtest, pattern=NaN, replacement=0) - replace(target=Xorig, pattern=NaN, replacement=0)) > 0.001 )
if(as.scalar(pipeline[1, i]) == "outlierBySd" | as.scalar(pipeline[1, i]) == "outlierByIQR" | as.scalar(pipeline[1, i]) == "imputeByFd") {
- [hpForPruning, changesByOp] = storeDataForPrunning(pipeline, hyperParameters, hpForPruning, changesByOp, changesSingle, i)
+
+ hpForPruning[1, i] = hyperParameters[i, 2]
+ changesByOp[1, i] = changesSingle
}
}
}
@@ -191,33 +192,33 @@ return(Matrix[Double] X,Integer executeFlag)
else X = X
}
-confirmMeta = function(Matrix[Double] X, Matrix[Double] mask)
-return (Matrix[Double] X)
-{
- if((sum(mask) > 0) & (ncol(X) == ncol(mask)))
- {
- # get the max + 1 for nan replacement
- nanMask = is.na(X)
- # replace nan
- X = replace(target = X, pattern = NaN, replacement = 9999)
- # take categorical out
- cat = removeEmpty(target=X, margin="cols", select = mask)
- # round categorical (if there is any floating point)
- cat = round(cat)
- less_than_1_mask = cat < 1
- less_than_1 = less_than_1_mask * 9999
- cat = (cat * (less_than_1_mask == 0)) + less_than_1
- # reconstruct original X
- X = X * (mask == 0)
- q = table(seq(1, ncol(cat)), removeEmpty(target=seq(1, ncol(mask)), margin="rows",
- select=t(mask)), ncol(cat), ncol(X))
- X = (cat %*% q) + X
-
- # put nan back
- nanMask = replace(target = nanMask, pattern = 1, replacement = NaN)
- X = X + nanMask
- }
-}
+# confirmMeta = function(Matrix[Double] X, Matrix[Double] mask)
+# return (Matrix[Double] X)
+# {
+ # if((sum(mask) > 0) & (ncol(X) == ncol(mask)))
+ # {
+ # # get the max + 1 for nan replacement
+ # nanMask = is.na(X)
+ # # replace nan
+ # X = replace(target = X, pattern = NaN, replacement = 9999)
+ # # take categorical out
+ # cat = removeEmpty(target=X, margin="cols", select = mask)
+ # # round categorical (if there is any floating point)
+ # cat = round(cat)
+ # less_than_1_mask = cat < 1
+ # less_than_1 = less_than_1_mask * 9999
+ # cat = (cat * (less_than_1_mask == 0)) + less_than_1
+ # # reconstruct original X
+ # X = X * (mask == 0)
+ # q = table(seq(1, ncol(cat)), removeEmpty(target=seq(1, ncol(mask)), margin="rows",
+ # select=t(mask)), ncol(cat), ncol(X))
+ # X = (cat %*% q) + X
+
+ # # put nan back
+ # nanMask = replace(target = nanMask, pattern = 1, replacement = NaN)
+ # X = X + nanMask
+ # }
+# }
confirmData = function(Matrix[Double] nX, Matrix[Double] originalX, Matrix[Double] mask, Integer dataFlag)
diff --git a/scripts/builtin/mice.dml b/scripts/builtin/mice.dml
index 24ffaccf5b..ca2c4592e6 100644
--- a/scripts/builtin/mice.dml
+++ b/scripts/builtin/mice.dml
@@ -180,7 +180,7 @@ return (Matrix[Double] dist){
dist = matrix(1, 1, ncol(X))
X = replace(target=X, pattern=0, replacement=max(X)+1)
- parfor(i in 1:ncol(X))
+ for(i in 1:ncol(X))
{
if(as.scalar(mask[,i]) == 1)
{
diff --git a/scripts/builtin/outlierByIQR.dml b/scripts/builtin/outlierByIQR.dml
index 7abf43c065..bd15bb0f65 100644
--- a/scripts/builtin/outlierByIQR.dml
+++ b/scripts/builtin/outlierByIQR.dml
@@ -114,7 +114,8 @@ compute_quartiles = function(Matrix[Double] X)
if(nrow(X) > 1) {
parfor(i in 1:cols) {
isNull = is.na(X[, i])
- Xt = removeEmpty(target=X[, i], margin="rows", select=(isNull == 0))
+ sel = (isNull == 0)
+ Xt = removeEmpty(target=X[, i], margin="rows", select=sel)
colQ1[,i] = quantile(Xt, 0.25)
colQ3[,i] = quantile(Xt, 0.75)
}
diff --git a/scripts/builtin/outlierBySd.dml b/scripts/builtin/outlierBySd.dml
index 0e7a192f3f..c53ed1cfef 100644
--- a/scripts/builtin/outlierBySd.dml
+++ b/scripts/builtin/outlierBySd.dml
@@ -51,9 +51,8 @@ m_outlierBySd = function(Matrix[Double] X, Double k = 3, Integer repairMethod =
while( max_iterations == 0 | counter < max_iterations )
{
- colSD = getColSd(X)
- colMean = getColMean(X)
-
+ [colMean, colSD] = getColMean_Sd(X)
+
upperBound = colMean + k * colSD
lowerBound = colMean - k * colSD
@@ -74,27 +73,19 @@ m_outlierBySd = function(Matrix[Double] X, Double k = 3, Integer repairMethod =
}
out = X
if(verbose) {
- print("last outlier filter:\n"+ toString(outlierFilter))
- print("Total executed iterations = "+counter)
print("Upper-bound of data was calculated using Mean + k * Standard Deviation")
print("lower-bound of data was calculated using Mean - k * Standard Deviation")
print("Anything less than the lower-bound and greater than the upper-bound was treated as outlier")
if(sum(out) == 0)
print("output is a zero matrix due to iterative evaluation of outliers ")
- print("output:\n"+ toString(out))
}
- bounds = rbind(lowerBound, upperBound)
}
fix_outliers_sd = function(Matrix[Double] X, Matrix[Double] outlierFilter, Integer repairMethod = 2)
- return(Matrix[Double] fixed_X)
+ return(Matrix[Double] X)
{
- rows = nrow(X)
- cols = ncol(X)
- if(repairMethod == 0) {
- sel = (rowMaxs(outlierFilter) == 0)
- X = removeEmpty(target = X, margin = "rows", select = sel)
- }
+ if(repairMethod == 0)
+ X = removeEmpty(target = X, margin = "rows", select = (rowMaxs(outlierFilter) == 0))
else if(repairMethod == 1)
X = (outlierFilter == 0) * X
else if (repairMethod == 2) {
@@ -103,31 +94,19 @@ fix_outliers_sd = function(Matrix[Double] X, Matrix[Double] outlierFilter, Integ
}
else
stop("outlierBySd: invalid argument - repair required 0-1 found: "+repairMethod)
-
- fixed_X = X
}
-getColSd = function(Matrix[Double] X)
-return(Matrix[Double] colSd)
-{
- colSd = matrix(0, 1, ncol(X))
- for(i in 1:ncol(X))
- {
- isNull = is.na(X[, i])
- Xt = removeEmpty(target=X[, i], margin="rows", select=(isNull == 0))
- colSd[1, i] = sd(Xt)
- }
-}
-
-getColMean = function(Matrix[Double] X)
-return(Matrix[Double] colMean)
+getColMean_Sd = function(Matrix[Double] X)
+return(Matrix[Double] colMean, Matrix[Double] colSd)
{
colMean = matrix(0, 1, ncol(X))
+ colSd = matrix(0, 1, ncol(X))
for(i in 1:ncol(X))
{
- isNull = is.na(X[, i])
- Xt = removeEmpty(target=X[, i], margin="rows", select=(isNull == 0))
+ Xt = replace(target=X[, i], pattern=NaN, replacement=0)
+ Xt = removeEmpty(target=Xt, margin="rows")
colMean[1, i] = mean(Xt)
+ colSd[1, i] = sd(Xt)
}
}
diff --git a/scripts/builtin/underSampling.dml b/scripts/builtin/underSampling.dml
index 48d601ed73..8256ffd485 100644
--- a/scripts/builtin/underSampling.dml
+++ b/scripts/builtin/underSampling.dml
@@ -44,19 +44,19 @@ return(Matrix[Double] X, Matrix[Double] Y)
# # get the minority class
classes = table(Y, 1)
# # # get the minority class
- minority = as.scalar(rowIndexMin(t(classes)))
+ maxClass = as.scalar(rowIndexMax(t(classes)))
# # # separate the minority class
- notMin = (Y != matrix(minority, rows=nrow(Y), cols=1))
- dX = cbind(seq(1, nrow(X)), X)
+ notMin = (Y == maxClass)
+ dX = seq(1, nrow(X))
majority = removeEmpty(target=dX, margin="rows", select=notMin)
# # # formulate the undersampling ratio
- u_ratio = floor(nrow(majority) * ratio)
- # take the samples for oversampling
- u_sample = sample(nrow(majority), u_ratio)
- u_select = table(u_sample, 1, 1, nrow(majority), 1)
- u_select = u_select * majority[, 1]
- u_select = removeEmpty(target = u_select, margin = "rows")
+ # take the samples for undersampling
+ u_select = rand(rows=nrow(majority), cols=1, min=1, max=2, sparsity=(ratio), seed=1)
+ u_select = u_select > 0
+ u_select = u_select * majority
+ u_select = removeEmpty(target = u_select, margin = "rows")
u_select1 = table(u_select, 1, 1, nrow(X), 1)
- X = removeEmpty(target=X, margin="rows", select = (u_select1 == 0))
- Y = removeEmpty(target=Y, margin="rows", select = (u_select1 == 0))
+ sel = (u_select1 == 0)
+ X = removeEmpty(target=X, margin="rows", select = sel)
+ Y = removeEmpty(target=Y, margin="rows", select = sel)
}
\ No newline at end of file
diff --git a/scripts/pipelines/scripts/utils.dml b/scripts/pipelines/scripts/utils.dml
index 7fb95297df..3f9378c9d2 100644
--- a/scripts/pipelines/scripts/utils.dml
+++ b/scripts/pipelines/scripts/utils.dml
@@ -78,6 +78,7 @@ doSample = function(Matrix[Double] eX, Matrix[Double] eY, Double ratio, Boolean
sampledY = eY
}
}
+ print("sampled rows "+nrow(sampledY)+" out of "+nrow(eY))
}
# #######################################################################
@@ -138,16 +139,16 @@ return(Frame[Unknown] data, List[Unknown] distanceMatrix, List[Unknown] dictiona
print(prefix+" convert strings to lower case");
data = map(data, "x -> x.toLowerCase()")
# step 2 fix invalid lengths
- # q0 = 0.05
- # q1 = 0.95
- # print(prefix+" fixing invalid lengths between "+q0+" and "+q1+" quantile");
+ q0 = 0.05
+ q1 = 0.95
+ print(prefix+" fixing invalid lengths between "+q0+" and "+q1+" quantile");
- # [data, mask, qlow, qup] = fixInvalidLengths(data, mask, q0, q1)
+ [data, mask, qlow, qup] = fixInvalidLengths(data, mask, q0, q1)
# # step 3 fix swap values
- # print(prefix+" value swap fixing");
- # data = valueSwap(data, schema)
+ print(prefix+" value swap fixing");
+ data = valueSwap(data, schema)
# step 3 drop invalid types
print(prefix+" drop values with type mismatch");
@@ -155,8 +156,8 @@ return(Frame[Unknown] data, List[Unknown] distanceMatrix, List[Unknown] dictiona
# step 5 porter stemming on all features
- # print(prefix+" porter-stemming on all features");
- # data = map(data, "x -> PorterStemmer.stem(x)", 0)
+ print(prefix+" porter-stemming on all features");
+ data = map(data, "x -> PorterStemmer.stem(x)", 0)
# step 6 typo correction
if(CorrectTypos)
@@ -202,20 +203,20 @@ return(Frame[Unknown] data)
data = map(data, "x -> x.toLowerCase()")
# step 2 fix invalid lengths
- # q0 = 0.05
- # q1 = 0.95
+ q0 = 0.05
+ q1 = 0.95
- # [data, mask, qlow, qup] = fixInvalidLengths(data, mask, q0, q1)
+ [data, mask, qlow, qup] = fixInvalidLengths(data, mask, q0, q1)
# # step 3 fix swap values
- # data = valueSwap(data, schema)
+ data = valueSwap(data, schema)
# step 3 drop invalid types
data = dropInvalidType(data, schema)
# step 5 porter stemming on all features
- # data = map(data, "x -> PorterStemmer.stem(x)", 0)
+ data = map(data, "x -> PorterStemmer.stem(x)", 0)
# step 6 typo correction
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/spark/ParameterizedBuiltinSPInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/spark/ParameterizedBuiltinSPInstruction.java
index 7135f141c6..7bb547673c 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/spark/ParameterizedBuiltinSPInstruction.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/spark/ParameterizedBuiltinSPInstruction.java
@@ -686,6 +686,8 @@ public class ParameterizedBuiltinSPInstruction extends ComputationSPInstruction
_off.getBlock(1, (int) arg0._1().getColumnIndex()));
// execute remove empty operations
+ System.out.println("offset: "+offsets.getValue().getNumRows());
+ System.out.println("_rmRows: "+_rmRows);
ArrayList<IndexedMatrixValue> out = new ArrayList<>();
LibMatrixReorg.rmempty(data, offsets, _rmRows, _len, _blen, out);
diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinUnderSamplingTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinUnderSamplingTest.java
index ea9ca8da8f..45e025aae8 100644
--- a/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinUnderSamplingTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinUnderSamplingTest.java
@@ -47,7 +47,7 @@ public class BuiltinUnderSamplingTest extends AutomatedTestBase {
@Test
public void test_CP2() {
- runUnderSamplingTest(0.5, Types.ExecType.CP);
+ runUnderSamplingTest(0.4, Types.ExecType.CP);
}
diff --git a/src/test/scripts/functions/builtin/underSamplingTest.dml b/src/test/scripts/functions/builtin/underSamplingTest.dml
index d59d2832c3..ad1bb5fabb 100644
--- a/src/test/scripts/functions/builtin/underSamplingTest.dml
+++ b/src/test/scripts/functions/builtin/underSamplingTest.dml
@@ -20,7 +20,7 @@
#-------------------------------------------------------------
ratio = as.double($1)
-X = rand(rows=20, cols=4, min=1, max =100)
+X = rand(rows=20, cols=4, min=1, max =100, seed=1)
Y = rbind(matrix(1, rows=15, cols=1), matrix(2, rows=5, cols=1))
classesUnBalanced = table(Y[, ncol(Y)], 1)
# # # randomize the data
@@ -31,7 +31,8 @@ Y = P %*% Y
[balancedX, balancedY] = underSampling(X, Y, ratio)
classesBalanced = table(balancedY, 1)
-out = as.scalar(classesUnBalanced[1] - classesBalanced[1]) == floor(15.0*ratio)
+out = as.scalar(classesUnBalanced[1] - classesBalanced[1]) >= (floor(15.0*ratio) - 1) &
+ as.scalar(classesUnBalanced[1] - classesBalanced[1]) <= (floor(15.0*ratio) + 1)
print(out)