You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by ss...@apache.org on 2022/01/12 15:38:07 UTC
[systemds] branch main updated: [MINOR] Minor changes in logical enumeration - This commit introduce the default parameter values for the cleaning primitives - The logical enumeration now only use the default parameter values to evaluate the primitives and iteratively add the categories to logical pipelines until converge. - The output is a single best logical pipeline
This is an automated email from the ASF dual-hosted git repository.
ssiddiqi pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new 08944a7 [MINOR] Minor changes in logical enumeration - This commit introduce the default parameter values for the cleaning primitives - The logical enumeration now only use the default parameter values to evaluate the primitives and iteratively add the categories to logical pipelines until converge. - The output is a single best logical pipeline
08944a7 is described below
commit 08944a7305cbc4f4d9cbbd4565efa8bcc93b82e3
Author: Shafaq Siddiqi <sh...@tugraz.at>
AuthorDate: Wed Jan 12 16:36:58 2022 +0100
[MINOR] Minor changes in logical enumeration
- This commit introduce the default parameter values for the cleaning primitives
- The logical enumeration now only use the default parameter values to evaluate the
primitives and iteratively add the categories to logical pipelines until converge.
- The output is a single best logical pipeline
---
scripts/builtin/applyAndEvaluate.dml | 19 +--
scripts/builtin/bandit.dml | 157 ++++++++++++---------
scripts/builtin/executePipeline.dml | 5 +-
scripts/builtin/fixInvalidLengthsApply.dml | 2 +-
scripts/builtin/frameSort.dml | 12 +-
scripts/builtin/frequencyEncodeApply.dml | 7 +-
scripts/builtin/topk_cleaning.dml | 49 +++----
scripts/pipelines/properties/param.csv | 44 +++---
scripts/pipelines/properties/properties.csv | 2 -
scripts/pipelines/scripts/enumerateLogical.dml | 128 +++++++----------
scripts/pipelines/scripts/utils.dml | 31 ++--
.../BuiltinTopkCleaningClassificationTest.java | 6 +-
.../pipelines/BuiltinTopkEvaluateTest.java | 1 +
.../pipelines/BuiltinTopkLogicalTest.java | 2 +-
.../intermediates/classification/bestAcc.csv | 6 +-
.../pipelines/intermediates/classification/hp.csv | 6 +-
.../pipelines/intermediates/classification/lp.csv | 2 +-
.../pipelines/intermediates/classification/pip.csv | 6 +-
.../functions/pipelines/topkLogicalTest.dml | 28 ++--
.../pipelines/topkcleaningClassificationTest.dml | 7 +-
.../pipelines/topkcleaningRegressionTest.dml | 4 +-
21 files changed, 264 insertions(+), 260 deletions(-)
diff --git a/scripts/builtin/applyAndEvaluate.dml b/scripts/builtin/applyAndEvaluate.dml
index 96e199d..e82fa79 100644
--- a/scripts/builtin/applyAndEvaluate.dml
+++ b/scripts/builtin/applyAndEvaluate.dml
@@ -54,13 +54,16 @@ s_applyAndEvaluate = function(Frame[Unknown] trainData, Frame[Unknown] testData,
Boolean isLastLabel = TRUE, Boolean correctTypos=FALSE)
return (Matrix[Double] result)
{
+ print("logical: "+toString(lp))
no_of_flag_vars = 5
schema = metaData[1, 1:ncol(metaData) - 1]
mask = as.matrix(metaData[2, 1:ncol(metaData) - 1])
fdMask = as.matrix(metaData[3, 1:ncol(metaData) - 1])
maskY = as.integer(as.scalar(metaData[2, ncol(metaData)]))
- metaList = list(mask=mask, schema=schema, fd=fdMask)
-
+ idx = as.scalar(pip[, 1]) + 1
+ metaList = list(mask=mask, schema=schema, fd=fdMask, applyFunc=pip[, (idx+1):ncol(pip)])
+ pip = pip[, 2:idx]
+ ctx = list(prefix="----"); #TODO include seed
# separate the label
[Xtrain, Ytrain] = getLabel(trainData, isLastLabel)
[Xtest, Ytest] = getLabel(testData, isLastLabel)
@@ -77,7 +80,7 @@ return (Matrix[Double] result)
}
# # # when the evaluation function is called first we also compute and keep hyperparams of target application
dirtyScore = getDirtyScore(X=Xtrain, Y=eYtrain, Xtest=Xtest, Ytest=eYtest, metaList=metaList, evaluationFunc=evaluationFunc, evalFunHp=evalFunHp)
- [Xtrain, Xtest] = runStringPipeline(Xtrain, Xtest, schema, mask, FALSE, correctTypos)
+ [Xtrain, Xtest] = runStringPipeline(Xtrain, Xtest, schema, mask, FALSE, correctTypos, ctx)
# # # if mask has 1s then there are categorical features
[eXtrain, eXtest] = recodeData(Xtrain, Xtest, mask, FALSE, "recode")
@@ -93,7 +96,7 @@ return (Matrix[Double] result)
pipList = list(lp = lp, ph = pip, hp = hp_matrix, flags = no_of_flag_vars)
# argList = list(X=X, Y=Y, Xtest=Xtest, Ytest=Ytest, Xorig=clone_X, pipList=pipList, metaList=metaList, evalFunHp=evalFunHp, trainML=0)
# # # now test accuracy
- [eXtrain, eYtrain, eXtest, eYtest, a, b,Tr] = executePipeline(logical=lp, pipeline=pip, X=eXtrain, Y=eYtrain, Xtest=eXtest, Ytest=eYtest, metaList=metaList,
+ [eXtrain, eYtrain, eXtest, eYtest, a, b,Tr] = executePipeline(logical=lp, pipeline=pip, Xtrain=eXtrain, Ytrain=eYtrain, Xtest=eXtest, Ytest=eYtest, metaList=metaList,
hyperParameters=hp_matrix, flagsCount=no_of_flag_vars, test=TRUE, verbose=FALSE)
if(max(eYtrain) == min(eYtrain))
@@ -113,17 +116,15 @@ return (Matrix[Double] result)
}
runStringPipeline = function(Frame[Unknown] Xtrain, Frame[Unknown] Xtest, Frame[String] schema,
- Matrix[Double] mask, Boolean cv, Boolean correctTypos = FALSE)
+ Matrix[Double] mask, Boolean cv, Boolean correctTypos = FALSE, List[Unknown] ctx)
return(Frame[Unknown] Xtrain, Frame[Unknown] Xtest)
{
if(cv)
- Xtrain = utils::stringProcessing(data=Xtrain, mask=mask, schema=schema, CorrectTypos=correctTypos)
+ [Xtrain, Xtest] = utils::stringProcessing(train=Xtrain, test=Xtrain, mask=mask, schema=schema, CorrectTypos=correctTypos, ctx=ctx)
else
{
# # # binding train and test to use same dictionary for both
- XAll = utils::stringProcessing(data=rbind(Xtrain, Xtest), mask=mask, schema=schema, CorrectTypos=correctTypos)
- Xtrain = XAll[1:nrow(Xtrain),]
- Xtest = XAll[nrow(Xtrain)+1:nrow(XAll),]
+ [Xtrain, Xtest] = utils::stringProcessing(train=Xtrain, test=Xtest, mask=mask, schema=schema, CorrectTypos=correctTypos, ctx=ctx)
}
}
diff --git a/scripts/builtin/bandit.dml b/scripts/builtin/bandit.dml
index 15fa734..92b90c3 100644
--- a/scripts/builtin/bandit.dml
+++ b/scripts/builtin/bandit.dml
@@ -115,8 +115,8 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train, Matrix[Doubl
}
configurations = configurations[1:n_i, ]
- [outPip,outHp, f] = run_with_hyperparam(lp, configurations, r_i, X_train, Y_train, X_test, Y_test, metaList,
- evaluationFunc, evalFunHp, param, feaFrameOuter, cv, cvk, verbose)
+ [outPip,outHp, f] = run_with_hyperparam(lp=lp, ph_pip=configurations, r_i=r_i, X=X_train, Y=Y_train, Xtest=X_test, Ytest=Y_test, metaList=metaList,
+ evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, param=param, featureFrameOuter=feaFrameOuter, cv=cv, cvk=cvk)
# sort the pipelines by order of accuracy decreasing
a = order(target = outPip, by = 1, decreasing=TRUE, index.return=FALSE)
b = order(target = outHp, by = 1, decreasing=TRUE, index.return=FALSE)
@@ -130,7 +130,8 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train, Matrix[Doubl
# sort the configurations for successive halving
avergae_perf = getMaxPerConf(outPip, nrow(configurations))
- configurations = frameSort(cbind(avergae_perf, configurations), TRUE)
+ sortMask = matrix(1, rows=1, cols=ncol(configurations))
+ configurations = frameSort(cbind(avergae_perf, configurations), cbind(as.matrix(0), sortMask), TRUE)
configurations = configurations[, 2:ncol(configurations)]
}
bracket_pipel = removeEmpty(target=bracket_pipel, margin="rows")
@@ -154,10 +155,16 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train, Matrix[Doubl
[bestPipeline, bestHyperparams] = extractTopK(pipeline, hparam, baseLineScore, k)
bestAccuracy = as.matrix(bestPipeline[,1])
- bestPipeline = bestPipeline[,2:ncol(bestPipeline)]
bestHyperparams = bestHyperparams[,2:ncol(bestHyperparams)]
imp = as.double(as.scalar(bestAccuracy[1, 1])) - as.double(baseLineScore)
perf = imp > 0
+ applyFunc = bestPipeline[, 2:ncol(bestPipeline)]
+ for(k in 1:nrow(bestPipeline))
+ {
+ applyFunc[k, ] = getParamMeta(bestPipeline[k, 2:ncol(bestPipeline)], param)
+ bestPipeline[k, 1] = as.frame(ncol(bestPipeline) - 1)
+ }
+ bestPipeline = cbind(bestPipeline, applyFunc)
if(verbose) {
print("dirty accuracy "+toString(baseLineScore))
print("best logical pipelines \n"+toString(lp))
@@ -177,7 +184,7 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train, Matrix[Doubl
}
# this method will extract the physical pipelines for a given logical pipelines
-get_physical_configurations = function(Frame[String] logical, Scalar[int] numConfigs = 0,
+get_physical_configurations = function(Frame[String] logical, Scalar[int] numConfigs = 10,
Frame[Unknown] primitives)
return(Frame[String] physical, Double min)
{
@@ -260,7 +267,7 @@ get_physical_configurations = function(Frame[String] logical, Scalar[int] numCon
# this method will call the execute pipelines with their hyper-parameters
run_with_hyperparam = function(Frame[Unknown] lp, Frame[Unknown] ph_pip, Integer r_i, Matrix[Double] X, Matrix[Double] Y,
Matrix[Double] Xtest, Matrix[Double] Ytest, List[Unknown] metaList, String evaluationFunc, Matrix[Double] evalFunHp,
- Frame[Unknown] param, Frame[Unknown] featureFrameOuter, Boolean cv, Integer cvk = 2, Boolean verbose)
+ Frame[Unknown] param, Frame[Unknown] featureFrameOuter, Boolean cv, Integer cvk = 2, Boolean default = FALSE)
return (Matrix[Double] output_operator, Matrix[Double] output_hyperparam, Frame[Unknown] featureFrameOuter)
{
output_hp = matrix(0, nrow(ph_pip)*r_i, ncol(lp) * 5 * 3)
@@ -280,7 +287,7 @@ run_with_hyperparam = function(Frame[Unknown] lp, Frame[Unknown] ph_pip, Integer
for(i in 1:nrow(ph_pip))
{
# execute configurations with r resources
- [hp, applyFunctions, no_of_res, no_of_flag_vars] = getHyperparam(ph_pip[i], param, r_i)
+ [hp, applyFunctions, no_of_res, no_of_flag_vars] = getHyperparam(ph_pip[i], param, r_i, default)
if(ncol(featureFrameOuter) > 1)
feaFrame = frame("", rows = no_of_res, cols = ncol(featureFrameOuter))
pip_toString = pipToString(ph_pip[i])
@@ -355,35 +362,23 @@ run_with_hyperparam = function(Frame[Unknown] lp, Frame[Unknown] ph_pip, Integer
}
# extract the hyper-parameters for pipelines
-getHyperparam = function(Frame[Unknown] pipeline, Frame[Unknown] hpList, Integer no_of_res)
+getHyperparam = function(Frame[Unknown] pipeline, Frame[Unknown] hpList, Integer no_of_res, Boolean default)
return (Matrix[Double] paramMatrix, Frame[Unknown] applyFunc, Integer no_of_res, Integer NUM_META_FLAGS)
{
allParam = 0;
- START_INDEX = 8 # value from where the hyper-params starts after skipping meta flags
NUM_META_FLAGS = 5
+ NUM_DEFAULT_VALUES = 4
+
# load the hyper-parameters values
paramList = list()
# store the row indexes of the operator matches
- indexes = matrix(0, rows= ncol(pipeline), cols=1)
- paramCount = matrix(0, rows= ncol(pipeline), cols=1)
- applyList = hpList[, 1]
- hpList = hpList[, 2:ncol(hpList)]
- applyFunc = pipeline
- parfor(k in 1:ncol(pipeline))
- {
- op = as.scalar(pipeline[1,k])
- hasParam = map(hpList[,1], "x->x.split(\",\")[0].equals(\""+op+"\")")
- # convert the boolean vector to 0/1 matrix representation
- m_hasParam = hasParam == frame("true", rows=nrow(hasParam), cols=1)
- m_hasParam = as.matrix(m_hasParam)
- # compute the relevant index
- index = m_hasParam * seq(1, nrow(m_hasParam))
- index = as.scalar(removeEmpty(target = index, margin = "rows"))
- indexes[k] = index
- paramCount[k] = as.integer(as.scalar(hpList[index, 2]))
- applyFunc[1, k] = as.scalar(applyList[index, 1])
- }
+ [applyFunc, indexes, paramCount] = getParamMeta(pipeline, hpList)
+
+ hpList = hpList[, 3:ncol(hpList)]
+ DEFAULT_INDEX = 7
+ START_INDEX = 11 # value from where the hyper-params starts after skipping meta flags
+
# if there are no hyper-parameters than change the values of resources
# so that the pipeline is only executed once and no resource are wasted, saving looping
no_of_res = ifelse(sum(paramCount) > 0, no_of_res, 1)
@@ -398,49 +393,52 @@ getHyperparam = function(Frame[Unknown] pipeline, Frame[Unknown] hpList, Intege
index = as.scalar(indexes[i])
no_of_param = as.integer(as.scalar(paramCount[i]))
# extract hasY and verbose flags
- attachMask = matrix(as.scalar(hpList[index, 3]), rows=no_of_res, cols=1)
- attachFD = matrix(as.scalar(hpList[index, 4]), rows=no_of_res, cols=1)
- attachY = matrix(as.scalar(hpList[index, 5]), rows=no_of_res, cols=1)
- isVerbose = matrix(as.scalar(hpList[index, 6]), rows=no_of_res, cols=1)
- dataFlag = matrix(as.scalar(hpList[index, 7]), rows=no_of_res, cols=1)
-
+ attachMask = matrix(as.scalar(hpList[index, 2]), rows=no_of_res, cols=1)
+ attachFD = matrix(as.scalar(hpList[index, 3]), rows=no_of_res, cols=1)
+ attachY = matrix(as.scalar(hpList[index, 4]), rows=no_of_res, cols=1)
+ isVerbose = matrix(as.scalar(hpList[index, 5]), rows=no_of_res, cols=1)
+ dataFlag = matrix(as.scalar(hpList[index, 6]), rows=no_of_res, cols=1)
if(no_of_param > 0) {
paramIdx = START_INDEX
typeIdx = START_INDEX
OpParam = matrix(0, rows=no_of_res, cols=max(paramCount))
-
- for(j in 1:no_of_param) {
- type = as.scalar(hpList[index, typeIdx])
- paramValIndex = (no_of_param) + paramIdx
- minVal = as.scalar(hpList[index, paramValIndex])
- maxVal = as.scalar(hpList[index, paramValIndex + 1])
- if(type == "FP") {
- val = rand(rows=no_of_res, cols=1, min=minVal, max=maxVal, pdf="uniform");
- OpParam[, j] = val;
- }
- else if(type == "INT") {
- if(as.integer(maxVal) > no_of_res)
- val = sample(as.integer(maxVal), no_of_res, FALSE)
- else
- val = sample(as.integer(maxVal), no_of_res, TRUE)
- less_than_min = val < as.integer(minVal);
- val = (less_than_min * minVal) + val;
- OpParam[, j] = val;
- }
- else if(type == "BOOL") {
- if(maxVal == 1) {
- s = sample(2, no_of_res, TRUE);
- b = s - 1;
- OpParam[, j] = b;
- }
+ if(default) {
+ OpParam[1, 1:no_of_param] = as.matrix(hpList[index, DEFAULT_INDEX:DEFAULT_INDEX+(no_of_param - 1)])
+ }
+ else {
+ for(j in 1:no_of_param) {
+ type = as.scalar(hpList[index, typeIdx])
+ paramValIndex = (no_of_param) + paramIdx
+ minVal = as.scalar(hpList[index, paramValIndex])
+ maxVal = as.scalar(hpList[index, paramValIndex + 1])
+ if(type == "FP") {
+ val = rand(rows=no_of_res, cols=1, min=minVal, max=maxVal, pdf="uniform");
+ OpParam[, j] = val;
+ }
+ else if(type == "INT") {
+ if(as.integer(maxVal) > no_of_res)
+ val = sample(as.integer(maxVal), no_of_res, FALSE)
+ else
+ val = sample(as.integer(maxVal), no_of_res, TRUE)
+ less_than_min = val < as.integer(minVal);
+ val = (less_than_min * minVal) + val;
+ OpParam[, j] = val;
+ }
+ else if(type == "BOOL") {
+ if(maxVal == 1) {
+ s = sample(2, no_of_res, TRUE);
+ b = s - 1;
+ OpParam[, j] = b;
+ }
+ else
+ OpParam[, j] = matrix(0, rows=no_of_res, cols=1)
+ }
else
- OpParam[, j] = matrix(0, rows=no_of_res, cols=1)
- }
- else
- print("invalid data type") # TODO handle string set something like {,,}
+ print("invalid data type") # TODO handle string set something like {,,}
- paramIdx = paramIdx + 2
- typeIdx = typeIdx + 1
+ paramIdx = paramIdx + 2
+ typeIdx = typeIdx + 1
+ }
}
# hyper-parameter vector contains no. of hp, values of hp, and flag values
OpParam = cbind(matrix(no_of_param, rows=nrow(OpParam), cols=1),OpParam, attachMask,
@@ -503,7 +501,7 @@ extractTopK = function(Frame[Unknown] pipeline, Matrix[Double] hyperparam,
# # add accuracy back
pipeline = cbind(as.frame(forDedup[, ncol(pipeline)+1]), pipeline)
hyperparam = order(target = hyperparam, by = 1, decreasing=TRUE, index.return=FALSE)
- pipeline = frameSort(pipeline, TRUE)
+ pipeline = frameSort(pipeline, cbind(as.matrix(0), matrix(1, rows=1, cols=ncol(pipeline) - 1)), TRUE)
# remove the row with accuracy less than test accuracy
@@ -686,7 +684,7 @@ return (Matrix[Double] output, Matrix[Double] hpForPruning, Matrix[Double] chang
if(as.scalar(pipList['flags']) != 0)
{
[trainX, trainy, testX, testy, Tr, hpForPruning, changesByOp] = executePipeline(logical=as.frame(pipList['lp']), pipeline=as.frame(pipList['ph']),
- X=trainX, Y=trainy, Xtest= testX, Ytest=testy, metaList=metaList, hyperParameters=as.matrix(pipList['hp']), hpForPruning=hpForPruning,
+ Xtrain=trainX, Ytrain=trainy, Xtest= testX, Ytest=testy, metaList=metaList, hyperParameters=as.matrix(pipList['hp']), hpForPruning=hpForPruning,
changesByOp=changesByOp, flagsCount=as.scalar(pipList['flags']), test=TRUE, verbose=FALSE)
}
# print("test out: "+nrow(testy))
@@ -694,7 +692,9 @@ return (Matrix[Double] output, Matrix[Double] hpForPruning, Matrix[Double] chang
accuracyMatrix[i] = res[1, 1]
evalFunHp = res[, 2:ncol(res)]
}
+ print("----- cv mean accuracy ---")
accuracy = as.matrix(mean(accuracyMatrix))
+ print(toString(accuracy))
output = cbind(accuracy, evalFunHp)
}
@@ -720,3 +720,26 @@ return(Boolean execute)
}
execute = !(changeCount > 0)
}
+
+getParamMeta = function(Frame[Unknown] pipeline, Frame[Unknown] hpList)
+return(Frame[Unknown] applyFunc, Matrix[Double] indexes, Matrix[Double] paramCount)
+{
+ indexes = matrix(0, rows= ncol(pipeline), cols=1)
+ paramCount = matrix(0, rows= ncol(pipeline), cols=1)
+ applyList = hpList[, 1]
+ applyFunc = pipeline
+ parfor(k in 1:ncol(pipeline))
+ {
+ op = as.scalar(pipeline[1,k])
+ hasParam = map(hpList[,2], "x->x.split(\",\")[0].equals(\""+op+"\")")
+ # convert the boolean vector to 0/1 matrix representation
+ m_hasParam = hasParam == frame("true", rows=nrow(hasParam), cols=1)
+ m_hasParam = as.matrix(m_hasParam)
+ # compute the relevant index
+ index = m_hasParam * seq(1, nrow(m_hasParam))
+ index = as.scalar(removeEmpty(target = index, margin = "rows"))
+ indexes[k] = index
+ paramCount[k] = as.integer(as.scalar(hpList[index, 3]))
+ applyFunc[1, k] = as.scalar(hpList[index, 1])
+ }
+}
diff --git a/scripts/builtin/executePipeline.dml b/scripts/builtin/executePipeline.dml
index ebb8f60..3e34be8 100644
--- a/scripts/builtin/executePipeline.dml
+++ b/scripts/builtin/executePipeline.dml
@@ -56,8 +56,10 @@
s_executePipeline = function(Frame[String] logical = as.frame("NULL"), Frame[String] pipeline, Matrix[Double] Xtrain, Matrix[Double] Ytrain,
Matrix[Double] Xtest, Matrix[Double] Ytest, List[Unknown] metaList, Matrix[Double] hyperParameters, Matrix[Double] hpForPruning = as.matrix(0),
Matrix[Double] changesByOp = as.matrix(0), Integer flagsCount, Boolean test = FALSE, Boolean verbose)
- return (Matrix[Double] Xtrain, Matrix[Double] Ytrain, Matrix[Double] Xtest, Matrix[Double] Ytest, Double t2, Matrix[Double] hpForPruning, Matrix[Double] changesByOp)
+ return (Matrix[Double] Xtrain, Matrix[Double] Ytrain, Matrix[Double] Xtest, Matrix[Double] Ytest,
+ Double t2, Matrix[Double] hpForPruning, Matrix[Double] changesByOp)
{
+
mask=as.matrix(metaList['mask'])
FD = as.matrix(metaList['fd'])
applyFunc = as.frame(metaList['applyFunc'])
@@ -78,6 +80,7 @@ s_executePipeline = function(Frame[String] logical = as.frame("NULL"), Frame[Str
Xclone = Xtrain
XtestClone = Xtest
[hp, dataFlag, yFlag, executeFlag] = matrixToList(Xtrain, Ytrain, mask, FD, hyperParameters[i], flagsCount, op)
+ print("executing: ---------- "+toString(op))
if(executeFlag == 1) {
L = evalList(op, hp)
[L, O] = remove(L, 1);
diff --git a/scripts/builtin/fixInvalidLengthsApply.dml b/scripts/builtin/fixInvalidLengthsApply.dml
index 3a530ae..99ca0b6 100644
--- a/scripts/builtin/fixInvalidLengthsApply.dml
+++ b/scripts/builtin/fixInvalidLengthsApply.dml
@@ -25,7 +25,7 @@
# ----------------------------------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ----------------------------------------------------------------------------------------------------------------------
-# X Frame[Unknown] ---
+# X Frame[Unknown] ---
# mask Matrix[Double] ---
# ql Double 0.05
# qu Double 0.99
diff --git a/scripts/builtin/frameSort.dml b/scripts/builtin/frameSort.dml
index aaf6cd3..2519198 100644
--- a/scripts/builtin/frameSort.dml
+++ b/scripts/builtin/frameSort.dml
@@ -26,7 +26,8 @@
# ----------------------------------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ----------------------------------------------------------------------------------------------------------------------
-# F Frame[String] --- Data frame of string values
+# F Frame[String] --- Data frame of string values
+# mask Matrix[Double] --- matrix for identifying string columns
# ----------------------------------------------------------------------------------------------------------------------
#
# OUTPUT:
@@ -36,12 +37,13 @@
# f_odered Frame[String] sorted dataset by column 1 in decreasing order
# ----------------------------------------------------------------------------------------------------------------------
-s_frameSort = function(Frame[String] F, Boolean orderDesc = TRUE )
+s_frameSort = function(Frame[String] F, Matrix[Double] mask, Boolean orderDesc = TRUE )
return (Frame[String] f_odered)
{
- idx = matrix(1, 1, ncol(F))
- idx[1,1] = 0 # to save accuracy column from encoding
- index = vectorToCsv(idx)
+ # idx = matrix(1, 1, ncol(F))
+ # idx[1,1] = 0 # to save accuracy column from encoding
+ index = vectorToCsv(mask)
+ print("framesort index: "+toString(index))
# recode logical pipelines for easy handling
jspecR = "{ids:true, recode:["+index+"]}";
[X, M] = transformencode(target=F, spec=jspecR);
diff --git a/scripts/builtin/frequencyEncodeApply.dml b/scripts/builtin/frequencyEncodeApply.dml
index 5146536..a7e6a67 100644
--- a/scripts/builtin/frequencyEncodeApply.dml
+++ b/scripts/builtin/frequencyEncodeApply.dml
@@ -27,9 +27,10 @@ return (Matrix[Double] X) {
if(sum(freqCount[i]) > 0)
{
Y = replace(target=X[, i], pattern=NaN, replacement=1)
- valueCount = freqCount[i, 1:max(Y)]
- resp = matrix(0, nrow(Y), max(Y))
- resp = (resp + t(seq(1, max(Y)))) == Y
+ idx = min(ncol(freqCount), max(Y))
+ valueCount = freqCount[i, 1:idx]
+ resp = matrix(0, nrow(Y), idx)
+ resp = (resp + t(seq(1, idx))) == Y
resp = resp * valueCount
X[, i] = rowSums(resp)
}
diff --git a/scripts/builtin/topk_cleaning.dml b/scripts/builtin/topk_cleaning.dml
index 946af13..45fd7be 100644
--- a/scripts/builtin/topk_cleaning.dml
+++ b/scripts/builtin/topk_cleaning.dml
@@ -57,7 +57,7 @@ source("scripts/pipelines/scripts/enumerateLogical.dml") as lg;
s_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest = as.frame("NULL"), Frame[Unknown] metaData = as.frame("NULL"), Frame[Unknown] primitives,
Frame[Unknown] parameters, Matrix[Double] cmr = matrix("4 0.7 1", rows=1, cols=3), String evaluationFunc, Matrix[Double] evalFunHp, Integer topK = 5,
- Integer resource_val = 20, Integer num_inst = 5, Double sample = 0.1, Boolean cv=TRUE, Integer cvk = 2, Boolean isLastLabel = TRUE, Boolean correctTypos=FALSE, String output)
+ Integer resource_val = 20, Integer num_inst = 5, Integer max_iter = 10, Double sample = 0.1, Boolean cv=TRUE, Integer cvk = 2, Boolean isLastLabel = TRUE, Boolean correctTypos=FALSE, String output)
return(Boolean perf)
# return (Frame[Unknown] topKPipelines, Matrix[Double] topKHyperParams, Matrix[Double] topKScores, Frame[Unknown] bestLogical,
# Frame[Unknown] features, Double dirtyScore, Matrix[Double] evalFunHp)
@@ -110,32 +110,25 @@ s_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest = a
[eXtrain, eYtrain] = utils::doSample(eXtrain, eYtrain, sample, TRUE)
t5 = time(); print("---- finalized in: "+(t5-t4)/1e9+"s");
- # # # create logical pipeline seeds
+ # # # create logical pipeline seeds
logicalSeedCI = frame([
- "4", "ED", "MVI", "OTLR", "EC", "0", "0", "0", "0",
- "5", "ED", "EC", "SCALE", "CI","DUMMY","0", "0", "0",
- "5", "OTLR", "EC", "SCALE", "CI", "DUMMY", "0","0", "0",
- "8", "ED", "MVI", "OTLR", "ED", "EC", "SCALE", "CI", "DUMMY",
- "5", "ED", "MVI", "SCALE", "CI", "DUMMY", "0", "0", "0",
- "4", "MVI", "SCALE", "CI", "DUMMY", "0", "0", "0", "0",
- "6", "ED", "MVI", "EC", "SCALE", "CI", "DUMMY", "0", "0",
- "6", "MVI", "OTLR","EC", "SCALE", "CI", "DUMMY", "0", "0",
- "7", "OTLR", "MVI", "OTLR", "EC", "SCALE", "CI", "DUMMY", "0",
- "7", "ED", "MVI", "OTLR", "EC", "SCALE", "CI", "DUMMY", "0"
- ], rows=10, cols=9)
+ "4", "ED", "MVI", "OTLR", "EC",
+ "2", "MVI", "DUMMY", "0","0",
+ "2", "OTLR", "DUMMY","0","0",
+ "2", "CI", "DUMMY","0","0",
+ "2", "SCALE", "DUMMY","0","0",
+ "2", "ED", "DUMMY","0","0",
+ "2", "EC", "DUMMY", "0","0"
+ ], rows=7, cols=5)
logicalSeedNoCI = frame([
- "3", "ED", "MVI", "OTLR", "EC", "0", "0", "0",
- "4", "ED", "EC", "SCALE", "DUMMY","0", "0", "0",
- "4", "OTLR", "EC", "SCALE", "DUMMY", "0","0", "0",
- "7", "ED", "MVI", "OTLR", "ED", "EC", "SCALE", "DUMMY",
- "4", "ED", "MVI", "SCALE", "DUMMY", "0", "0", "0",
- "3", "MVI", "SCALE", "DUMMY", "0", "0", "0", "0",
- "5", "ED", "MVI", "EC", "SCALE", "DUMMY", "0", "0",
- "5", "MVI", "OTLR","EC", "SCALE", "DUMMY", "0", "0",
- "6", "OTLR", "MVI", "OTLR", "EC", "SCALE", "DUMMY", "0",
- "6", "ED", "MVI", "OTLR", "EC", "SCALE", "DUMMY", "0"
- ], rows=10, cols=8)
+ "4", "ED", "MVI", "OTLR", "EC",
+ "2", "MVI", "DUMMY", "0","0",
+ "2", "OTLR", "DUMMY","0","0",
+ "2", "SCALE", "DUMMY","0","0",
+ "2", "ED", "DUMMY","0","0",
+ "2", "EC", "DUMMY", "0","0"
+ ], rows=6, cols=5)
if(min(eYtrain) >= 1) {
tab = table(eYtrain, 1)
@@ -150,10 +143,10 @@ s_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest = a
category = logical[1, 2:idx]
print("-- Cleaning - Enum Logical Pipelines: ");
- [bestLogical, score] = lg::enumerateLogical(X=eXtrain, y=eYtrain, Xtest=eXtest, ytest=eYtest, cmr=cmr,
- cat=category, population=logical[2:nrow(logical)], max_iter=ceil(resource_val/topK), metaList = metaList,
+ [bestLogical, score] = lg::enumerateLogical(X=eXtrain, y=eYtrain, Xtest=eXtest, ytest=eYtest,
+ cat=category, population=logical[2:nrow(logical),], max_iter=max_iter, metaList = metaList,
evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, primitives=primitives, param=parameters,
- num_inst=num_inst, num_exec=ceil(resource_val/topK), cv=cv, cvk=cvk, verbose=TRUE, ctx=ctx)
+ num_inst=num_inst, cv=cv, cvk=cvk, verbose=TRUE, ctx=ctx)
t6 = time(); print("---- finalized in: "+(t6-t5)/1e9+"s");
# bestLogical = frame(["MVI", "OTLR", "DUMMY"], rows=1, cols=3)
topKPipelines = as.frame("NULL"); topKHyperParams = matrix(0,0,0); topKScores = matrix(0,0,0); features = as.frame("NULL")
@@ -210,7 +203,7 @@ runStringPipeline = function(Frame[Unknown] Xtrain, Frame[Unknown] Xtest, Frame[
return(Frame[Unknown] Xtrain, Frame[Unknown] Xtest)
{
if(cv)
- [Xtrain, Xtest] = utils::stringProcessing(train=Xtrain, test=Xtrain, mask=mask, schema=schema, CorrectTypos=correctTypos, ctx=ctx)
+ Xtrain = utils::stringProcessing(train=Xtrain, test=Xtrain, mask=mask, schema=schema, CorrectTypos=correctTypos, ctx=ctx)
else
{
# # # binding train and test to use same dictionary for both
diff --git a/scripts/pipelines/properties/param.csv b/scripts/pipelines/properties/param.csv
index 5998339..bee6a32 100644
--- a/scripts/pipelines/properties/param.csv
+++ b/scripts/pipelines/properties/param.csv
@@ -1,22 +1,22 @@
-applyName,name,param_no,maskFlag,FDFlag,yFlag,verboseFlag,dataFlag,dt1,dt2,dt3,dt4,st1,en1,st2,en2,st3,en3,st4,en4
-outlierByIQRApply,outlierByIQR,3,0,0,0,1,0,FP,INT,INT,1,7,2,2,1,1,,,
-outlierBySdApply,outlierBySd,3,0,0,0,1,0,INT,INT,INT,1,7,1,2,2,1,,,
-winsorizeApply,winsorize,2,0,0,0,1,0,FP,FP,0.01,0.05,0.95,1,,,,,,
-normalizeApply,normalize,0,0,0,0,0,0,,,,,,,,,,,,
-imputeByMeanApply,imputeByMean,0,1,0,0,0,2,,,,,,,,,,,,
-imputeByMedianApply,imputeByMedian,0,1,0,0,0,2,,,,,,,,,,,,
-miceApply,mice,2,1,0,0,1,2,INT,FP,1,3,0.5,1,,,,,,
-,abstain,1,0,0,1,1,2,FP,0.6,0.8,,,,,,,,,
-,flipLabels,2,0,0,1,1,2,FP,INT,0.6,0.9,1,20,,,,,,
-,SMOTE,1,1,0,1,1,2,INT,100,500,,,,,,,,,
-pca_predict,pca,3,0,0,0,0,2,INT,BOOL,BOOL,100,200,0,1,0,0,,,
-,ppca,4,0,0,0,1,2,INT,INT,FP,FP,100,200,1,10,1.00E-09,1.00E-06,1.00E-02,1.00E-01
-fillDefaultApply,fillDefault,0,0,0,0,0,2,,,,,,,,,,,,
-dummycodingApply,dummycoding,0,1,0,0,0,2,,,,,,,,,,,,
-frequencyEncodeApply,frequencyEncode,0,1,0,0,0,2,,,,,,,,,,,,
-WoEApply,WoE,0,1,0,1,0,2,,,,,,,,,,,,
-scaleApply,scale,2,0,0,0,0,0,BOOL,BOOL,0,1,0,1,,,,,,
-forward_fill,forward_fill,1,0,0,0,1,2,BOOL,0,1,,,,,,,,,
-imputeByFdApply,imputeByFd,1,0,1,0,0,1,FP,0.6,0.9,,,,,,,,,
-,tomeklink,0,0,0,1,0,2,,,,,,,,,,,,
-,underSampling,1,0,0,1,0,2,FP,0.1,0.6,,,,,,,,,
+applyName,name,param_no,maskFlag,FDFlag,yFlag,verboseFlag,dataFlag,default1,default2,default3,default4,dt1,dt2,dt3,dt4,st1,en1,st2,en2,st3,en3,st4,en4
+outlierByIQRApply,outlierByIQR,3,0,0,0,1,0,1.5,2,1,,FP,INT,INT,1,7,2,2,1,1,,,
+outlierBySdApply,outlierBySd,3,0,0,0,1,0,3,2,1,,INT,INT,INT,1,7,1,2,2,1,,,
+winsorizeApply,winsorize,2,0,0,0,1,0,0.05,0.95,,,FP,FP,0.01,0.05,0.95,1,,,,,,
+normalizeApply,normalize,0,0,0,0,0,0,,,,,,,,,,,,,,,,
+imputeByMeanApply,imputeByMean,0,1,0,0,0,2,,,,,,,,,,,,,,,,
+imputeByMedianApply,imputeByMedian,0,1,0,0,0,2,,,,,,,,,,,,,,,,
+miceApply,mice,2,1,0,0,1,2,3,0.9,,,INT,FP,1,3,0.5,1,,,,,,
+,abstain,1,0,0,1,1,2,0.75,,,,FP,0.6,0.8,,,,,,,,,
+,flipLabels,2,0,0,1,1,2,0.75,5,,,FP,INT,0.6,0.9,1,20,,,,,,
+,SMOTE,1,1,0,1,1,2,200,,,,INT,100,500,,,,,,,,,
+pca_predict,pca,3,0,0,0,0,2,10,1,0,,INT,BOOL,BOOL,100,200,0,1,0,0,,,
+,ppca,4,0,0,0,1,2,5,10,0.000001,0.02,INT,INT,FP,FP,100,200,1,10,1.00E-09,1.00E-06,1.00E-02,1.00E-01
+fillDefaultApply,fillDefault,0,0,0,0,0,2,,,,,,,,,,,,,,,,
+dummycodingApply,dummycoding,0,1,0,0,0,2,,,,,,,,,,,,,,,,
+frequencyEncodeApply,frequencyEncode,0,1,0,0,0,2,,,,,,,,,,,,,,,,
+WoEApply,WoE,0,1,0,1,0,2,,,,,,,,,,,,,,,,
+scaleApply,scale,2,0,0,0,0,0,1,0,,,BOOL,BOOL,0,1,0,1,,,,,,
+forward_fill,forward_fill,1,0,0,0,1,2,1,,,,BOOL,0,1,,,,,,,,,
+imputeByFdApply,imputeByFd,1,0,1,0,0,1,0.8,,,,FP,0.6,0.9,,,,,,,,,
+,tomeklink,0,0,0,1,0,2,,,,,,,,,,,,,,,,
+,underSampling,1,0,0,1,0,2,0.2,,,,FP,0.1,0.6,,,,,,,,,
diff --git a/scripts/pipelines/properties/properties.csv b/scripts/pipelines/properties/properties.csv
deleted file mode 100644
index e2d9f1a..0000000
--- a/scripts/pipelines/properties/properties.csv
+++ /dev/null
@@ -1,2 +0,0 @@
-outlierByIQR,3,1.5,1,3.5,1,1,2,20,10,40
-outlierBySd,3,1,1,3,1,1,2,20,10,40
\ No newline at end of file
diff --git a/scripts/pipelines/scripts/enumerateLogical.dml b/scripts/pipelines/scripts/enumerateLogical.dml
index 66b2310..7bd11d6 100644
--- a/scripts/pipelines/scripts/enumerateLogical.dml
+++ b/scripts/pipelines/scripts/enumerateLogical.dml
@@ -52,19 +52,20 @@
source("scripts/builtin/bandit.dml") as bandit;
enumerateLogical = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] Xtest, Matrix[Double] ytest,
- Matrix[Double] cmr, Frame[Unknown] cat, Frame[Unknown] population, Integer max_iter=10, List[Unknown] metaList,
+ Frame[Unknown] cat, Frame[Unknown] population, Integer max_iter=10, List[Unknown] metaList,
String evaluationFunc, Matrix[Double] evalFunHp, Frame[Unknown] primitives, Frame[Unknown] param,
- Integer num_inst, Integer num_exec, Boolean cv=FALSE, Boolean cvk=3, Boolean verbose, List[Unknown] ctx=list(prefix="----"))
+ Integer num_inst, Boolean cv=FALSE, Boolean cvk=3, Boolean verbose, List[Unknown] ctx=list(prefix="----"))
return (Frame[Unknown] bestLg, Double pre_best)
{
+
+ num_exec = 1
prefix = as.scalar(ctx["prefix"]);
bestLg = as.frame("")
best_score = 0.0
pre_best = 0.0
iter = 1
- convergedOuter = FALSE
- while(iter <= max_iter & !convergedOuter)
+ while(as.scalar(population[1, 1]) > 0 & iter < max_iter)
{
print(prefix+" EnumLP iteration "+iter+"/"+as.integer(max_iter)+":" );
physicalPipList = list();
@@ -72,40 +73,49 @@ return (Frame[Unknown] bestLg, Double pre_best)
# get the physical instances from logical ones
# unrolled by physical pipelines
- ppos = list();
+ max_confR = 0
+ max_confC = 0
+ start = 1;
+ end = 0;
for(i in 1:nrow(population)) {
lv = as.integer(as.scalar(population[i, 1])) + 1
lp = population[i, 2:lv]
- pconf = bandit::get_physical_configurations(lp, num_inst, primitives)
- for(j in 1:nrow(pconf)) {
- physicalPipList = append(physicalPipList, pconf[j,]);
- ppos = append(ppos, i)
- }
+ pconf = bandit::get_physical_configurations(lp, 0, primitives)
+ max_confR = ifelse(max_confR < nrow(pconf), nrow(pconf), max_confR)
+ max_confC = ifelse(max_confC < ncol(pconf), ncol(pconf), max_confC)
+ physicalPipList = append(physicalPipList, pconf);
logicalPipList = append(logicalPipList, lp);
+
}
-
+ # print("pipeline Frame: "+toString(pipelineFrame))
# # # execute the physical pipelines
- scores = matrix(0, nrow(physicalPipList), 1)
- parfor(i in 1:length(physicalPipList)) {
- lp2 = as.frame(logicalPipList[as.scalar(ppos[i]),])
+ scores = matrix(0, rows=nrow(population) * max_confR, cols=2)
+ start = 1;
+ end = 0;
+ pipelineFrame = frame(0, rows=length(physicalPipList) * max_confR, cols=max_confC)
+ parfor(i in 1:length(physicalPipList), check=0) {
+ lp2 = as.frame(logicalPipList[i,])
pp2 = as.frame(physicalPipList[i,])
# # append configuration keys for extracting the pipeline later on
id = seq(1, nrow(pp2))
idpp = cbind(as.frame(id), pp2)
# # execute the physical instances and store the minimum scores, each pipeline is executed num_exec times
[outPip, outHp, feaFrameOuter] = bandit::run_with_hyperparam(lp2, idpp, num_exec, X, y, Xtest, ytest, metaList,
- evaluationFunc, evalFunHp, param, as.frame(""), cv, cvk, FALSE)
+ evaluationFunc, evalFunHp, param, as.frame(""), cv, cvk, TRUE)
# # sort the configurations groupwise
- max_perf = bandit::getMaxPerConf(outPip, nrow(pp2))
- scores[i,1] = as.matrix(max_perf[1,1])
+ end = end + nrow(outPip)
+ scores[start:end, 1] = outPip[, 1]
+ scores[start:end, 2] = matrix(i, rows=nrow(outPip), cols=1)
+ start = end + 1
}
-
+
# # select parents and best score
- selected = order(target = scores, by = 1, decreasing=TRUE, index.return=TRUE)
+ selected = order(target = scores[, 1], by = 1, decreasing=TRUE, index.return=TRUE)
idxR = as.scalar(selected[1,1])
- best_score = as.scalar(scores[idxR])
- converged = pre_best > best_score
- convergedOuter = converged
+ best_score = as.scalar(scores[idxR, 1])
+ converged = pre_best >= best_score
+ print("best score: "+best_score)
+ print("pre score: "+pre_best)
if(converged & (iter > 1)) {
print(prefix+"EnumLP: converged after "+iter+" iteration(s)")
print(prefix+"EnumLP: best score " + pre_best)
@@ -113,38 +123,30 @@ return (Frame[Unknown] bestLg, Double pre_best)
}
else {
pre_best = best_score
- idxR2 = as.scalar(ppos[idxR]) #logical pipeline ID
- idxC = as.integer(as.scalar(population[idxR2, 1])) + 1
- bestLg = population[idxR2, 2:idxC]
+ bestLg = as.frame(logicalPipList[as.scalar(scores[idxR, 2])])
+ print("best logical: "+toString(bestLg))
}
- pipLength = max(as.matrix(population[, 1])) + as.scalar(cmr[1, 1]) + 3
+ pipLength = 10
# # # if new best is not better than pre_best then no need od generating new population
- children = frame(0, rows=ceil(nrow(scores)/2), cols=pipLength)
+ children = frame(0, rows=ceil(nrow(population)/2), cols=pipLength)
i = 1
-
- while(i <= ceil(nrow(scores)/2) & !converged) {
- top = population[as.scalar(ppos[as.scalar(selected[i])]), ]
- length_top = as.integer(as.scalar(top[1, 1]))
- top = top[, 2:(length_top+1)]
-
+ while(i <= ceil(nrow(population)/2) & ncol(population) < pipLength - 1) {
+ idxR = as.scalar(selected[i,1])
+ top = as.frame(logicalPipList[as.scalar(scores[idxR, 2])])
+ length_top = ncol(top)
# generate children from crossover
- c1 = addition(top, cat, as.scalar(cmr[1,1]))
-
- # perform mutation
- c1 = mutation(c1, as.scalar(cmr[1, 2]))
-
- # perform removal if non-zero
- c1 = removal(c1, as.scalar(cmr[1, 3]))
+ c1 = addition(top, cat, 1) #i%%(pipLength-1)
# # # append length of pipeline and pipeline in frame
children[i, 1] = ncol(c1)
children[i, 2:(ncol(c1) + 1)] = c1
+
i = i + 1
}
population = children
iter = iter + 1
}
- if(pre_best == best_score) {
+ if(pre_best < best_score) {
print(prefix+" EnumLP did not converge after "+max_iter+" iterations")
}
}
@@ -156,7 +158,7 @@ return (Frame [Unknown] child)
for(i in 1:addCount)
{
c = as.scalar(sample(ncol(allOps), 1))
- place_to_add = as.scalar(sample(ncol(top)-2, 1))
+ place_to_add = as.scalar(sample(ncol(top)+2, 1))
if(place_to_add == 1)
child = cbind(allOps[1, c], top)
else if(place_to_add >= ncol(top))
@@ -167,41 +169,17 @@ return (Frame [Unknown] child)
end = top[, place_to_add+1:ncol(top)]
child = cbind(cbind(start, allOps[1, c]), end)
}
+ top = child
}
-}
-
-mutation = function(Frame[Unknown] child, Double mutationRate)
-return (Frame [Unknown] mChild)
-{
- random = as.scalar(rand(rows=1, cols=1))
- if(random > mutationRate & ncol(child) >= 3)
+ hasDummy = map(child, "x -> x.equals(\"DUMMY\")")
+ hasDummy = as.matrix(hasDummy == frame("true", rows=1, cols=ncol(hasDummy)))
+ if(sum(hasDummy) > 0 & as.scalar(hasDummy[1, ncol(hasDummy)]) != 1)
{
- r = sample(ncol(child) - 2, 2)
- r1 = as.scalar(r[1,1])
- r2 = as.scalar(r[2,1])
- temp = child[1, r1]
- child[1, r1] = child[1, r2]
- child[1, r2] = temp
+ # place the dummycode in last
+ idx = as.scalar(removeEmpty(target = hasDummy*t(seq(1, ncol(hasDummy))), margin = "cols"))
+ tmp = child[1, idx]
+ child[1, idx] = child[1, ncol(child)]
+ child[1, ncol(child)] = tmp
}
- mChild = child
}
-removal = function(Frame[Unknown] child, Integer removal)
-return (Frame[Unknown] output)
-{
- if(ncol(child) > 2 & (ncol(child)-2) > removal & removal > 0)
- {
- for(i in 1:removal)
- {
- idx = as.scalar(sample(ncol(child)-3, 1))
- if(idx == 1)
- ch = child[, 2:ncol(child)]
- else if (idx == ncol(child))
- ch = child[, 1:ncol(child)-1]
- else
- ch = cbind(child[, 1:(idx-1)], child[,(idx+1):ncol(child)])
- child = ch
- }
- }
- output = child
-}
diff --git a/scripts/pipelines/scripts/utils.dml b/scripts/pipelines/scripts/utils.dml
index 0658abc..6e0a28d 100644
--- a/scripts/pipelines/scripts/utils.dml
+++ b/scripts/pipelines/scripts/utils.dml
@@ -160,26 +160,40 @@ return(Frame[Unknown] train, Frame[Unknown] test, Matrix[Double] M)
print(prefix+" fixing invalid lengths between "+q0+" and "+q1+" quantile");
[train, mask, qlow, qup] = fixInvalidLengths(train, mask, q0, q1)
- test = fixInvalidLengthsApply(test, mask, qlow, qup)
# step 2 fix swap values
print(prefix+" value swap fixing");
train = valueSwap(train, schema)
- test = valueSwap(test, schema)
+ if(length(test) > 0)
+
# step 3 drop invalid types
print(prefix+" drop values with type mismatch");
train = dropInvalidType(train, schema)
- test = dropInvalidType(test, schema)
+
# step 4 do the case transformations
print(prefix+" convert strings to lower case");
train = map(train, "x -> x.toLowerCase()")
- test = map(test, "x -> x.toLowerCase()")
- # step 5 typo correction
+
+ # step 5 porter stemming on all features
+ print(prefix+" porter-stemming on all features");
+ train = map(train, "x -> PorterStemmer.stem(x)", 0)
+
+
+ if(length(test) > 0)
+ {
+ test = fixInvalidLengthsApply(test, mask, qlow, qup)
+ test = valueSwap(test, schema)
+ test = dropInvalidType(test, schema)
+ test = map(test, "x -> x.toLowerCase()")
+ test = map(test, "x -> PorterStemmer.stem(x)", 0)
+ }
+
+ # step 6 typo correction
if(CorrectTypos)
{
print(prefix+" correct typos in strings");
@@ -187,13 +201,10 @@ return(Frame[Unknown] train, Frame[Unknown] test, Matrix[Double] M)
for(i in 1:ncol(schema))
if(as.scalar(schema[1,i]) == "STRING") {
[train[, i], ft, dt, dm, fr] = correctTypos(train[, i], 0.2, 0.9, FALSE);
- test[, i] = correctTyposApply(test[, i], ft, dt, dm, fr);
+ if(length(test) > 0)
+ test[, i] = correctTyposApply(test[, i], ft, dt, dm, fr);
}
}
- # step 6 porter stemming on all features
- print(prefix+" porter-stemming on all features");
- train = map(train, "x -> PorterStemmer.stem(x)", 0)
- test = map(test, "x -> PorterStemmer.stem(x)", 0)
# TODO add deduplication
print(prefix+" deduplication via entity resolution");
diff --git a/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java b/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java
index a26cb5e..3b7a684 100644
--- a/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java
@@ -38,7 +38,7 @@ public class BuiltinTopkCleaningClassificationTest extends AutomatedTestBase {
private static final String PARAM_DIR = "./scripts/pipelines/properties/";
private final static String PARAM = PARAM_DIR + "param.csv";
- private final static String PRIMITIVES = PARAM_DIR + "testPrimitives.csv";
+ private final static String PRIMITIVES = PARAM_DIR + "primitives.csv";
@Override
public void setUp() {
@@ -79,8 +79,8 @@ public class BuiltinTopkCleaningClassificationTest extends AutomatedTestBase {
loadTestConfiguration(getTestConfiguration(TEST_NAME));
fullDMLScriptName = HOME + TEST_NAME + ".dml";
programArgs = new String[] { "-stats", "-exec", "singlenode", "-nvargs", "dirtyData="+data,
- "metaData="+meta, "primitives="+PRIMITIVES, "parameters="+PARAM, "topk="+ topk, "rv="+ resources, "num_inst=5",
- "sample="+sample, "testCV="+cv, "cvk="+cvk, "split="+split, "output="+OUTPUT, "O="+output("O")};
+ "metaData="+meta, "primitives="+PRIMITIVES, "parameters="+PARAM, "topk="+ topk, "rv="+ resources, "num_inst=0",
+ "max_iter="+3, "sample="+sample, "testCV="+cv, "cvk="+cvk, "split="+split, "output="+OUTPUT, "O="+output("O")};
runTest(true, EXCEPTION_NOT_EXPECTED, null, -1);
diff --git a/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkEvaluateTest.java b/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkEvaluateTest.java
index f2e873c..71160b7 100644
--- a/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkEvaluateTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkEvaluateTest.java
@@ -25,6 +25,7 @@ import org.apache.sysds.test.TestConfiguration;
import org.apache.sysds.test.TestUtils;
import org.junit.Assert;
import org.junit.Ignore;
+import org.junit.Test;
public class BuiltinTopkEvaluateTest extends AutomatedTestBase {
// private final static String TEST_NAME1 = "prioritized";
diff --git a/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkLogicalTest.java b/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkLogicalTest.java
index 087feab..b47a6b4 100644
--- a/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkLogicalTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkLogicalTest.java
@@ -65,7 +65,7 @@ public class BuiltinTopkLogicalTest extends AutomatedTestBase {
private void runTestLogical(int max_iter, int num_inst, int num_exec, Types.ExecMode et) {
- setOutputBuffering(true);
+// setOutputBuffering(true);
String HOME = SCRIPT_DIR+"functions/pipelines/" ;
Types.ExecMode modeOld = setExecMode(et);
diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv b/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv
index d323902..30f196d 100644
--- a/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv
@@ -1,3 +1,3 @@
-90.990990990991
-90.990990990991
-90.990990990991
+90.09009009009009
+89.1891891891892
+89.1891891891892
diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv b/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv
index 791a2c4..3102ff5 100644
--- a/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv
@@ -1,3 +1,3 @@
-32.0,2.0,0.04942508842239585,0.9690338275332404,0,0,0,1.0,0,2.0,1.0,1.0,0,0,0,0,0,1.0,0.6041981259130369,0,0,0,1.0,1.0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-32.0,2.0,0.025181804564039616,0.9961713994683723,0,0,0,1.0,0,2.0,1.0,1.0,0,0,0,0,0,1.0,0.5749065843221863,0,0,0,1.0,0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-32.0,2.0,0.030109393540493433,0.9774428031375582,0,0,0,1.0,0,2.0,1.0,1.0,0,0,0,0,0,1.0,0.27268133865163424,0,0,0,1.0,0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+18.0,3.0,1.0,2.0,1.0,0,0,0,1.0,0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0
+16.0,2.0,0.011239685157868542,0.9882169781390451,0,0,0,1.0,0,0,0,0,1.0,0,1.0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0
+16.0,2.0,0.031106506106547423,0.9916418186198904,0,0,0,1.0,0,0,0,0,1.0,0,1.0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0
diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/lp.csv b/src/test/scripts/functions/pipelines/intermediates/classification/lp.csv
index 549517c..1dd9f30 100644
--- a/src/test/scripts/functions/pipelines/intermediates/classification/lp.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/classification/lp.csv
@@ -1 +1 @@
-OTLR,SCALE,CI,DUMMY
+OTLR,DUMMY
diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv b/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv
index bceea73..416f64a 100644
--- a/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv
@@ -1,3 +1,3 @@
-winsorize,scale,abstain,dummycoding
-winsorize,scale,underSampling,dummycoding
-winsorize,scale,underSampling,dummycoding
+2.0,outlierBySd,frequencyEncode,outlierBySdApply,frequencyEncodeApply
+2.0,winsorize,WoE,winsorizeApply,WoEApply
+2.0,winsorize,WoE,winsorizeApply,WoEApply
diff --git a/src/test/scripts/functions/pipelines/topkLogicalTest.dml b/src/test/scripts/functions/pipelines/topkLogicalTest.dml
index 3213cdd..a8ac1cb 100644
--- a/src/test/scripts/functions/pipelines/topkLogicalTest.dml
+++ b/src/test/scripts/functions/pipelines/topkLogicalTest.dml
@@ -61,7 +61,7 @@ else
eY = eX[, ncol(eX)]
eX = eX[, 1:ncol(eX) - 1]
-print("y classes \n"+toString(table(eY, 1)))
+
getMask = getMask[, 1:ncol(getMask) - 1] # strip the mask of class label
getFdMask = getFdMask[, 1:ncol(getFdMask) - 1] # strip the mask of class label
getSchema = getSchema[, 1:ncol(getSchema) - 1] # strip the mask of class label
@@ -69,29 +69,24 @@ getSchema = getSchema[, 1:ncol(getSchema) - 1] # strip the mask of class label
metaList = list(mask=getMask, schema=getSchema, fd=as.matrix(0), applyFunc=as.frame("NULL"))
logical = frame([
- "6", "MVI", "OTLR", "ED", "EC", "CI", "DUMMY",
- "4", "ED", "MVI", "CI", "DUMMY", "0", "0"
- ], rows=2, cols=7)
-
-
+ "2", "MVI", "DUMMY",
+ "2", "ED", "DUMMY",
+ "2", "OTLR", "DUMMY",
+ "2", "EC", "DUMMY"
+ ], rows=4, cols=3)
categories = frame(["ED", "MVI", "OTLR", "EC"], rows=1, cols=4)
-cmr = matrix("4 0.7 1", rows=1, cols=3)
+
# doing holdout evaluation
[trainX, trainY, testX, testY] = splitBalanced(eX, eY, trainTestSplit, FALSE)
-# split = nrow(eX) * trainTestSplit
-# trainX = eX[1:split,]
-# trainY = eY[1:split,]
-# testX = eX[split+1:nrow(eX),]
-# testY = eY[split+1:nrow(eY),]
-[bestLogical, score] = lg::enumerateLogical(X=trainX, y=trainY, Xtest=testX, ytest=testY, cmr=cmr,
- cat=categories, population=logical, max_iter=max_iter, metaList = metaList, evaluationFunc="evalML",
+[bestLogical, score] = lg::enumerateLogical(X=trainX, y=trainY, Xtest=testX, ytest=testY, cat=categories,
+ population=logical, max_iter=max_iter, metaList = metaList, evaluationFunc="evalML",
evalFunHp=matrix("1 1e-3 1e-9 100", rows=1, cols=4), primitives=primitives, param=param,
- num_inst=num_inst, num_exec=num_exec, cv=FALSE, verbose=TRUE)
+ num_inst=num_inst, cv=FALSE, verbose=TRUE)
print("score of pipeline: "+toString(score))
print("bestLogical "+toString(bestLogical))
@@ -101,9 +96,6 @@ print("result satisfied ------------"+result)
write(result , $O)
-
-# UDF for evaluation
-# choice of parameters provided by API, X, Y, clone_X, evalFunHp (hyper-param), trainML (boolean for optimizing hp internally or passed by externally )
# UDF for evaluation
# choice of parameters provided by API, X, Y, clone_X, evalFunHp (hyper-param), trainML (boolean for optimizing hp internally or passed by externally )
evalML = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] Xtest, Matrix[Double] Ytest, Matrix[Double] Xorig=as.matrix(0),
diff --git a/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml b/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml
index 356ae22..18a725a 100644
--- a/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml
+++ b/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml
@@ -33,6 +33,7 @@ topK = $topk
resources = $rv
num_inst=$num_inst
sample=$sample
+max_iter=$max_iter
output=$output
testCV = as.logical($testCV)
cvk = as.integer($cvk)
@@ -42,7 +43,7 @@ split = nrow(F) * trainTestSplit
if(testCV) {
trainData = F
- testData = as.frame("0")
+ testData = frame("", rows=0, cols=0)
}
else {
@@ -58,8 +59,8 @@ metaInfo = metaInfo[, 2:ncol(metaInfo)]
# [topKPipelines, topKHyperParams, topKScores, bestLogical, features, dirtyScore, evalHp] =
result = topk_cleaning(dataTrain=trainData, dataTest=testData, metaData=metaInfo, primitives=primitives, parameters=param,
- cmr=matrix("2 0.7 1", rows=1, cols=3), evaluationFunc=evalFunc, evalFunHp=as.matrix(NaN),
- topK=topK, resource_val=resources, num_inst=num_inst, cv=testCV, cvk=cvk, sample=sample, isLastLabel=TRUE, correctTypos=FALSE, output=output)
+ evaluationFunc=evalFunc, evalFunHp=as.matrix(NaN),topK=topK, resource_val=resources,
+ num_inst=num_inst, max_iter=max_iter, cv=testCV, cvk=cvk, sample=sample, isLastLabel=TRUE, correctTypos=FALSE, output=output)
write(result, $O)
diff --git a/src/test/scripts/functions/pipelines/topkcleaningRegressionTest.dml b/src/test/scripts/functions/pipelines/topkcleaningRegressionTest.dml
index 7682ae4..aef89e5 100644
--- a/src/test/scripts/functions/pipelines/topkcleaningRegressionTest.dml
+++ b/src/test/scripts/functions/pipelines/topkcleaningRegressionTest.dml
@@ -39,7 +39,7 @@ split = nrow(F) * trainTestSplit
evalFunc = "evalRegression"
if(testCV) {
trainData = F[1:split,]
- testData = as.frame("0")
+ testData = frame("", rows=0, cols=0)
}
else {
trainData = F[1:split,]
@@ -49,7 +49,7 @@ else {
# # # split in train/test 70/30
result = topk_cleaning(dataTrain=trainData, dataTest=testData,
- primitives=primitives, parameters=param, cmr=matrix("2 0.7 1", rows=1, cols=3), evaluationFunc=evalFunc, evalFunHp=matrix("1 1e-3 1e-9 100", rows=1, cols=4),
+ primitives=primitives, parameters=param, evaluationFunc=evalFunc, evalFunHp=matrix("1 1e-3 1e-9 100", rows=1, cols=4),
topK=topK, resource_val=resources, cv=testCV, cvk=cvk, sample=sample, isLastLabel=TRUE, correctTypos=FALSE, output=output)