You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by ss...@apache.org on 2022/01/12 15:38:07 UTC

[systemds] branch main updated: [MINOR] Minor changes in logical enumeration - This commit introduce the default parameter values for the cleaning primitives - The logical enumeration now only use the default parameter values to evaluate the primitives and iteratively add the categories to logical pipelines until converge. - The output is a single best logical pipeline

This is an automated email from the ASF dual-hosted git repository.

ssiddiqi pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/main by this push:
     new 08944a7  [MINOR] Minor changes in logical enumeration   - This commit introduce the default parameter values for the cleaning primitives   - The logical enumeration now only use the default parameter values to evaluate the      primitives and iteratively add the categories to logical pipelines until converge.   - The output is a single best logical pipeline
08944a7 is described below

commit 08944a7305cbc4f4d9cbbd4565efa8bcc93b82e3
Author: Shafaq Siddiqi <sh...@tugraz.at>
AuthorDate: Wed Jan 12 16:36:58 2022 +0100

    [MINOR] Minor changes in logical enumeration
      - This commit introduce the default parameter values for the cleaning primitives
      - The logical enumeration now only use the default parameter values to evaluate the
         primitives and iteratively add the categories to logical pipelines until converge.
      - The output is a single best logical pipeline
---
 scripts/builtin/applyAndEvaluate.dml               |  19 +--
 scripts/builtin/bandit.dml                         | 157 ++++++++++++---------
 scripts/builtin/executePipeline.dml                |   5 +-
 scripts/builtin/fixInvalidLengthsApply.dml         |   2 +-
 scripts/builtin/frameSort.dml                      |  12 +-
 scripts/builtin/frequencyEncodeApply.dml           |   7 +-
 scripts/builtin/topk_cleaning.dml                  |  49 +++----
 scripts/pipelines/properties/param.csv             |  44 +++---
 scripts/pipelines/properties/properties.csv        |   2 -
 scripts/pipelines/scripts/enumerateLogical.dml     | 128 +++++++----------
 scripts/pipelines/scripts/utils.dml                |  31 ++--
 .../BuiltinTopkCleaningClassificationTest.java     |   6 +-
 .../pipelines/BuiltinTopkEvaluateTest.java         |   1 +
 .../pipelines/BuiltinTopkLogicalTest.java          |   2 +-
 .../intermediates/classification/bestAcc.csv       |   6 +-
 .../pipelines/intermediates/classification/hp.csv  |   6 +-
 .../pipelines/intermediates/classification/lp.csv  |   2 +-
 .../pipelines/intermediates/classification/pip.csv |   6 +-
 .../functions/pipelines/topkLogicalTest.dml        |  28 ++--
 .../pipelines/topkcleaningClassificationTest.dml   |   7 +-
 .../pipelines/topkcleaningRegressionTest.dml       |   4 +-
 21 files changed, 264 insertions(+), 260 deletions(-)

diff --git a/scripts/builtin/applyAndEvaluate.dml b/scripts/builtin/applyAndEvaluate.dml
index 96e199d..e82fa79 100644
--- a/scripts/builtin/applyAndEvaluate.dml
+++ b/scripts/builtin/applyAndEvaluate.dml
@@ -54,13 +54,16 @@ s_applyAndEvaluate = function(Frame[Unknown] trainData, Frame[Unknown] testData,
   Boolean isLastLabel = TRUE, Boolean correctTypos=FALSE)
 return (Matrix[Double] result)
 {
+  print("logical: "+toString(lp))
   no_of_flag_vars = 5
   schema = metaData[1, 1:ncol(metaData) - 1]
   mask = as.matrix(metaData[2, 1:ncol(metaData) - 1])
   fdMask = as.matrix(metaData[3, 1:ncol(metaData) - 1])
   maskY = as.integer(as.scalar(metaData[2, ncol(metaData)]))
-  metaList = list(mask=mask, schema=schema, fd=fdMask)
-
+  idx = as.scalar(pip[, 1]) + 1
+  metaList = list(mask=mask, schema=schema, fd=fdMask, applyFunc=pip[, (idx+1):ncol(pip)])
+  pip = pip[, 2:idx]
+  ctx = list(prefix="----"); #TODO include seed
   # separate the label
   [Xtrain, Ytrain] = getLabel(trainData, isLastLabel)
   [Xtest, Ytest] = getLabel(testData, isLastLabel)
@@ -77,7 +80,7 @@ return (Matrix[Double] result)
   }
     # # # when the evaluation function is called first we also compute and keep hyperparams of target application
   dirtyScore = getDirtyScore(X=Xtrain, Y=eYtrain, Xtest=Xtest, Ytest=eYtest, metaList=metaList, evaluationFunc=evaluationFunc, evalFunHp=evalFunHp)
-  [Xtrain, Xtest] = runStringPipeline(Xtrain, Xtest, schema, mask, FALSE, correctTypos)
+  [Xtrain, Xtest] = runStringPipeline(Xtrain, Xtest, schema, mask, FALSE, correctTypos, ctx)
   
   # # # if mask has 1s then there are categorical features
   [eXtrain, eXtest] = recodeData(Xtrain, Xtest, mask, FALSE, "recode")
@@ -93,7 +96,7 @@ return (Matrix[Double] result)
   pipList = list(lp = lp, ph = pip, hp = hp_matrix, flags = no_of_flag_vars)
   # argList = list(X=X, Y=Y, Xtest=Xtest, Ytest=Ytest, Xorig=clone_X, pipList=pipList, metaList=metaList, evalFunHp=evalFunHp, trainML=0)
   # # # now test accuracy
-  [eXtrain, eYtrain, eXtest, eYtest, a, b,Tr] = executePipeline(logical=lp, pipeline=pip, X=eXtrain, Y=eYtrain, Xtest=eXtest, Ytest=eYtest, metaList=metaList,
+  [eXtrain, eYtrain, eXtest, eYtest, a, b,Tr] = executePipeline(logical=lp, pipeline=pip, Xtrain=eXtrain, Ytrain=eYtrain, Xtest=eXtest, Ytest=eYtest, metaList=metaList,
     hyperParameters=hp_matrix, flagsCount=no_of_flag_vars, test=TRUE, verbose=FALSE)
   
   if(max(eYtrain) == min(eYtrain)) 
@@ -113,17 +116,15 @@ return (Matrix[Double] result)
 }
 
 runStringPipeline = function(Frame[Unknown] Xtrain, Frame[Unknown] Xtest, Frame[String] schema,
-  Matrix[Double] mask, Boolean cv, Boolean correctTypos = FALSE)
+  Matrix[Double] mask, Boolean cv, Boolean correctTypos = FALSE, List[Unknown] ctx)
 return(Frame[Unknown] Xtrain, Frame[Unknown] Xtest)
 {
   if(cv)
-    Xtrain = utils::stringProcessing(data=Xtrain, mask=mask, schema=schema, CorrectTypos=correctTypos)
+    [Xtrain, Xtest] = utils::stringProcessing(train=Xtrain, test=Xtrain, mask=mask, schema=schema, CorrectTypos=correctTypos, ctx=ctx)
   else
   {
     # # # binding train and test to use same dictionary for both
-    XAll = utils::stringProcessing(data=rbind(Xtrain, Xtest), mask=mask, schema=schema, CorrectTypos=correctTypos)
-    Xtrain = XAll[1:nrow(Xtrain),]
-    Xtest = XAll[nrow(Xtrain)+1:nrow(XAll),]
+    [Xtrain, Xtest] = utils::stringProcessing(train=Xtrain, test=Xtest, mask=mask, schema=schema, CorrectTypos=correctTypos, ctx=ctx)
   }
 }
 
diff --git a/scripts/builtin/bandit.dml b/scripts/builtin/bandit.dml
index 15fa734..92b90c3 100644
--- a/scripts/builtin/bandit.dml
+++ b/scripts/builtin/bandit.dml
@@ -115,8 +115,8 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train, Matrix[Doubl
       }
       
       configurations = configurations[1:n_i, ]
-      [outPip,outHp, f] = run_with_hyperparam(lp, configurations, r_i, X_train, Y_train, X_test, Y_test, metaList,
-        evaluationFunc, evalFunHp, param, feaFrameOuter, cv, cvk, verbose)
+      [outPip,outHp, f] = run_with_hyperparam(lp=lp, ph_pip=configurations, r_i=r_i, X=X_train, Y=Y_train, Xtest=X_test, Ytest=Y_test, metaList=metaList,
+        evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, param=param, featureFrameOuter=feaFrameOuter, cv=cv, cvk=cvk)
       # sort the pipelines by order of accuracy decreasing
       a = order(target = outPip, by = 1, decreasing=TRUE, index.return=FALSE)
       b = order(target = outHp, by = 1, decreasing=TRUE, index.return=FALSE)
@@ -130,7 +130,8 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train, Matrix[Doubl
 
       # sort the configurations for successive halving
       avergae_perf =  getMaxPerConf(outPip, nrow(configurations)) 
-      configurations = frameSort(cbind(avergae_perf, configurations), TRUE)
+      sortMask = matrix(1, rows=1, cols=ncol(configurations))
+      configurations = frameSort(cbind(avergae_perf, configurations), cbind(as.matrix(0), sortMask), TRUE)
       configurations = configurations[, 2:ncol(configurations)]
     }
     bracket_pipel = removeEmpty(target=bracket_pipel, margin="rows")
@@ -154,10 +155,16 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train, Matrix[Doubl
   [bestPipeline, bestHyperparams] = extractTopK(pipeline, hparam, baseLineScore, k)
 
   bestAccuracy = as.matrix(bestPipeline[,1])
-  bestPipeline = bestPipeline[,2:ncol(bestPipeline)]
   bestHyperparams = bestHyperparams[,2:ncol(bestHyperparams)]
   imp = as.double(as.scalar(bestAccuracy[1, 1])) - as.double(baseLineScore)
   perf = imp > 0
+  applyFunc = bestPipeline[, 2:ncol(bestPipeline)]
+  for(k in 1:nrow(bestPipeline))
+  {
+    applyFunc[k, ] = getParamMeta(bestPipeline[k, 2:ncol(bestPipeline)], param)
+    bestPipeline[k, 1] = as.frame(ncol(bestPipeline) - 1)
+  }
+  bestPipeline = cbind(bestPipeline, applyFunc)
   if(verbose) {
     print("dirty accuracy "+toString(baseLineScore))  
     print("best logical pipelines \n"+toString(lp))  
@@ -177,7 +184,7 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train, Matrix[Doubl
 }
 
 # this method will extract the physical pipelines for a given logical pipelines
-get_physical_configurations = function(Frame[String] logical, Scalar[int] numConfigs = 0, 
+get_physical_configurations = function(Frame[String] logical, Scalar[int] numConfigs = 10, 
   Frame[Unknown] primitives)
   return(Frame[String] physical, Double min)
 {
@@ -260,7 +267,7 @@ get_physical_configurations = function(Frame[String] logical, Scalar[int] numCon
 # this method will call the execute pipelines with their hyper-parameters
 run_with_hyperparam = function(Frame[Unknown] lp, Frame[Unknown] ph_pip, Integer r_i, Matrix[Double] X, Matrix[Double] Y,
   Matrix[Double] Xtest, Matrix[Double] Ytest, List[Unknown] metaList, String evaluationFunc, Matrix[Double] evalFunHp,
-  Frame[Unknown] param, Frame[Unknown] featureFrameOuter, Boolean cv,  Integer cvk = 2, Boolean verbose)
+  Frame[Unknown] param, Frame[Unknown] featureFrameOuter, Boolean cv,  Integer cvk = 2, Boolean default = FALSE)
   return (Matrix[Double] output_operator, Matrix[Double] output_hyperparam, Frame[Unknown] featureFrameOuter)
 {
   output_hp = matrix(0, nrow(ph_pip)*r_i, ncol(lp) * 5 * 3)
@@ -280,7 +287,7 @@ run_with_hyperparam = function(Frame[Unknown] lp, Frame[Unknown] ph_pip, Integer
   for(i in 1:nrow(ph_pip))
   {
     # execute configurations with r resources
-    [hp, applyFunctions, no_of_res, no_of_flag_vars] = getHyperparam(ph_pip[i], param, r_i)
+    [hp, applyFunctions, no_of_res, no_of_flag_vars] = getHyperparam(ph_pip[i], param, r_i, default)
     if(ncol(featureFrameOuter) > 1)
       feaFrame = frame("", rows = no_of_res, cols = ncol(featureFrameOuter))
     pip_toString = pipToString(ph_pip[i])
@@ -355,35 +362,23 @@ run_with_hyperparam = function(Frame[Unknown] lp, Frame[Unknown] ph_pip, Integer
 }
 
 # extract the hyper-parameters for pipelines
-getHyperparam = function(Frame[Unknown] pipeline, Frame[Unknown]  hpList, Integer no_of_res)
+getHyperparam = function(Frame[Unknown] pipeline, Frame[Unknown]  hpList, Integer no_of_res, Boolean default)
   return (Matrix[Double] paramMatrix, Frame[Unknown] applyFunc, Integer no_of_res, Integer NUM_META_FLAGS)
 {
 
   allParam = 0;
-  START_INDEX = 8 # value from where the hyper-params starts after skipping meta flags
   NUM_META_FLAGS = 5
+  NUM_DEFAULT_VALUES = 4
+
   # load the hyper-parameters values
   paramList = list()
   # store the row indexes of the operator matches
-  indexes = matrix(0, rows= ncol(pipeline), cols=1)
-  paramCount = matrix(0, rows= ncol(pipeline), cols=1)
-  applyList = hpList[, 1]
-  hpList = hpList[, 2:ncol(hpList)]
-  applyFunc = pipeline
-  parfor(k in 1:ncol(pipeline))
-  {
-    op = as.scalar(pipeline[1,k])
-    hasParam = map(hpList[,1], "x->x.split(\",\")[0].equals(\""+op+"\")")
-    # convert the boolean vector to 0/1 matrix representation
-    m_hasParam = hasParam == frame("true", rows=nrow(hasParam), cols=1)
-    m_hasParam = as.matrix(m_hasParam)
-    # compute the relevant index
-    index = m_hasParam * seq(1, nrow(m_hasParam))
-    index = as.scalar(removeEmpty(target = index, margin = "rows"))
-    indexes[k] = index
-    paramCount[k] = as.integer(as.scalar(hpList[index, 2]))
-    applyFunc[1, k] = as.scalar(applyList[index, 1])
-  }
+  [applyFunc, indexes, paramCount] = getParamMeta(pipeline, hpList)
+
+  hpList = hpList[, 3:ncol(hpList)]
+  DEFAULT_INDEX = 7
+  START_INDEX = 11 # value from where the hyper-params starts after skipping meta flags
+
   # if there are no hyper-parameters than change the values of resources
   # so that the pipeline is only executed once and no resource are wasted, saving looping
   no_of_res = ifelse(sum(paramCount) > 0, no_of_res, 1)
@@ -398,49 +393,52 @@ getHyperparam = function(Frame[Unknown] pipeline, Frame[Unknown]  hpList, Intege
     index = as.scalar(indexes[i])
     no_of_param = as.integer(as.scalar(paramCount[i]))
     # extract hasY and verbose flags
-    attachMask = matrix(as.scalar(hpList[index, 3]), rows=no_of_res, cols=1)
-    attachFD = matrix(as.scalar(hpList[index, 4]), rows=no_of_res, cols=1)
-    attachY = matrix(as.scalar(hpList[index, 5]), rows=no_of_res, cols=1)
-    isVerbose = matrix(as.scalar(hpList[index, 6]), rows=no_of_res, cols=1)
-    dataFlag = matrix(as.scalar(hpList[index, 7]), rows=no_of_res, cols=1)
-    
+    attachMask = matrix(as.scalar(hpList[index, 2]), rows=no_of_res, cols=1)
+    attachFD = matrix(as.scalar(hpList[index, 3]), rows=no_of_res, cols=1)
+    attachY = matrix(as.scalar(hpList[index, 4]), rows=no_of_res, cols=1)
+    isVerbose = matrix(as.scalar(hpList[index, 5]), rows=no_of_res, cols=1)
+    dataFlag = matrix(as.scalar(hpList[index, 6]), rows=no_of_res, cols=1)
     if(no_of_param > 0) {
       paramIdx = START_INDEX
       typeIdx = START_INDEX
       OpParam = matrix(0, rows=no_of_res, cols=max(paramCount))
-      
-      for(j in 1:no_of_param) {
-        type = as.scalar(hpList[index, typeIdx])
-        paramValIndex = (no_of_param) + paramIdx
-        minVal =  as.scalar(hpList[index, paramValIndex])
-        maxVal = as.scalar(hpList[index, paramValIndex + 1])
-        if(type == "FP") {
-          val = rand(rows=no_of_res, cols=1, min=minVal, max=maxVal, pdf="uniform");
-          OpParam[, j] = val;
-        }
-        else if(type == "INT") {
-          if(as.integer(maxVal) > no_of_res)
-            val = sample(as.integer(maxVal), no_of_res, FALSE)
-          else 
-            val = sample(as.integer(maxVal), no_of_res, TRUE)
-          less_than_min = val < as.integer(minVal);
-          val = (less_than_min * minVal) + val;
-          OpParam[, j] = val;
-        }
-        else if(type == "BOOL") {
-          if(maxVal == 1) {
-            s = sample(2, no_of_res, TRUE);
-            b = s - 1;
-            OpParam[, j] = b;
-          } 
+      if(default) {
+        OpParam[1, 1:no_of_param] = as.matrix(hpList[index, DEFAULT_INDEX:DEFAULT_INDEX+(no_of_param - 1)])
+      }
+      else {
+        for(j in 1:no_of_param) {
+          type = as.scalar(hpList[index, typeIdx])
+          paramValIndex = (no_of_param) + paramIdx
+          minVal =  as.scalar(hpList[index, paramValIndex])
+          maxVal = as.scalar(hpList[index, paramValIndex + 1])
+          if(type == "FP") {
+            val = rand(rows=no_of_res, cols=1, min=minVal, max=maxVal, pdf="uniform");
+            OpParam[, j] = val;
+          }
+          else if(type == "INT") {
+            if(as.integer(maxVal) > no_of_res)
+              val = sample(as.integer(maxVal), no_of_res, FALSE)
+            else 
+              val = sample(as.integer(maxVal), no_of_res, TRUE)
+            less_than_min = val < as.integer(minVal);
+            val = (less_than_min * minVal) + val;
+            OpParam[, j] = val;
+          }
+          else if(type == "BOOL") {
+            if(maxVal == 1) {
+              s = sample(2, no_of_res, TRUE);
+              b = s - 1;
+              OpParam[, j] = b;
+            } 
+            else
+              OpParam[, j] = matrix(0, rows=no_of_res, cols=1)
+          }
           else
-            OpParam[, j] = matrix(0, rows=no_of_res, cols=1)
-        }
-        else
-          print("invalid data type")  # TODO handle string set something like {,,}
+            print("invalid data type")  # TODO handle string set something like {,,}
           
-        paramIdx = paramIdx + 2
-        typeIdx = typeIdx + 1
+          paramIdx = paramIdx + 2
+          typeIdx = typeIdx + 1
+        }
       }
       # hyper-parameter vector contains no. of hp, values of hp, and flag values
       OpParam = cbind(matrix(no_of_param, rows=nrow(OpParam), cols=1),OpParam, attachMask,
@@ -503,7 +501,7 @@ extractTopK = function(Frame[Unknown] pipeline, Matrix[Double] hyperparam,
   # # add accuracy back
   pipeline = cbind(as.frame(forDedup[, ncol(pipeline)+1]), pipeline)
   hyperparam = order(target = hyperparam, by = 1, decreasing=TRUE, index.return=FALSE)
-  pipeline = frameSort(pipeline, TRUE)
+  pipeline = frameSort(pipeline, cbind(as.matrix(0), matrix(1, rows=1, cols=ncol(pipeline) - 1)), TRUE)
 
 
   # remove the row with accuracy less than test accuracy 
@@ -686,7 +684,7 @@ return (Matrix[Double] output, Matrix[Double] hpForPruning, Matrix[Double] chang
     if(as.scalar(pipList['flags']) != 0)
     {
       [trainX, trainy, testX, testy, Tr, hpForPruning, changesByOp] = executePipeline(logical=as.frame(pipList['lp']), pipeline=as.frame(pipList['ph']),
-        X=trainX, Y=trainy, Xtest= testX, Ytest=testy, metaList=metaList, hyperParameters=as.matrix(pipList['hp']), hpForPruning=hpForPruning,
+        Xtrain=trainX, Ytrain=trainy, Xtest= testX, Ytest=testy, metaList=metaList, hyperParameters=as.matrix(pipList['hp']), hpForPruning=hpForPruning,
         changesByOp=changesByOp, flagsCount=as.scalar(pipList['flags']), test=TRUE, verbose=FALSE)
     }
     # print("test out: "+nrow(testy))
@@ -694,7 +692,9 @@ return (Matrix[Double] output, Matrix[Double] hpForPruning, Matrix[Double] chang
     accuracyMatrix[i] = res[1, 1]
     evalFunHp = res[, 2:ncol(res)]
   }
+  print("----- cv mean accuracy ---")
   accuracy = as.matrix(mean(accuracyMatrix))
+  print(toString(accuracy))
   output = cbind(accuracy, evalFunHp)
 }
 
@@ -720,3 +720,26 @@ return(Boolean execute)
   }
   execute = !(changeCount > 0)
 }
+
+getParamMeta = function(Frame[Unknown] pipeline, Frame[Unknown] hpList)
+return(Frame[Unknown] applyFunc, Matrix[Double] indexes, Matrix[Double] paramCount)
+{
+  indexes = matrix(0, rows= ncol(pipeline), cols=1)
+  paramCount = matrix(0, rows= ncol(pipeline), cols=1)
+  applyList = hpList[, 1]
+  applyFunc = pipeline
+  parfor(k in 1:ncol(pipeline))
+  {
+    op = as.scalar(pipeline[1,k])
+    hasParam = map(hpList[,2], "x->x.split(\",\")[0].equals(\""+op+"\")")
+    # convert the boolean vector to 0/1 matrix representation
+    m_hasParam = hasParam == frame("true", rows=nrow(hasParam), cols=1)
+    m_hasParam = as.matrix(m_hasParam)
+    # compute the relevant index
+    index = m_hasParam * seq(1, nrow(m_hasParam))
+    index = as.scalar(removeEmpty(target = index, margin = "rows"))
+    indexes[k] = index
+    paramCount[k] = as.integer(as.scalar(hpList[index, 3]))
+    applyFunc[1, k] = as.scalar(hpList[index, 1])
+  }
+}
diff --git a/scripts/builtin/executePipeline.dml b/scripts/builtin/executePipeline.dml
index ebb8f60..3e34be8 100644
--- a/scripts/builtin/executePipeline.dml
+++ b/scripts/builtin/executePipeline.dml
@@ -56,8 +56,10 @@
 s_executePipeline = function(Frame[String] logical = as.frame("NULL"), Frame[String] pipeline, Matrix[Double] Xtrain,  Matrix[Double] Ytrain, 
   Matrix[Double] Xtest,  Matrix[Double] Ytest, List[Unknown] metaList, Matrix[Double] hyperParameters, Matrix[Double] hpForPruning = as.matrix(0),
   Matrix[Double] changesByOp = as.matrix(0), Integer flagsCount, Boolean test = FALSE, Boolean verbose)
-  return (Matrix[Double] Xtrain, Matrix[Double] Ytrain, Matrix[Double] Xtest, Matrix[Double] Ytest, Double t2, Matrix[Double] hpForPruning, Matrix[Double] changesByOp)
+  return (Matrix[Double] Xtrain, Matrix[Double] Ytrain, Matrix[Double] Xtest, Matrix[Double] Ytest,
+    Double t2, Matrix[Double] hpForPruning, Matrix[Double] changesByOp)
 {
+
   mask=as.matrix(metaList['mask'])
   FD = as.matrix(metaList['fd'])
   applyFunc = as.frame(metaList['applyFunc'])
@@ -78,6 +80,7 @@ s_executePipeline = function(Frame[String] logical = as.frame("NULL"), Frame[Str
     Xclone = Xtrain
     XtestClone = Xtest
     [hp, dataFlag, yFlag, executeFlag] = matrixToList(Xtrain, Ytrain, mask, FD, hyperParameters[i], flagsCount, op)
+    print("executing: ---------- "+toString(op))
     if(executeFlag == 1) {
       L = evalList(op, hp)
       [L, O] = remove(L, 1);
diff --git a/scripts/builtin/fixInvalidLengthsApply.dml b/scripts/builtin/fixInvalidLengthsApply.dml
index 3a530ae..99ca0b6 100644
--- a/scripts/builtin/fixInvalidLengthsApply.dml
+++ b/scripts/builtin/fixInvalidLengthsApply.dml
@@ -25,7 +25,7 @@
 # ----------------------------------------------------------------------------------------------------------------------
 # NAME     TYPE              DEFAULT   MEANING
 # ----------------------------------------------------------------------------------------------------------------------
-# X       Frame[Unknown]    ---
+# X       Frame[Unknown]     ---
 # mask     Matrix[Double]    ---
 # ql       Double            0.05
 # qu       Double            0.99
diff --git a/scripts/builtin/frameSort.dml b/scripts/builtin/frameSort.dml
index aaf6cd3..2519198 100644
--- a/scripts/builtin/frameSort.dml
+++ b/scripts/builtin/frameSort.dml
@@ -26,7 +26,8 @@
 # ----------------------------------------------------------------------------------------------------------------------
 # NAME      TYPE             DEFAULT     MEANING
 # ----------------------------------------------------------------------------------------------------------------------
-# F         Frame[String]     ---        Data frame of string values
+# F         Frame[String]     ---       Data frame of string values
+# mask      Matrix[Double]   ---        matrix for identifying string columns
 # ----------------------------------------------------------------------------------------------------------------------
 #
 # OUTPUT:
@@ -36,12 +37,13 @@
 # f_odered  Frame[String]                sorted dataset by column 1 in decreasing order
 # ----------------------------------------------------------------------------------------------------------------------
 
-s_frameSort = function(Frame[String] F, Boolean orderDesc = TRUE )
+s_frameSort = function(Frame[String] F, Matrix[Double] mask, Boolean orderDesc = TRUE )
 return (Frame[String] f_odered)
 {
-  idx = matrix(1, 1, ncol(F))
-  idx[1,1] = 0 # to save accuracy column from encoding 
-  index = vectorToCsv(idx)
+  # idx = matrix(1, 1, ncol(F))
+  # idx[1,1] = 0 # to save accuracy column from encoding 
+  index = vectorToCsv(mask)
+  print("framesort index: "+toString(index))
   # recode logical pipelines for easy handling
   jspecR = "{ids:true, recode:["+index+"]}";
   [X, M] = transformencode(target=F, spec=jspecR);
diff --git a/scripts/builtin/frequencyEncodeApply.dml b/scripts/builtin/frequencyEncodeApply.dml
index 5146536..a7e6a67 100644
--- a/scripts/builtin/frequencyEncodeApply.dml
+++ b/scripts/builtin/frequencyEncodeApply.dml
@@ -27,9 +27,10 @@ return (Matrix[Double] X) {
     if(sum(freqCount[i]) > 0)
     {
       Y = replace(target=X[, i], pattern=NaN, replacement=1)
-      valueCount = freqCount[i, 1:max(Y)]
-      resp = matrix(0, nrow(Y), max(Y))
-      resp = (resp + t(seq(1, max(Y)))) == Y
+      idx = min(ncol(freqCount), max(Y))
+      valueCount = freqCount[i, 1:idx]
+      resp = matrix(0, nrow(Y), idx)
+      resp = (resp + t(seq(1, idx))) == Y
       resp = resp * valueCount
       X[, i] = rowSums(resp)
     }
diff --git a/scripts/builtin/topk_cleaning.dml b/scripts/builtin/topk_cleaning.dml
index 946af13..45fd7be 100644
--- a/scripts/builtin/topk_cleaning.dml
+++ b/scripts/builtin/topk_cleaning.dml
@@ -57,7 +57,7 @@ source("scripts/pipelines/scripts/enumerateLogical.dml") as lg;
 
 s_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest = as.frame("NULL"), Frame[Unknown] metaData = as.frame("NULL"), Frame[Unknown] primitives,
   Frame[Unknown] parameters, Matrix[Double] cmr = matrix("4 0.7 1", rows=1, cols=3), String evaluationFunc, Matrix[Double] evalFunHp, Integer topK = 5, 
-  Integer resource_val = 20, Integer num_inst = 5, Double sample = 0.1, Boolean cv=TRUE, Integer cvk = 2, Boolean isLastLabel = TRUE, Boolean correctTypos=FALSE, String output)
+  Integer resource_val = 20, Integer num_inst = 5, Integer max_iter = 10, Double sample = 0.1, Boolean cv=TRUE, Integer cvk = 2, Boolean isLastLabel = TRUE, Boolean correctTypos=FALSE, String output)
   return(Boolean perf)
   # return (Frame[Unknown] topKPipelines, Matrix[Double] topKHyperParams, Matrix[Double] topKScores, Frame[Unknown] bestLogical,
   # Frame[Unknown] features, Double dirtyScore, Matrix[Double] evalFunHp)
@@ -110,32 +110,25 @@ s_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest = a
   [eXtrain, eYtrain] = utils::doSample(eXtrain, eYtrain, sample, TRUE)
   t5 = time(); print("---- finalized in: "+(t5-t4)/1e9+"s");
 
-  # # # create logical pipeline seeds
+  # # # create logical pipeline seeds 
   logicalSeedCI =  frame([
-                   "4", "ED", "MVI", "OTLR", "EC", "0", "0", "0", "0",
-                   "5", "ED", "EC", "SCALE", "CI","DUMMY","0", "0", "0", 
-                   "5", "OTLR", "EC", "SCALE", "CI", "DUMMY", "0","0", "0",
-                   "8", "ED", "MVI", "OTLR", "ED", "EC", "SCALE", "CI", "DUMMY",
-                   "5", "ED",  "MVI",  "SCALE", "CI", "DUMMY", "0", "0", "0",
-                   "4", "MVI", "SCALE", "CI", "DUMMY", "0", "0", "0", "0", 
-                   "6", "ED", "MVI", "EC", "SCALE", "CI", "DUMMY", "0", "0",
-                   "6", "MVI", "OTLR","EC", "SCALE", "CI", "DUMMY", "0", "0",
-                   "7", "OTLR", "MVI", "OTLR", "EC", "SCALE", "CI", "DUMMY", "0",
-                   "7", "ED", "MVI", "OTLR", "EC", "SCALE", "CI", "DUMMY", "0"
-                   ], rows=10, cols=9)  
+                   "4", "ED", "MVI", "OTLR", "EC",
+                   "2", "MVI", "DUMMY", "0","0",
+                   "2", "OTLR", "DUMMY","0","0", 
+                   "2", "CI", "DUMMY","0","0",
+                   "2", "SCALE", "DUMMY","0","0",
+                   "2", "ED", "DUMMY","0","0",
+                   "2", "EC", "DUMMY", "0","0"
+                   ], rows=7, cols=5)  
                    
   logicalSeedNoCI =  frame([
-                   "3", "ED", "MVI", "OTLR", "EC", "0", "0", "0",
-                   "4", "ED", "EC", "SCALE", "DUMMY","0", "0", "0", 
-                   "4", "OTLR", "EC", "SCALE", "DUMMY", "0","0", "0",
-                   "7", "ED", "MVI", "OTLR", "ED", "EC", "SCALE", "DUMMY",
-                   "4", "ED",  "MVI",  "SCALE", "DUMMY", "0", "0", "0",
-                   "3", "MVI", "SCALE", "DUMMY", "0", "0", "0", "0", 
-                   "5", "ED", "MVI", "EC", "SCALE", "DUMMY", "0", "0",
-                   "5", "MVI", "OTLR","EC", "SCALE", "DUMMY", "0", "0",
-                   "6", "OTLR", "MVI", "OTLR", "EC", "SCALE", "DUMMY", "0",
-                   "6", "ED", "MVI", "OTLR", "EC", "SCALE", "DUMMY", "0"
-                   ], rows=10, cols=8)  
+                   "4", "ED", "MVI", "OTLR", "EC",
+                   "2", "MVI", "DUMMY", "0","0",
+                   "2", "OTLR", "DUMMY","0","0", 
+                   "2", "SCALE", "DUMMY","0","0",
+                   "2", "ED", "DUMMY","0","0",
+                   "2", "EC", "DUMMY", "0","0"
+                   ], rows=6, cols=5)  
                    
   if(min(eYtrain) >= 1) {
     tab = table(eYtrain, 1)
@@ -150,10 +143,10 @@ s_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest = a
   category = logical[1, 2:idx]
   
   print("-- Cleaning - Enum Logical Pipelines: ");
-  [bestLogical, score] = lg::enumerateLogical(X=eXtrain, y=eYtrain, Xtest=eXtest, ytest=eYtest, cmr=cmr, 
-    cat=category, population=logical[2:nrow(logical)], max_iter=ceil(resource_val/topK), metaList = metaList,
+  [bestLogical, score] = lg::enumerateLogical(X=eXtrain, y=eYtrain, Xtest=eXtest, ytest=eYtest,
+    cat=category, population=logical[2:nrow(logical),], max_iter=max_iter, metaList = metaList,
     evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, primitives=primitives, param=parameters,
-    num_inst=num_inst, num_exec=ceil(resource_val/topK), cv=cv, cvk=cvk, verbose=TRUE, ctx=ctx)
+    num_inst=num_inst, cv=cv, cvk=cvk, verbose=TRUE, ctx=ctx)
   t6 = time(); print("---- finalized in: "+(t6-t5)/1e9+"s");
   # bestLogical = frame(["MVI", "OTLR", "DUMMY"], rows=1, cols=3)
   topKPipelines = as.frame("NULL"); topKHyperParams = matrix(0,0,0); topKScores = matrix(0,0,0); features = as.frame("NULL")
@@ -210,7 +203,7 @@ runStringPipeline = function(Frame[Unknown] Xtrain, Frame[Unknown] Xtest, Frame[
 return(Frame[Unknown] Xtrain, Frame[Unknown] Xtest)
 {
   if(cv)
-    [Xtrain, Xtest] = utils::stringProcessing(train=Xtrain, test=Xtrain, mask=mask, schema=schema, CorrectTypos=correctTypos, ctx=ctx)
+    Xtrain = utils::stringProcessing(train=Xtrain, test=Xtrain, mask=mask, schema=schema, CorrectTypos=correctTypos, ctx=ctx)
   else
   {
     # # # binding train and test to use same dictionary for both
diff --git a/scripts/pipelines/properties/param.csv b/scripts/pipelines/properties/param.csv
index 5998339..bee6a32 100644
--- a/scripts/pipelines/properties/param.csv
+++ b/scripts/pipelines/properties/param.csv
@@ -1,22 +1,22 @@
-applyName,name,param_no,maskFlag,FDFlag,yFlag,verboseFlag,dataFlag,dt1,dt2,dt3,dt4,st1,en1,st2,en2,st3,en3,st4,en4
-outlierByIQRApply,outlierByIQR,3,0,0,0,1,0,FP,INT,INT,1,7,2,2,1,1,,,
-outlierBySdApply,outlierBySd,3,0,0,0,1,0,INT,INT,INT,1,7,1,2,2,1,,,
-winsorizeApply,winsorize,2,0,0,0,1,0,FP,FP,0.01,0.05,0.95,1,,,,,,
-normalizeApply,normalize,0,0,0,0,0,0,,,,,,,,,,,,
-imputeByMeanApply,imputeByMean,0,1,0,0,0,2,,,,,,,,,,,,
-imputeByMedianApply,imputeByMedian,0,1,0,0,0,2,,,,,,,,,,,,
-miceApply,mice,2,1,0,0,1,2,INT,FP,1,3,0.5,1,,,,,,
-,abstain,1,0,0,1,1,2,FP,0.6,0.8,,,,,,,,,
-,flipLabels,2,0,0,1,1,2,FP,INT,0.6,0.9,1,20,,,,,,
-,SMOTE,1,1,0,1,1,2,INT,100,500,,,,,,,,,
-pca_predict,pca,3,0,0,0,0,2,INT,BOOL,BOOL,100,200,0,1,0,0,,,
-,ppca,4,0,0,0,1,2,INT,INT,FP,FP,100,200,1,10,1.00E-09,1.00E-06,1.00E-02,1.00E-01
-fillDefaultApply,fillDefault,0,0,0,0,0,2,,,,,,,,,,,,
-dummycodingApply,dummycoding,0,1,0,0,0,2,,,,,,,,,,,,
-frequencyEncodeApply,frequencyEncode,0,1,0,0,0,2,,,,,,,,,,,,
-WoEApply,WoE,0,1,0,1,0,2,,,,,,,,,,,,
-scaleApply,scale,2,0,0,0,0,0,BOOL,BOOL,0,1,0,1,,,,,,
-forward_fill,forward_fill,1,0,0,0,1,2,BOOL,0,1,,,,,,,,,
-imputeByFdApply,imputeByFd,1,0,1,0,0,1,FP,0.6,0.9,,,,,,,,,
-,tomeklink,0,0,0,1,0,2,,,,,,,,,,,,
-,underSampling,1,0,0,1,0,2,FP,0.1,0.6,,,,,,,,,
+applyName,name,param_no,maskFlag,FDFlag,yFlag,verboseFlag,dataFlag,default1,default2,default3,default4,dt1,dt2,dt3,dt4,st1,en1,st2,en2,st3,en3,st4,en4
+outlierByIQRApply,outlierByIQR,3,0,0,0,1,0,1.5,2,1,,FP,INT,INT,1,7,2,2,1,1,,,
+outlierBySdApply,outlierBySd,3,0,0,0,1,0,3,2,1,,INT,INT,INT,1,7,1,2,2,1,,,
+winsorizeApply,winsorize,2,0,0,0,1,0,0.05,0.95,,,FP,FP,0.01,0.05,0.95,1,,,,,,
+normalizeApply,normalize,0,0,0,0,0,0,,,,,,,,,,,,,,,,
+imputeByMeanApply,imputeByMean,0,1,0,0,0,2,,,,,,,,,,,,,,,,
+imputeByMedianApply,imputeByMedian,0,1,0,0,0,2,,,,,,,,,,,,,,,,
+miceApply,mice,2,1,0,0,1,2,3,0.9,,,INT,FP,1,3,0.5,1,,,,,,
+,abstain,1,0,0,1,1,2,0.75,,,,FP,0.6,0.8,,,,,,,,,
+,flipLabels,2,0,0,1,1,2,0.75,5,,,FP,INT,0.6,0.9,1,20,,,,,,
+,SMOTE,1,1,0,1,1,2,200,,,,INT,100,500,,,,,,,,,
+pca_predict,pca,3,0,0,0,0,2,10,1,0,,INT,BOOL,BOOL,100,200,0,1,0,0,,,
+,ppca,4,0,0,0,1,2,5,10,0.000001,0.02,INT,INT,FP,FP,100,200,1,10,1.00E-09,1.00E-06,1.00E-02,1.00E-01
+fillDefaultApply,fillDefault,0,0,0,0,0,2,,,,,,,,,,,,,,,,
+dummycodingApply,dummycoding,0,1,0,0,0,2,,,,,,,,,,,,,,,,
+frequencyEncodeApply,frequencyEncode,0,1,0,0,0,2,,,,,,,,,,,,,,,,
+WoEApply,WoE,0,1,0,1,0,2,,,,,,,,,,,,,,,,
+scaleApply,scale,2,0,0,0,0,0,1,0,,,BOOL,BOOL,0,1,0,1,,,,,,
+forward_fill,forward_fill,1,0,0,0,1,2,1,,,,BOOL,0,1,,,,,,,,,
+imputeByFdApply,imputeByFd,1,0,1,0,0,1,0.8,,,,FP,0.6,0.9,,,,,,,,,
+,tomeklink,0,0,0,1,0,2,,,,,,,,,,,,,,,,
+,underSampling,1,0,0,1,0,2,0.2,,,,FP,0.1,0.6,,,,,,,,,
diff --git a/scripts/pipelines/properties/properties.csv b/scripts/pipelines/properties/properties.csv
deleted file mode 100644
index e2d9f1a..0000000
--- a/scripts/pipelines/properties/properties.csv
+++ /dev/null
@@ -1,2 +0,0 @@
-outlierByIQR,3,1.5,1,3.5,1,1,2,20,10,40
-outlierBySd,3,1,1,3,1,1,2,20,10,40
\ No newline at end of file
diff --git a/scripts/pipelines/scripts/enumerateLogical.dml b/scripts/pipelines/scripts/enumerateLogical.dml
index 66b2310..7bd11d6 100644
--- a/scripts/pipelines/scripts/enumerateLogical.dml
+++ b/scripts/pipelines/scripts/enumerateLogical.dml
@@ -52,19 +52,20 @@
 source("scripts/builtin/bandit.dml") as bandit;
 
 enumerateLogical = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] Xtest, Matrix[Double] ytest,
-  Matrix[Double] cmr, Frame[Unknown] cat, Frame[Unknown] population, Integer max_iter=10, List[Unknown] metaList, 
+  Frame[Unknown] cat, Frame[Unknown] population, Integer max_iter=10, List[Unknown] metaList, 
   String evaluationFunc, Matrix[Double] evalFunHp, Frame[Unknown] primitives, Frame[Unknown] param,
-  Integer num_inst, Integer num_exec, Boolean cv=FALSE, Boolean cvk=3, Boolean verbose, List[Unknown] ctx=list(prefix="----"))
+  Integer num_inst, Boolean cv=FALSE, Boolean cvk=3, Boolean verbose, List[Unknown] ctx=list(prefix="----"))
 return (Frame[Unknown] bestLg, Double pre_best)
 {
+
+  num_exec = 1
   prefix = as.scalar(ctx["prefix"]);  
   bestLg = as.frame("")
   best_score = 0.0
   pre_best = 0.0
   iter = 1
-  convergedOuter = FALSE
 
-  while(iter <= max_iter & !convergedOuter)
+  while(as.scalar(population[1, 1]) > 0 & iter < max_iter)
   {
     print(prefix+" EnumLP iteration "+iter+"/"+as.integer(max_iter)+":" );
     physicalPipList = list();
@@ -72,40 +73,49 @@ return (Frame[Unknown] bestLg, Double pre_best)
     
     # get the physical instances from logical ones
     # unrolled by physical pipelines
-    ppos = list();
+    max_confR = 0
+    max_confC = 0
+    start = 1; 
+    end = 0;
     for(i in 1:nrow(population)) { 
       lv = as.integer(as.scalar(population[i, 1])) + 1
       lp = population[i, 2:lv]
-      pconf = bandit::get_physical_configurations(lp, num_inst, primitives)
-      for(j in 1:nrow(pconf)) {
-        physicalPipList = append(physicalPipList, pconf[j,]);
-        ppos = append(ppos, i)
-      }
+      pconf = bandit::get_physical_configurations(lp, 0, primitives)
+      max_confR = ifelse(max_confR < nrow(pconf), nrow(pconf), max_confR)
+      max_confC = ifelse(max_confC < ncol(pconf), ncol(pconf), max_confC)
+      physicalPipList = append(physicalPipList, pconf);
       logicalPipList = append(logicalPipList, lp);
+
     }
-    
+    # print("pipeline Frame: "+toString(pipelineFrame))
     # # # execute the physical pipelines
-    scores = matrix(0, nrow(physicalPipList), 1)
-    parfor(i in 1:length(physicalPipList)) {
-      lp2 = as.frame(logicalPipList[as.scalar(ppos[i]),])
+    scores = matrix(0, rows=nrow(population) * max_confR, cols=2)
+    start = 1; 
+    end = 0;
+    pipelineFrame = frame(0, rows=length(physicalPipList) * max_confR, cols=max_confC)
+    parfor(i in 1:length(physicalPipList), check=0) {
+      lp2 = as.frame(logicalPipList[i,])
       pp2 = as.frame(physicalPipList[i,])
       # # append configuration keys for extracting the pipeline later on
       id = seq(1, nrow(pp2))
       idpp = cbind(as.frame(id), pp2)
       # # execute the physical instances and store the minimum scores, each pipeline is executed num_exec times
       [outPip, outHp, feaFrameOuter] = bandit::run_with_hyperparam(lp2, idpp, num_exec, X, y, Xtest, ytest, metaList,
-        evaluationFunc, evalFunHp, param, as.frame(""), cv, cvk, FALSE)
+        evaluationFunc, evalFunHp, param, as.frame(""), cv, cvk, TRUE)
       # # sort the configurations groupwise
-      max_perf = bandit::getMaxPerConf(outPip, nrow(pp2)) 
-      scores[i,1] = as.matrix(max_perf[1,1])
+      end = end + nrow(outPip)
+      scores[start:end, 1] = outPip[, 1]
+      scores[start:end, 2] = matrix(i, rows=nrow(outPip), cols=1)
+      start = end + 1
     }
-    
+
     # # select parents and best score
-    selected = order(target = scores, by = 1, decreasing=TRUE, index.return=TRUE)
+    selected = order(target = scores[, 1], by = 1, decreasing=TRUE, index.return=TRUE)
     idxR = as.scalar(selected[1,1])
-    best_score = as.scalar(scores[idxR])
-    converged =  pre_best > best_score
-    convergedOuter = converged
+    best_score = as.scalar(scores[idxR, 1])
+    converged =  pre_best >= best_score
+    print("best score: "+best_score)
+    print("pre score: "+pre_best)
     if(converged & (iter > 1)) {
       print(prefix+"EnumLP: converged after "+iter+" iteration(s)")
       print(prefix+"EnumLP: best score " + pre_best)
@@ -113,38 +123,30 @@ return (Frame[Unknown] bestLg, Double pre_best)
     }
     else {
       pre_best = best_score
-      idxR2 = as.scalar(ppos[idxR]) #logical pipeline ID
-      idxC = as.integer(as.scalar(population[idxR2, 1])) + 1
-      bestLg = population[idxR2, 2:idxC]
+      bestLg = as.frame(logicalPipList[as.scalar(scores[idxR, 2])])
+      print("best logical: "+toString(bestLg))
     }
-    pipLength = max(as.matrix(population[, 1])) + as.scalar(cmr[1, 1]) + 3
+    pipLength = 10
     # # # if new best is not better than pre_best then no need od generating new population
-    children = frame(0, rows=ceil(nrow(scores)/2), cols=pipLength)
+    children = frame(0, rows=ceil(nrow(population)/2), cols=pipLength)
     i = 1
-
-    while(i <= ceil(nrow(scores)/2) & !converged) {
-      top = population[as.scalar(ppos[as.scalar(selected[i])]), ]
-      length_top = as.integer(as.scalar(top[1, 1]))
-      top = top[, 2:(length_top+1)]
-      
+    while(i <= ceil(nrow(population)/2) & ncol(population) < pipLength - 1) {
+      idxR = as.scalar(selected[i,1])
+      top = as.frame(logicalPipList[as.scalar(scores[idxR, 2])])
+      length_top = ncol(top)
       # generate children from crossover
-      c1 = addition(top, cat, as.scalar(cmr[1,1]))
-
-      # perform mutation
-      c1 = mutation(c1, as.scalar(cmr[1, 2]))
-
-      # perform removal if non-zero
-      c1 = removal(c1, as.scalar(cmr[1, 3]))
+      c1 = addition(top, cat, 1) #i%%(pipLength-1)
 
       # # # append length of pipeline and pipeline in frame
       children[i, 1] = ncol(c1)
       children[i, 2:(ncol(c1) + 1)] = c1
+      
       i = i + 1
     }
     population = children
     iter  = iter + 1
   }
-  if(pre_best == best_score) {
+  if(pre_best < best_score) {
     print(prefix+" EnumLP did not converge after "+max_iter+" iterations")  
   }
 }
@@ -156,7 +158,7 @@ return (Frame [Unknown] child)
   for(i in 1:addCount)
   {
     c = as.scalar(sample(ncol(allOps), 1))
-    place_to_add = as.scalar(sample(ncol(top)-2, 1))
+    place_to_add = as.scalar(sample(ncol(top)+2, 1))
     if(place_to_add == 1)
       child = cbind(allOps[1, c], top)
     else if(place_to_add >= ncol(top))
@@ -167,41 +169,17 @@ return (Frame [Unknown] child)
       end = top[, place_to_add+1:ncol(top)]
       child = cbind(cbind(start, allOps[1, c]), end)
     }
+    top = child
   }
-}
-
-mutation = function(Frame[Unknown] child, Double mutationRate)
-return (Frame [Unknown] mChild)
-{
-  random = as.scalar(rand(rows=1, cols=1))
-  if(random > mutationRate & ncol(child) >= 3)
+  hasDummy = map(child, "x -> x.equals(\"DUMMY\")")
+  hasDummy = as.matrix(hasDummy == frame("true", rows=1, cols=ncol(hasDummy)))
+  if(sum(hasDummy) > 0 & as.scalar(hasDummy[1, ncol(hasDummy)]) != 1)
   {
-    r = sample(ncol(child) - 2, 2)
-    r1 = as.scalar(r[1,1])
-    r2 = as.scalar(r[2,1])
-    temp = child[1, r1]
-    child[1, r1] = child[1, r2]
-    child[1, r2] = temp
+    # place the dummycode in last
+    idx = as.scalar(removeEmpty(target = hasDummy*t(seq(1, ncol(hasDummy))), margin = "cols"))
+    tmp = child[1, idx]
+    child[1, idx] = child[1, ncol(child)]
+    child[1, ncol(child)] = tmp
   }
-  mChild = child
 }
 
-removal = function(Frame[Unknown] child, Integer removal)
-return (Frame[Unknown] output)
-{
-  if(ncol(child) > 2 & (ncol(child)-2) > removal & removal > 0)
-  {
-    for(i in 1:removal)
-    {
-      idx = as.scalar(sample(ncol(child)-3, 1))
-      if(idx == 1)
-        ch = child[, 2:ncol(child)]
-      else if (idx == ncol(child))
-        ch = child[, 1:ncol(child)-1]
-      else 
-        ch = cbind(child[, 1:(idx-1)], child[,(idx+1):ncol(child)])
-      child = ch
-    }
-  }
-  output = child
-}
diff --git a/scripts/pipelines/scripts/utils.dml b/scripts/pipelines/scripts/utils.dml
index 0658abc..6e0a28d 100644
--- a/scripts/pipelines/scripts/utils.dml
+++ b/scripts/pipelines/scripts/utils.dml
@@ -160,26 +160,40 @@ return(Frame[Unknown] train, Frame[Unknown] test, Matrix[Double] M)
   print(prefix+" fixing invalid lengths between "+q0+" and "+q1+" quantile");
 
   [train, mask, qlow, qup] = fixInvalidLengths(train, mask, q0, q1)
-  test = fixInvalidLengthsApply(test, mask, qlow, qup)
 
   
   # step 2 fix swap values
   print(prefix+" value swap fixing");
   train = valueSwap(train, schema)
-  test = valueSwap(test, schema)
+  if(length(test) > 0)
+
   
   # step 3 drop invalid types
   print(prefix+" drop values with type mismatch");
   train = dropInvalidType(train, schema)
-  test = dropInvalidType(test, schema)
+
   
   # step 4 do the case transformations
   print(prefix+" convert strings to lower case");
   train = map(train, "x -> x.toLowerCase()")
-  test = map(test, "x -> x.toLowerCase()")
 
 
-  # step 5 typo correction  
+
+  # step 5 porter stemming on all features
+  print(prefix+" porter-stemming on all features");
+  train = map(train, "x -> PorterStemmer.stem(x)", 0)
+
+  
+  if(length(test) > 0)
+  {
+    test = fixInvalidLengthsApply(test, mask, qlow, qup)
+    test = valueSwap(test, schema)
+    test = dropInvalidType(test, schema)
+    test = map(test, "x -> x.toLowerCase()")
+    test = map(test, "x -> PorterStemmer.stem(x)", 0)
+  }
+ 
+  # step 6 typo correction  
   if(CorrectTypos)
   {
     print(prefix+" correct typos in strings");
@@ -187,13 +201,10 @@ return(Frame[Unknown] train, Frame[Unknown] test, Matrix[Double] M)
     for(i in 1:ncol(schema))
       if(as.scalar(schema[1,i]) == "STRING") {
         [train[, i], ft, dt, dm, fr] = correctTypos(train[, i], 0.2, 0.9, FALSE);
-        test[, i] = correctTyposApply(test[, i], ft, dt, dm, fr);
+        if(length(test) > 0)
+          test[, i] = correctTyposApply(test[, i], ft, dt, dm, fr);
       }
   }
-  # step 6 porter stemming on all features
-  print(prefix+" porter-stemming on all features");
-  train = map(train, "x -> PorterStemmer.stem(x)", 0)
-  test = map(test, "x -> PorterStemmer.stem(x)", 0)
   
   # TODO add deduplication
   print(prefix+" deduplication via entity resolution");
diff --git a/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java b/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java
index a26cb5e..3b7a684 100644
--- a/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java
@@ -38,7 +38,7 @@ public class BuiltinTopkCleaningClassificationTest extends AutomatedTestBase {
 
 	private static final String PARAM_DIR = "./scripts/pipelines/properties/";
 	private final static String PARAM = PARAM_DIR + "param.csv";
-	private final static String PRIMITIVES = PARAM_DIR + "testPrimitives.csv";
+	private final static String PRIMITIVES = PARAM_DIR + "primitives.csv";
 
 	@Override
 	public void setUp() {
@@ -79,8 +79,8 @@ public class BuiltinTopkCleaningClassificationTest extends AutomatedTestBase {
 			loadTestConfiguration(getTestConfiguration(TEST_NAME));
 			fullDMLScriptName = HOME + TEST_NAME + ".dml";
 			programArgs = new String[] { "-stats", "-exec", "singlenode", "-nvargs", "dirtyData="+data,
-				"metaData="+meta, "primitives="+PRIMITIVES, "parameters="+PARAM, "topk="+ topk, "rv="+ resources, "num_inst=5",
-				"sample="+sample, "testCV="+cv, "cvk="+cvk, "split="+split, "output="+OUTPUT, "O="+output("O")};
+				"metaData="+meta, "primitives="+PRIMITIVES, "parameters="+PARAM, "topk="+ topk, "rv="+ resources, "num_inst=0",
+				"max_iter="+3, "sample="+sample, "testCV="+cv, "cvk="+cvk, "split="+split, "output="+OUTPUT, "O="+output("O")};
 
 			runTest(true, EXCEPTION_NOT_EXPECTED, null, -1);
 
diff --git a/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkEvaluateTest.java b/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkEvaluateTest.java
index f2e873c..71160b7 100644
--- a/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkEvaluateTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkEvaluateTest.java
@@ -25,6 +25,7 @@ import org.apache.sysds.test.TestConfiguration;
 import org.apache.sysds.test.TestUtils;
 import org.junit.Assert;
 import org.junit.Ignore;
+import org.junit.Test;
 
 public class BuiltinTopkEvaluateTest extends AutomatedTestBase {
 	//	private final static String TEST_NAME1 = "prioritized";
diff --git a/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkLogicalTest.java b/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkLogicalTest.java
index 087feab..b47a6b4 100644
--- a/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkLogicalTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkLogicalTest.java
@@ -65,7 +65,7 @@ public class BuiltinTopkLogicalTest extends AutomatedTestBase {
 
 	private void runTestLogical(int max_iter,  int num_inst, int num_exec,  Types.ExecMode et) {
 
-		setOutputBuffering(true);
+//		setOutputBuffering(true);
 
 		String HOME = SCRIPT_DIR+"functions/pipelines/" ;
 		Types.ExecMode modeOld = setExecMode(et);
diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv b/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv
index d323902..30f196d 100644
--- a/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv
@@ -1,3 +1,3 @@
-90.990990990991
-90.990990990991
-90.990990990991
+90.09009009009009
+89.1891891891892
+89.1891891891892
diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv b/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv
index 791a2c4..3102ff5 100644
--- a/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv
@@ -1,3 +1,3 @@
-32.0,2.0,0.04942508842239585,0.9690338275332404,0,0,0,1.0,0,2.0,1.0,1.0,0,0,0,0,0,1.0,0.6041981259130369,0,0,0,1.0,1.0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-32.0,2.0,0.025181804564039616,0.9961713994683723,0,0,0,1.0,0,2.0,1.0,1.0,0,0,0,0,0,1.0,0.5749065843221863,0,0,0,1.0,0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-32.0,2.0,0.030109393540493433,0.9774428031375582,0,0,0,1.0,0,2.0,1.0,1.0,0,0,0,0,0,1.0,0.27268133865163424,0,0,0,1.0,0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+18.0,3.0,1.0,2.0,1.0,0,0,0,1.0,0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0
+16.0,2.0,0.011239685157868542,0.9882169781390451,0,0,0,1.0,0,0,0,0,1.0,0,1.0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0
+16.0,2.0,0.031106506106547423,0.9916418186198904,0,0,0,1.0,0,0,0,0,1.0,0,1.0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0
diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/lp.csv b/src/test/scripts/functions/pipelines/intermediates/classification/lp.csv
index 549517c..1dd9f30 100644
--- a/src/test/scripts/functions/pipelines/intermediates/classification/lp.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/classification/lp.csv
@@ -1 +1 @@
-OTLR,SCALE,CI,DUMMY
+OTLR,DUMMY
diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv b/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv
index bceea73..416f64a 100644
--- a/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv
@@ -1,3 +1,3 @@
-winsorize,scale,abstain,dummycoding
-winsorize,scale,underSampling,dummycoding
-winsorize,scale,underSampling,dummycoding
+2.0,outlierBySd,frequencyEncode,outlierBySdApply,frequencyEncodeApply
+2.0,winsorize,WoE,winsorizeApply,WoEApply
+2.0,winsorize,WoE,winsorizeApply,WoEApply
diff --git a/src/test/scripts/functions/pipelines/topkLogicalTest.dml b/src/test/scripts/functions/pipelines/topkLogicalTest.dml
index 3213cdd..a8ac1cb 100644
--- a/src/test/scripts/functions/pipelines/topkLogicalTest.dml
+++ b/src/test/scripts/functions/pipelines/topkLogicalTest.dml
@@ -61,7 +61,7 @@ else
 eY = eX[, ncol(eX)]
 eX = eX[, 1:ncol(eX) - 1]
 
-print("y classes \n"+toString(table(eY, 1)))
+
 getMask = getMask[, 1:ncol(getMask) - 1] # strip the mask of class label
 getFdMask = getFdMask[, 1:ncol(getFdMask) - 1] # strip the mask of class label
 getSchema = getSchema[, 1:ncol(getSchema) - 1] # strip the mask of class label
@@ -69,29 +69,24 @@ getSchema = getSchema[, 1:ncol(getSchema) - 1] # strip the mask of class label
 metaList = list(mask=getMask, schema=getSchema, fd=as.matrix(0), applyFunc=as.frame("NULL"))
 
 logical =  frame([
-                 "6", "MVI", "OTLR", "ED", "EC", "CI", "DUMMY", 
-                 "4", "ED",  "MVI",  "CI", "DUMMY", "0", "0"
-                 ], rows=2, cols=7) 
-
-
+                 "2", "MVI", "DUMMY", 
+                 "2", "ED", "DUMMY",
+                 "2", "OTLR", "DUMMY", 
+                 "2", "EC", "DUMMY"
+                 ], rows=4, cols=3) 
 
 categories = frame(["ED", "MVI", "OTLR", "EC"], rows=1, cols=4)
-cmr = matrix("4 0.7 1", rows=1, cols=3)
+
 
 # doing holdout evaluation
 
 [trainX, trainY, testX, testY] = splitBalanced(eX, eY, trainTestSplit, FALSE)
-# split = nrow(eX) * trainTestSplit
-# trainX = eX[1:split,]
-# trainY = eY[1:split,]
-# testX = eX[split+1:nrow(eX),]
-# testY = eY[split+1:nrow(eY),]
 
 
-[bestLogical, score] = lg::enumerateLogical(X=trainX, y=trainY, Xtest=testX, ytest=testY,  cmr=cmr, 
-  cat=categories, population=logical, max_iter=max_iter, metaList = metaList, evaluationFunc="evalML",
+[bestLogical, score] = lg::enumerateLogical(X=trainX, y=trainY, Xtest=testX, ytest=testY, cat=categories,
+  population=logical, max_iter=max_iter, metaList = metaList, evaluationFunc="evalML",
   evalFunHp=matrix("1 1e-3 1e-9 100", rows=1, cols=4), primitives=primitives, param=param,
-	num_inst=num_inst, num_exec=num_exec, cv=FALSE, verbose=TRUE)
+	num_inst=num_inst, cv=FALSE, verbose=TRUE)
 
 print("score of pipeline: "+toString(score))
 print("bestLogical "+toString(bestLogical))
@@ -101,9 +96,6 @@ print("result satisfied ------------"+result)
 write(result , $O)
 
 
-
-# UDF for evaluation  
-# choice of parameters provided by API, X, Y, clone_X, evalFunHp (hyper-param), trainML (boolean for optimizing hp internally or passed by externally )
 # UDF for evaluation  
 # choice of parameters provided by API, X, Y, clone_X, evalFunHp (hyper-param), trainML (boolean for optimizing hp internally or passed by externally )
 evalML = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] Xtest, Matrix[Double] Ytest, Matrix[Double] Xorig=as.matrix(0),
diff --git a/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml b/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml
index 356ae22..18a725a 100644
--- a/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml
+++ b/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml
@@ -33,6 +33,7 @@ topK = $topk
 resources = $rv
 num_inst=$num_inst
 sample=$sample
+max_iter=$max_iter
 output=$output
 testCV = as.logical($testCV)
 cvk = as.integer($cvk)
@@ -42,7 +43,7 @@ split = nrow(F) * trainTestSplit
 if(testCV) {
 
   trainData = F
-  testData = as.frame("0")
+  testData = frame("", rows=0, cols=0)
 }
 else {
 
@@ -58,8 +59,8 @@ metaInfo = metaInfo[, 2:ncol(metaInfo)]
 
 # [topKPipelines, topKHyperParams, topKScores, bestLogical, features, dirtyScore, evalHp] = 
 result = topk_cleaning(dataTrain=trainData, dataTest=testData, metaData=metaInfo, primitives=primitives, parameters=param,
-  cmr=matrix("2 0.7 1", rows=1, cols=3), evaluationFunc=evalFunc, evalFunHp=as.matrix(NaN),
-  topK=topK, resource_val=resources, num_inst=num_inst, cv=testCV, cvk=cvk, sample=sample, isLastLabel=TRUE, correctTypos=FALSE, output=output) 
+  evaluationFunc=evalFunc, evalFunHp=as.matrix(NaN),topK=topK, resource_val=resources,
+  num_inst=num_inst, max_iter=max_iter, cv=testCV, cvk=cvk, sample=sample, isLastLabel=TRUE, correctTypos=FALSE, output=output) 
 
 write(result, $O)
 
diff --git a/src/test/scripts/functions/pipelines/topkcleaningRegressionTest.dml b/src/test/scripts/functions/pipelines/topkcleaningRegressionTest.dml
index 7682ae4..aef89e5 100644
--- a/src/test/scripts/functions/pipelines/topkcleaningRegressionTest.dml
+++ b/src/test/scripts/functions/pipelines/topkcleaningRegressionTest.dml
@@ -39,7 +39,7 @@ split = nrow(F) * trainTestSplit
   evalFunc = "evalRegression"
 if(testCV) {
   trainData = F[1:split,]
-  testData = as.frame("0")
+  testData = frame("", rows=0, cols=0)
 }
 else {
   trainData = F[1:split,]
@@ -49,7 +49,7 @@ else {
 # # # split in train/test 70/30
 
 result = topk_cleaning(dataTrain=trainData, dataTest=testData, 
-  primitives=primitives, parameters=param, cmr=matrix("2 0.7 1", rows=1, cols=3), evaluationFunc=evalFunc, evalFunHp=matrix("1 1e-3 1e-9 100", rows=1, cols=4),
+  primitives=primitives, parameters=param, evaluationFunc=evalFunc, evalFunHp=matrix("1 1e-3 1e-9 100", rows=1, cols=4),
   topK=topK, resource_val=resources, cv=testCV, cvk=cvk, sample=sample, isLastLabel=TRUE, correctTypos=FALSE, output=output)