You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by ss...@apache.org on 2021/05/30 17:31:11 UTC

[systemds] branch master updated: [SYSTEMDS-2965] Cleaning Pipelines: Refactor and string operations support This commit contains the following cleanups and additions, - removal of unnecessary scripts i.e., gridsearchMLR.dml and logicalFunc.dml because the gridsearch builtin has been generalized to incorporate the cases for both lm and multiLogReg (16f4191) and the enumerateLogical.dml covers the logical enumeration and logicalFunc.dml is not needed anymore. - minor formatting fixes in dml files in pipelines package. - [...]

This is an automated email from the ASF dual-hosted git repository.

ssiddiqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/master by this push:
     new cb46197  [SYSTEMDS-2965] Cleaning Pipelines: Refactor and string operations support This commit contains the following cleanups and additions, - removal of unnecessary scripts i.e., gridsearchMLR.dml and logicalFunc.dml because   the gridsearch builtin has been generalized to incorporate the cases for both lm and multiLogReg (16f4191)   and the enumerateLogical.dml covers the logical enumeration and logicalFunc.dml is not needed anymore. - minor formatting fixes in dml files in p [...]
cb46197 is described below

commit cb46197b74a6ca62cbd877bf4c087ef6755b2092
Author: Shafaq Siddiqi <sh...@tugraz.at>
AuthorDate: Sun May 30 22:24:45 2021 +0200

    [SYSTEMDS-2965] Cleaning Pipelines: Refactor and string operations support
    This commit contains the following cleanups and additions,
    - removal of unnecessary scripts i.e., gridsearchMLR.dml and logicalFunc.dml because
      the gridsearch builtin has been generalized to incorporate the cases for both lm and multiLogReg (16f4191)
      and the enumerateLogical.dml covers the logical enumeration and logicalFunc.dml is not needed anymore.
    - minor formatting fixes in dml files in pipelines package.
    - Addition of basic string processing function in utils.dml to perform string operations on frame before recoding the data.
    - fixes in enumerateLogical.dml to make the algorithm more random.
    
    Closes #1292.
---
 scripts/builtin/bandit.dml                         |  35 +--
 scripts/builtin/executePipeline.dml                |  54 +++-
 scripts/builtin/imputeByMean.dml                   |   2 +-
 scripts/builtin/imputeByMedian.dml                 |   1 +
 scripts/pipelines/properties/param.csv             |   4 +-
 scripts/pipelines/properties/primitives.csv        |   5 +-
 scripts/pipelines/properties/testPrimitives.csv    |   3 +
 scripts/pipelines/scripts/enumerateLogical.dml     | 180 ++++++++-----
 scripts/pipelines/scripts/enumerator.dml           | 293 ---------------------
 scripts/pipelines/scripts/gridsearchMLR.dml        |  82 ------
 scripts/pipelines/scripts/logicalFunc.dml          | 114 --------
 scripts/pipelines/scripts/utils.dml                |  69 ++++-
 .../pipelines/CleaningTestClassification.java      |  27 +-
 .../functions/pipelines/CleaningTestCompare.java   |   2 +-
 .../functions/pipelines/CleaningTestLogical.java   |  31 ++-
 .../functions/builtin/GridSearchMLogreg.dml        |   2 +-
 .../functions/pipelines/compareAccuracy.dml        |  18 +-
 .../pipelines/intermediates/hyperparams.csv        |  10 +-
 .../functions/pipelines/intermediates/logical.csv  |   1 +
 .../pipelines/intermediates/pipelines.csv          |  10 +-
 .../functions/pipelines/testClassification.dml     |  76 ++----
 .../scripts/functions/pipelines/testCompare.dml    |  41 +--
 .../scripts/functions/pipelines/testLogical.dml    |  52 ++--
 23 files changed, 377 insertions(+), 735 deletions(-)

diff --git a/scripts/builtin/bandit.dml b/scripts/builtin/bandit.dml
index bcf7f94..5c24dab 100644
--- a/scripts/builtin/bandit.dml
+++ b/scripts/builtin/bandit.dml
@@ -20,7 +20,8 @@
 #-------------------------------------------------------------
 
 m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train, List[Unknown] metaList, List[Unknown] targetList,
-  Frame[Unknown] lp, Frame[Unknown] primitives, Frame[Unknown] param,  Integer k = 3, Integer R=50, Boolean verbose = TRUE)
+  Frame[Unknown] lp, Frame[Unknown] primitives, Frame[Unknown] param,  Integer k = 3, Integer R=50,
+  Boolean verbose = TRUE)
   return (Frame[Unknown] bestPipeline, Matrix[Double] bestHyperparams,  Matrix[Double] bestAccuracy, Frame[String] feaFrameOuter) 
 {
   print("Starting optimizer")
@@ -42,7 +43,8 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train, List[Unknown
   startOut=0; endOut=0;
   feaFrameOuter = frame(data=["#MissingValues", "MinVla", "MaxVal", "AverageMin", "AverageMax", 
   "#CategoricalFeatures", "#NumericFeatures", "Mean", "#Outliers", "#OHEfeatures", "#Classes",
-  "Imbalance", "#rows", "#cols", "pipelines", "accuracy", "execution time in ms", "CV time in ms"], rows = 1, cols = NUM_FEATURES + 4 )
+  "Imbalance", "#rows", "#cols", "pipelines", "accuracy", "execution time in ms", "CV time in ms"],
+  rows = 1, cols = NUM_FEATURES + 4 )
 
   for(s in s_max:0) {
     
@@ -83,7 +85,7 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train, List[Unknown
       # sort the pipelines by order of accuracy decreasing
       a = order(target = outPip, by = 1, decreasing=TRUE, index.return=FALSE)
       b = order(target = outHp, by = 1, decreasing=TRUE, index.return=FALSE)
-      rowIndex = ifelse(nrow(a) >= k, k, nrow(a))
+      rowIndex = min(k, nrow(a))
 
       # maintain the brackets results
       end = end + rowIndex
@@ -123,9 +125,7 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train, List[Unknown
   }
 }
 
-
 # this method will extract the physical pipelines for a given logical pipelines
-
 get_physical_configurations = function(Frame[String] logical, Scalar[int] numConfigs, 
   Frame[Unknown] primitives)
   return(Frame[String] physical, Double min)
@@ -140,7 +140,7 @@ get_physical_configurations = function(Frame[String] logical, Scalar[int] numCon
   dummy = primitives[,6]
   scale = primitives[,7]
  
-  operator = frame(0, rows=nrow(primitives), cols=ncol(logical)) #as.frame(matrix(0,nrow(outliers),1)) #combine all logical primitives
+  operator = frame(0, rows=nrow(primitives), cols=ncol(logical))  # combine all logical primitives
   for(j in 1:ncol(logical))
   {
     # extract the physical primitives
@@ -198,8 +198,9 @@ get_physical_configurations = function(Frame[String] logical, Scalar[int] numCon
 }
 
 # this method will call the execute pipelines with their hyper-parameters
-run_with_hyperparam = function(Frame[Unknown] ph_pip, Integer r_i, Matrix[Double] X, Matrix[Double] Y, List[Unknown] metaList, 
-   List[Unknown] targetList, Frame[Unknown] param, Frame[Unknown] featureFrameOuter, Boolean verbose)                    
+run_with_hyperparam = function(Frame[Unknown] ph_pip, Integer r_i, Matrix[Double] X, Matrix[Double] Y,
+  List[Unknown] metaList, List[Unknown] targetList, Frame[Unknown] param, Frame[Unknown] featureFrameOuter,
+  Boolean verbose)
   return (Matrix[Double] output_operator, Matrix[Double] output_hyperparam, Frame[Unknown] featureFrameOuter) {
 
   output_hp = matrix(0, nrow(ph_pip)*r_i, 100)
@@ -419,7 +420,7 @@ extractTopK = function(Frame[Unknown] pipeline, Matrix[Double] hyperparam,
 
 # extract the top k pipelines for each bracket, the intermediate results
 extractBracketWinners = function(Matrix[Double] pipeline, Matrix[Double] hyperparam, 
-  Integer k, Frame[Unknown] conf)
+  Integer k, Frame[String] conf)
   return (Frame[Unknown] bestPipeline, Matrix[Double] bestHyperparams)
 {
   # bestPipeline = frameSort(bestPipeline)
@@ -429,13 +430,13 @@ extractBracketWinners = function(Matrix[Double] pipeline, Matrix[Double] hyperpa
 
   pipeline = pipeline[1:rowIndex,]
   bestHyperparams = hyperparam[1:rowIndex,]
-  bestPipeline = frame(data="|", rows=nrow(pipeline), cols=ncol(conf)-1)
+  bestPipeline = frame(data="|", rows=nrow(pipeline), cols=ncol(conf))
   for(i in 1: nrow(pipeline)) {
     index = as.scalar(pipeline[i, 3])
-    bestPipeline[i, 1:ncol(bestPipeline)] = conf[index, 2:ncol(conf)]
-  }
-  bestPipeline = cbind(as.frame(pipeline[, 1]),  bestPipeline)
-  
+    out = conf[index, 2:ncol(conf)]
+    bestPipeline[i, 1] = as.frame(pipeline[i, 1])
+    bestPipeline[i, 2:ncol(bestPipeline)] = out
+  }  
 }
 
 ###########################################################################
@@ -523,8 +524,8 @@ return (Matrix[Double] accuracyMatrix, Double T)
     trainy = trainset[, 1]
     testX = testset[, 2:ncol(testset)]
     testy = testset[, 1]
-    beta = multiLogReg(X=trainX, Y=trainy, icpt=1, reg=as.scalar(MLhp[1,1]), tol= 1e-9, 
-    maxi=as.scalar(MLhp[1,2]), maxii= 50, verbose=FALSE);
+    beta = multiLogReg(X=trainX, Y=trainy, icpt=as.scalar(MLhp[1,1]), reg=as.scalar(MLhp[1,2]), tol= 1e-9, 
+    maxi=as.scalar(MLhp[1,3]), maxii= 50, verbose=FALSE);
     [prob, yhat, a] = multiLogRegPredict(testX, beta, testy, FALSE)
     accuracy = getAccuracy(testy, yhat, isWeighted)
     accuracyMatrix[i] = accuracy
@@ -624,4 +625,4 @@ return (String s)
   for(i in 1:ncol(F))
     s = s + as.scalar(F[,i])+";"
 
-}
+}
\ No newline at end of file
diff --git a/scripts/builtin/executePipeline.dml b/scripts/builtin/executePipeline.dml
index b646464..1e1a656 100644
--- a/scripts/builtin/executePipeline.dml
+++ b/scripts/builtin/executePipeline.dml
@@ -225,12 +225,14 @@ return (Matrix[Double] dX_train) {
 imputeByFd = function(Matrix[Double] X, Matrix[Double] FD,  Double threshold)
 return (Matrix[Double] X_filled)
 {
-  
-  for(i in 1: nrow(FD))
+  if(sum(FD) > 0)
   {
-    for(j in 1:ncol(FD)) {
-      if(as.scalar(FD[i, j]) > 0 & (min(X[, i]) != 0) & (min(X[, j]) != 0) & (sum(FD[, j]) != nrow(FD)))
-        X = imputeByFD(X, i, j, threshold, FALSE)
+    for(i in 1: nrow(FD))
+    {
+      for(j in 1:ncol(FD)) {
+        if(as.scalar(FD[i, j]) > 0 & (min(X[, i]) != 0) & (min(X[, j]) != 0) & (sum(FD[, j]) != nrow(FD)))
+          X = imputeByFD(X, i, j, threshold, FALSE)
+      }
     }
   }
   X_filled = X
@@ -285,8 +287,6 @@ return (Matrix[Double] XY)
 }
 
 
-
-
 ########################################################
 # The function will replace the null with default values
 ########################################################
@@ -297,6 +297,46 @@ return(Matrix[Double] X){
   X = replace(target=X, pattern=NaN, replacement=max(X))
   Mask = Mask * defaullt
   X = X + Mask
+  print("fillDefault: no of NaNs "+sum(is.na(X)))
+}
+
+########################################################
+# A slightly changes version of PCA
+########################################################
+m_pca = function(Matrix[Double] X, Integer K=2, Boolean center=TRUE, Boolean scale=TRUE)
+  return (Matrix[Double] Xout) 
+{
+  if(K < ncol(X)) {
+    N = nrow(X);
+    D = ncol(X);
+
+    # perform z-scoring (centering and scaling)
+    [X, Centering, ScaleFactor] = scale(X, center, scale);
+
+    # co-variance matrix
+    mu = colSums(X)/N;
+    C = (t(X) %*% X)/(N-1) - (N/(N-1))*t(mu) %*% mu;
+
+    # compute eigen vectors and values
+    [evalues, evectors] = eigen(C);
+
+    decreasing_Idx = order(target=evalues,by=1,decreasing=TRUE,index.return=TRUE);
+    diagmat = table(seq(1,D),decreasing_Idx);
+    # sorts eigenvalues by decreasing order
+    evalues = diagmat %*% evalues;
+    # sorts eigenvectors column-wise in the order of decreasing eigenvalues
+    evectors = evectors %*% diagmat;
+
+    eval_dominant = evalues[1:K, 1];
+    evec_dominant = evectors[,1:K];
+
+    # Construct new data set by treating computed dominant eigenvectors as the basis vectors
+    Xout = X %*% evec_dominant;
+    Mout = evec_dominant;
+  }
+  else Xout = X
+
 }
 
 
+
diff --git a/scripts/builtin/imputeByMean.dml b/scripts/builtin/imputeByMean.dml
index a1b8834..040d814 100644
--- a/scripts/builtin/imputeByMean.dml
+++ b/scripts/builtin/imputeByMean.dml
@@ -58,5 +58,5 @@ return(Matrix[Double] X)
   q = table(seq(1, ncol(cX)), removeEmpty(target=seq(1, ncol(mask)), margin="rows", 
     select=t(mask)), ncol(cX), ncol(X))
   X = (X_n %*% p) + (X_c %*% q)
-
+  print("imputeByMean: no of NaNs "+sum(is.na(X)))
 }
diff --git a/scripts/builtin/imputeByMedian.dml b/scripts/builtin/imputeByMedian.dml
index 73931b6..c6716c6 100644
--- a/scripts/builtin/imputeByMedian.dml
+++ b/scripts/builtin/imputeByMedian.dml
@@ -62,4 +62,5 @@ return(Matrix[Double] X)
   q = table(seq(1, ncol(cX)), removeEmpty(target=seq(1, ncol(mask)), margin="rows", 
     select=t(mask)), ncol(cX), ncol(X))
   X = (X_n %*% p) + (X_c %*% q)
+  print("imputeByMedian: no of NaNs "+sum(is.na(X)))
 }
\ No newline at end of file
diff --git a/scripts/pipelines/properties/param.csv b/scripts/pipelines/properties/param.csv
index c533e07..ecd239c 100644
--- a/scripts/pipelines/properties/param.csv
+++ b/scripts/pipelines/properties/param.csv
@@ -6,9 +6,9 @@ imputeByMean,0,1,0,0,0,2,,,,,,,,,
 imputeByMedian,0,1,0,0,0,2,,,,,,,,,
 mice,2,1,0,0,1,2,INT,FP,1,3,0.5,1.0,,,
 abstain,1,0,0,1,1,2,FP,0.6,0.8,,,,,,
-SMOTE,1,1,0,1,1,2,INT,100,200,,,,,,
+SMOTE,1,1,0,1,1,2,INT,100,500,,,,,,
 downSample,0,0,0,1,0,2,,,,,,,,,
-pca,3,0,0,0,0,2,INT,BOOL,BOOL,100,200,0,1,0,0
+m_pca,3,0,0,0,0,2,INT,BOOL,BOOL,100,200,0,1,0,0
 fillDefault,0,0,0,0,0,2,,,,,,,,,
 dummycoding,0,1,0,0,0,2,,,,,,,,,
 scale,2,0,0,0,0,0,BOOL,BOOL,0,1,0,1,,,
diff --git a/scripts/pipelines/properties/primitives.csv b/scripts/pipelines/properties/primitives.csv
index 98f3874..3bf1bdb 100644
--- a/scripts/pipelines/properties/primitives.csv
+++ b/scripts/pipelines/properties/primitives.csv
@@ -1,5 +1,6 @@
 OTLR,MVI,NR,CI,DIM,DUMMY,SCALE
-winsorize,imputeByMean,abstain,SMOTE,pca,dummycoding,scale
+winsorize,imputeByMean,abstain,SMOTE,m_pca,dummycoding,scale
 outlierBySd,imputeByMedian,,,,,
 outlierByIQR,mice,,,,,
-,fillDefault,,,,,
\ No newline at end of file
+,fillDefault,,,,,
+,imputeByFd,,,,,
\ No newline at end of file
diff --git a/scripts/pipelines/properties/testPrimitives.csv b/scripts/pipelines/properties/testPrimitives.csv
new file mode 100644
index 0000000..048b5b1
--- /dev/null
+++ b/scripts/pipelines/properties/testPrimitives.csv
@@ -0,0 +1,3 @@
+OTLR,MVI,NR,CI,DIM,DUMMY,SCALE
+winsorize,imputeByMean,abstain,SMOTE,m_pca,dummycoding,scale
+outlierBySd,imputeByMedian,,,,,
\ No newline at end of file
diff --git a/scripts/pipelines/scripts/enumerateLogical.dml b/scripts/pipelines/scripts/enumerateLogical.dml
index 1cb0ca3..090d48a 100644
--- a/scripts/pipelines/scripts/enumerateLogical.dml
+++ b/scripts/pipelines/scripts/enumerateLogical.dml
@@ -34,25 +34,28 @@
 # 11. n_pop, children created in each generation
 # output: best logical pipeline and evaluation time in ms
 
-
 # idea is to get the initial set of logical pipelines, as population, then get the num_inst physical pipelines foreach
-# logical pipeline in population. Then execute these physical pipelines num_exec time were in each execution a random set of
-# hyperparameters is used to execute operators. The compute a score vector by storing the best score foreach logical pipeline in 
-# population. Sort the pipelines by score and take n_pop pipelines as parents for generating new population.
-# from the selected pipelines take a pair in each iteration as parent and generate a pair of children by doing crossover and mutation.
-# In crossover make a child by taking some operation from p1 and some operations from p2 and in mutation randomly swap the 
-# operators in children. There new children will be the population in next iteration. Repeat the process max_iter time.
-# Converge in between if the best_score of previous generation is better then best_score of new generation.
+# logical pipeline in population. Then execute these physical pipelines num_exec time were in each execution a random
+# set of hyperparameters is used to execute operators. 
+# The compute a score vector by storing the best score foreach logical pipeline in population. Sort the pipelines by
+# score and take n_pop pipelines as parents for generating new population from the selected pipelines take a pair in
+# each iteration as parent and generate a pair of children by doing crossover and mutation.
+# There are total 3 transformation that will be performed to create a new chromosomes (children)
+#   1. crossover to create children by adding n operation from p1 to p2 and vice versa. 
+#   2. mutation to swap two operations in the children based on a mutation rate, swap is randomly based on mutationRatio.
+#   3. removal to remove n operations from a child
+# These new children will be the population in next iteration.
+# Repeat the process max_iter time. Converge in between if the best_score of previous generation is better then
+# best_score of new generation.
 
 source("scripts/builtin/bandit.dml") as bandit;
 source("scripts/pipelines/scripts/utils.dml") as utils;
 
-enumerateLogical = function(Matrix[Double] X, Matrix[Double] y, Frame[Unknown] population, Integer max_iter=10,
-  Integer pipLength, List[Unknown] metaList, List[Unknown] targetList, Frame[Unknown] primitives, Frame[Unknown] param,
-  Integer num_inst, Integer num_exec, Integer n_pop, Boolean verbose)
+enumerateLogical = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] cmr, Matrix[Double] cat, Frame[Unknown] population,
+  Integer max_iter=10, List[Unknown] metaList, List[Unknown] targetList, Frame[Unknown] primitives, Frame[Unknown] param,
+  Integer num_inst, Integer num_exec, Boolean isTailed = TRUE, Boolean verbose)
 return (Frame[Unknown] bestLg, Double pre_best, Double T)
 { 
-
   t1 = time()
   bestLg = as.frame("")
   best_score = 0
@@ -81,6 +84,10 @@ return (Frame[Unknown] bestLg, Double pre_best, Double T)
       # # append configuration keys for extracting the pipeline later on
       id = seq(1, nrow(physicalConf))
       physicalConf = cbind(as.frame(id), physicalConf)
+      if(isTailed) {
+        tail = frame(["dummycoding", "m_pca"], rows=nrow(physicalConf), cols=2)
+        physicalConf = cbind(physicalConf, tail)
+      }
       # # execute the physical instances and store the minimum scores, each pipeline is executed num_exec times
       [outPip,outHp, feaFrameOuter] = bandit::run_with_hyperparam(physicalConf, num_exec, X, y, metaList,
         targetList, param, as.frame(""), verbose)
@@ -95,16 +102,22 @@ return (Frame[Unknown] bestLg, Double pre_best, Double T)
     best_score = as.scalar(scores[idxR])
     if(verbose)
     {
-      print("best score "+best_score)
-      print("previous score "+pre_best)
+      print("best score " + best_score)
+      print("previous score " + pre_best)
+      print("bestLg " + toString(bestLg))
     }
-    
-    converge = ifelse(pre_best > best_score, TRUE, FALSE)
-    if(converge) {
-      convergedOuter = TRUE
-      print("----------- converged after "+iter+" iteration-------------")
-      print("best score "+pre_best)
-      print("best pipeline "+toString(bestLg))
+    converged = pre_best > best_score
+    convergedOuter = converged
+    if(converged)
+    {
+      if(isTailed)
+      {
+        lg_tail = frame(["DUMMY", "DIM"], rows=1, cols=2)
+        bestLg = cbind(bestLg, lg_tail)
+      }
+      print("converged after "+iter+" iteration(s)")
+      print("best score " + pre_best)
+      print("best pipeline " + toString(bestLg))
     }
     else 
     {
@@ -112,61 +125,102 @@ return (Frame[Unknown] bestLg, Double pre_best, Double T)
       idxC = as.integer(as.scalar(population[idxR, 1])) + 1
       bestLg = population[idxR, 2:idxC]
     }
-    
+    pipLength = max(as.matrix(population[, 1])) + as.scalar(cmr[1, 1]) + 1
     # # # if new best is not better than pre_best then no need od generating new population
-    children = frame(0, rows=n_pop, cols=pipLength+1)
-    CROSS_OVER_RATE = 2
+    children = frame(0, rows=ceil(nrow(scores)/2), cols=pipLength)
     i = 1
-    while(i <= n_pop & !converge)
+    while(i <= ceil(nrow(scores)/2) & !converged)
     {
-      p1 = population[as.scalar(selected[i]), ]
-      p2 = population[as.scalar(selected[i+1]), ]
-      lengthp1 = as.integer(as.scalar(p1[1, 1]))
-      lengthp2 = as.integer(as.scalar(p2[1, 1]))
-      p1 = p1[, 2:(lengthp1+1)]
-      p2 = p2[, 2:(lengthp2+1)]
-      # # # cross over, this constrained crossover will only add first operator from each parent to child
+      top = population[as.scalar(selected[i]), ]
+
+      length_top = as.integer(as.scalar(top[1, 1]))
+      top = top[, 2:(length_top+1)]
+      print("top 1 "+toString(top))
       
-      if(lengthp1 >= 5 & (lengthp1 + CROSS_OVER_RATE) < pipLength) #check if pipeline is less than 5 operation only crossover one 
-        c1 = cbind(p1[1,1:CROSS_OVER_RATE], p2)                    # operator so the probability of swapping pca and dummycoding is 
-      else if ((lengthp1 + 1) < pipLength)          # low and the crossover all should not exceed pipeline total length 
-        c1 = cbind(p1[1,1], p2)
-        
-      if(lengthp2 >= 5 & (lengthp2 + CROSS_OVER_RATE) < pipLength)
-        c2 = cbind(p2[1,1:CROSS_OVER_RATE], p1)
-      else if ((lengthp2 + 1) < pipLength)
-        c2 = cbind(p2[1,1], p1)
+      # generate children from crossover
+      c1 = addition(top, cat, as.scalar(cmr[1,1]))
+
+      # perform mutation
+      c1 = mutation(c1, as.scalar(cmr[1, 2]))
+
+
+      # perform removal if non-zero
+      c1 = removal(c1, as.scalar(cmr[1, 3]))
 
-      # # # mutation swap the operators at random positions if the length is greater than 5
-      if(ncol(c1) >= 5)
-      {
-        r = sample(3, 2)
-        r1 = as.scalar(r[1,1])
-        r2 = as.scalar(r[2,1])
-        temp = c1[1, r1]
-        c1[1, r1] = c1[1, r2]
-        c1[1, r2] = temp
-      }
-      if(ncol(c2) >= 5)
-      {
-        r = sample(3, 2)
-        r1 = as.scalar(r[1,1])
-        r2 = as.scalar(r[2,1])
-        temp = c2[1, r1]
-        c2[1, r1] = c2[1, r2]
-        c2[1, r2] = temp
-      }
       # # # append length of pipeline and pipeline in frame
       children[i, 1] = ncol(c1)
       children[i, 2:(ncol(c1) + 1)] = c1
-      children[i+1, 1] = ncol(c2)
-      children[i+1, 2:(ncol(c2) + 1)] = c2
 
-      i = i + 2
+      i = i + 1
     }
     population = children
+    iter  = iter + 1
   }
+
   T = floor((time() - t1) / 1e+6)
   print("time "+T+" ms")
 }
 
+
+addition = function(Frame[Unknown] top, Frame[Unknown] allOps, Integer addCount)
+return (Frame [Unknown] child)
+{
+  print("Starting addition")
+  for(i in 1:addCount)
+  {
+    c = as.scalar(sample(ncol(allOps), 1))
+    place_to_add = as.scalar(sample(ncol(top)+1, 1))
+    if(place_to_add == 1)
+      child = cbind(allOps[1, c], top)
+    else if(place_to_add >= ncol(top))
+      child = cbind(top, allOps[1, c])
+    else
+    {
+      start = top[, 1:place_to_add-1]
+      end = top[, place_to_add+1:ncol(top)]
+      child = cbind(cbind(start, allOps[1, c]), end)
+    }
+  }
+}
+
+mutation = function(Frame[Unknown] child, Double mutationRate)
+return (Frame [Unknown] mChild)
+{
+  print("Starting mutation on "+toString(child))
+  random = as.scalar(rand(rows=1, cols=1))
+  if(random > mutationRate & ncol(child) >= 2)
+  {
+    print("before mutation "+toString(child))
+    r = sample(ncol(child), 2)
+    r1 = as.scalar(r[1,1])
+    r2 = as.scalar(r[2,1])
+    temp = child[1, r1]
+    child[1, r1] = child[1, r2]
+    child[1, r2] = temp
+    print("after mutation "+toString(child))
+  }
+  mChild = child
+}
+
+removal = function(Frame[Unknown] child, Integer removal)
+return (Frame[Unknown] output)
+{
+  if(ncol(child) > 1 & ncol(child) > removal & removal > 0)
+  {
+    print("Starting removal on "+toString(child))
+    for(i in 1:removal)
+    {
+      idx = as.scalar(sample(ncol(child), 1))
+      if(idx == 1)
+        ch = child[, 2:ncol(child)]
+      else if (idx == ncol(child))
+        ch = child[, 1:ncol(child)-1]
+      else 
+        ch = cbind(child[, 1:(idx-1)], child[,(idx+1):ncol(child)])
+      child = ch
+    }
+  }
+  output = child
+  print("Starting removal on "+toString(output))
+}
+
diff --git a/scripts/pipelines/scripts/enumerator.dml b/scripts/pipelines/scripts/enumerator.dml
deleted file mode 100644
index 7a2390f..0000000
--- a/scripts/pipelines/scripts/enumerator.dml
+++ /dev/null
@@ -1,293 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-
-enumerator = function(Matrix[Double] X, Matrix[Double] Y, Frame[String] logical, Frame[String] outlierPrimitives, 
-               Frame[String] mviPrimitives, Frame[String] param, Integer k, Boolean verbose = TRUE)
-return(Frame[String] Kpipeline)
-{
-  for(i in 1:2) {#nrow(logical) 
-    operator = as.frame(matrix(0,nrow(outlierPrimitives),1)) #combine all logical primitives
-    for(j in 1:ncol(logical))
-    {
-      if(as.scalar(logical[i,j]) == "outlier")
-        operator = cbind(operator, outlierPrimitives);
-      else if(as.scalar(logical[i,j]) == "MVI")
-        operator = cbind(operator, mviPrimitives);
-    }
-    operator = operator[,2:ncol(operator)]
-    intermediates = getPermutations(operator) # get the all possible combination of physical primitives
-                                                   # for ith logical pipeline 
-    if(verbose)
-      print(" pipelines \n"+toString(intermediates))
-    [p, h] = executeAll(X, Y,intermediates, param, verbose);
-
-    Kpipeline = getFinalTopK(p, h, k)
-    print("top k pipelines of "+i+"th logical pipeline "+toString(Kpipeline))
-    # str = "top k pipelines of iteration "+i
-    # str = append(str, toString(Kpipeline))
-  }
-  # if(verbose)
-  # print("final top k pipelines \n"+toString(Kpipeline))
-  # write(str, "D:/Workspace/Pipelines/output/kpipeline.txt")
-}  
-
-# The pipeline execution functions 
-###################################################
-executeAll = function(Matrix[Double] X, Matrix[Double] Y, Frame[String] intermediates,  Frame[String] param, Boolean verbose)
-return(Frame[String] opt, Matrix[Double] hyper_param)
-{
-
-
-  clone_X = X;
-  # initialize output variables
-  opt = as.frame("NA")
-  hyper_param = matrix(0,0,1)
-  if(verbose)
-    print("total pipelines to be executed "+nrow(intermediates))
-  for(i in 1:nrow(intermediates)) {
-    paraList = list()
-    op = intermediates[i,]
-
-    paraList = getInstanceParam(op, param)
-    sum = 1
-    print("executing "+toString(op))
-    while(sum > 0) #condition to terminate when all hyper parameters are executed
-    {
-      paramL = list()
-      hp_temp = matrix(0,1,0)
-      opt_temp = op
-      for(j in 1: length(paraList))
-      {
-        singleHp = as.matrix(paraList[j])
-        hp_temp = cbind(hp_temp, as.matrix(ncol(singleHp)))
-        hp_temp = cbind(hp_temp, singleHp[1,])
-        paramL = append(paramL, singleHp[1, ])
-        if(nrow(singleHp) > 1)
-        {
-          singleHp = singleHp[2:nrow(singleHp),]
-          paraList[j] = singleHp
-          sum = sum(singleHp)
-        }
-      }
-      X = executePipeline(op, X, paramL, FALSE)
-      data = cbind(Y, X)
-      acc = eval("fclassify", data)
-      hp_temp = cbind(hp_temp, acc)
-      X = clone_X
-      if(as.scalar(opt[1,1]) == "NA" & nrow(hyper_param) == 0)
-      {
-        opt = opt_temp
-        hyper_param = hp_temp
-      }
-      else {
-        opt = rbind(opt, opt_temp)
-        hyper_param = rbind(hyper_param, hp_temp)
-      }
-    }
-    X = clone_X
-  }
-}
- 
-
-# The below functions will generate the all possible 
-# combinations for different hyper parameter values
-###################################################
-getInstanceParam = function(Frame[String] instance, Frame[String] param )
-return(list[Unknown] L)
-{
-  L = list();
-
-  parameters = matrix(0,0,1)
-  hpNum = matrix(0,1,ncol(instance))
-  for(i in 1:ncol(instance)) {
-    pVector = matrix(0,1,ncol(param))
-    coeff = as.scalar(instance[1,i])
-    for(j in 1:nrow(param)) {
-      if(as.scalar(instance[1,i]) == as.scalar(param[j,1]))
-        pVector = as.matrix(param[j,2:ncol(param)])
-    }
-    hpNum[1,i] =  as.scalar(pVector[1,1])
-    if(as.scalar(pVector[1,1]) > 0)
-    {
-      p=1;
-      while(p <= as.integer(as.scalar(pVector[1,1]))) 
-      {
-        # print("check point 1")
-        count = 1;
-        kVector = matrix(0,as.scalar(pVector[1,4])/as.scalar(pVector[1,3]),1)
-        inner = as.scalar(pVector[1,2])
-        while(inner <= as.scalar(pVector[1,4]))
-        {
-          kVector[count,1] = inner;
-          inner = inner + as.scalar(pVector[1,3])
-          count = count+1
-        }
-        pVector[1,2] = 0; pVector[1,3]=0; pVector[1,4]=0;
-        pVector = removeEmpty(target = pVector, margin="cols")
-        p = p+1
-        if(sum(parameters) == 0){
-          parameters = rbind(parameters, matrix(0, nrow(kVector)-nrow(parameters), ncol(parameters)))
-          parameters = cbind(parameters, kVector)
-        }
-        else  parameters = getParaCombinations(parameters, kVector)
-      }
-    }
-  }
-  index = 1
-  parameters = removeEmpty(target = parameters, margin="cols")
-  parameters = rbind(parameters, matrix(0,1,ncol(parameters)))
-
-  for(i in 1:ncol(instance))
-  {
-    if(as.scalar(hpNum[1, i]) > 0)
-    {
-      L =  append(L, parameters[,index:(index+as.scalar(hpNum[1,i]))-1])
-      index = index+as.scalar(hpNum[1,i])
-    }
-    else 
-      L = append(L, matrix(-1, 1, 1))
-  } 
-}
-
-getParaCombinations = function(Matrix[Double] para, Matrix[Double] vec)
-return (Matrix[Double] para)
-{
-  v_temp = matrix(0,0,1)
-  p_temp = matrix(0,0,ncol(para))
-  for(i in 1:nrow(vec))
-  {  
-    v = matrix(as.scalar(vec[i,1]), nrow(para), 1)
-    v_temp = rbind(v_temp, v)
-    p_temp = rbind(p_temp,para)
-  }
-  para = cbind(p_temp, v_temp) 
-}
-
-isSpecial = function(String op)
-return(Boolean yes){
-  yes = (op == "mice")
-}
-
-getPipelineSum = function(List[Unknown] paraList, Boolean verbose)
-return (Double psum)
-{
-  for(i in 1:length(paraList))
-  { 
-    if(exists(as.matrix(paraList[i])))
-      psum = sum(as.matrix(paraList[i]))
-    else 
-      psum = 0.0
-  }  
-}
-
-# This function will compute the top k pipelines from the results of all executions
-##################################################################################
-getFinalTopK = function(Frame[String] pipeline, Matrix[Double] hparameter, Integer k)
-return (Frame[String] pipeline)
-{
-  if(nrow(pipeline) < k)
-    stop("the top k should be less than the total pipelines")
-  # combine all parameter i.e., operation and hyper-parameter values
-  allParam = cbind(pipeline, as.frame(hparameter))
-  # get the indexes of columns for recode transformation
-  idx = seq(1, ncol(pipeline))
-  index = vectorToCsv(idx)
-  # encoding categorical columns using recode transformation
-  jspecR = "{ids:true, recode:["+index+"]}";
-  [X, M] = transformencode(target=allParam, spec=jspecR);  
-  top = order(target=X, by=ncol(X), decreasing=TRUE, index.return=FALSE);
-  pipeline = transformdecode(target=top, spec=jspecR, meta=M);
-  # TODO if k+n pipelines have same accuracy then how to return k pipelines  
-  pipeline = pipeline[1:k,]
-}
-
-# These private function are used to impute values by mean and by median
-##################################################################################
-imputeByMean = function(Matrix[Double] X, Boolean verbose = FALSE)
-return(Matrix[Double] X)
-{
-  Mask = is.nan(X)
-  X = replace(target=X, pattern=NaN, replacement=0)
-  Mask = Mask * (colMeans(X))
-  X = X + Mask 
-}
-
-imputeByMedian = function(Matrix[Double] X,  Boolean verbose = FALSE)
-return(Matrix[Double] X)
-{
-  cols = ncol(X)
-  colMedian = matrix(0, 1, cols)
-  Mask = is.nan(X)
-  X = replace(target=X, pattern=NaN, replacement=0)
-  parfor(i in 1:cols)
-    colMedian[, i] = median(X[,i])
-  Mask = Mask * colMedian
-  X = X + Mask
-}
-
-
-# Function to evaluate the pipeline using classification accuracy
-##################################################################################
-fclassify = function(Matrix[Double] X)
-return (Double accuracy)
-{
-  if(min(X[,1]) < 1)
-    stop("Y should contain value greater than zero")
-  
-  n = nrow(X)
-  d = ncol(X)
-
-  temp = rand(rows=n, cols=1, min = 0, max = 1, sparsity=1) <= 0.3
-  tempI = temp == 0
-  sel = diag(temp)
-  selI = diag(tempI)
-  sel = removeEmpty(target = sel, margin = "rows")
-  selI = removeEmpty(target = selI, margin = "rows")
-  testSet = sel %*% X
-  trainSet = selI %*% X
-
-  nTrain = nrow(trainSet)
-  dTrain = ncol(trainSet)
-  nTest = nrow(testSet)
-  dTest = ncol(testSet)
-
-  train_X = trainSet[, 2:dTrain] 
-  train_Y = trainSet[, 1] 
- 
-  test_X = testSet[, 2:dTest]
-  test_Y = testSet[, 1]
- 
-  betas = multiLogReg(X=train_X, Y=train_Y, icpt=2, tol=1e-9, reg=1.2, maxi=100, maxii=0, verbose=FALSE)
-  [prob, yhat, accuracy] = multiLogRegPredict(test_X, betas, test_Y, FALSE)
-}
-
-# Enumeration call
-##################################################################################
-X = read($1, data_type="matrix", format="csv", header=TRUE);
-Y = X[,ncol(X)]+1
-X = X[,1:ncol(X)-1]
-L = read($2, data_type="frame", format="csv");
-OP = read($3, data_type="frame", format="csv");
-MVIP = read($4, data_type="frame", format="csv");
-param = read($5, data_type="frame", format="csv");
-R = enumerator(X, Y, L, OP, MVIP, param, 5, TRUE);
-write(R, $6, format="csv", sep=",")
\ No newline at end of file
diff --git a/scripts/pipelines/scripts/gridsearchMLR.dml b/scripts/pipelines/scripts/gridsearchMLR.dml
deleted file mode 100644
index 23299cf..0000000
--- a/scripts/pipelines/scripts/gridsearchMLR.dml
+++ /dev/null
@@ -1,82 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the  License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-gridSearchMLR = function(Matrix[Double] Xtrain, Matrix[Double] ytrain, Matrix[Double] Xtest,
-    Matrix[Double] ytest, String train, String predict,	List[String] params, List[Unknown] paramValues,
-    Boolean verbose = TRUE) 
-
-  return (Matrix[Double] opt, Matrix[Double] Rloss) 
-{
-  # replace null with zeros
-  Xtrain = replace(target=Xtrain, pattern=NaN, replacement=0)
-  Xtest = replace(target=Xtest, pattern=NaN, replacement=0)
-  # Step 0) preparation of parameters, lengths, and values in convenient form
-  numParams = length(params);
-  paramLens = matrix(0, numParams, 1);
-  for( j in 1:numParams ) {
-    vect = as.matrix(paramValues[j,1]);
-    paramLens[j,1] = nrow(vect);
-  }
-  paramVals = matrix(0, numParams, max(paramLens));
-  for( j in 1:numParams ) {
-    vect = as.matrix(paramValues[j,1]);
-    paramVals[j,1:nrow(vect)] = t(vect);
-  }
-  cumLens = rev(cumprod(rev(paramLens))/rev(paramLens));
-  numConfigs = prod(paramLens);
-  
-  # Step 1) materialize hyper-parameter combinations 
-  # (simplify debugging and compared to compute negligible)
-  HP = matrix(0, numConfigs, numParams);
-  parfor( i in 1:nrow(HP) ) {
-    for( j in 1:numParams )
-      HP[i,j] = paramVals[j,as.scalar(((i-1)/cumLens[j,1])%%paramLens[j,1]+1)];
-  }
-
-  if( verbose )
-    print("GridSeach: Hyper-parameter combinations: \n"+toString(HP));
-
-  # Step 2) training/scoring of parameter combinations
-  # TODO integrate cross validation
- 
-  Rloss = matrix(0, nrow(HP), 3);
-  arguments1 = list(X=Xtrain, Y=ytrain, icpt=1, reg=-1, tol=1e-9, maxi=-1, maxii=0, verbose=FALSE);
-
-  parfor( i in 1:nrow(HP)) {
-    # a) replace training arguments
-    largs1 = arguments1;
-
-    for( j in 1:numParams ) {
-      largs1[as.scalar(params[j])] = as.scalar(HP[i,j]);
-    }
-    # b) core training/scoring and write-back
-    # TODO investigate rmvar handling with explicit binding (lbeta)
-    Rbeta1 = eval(train, largs1);
-    Rloss[i,1] = eval(predict, list(Xtest, ytest, Rbeta1));
-  }
-
-  # Step 3) select best parameter combination
-  ix = as.scalar(rowIndexMin(t(Rloss[,2])));
-  opt = HP[ix,]; # optimal hyper-parameters
- 
-}
-
-
diff --git a/scripts/pipelines/scripts/logicalFunc.dml b/scripts/pipelines/scripts/logicalFunc.dml
deleted file mode 100644
index a984273..0000000
--- a/scripts/pipelines/scripts/logicalFunc.dml
+++ /dev/null
@@ -1,114 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-# Generate the logical pipelines for data cleaning
-
-source("scripts/pipelines/scripts/utils.dml") as utils;
-
-# incomplete implementation of automatic logical pipelines
-generateLogicalSeed = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] mask, String target)
-return(Frame[String] logical){
-  
-  logical = frame(data=["NULL"], rows=1, cols=1, schema=["STRING"])
-  no_of_mv = sum(is.na(X))
-  X = replace(target= X, pattern = NaN, replacement = 0)
-  
-  # get the stats
-  minVal = min(X)
-  maxVal = max(X)
-  colMin = colMins(X)
-  colMax = colMaxs(X)
-  distinctCategories = colMaxs(X) * mask
-  colMean = colMeans(X)
-  colSd = colSds(X)
-  count3sdplus = sum(X > (colMean + 3*colSd )) 
-  count3sdminus = sum(X < (colMean - 3*colSd )) 
-  outliers = count3sdplus + count3sdminus
-  minCat = 0.0 # initialize variables
-  maxCat = 0.0
-  if(target != "compare") {
-    ctab = table(Y, 1)
-    minCat = min(ctab)
-    maxCat = max(ctab)
-  }
-  mv_to_data_ratio = no_of_mv/(nrow(X) * ncol(X))
-  out_to_data_ratio = outliers/ (nrow(X) * ncol(X))
-  
-  if(no_of_mv > 0)
-    logical = cbind(logical, as.frame("MVI"))
-  if(out_to_data_ratio > 0.1)
-    logical = cbind(logical, as.frame("OTLR"))
-  if(target != "compare") {
-    if(maxVal - minVal > 1000 )
-      logical = cbind(logical, as.frame("SCALE"))
-    if((maxCat - minCat) > (minCat/2))
-      logical = cbind(logical, as.frame("CI"))
-    if(sum(mask) > 0) {
-      logical = cbind(logical, as.frame("DUMMY"))
-      if(sum(distinctCategories) > 5*ncol(X))
-        logical = cbind(logical, as.frame("DIM"))
-    }
-  }
-   
-  if(ncol(logical) == 1)
-    logical = frame(["OTLR", "MVI"], rows=1, cols=2, schema=["STRING", "STRING"])
-  else
-    logical = logical[, 2:ncol(logical)]
-}
-
-
-
-transformLogical = function(Frame[String] seed)
-return(Frame[Unknown] transformLogical) {
-  transformLogical = frame(0, rows=3, cols= ncol(seed)+2)
- 
-  # case 1: MVI and OTLR
-  if(ncol(seed) > 1)
-  {
-    if(as.scalar(seed[1,1]) == "MVI" & as.scalar(seed[1,2]) == "OTLR") {
-      # t1: swap MV and OTLR 
-      transformLogical[2,1] = seed[1,2]
-      transformLogical[2,2] = seed[1,1]
-      transformLogical[2, 3:ncol(seed)] = seed[1,3:ncol(seed)]
-  
-    
-      # t2: if the sequence is MVI, OTLR then introduce an MVI after to avoid null
-      transformLogical[3,1:2] = seed[1,1:2]
-      transformLogical[3,3] = seed[1,1]
-      transformLogical[3, 4:ncol(seed)] = seed[1,3:ncol(seed)]
-    }
-    # case 2: OTLR
-    else if(as.scalar(seed[1, 1]) == "OTLR" & as.scalar(seed[1, 2]) != "MVI" )
-    {
-      # if first operation is OTLR then add a MVI to fill in MVs introduced by OTLR
-      transformLogical[2,1] = seed[1, 1]
-      transformLogical[2,2] = "MVI"
-      transformLogical[2, 3:ncol(seed)] = seed[1,2:ncol(seed)]
-    }
-  }
-  transformLogical[1, 1:ncol(seed)] = seed
-  transformLogical = map(transformLogical, "var -> var.replace(\"0\", \"\")")
-  transformLogical = utils::frameRemoveEmpty(target=transformLogical, marginParam="cols", select=as.matrix(0))
-  if(nrow(transformLogical) > 1)
-    transformLogical = utils::frameRemoveEmpty(target=transformLogical, marginParam="rows", select=as.matrix(0))
-
-}
-
-
diff --git a/scripts/pipelines/scripts/utils.dml b/scripts/pipelines/scripts/utils.dml
index 17186ab..df84a1c 100644
--- a/scripts/pipelines/scripts/utils.dml
+++ b/scripts/pipelines/scripts/utils.dml
@@ -88,6 +88,7 @@ doSample = function(Matrix[Double] eX, Matrix[Double] eY, Double ratio)
     eY = out[, 1]
     eX = out[, 2:ncol(out)]
   }
+  print("AFTER SAMPLES "+nrow(eX))
 }
 
 # #######################################################################
@@ -120,9 +121,8 @@ classifyDirty = function(Matrix[Double] Xtrain, Matrix[Double] ytrain, Matrix[Do
   Matrix[Double] mask, Boolean isWeighted = TRUE, Integer cv)
   return (Double accuracy)
 {
-  # # classify without cleaning fill with edfault values 1
-  Xtrain = replace(target = Xtrain, pattern = NaN, replacement=0)
-  dX_train = dummycoding(Xtrain, mask)
+  if(sum(mask) > 0)
+    Xtrain = dummycoding(Xtrain, mask)
   [accuracy, T] = bandit::crossV(Xtrain, ytrain, cv, mask, opt, isWeighted)
   accuracy = mean(accuracy)
   print("cross validated dirty accuracy "+accuracy)
@@ -169,3 +169,66 @@ return(Boolean validForResources)
   validForResources = count > 0
 }
 
+#####################################
+# Create a pipeline for string processing that needs to be applied before data recoding
+# The pipelines will drop invalid types, transform cases, deduplicate and remove pattern outliers
+######################################
+stringProcessing = function(Frame[Unknown] data, Matrix[Double] mask, Frame[String] schema)
+return(Frame[Unknown] processedData)
+{
+  n = nrow(data)
+  
+  # step 1 drop invalid types
+  data = dropInvalidType(data, schema)
+  
+  # step 2 do the case transformations
+  for(i in 1:ncol(mask))
+  {
+    if(as.scalar(schema[1,i]) == "STRING")
+    {
+      lowerCase = map(data[, i], "x -> x.toLowerCase()")
+      data[, i] = lowerCase
+    }
+  }
+  # TODO add deduplication
+  processedData = data
+
+}
+
+getOpByTarget = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] mask, String target)
+return (Matrix[Double] bestOpt)
+{
+  opt = as.frame(0)
+  if(target == "multiLogReg")
+  {
+    params = list("icpt", "reg", "maxii");
+    paramRanges = list(10^seq(0,-4), 10^seq(1,-6), 10^seq(1,3));
+
+    if(sum(mask) > 0)
+      X = dummycoding(replace(target = X, pattern = NaN, replacement=0), mask)
+      
+    trainArgs = list(X=X, Y=y, icpt=-1, reg=-1, tol=1e-9, maxi=100, maxii=-1, verbose=FALSE);
+    [B1,opt] = gridSearch(X=X, y=y, train="multiLogReg", predict="accuracy", numB=ncol(X)+1,
+      params=params, paramValues=paramRanges, trainArgs=trainArgs, verbose=FALSE);
+
+  }
+  else if(target == "lm")
+  {
+    params = list("reg", "tol", "maxi");
+    paramRanges = list(10^seq(0,-4), 10^seq(1,-6), seq(10,100,10));
+
+    if(sum(mask) > 0)
+      X = dummycoding(replace(target = X, pattern = NaN, replacement=0), mask)
+      
+    trainArgs = list(X=X, y=y, icpt=0, reg=-1, tol=-1, maxi=-1, verbose=FALSE);
+    [B1, opt] = gridSearch(X=X, y=y, train="lm", predict="l2norm", 
+      numB=ncol(X), params=params, paramValues=paramRanges, trainArgs=trainArgs, verbose=FALSE);
+  }
+  else
+    print("getOptByTarget: target type not supported. Expected multiLogReg or lm found: "+target)
+  
+  bestOpt = as.matrix(opt)
+}
+
+
+
diff --git a/src/test/java/org/apache/sysds/test/functions/pipelines/CleaningTestClassification.java b/src/test/java/org/apache/sysds/test/functions/pipelines/CleaningTestClassification.java
index 7c4871f..8144d89 100644
--- a/src/test/java/org/apache/sysds/test/functions/pipelines/CleaningTestClassification.java
+++ b/src/test/java/org/apache/sysds/test/functions/pipelines/CleaningTestClassification.java
@@ -28,6 +28,7 @@ import org.junit.Ignore;
 import org.junit.Test;
 
 public class CleaningTestClassification extends AutomatedTestBase {
+//	private final static String TEST_NAME1 = "prioritized";
 	private final static String TEST_NAME1 = "testClassification";
 	private final static String TEST_NAME2 = "compareAccuracy";
 	private final static String TEST_CLASS_DIR = SCRIPT_DIR + CleaningTestClassification.class.getSimpleName() + "/";
@@ -39,10 +40,11 @@ public class CleaningTestClassification extends AutomatedTestBase {
 	private final static String CLEAN = DATA_DIR+ "clean.csv";
 	private final static String META = RESOURCE+ "meta/meta_census.csv";
 	private final static String OUTPUT = RESOURCE+"intermediates/";
+	private final static String LOGICAL = RESOURCE+"intermediates/logical.csv";
 
 	private static final String PARAM_DIR = "./scripts/pipelines/properties/";
 	private final static String PARAM = PARAM_DIR + "param.csv";
-	private final static String PRIMITIVES = PARAM_DIR + "primitives.csv";
+	private final static String PRIMITIVES = PARAM_DIR + "testPrimitives.csv";
 
 	@Override
 	public void setUp() {
@@ -50,10 +52,16 @@ public class CleaningTestClassification extends AutomatedTestBase {
 		addTestConfiguration(TEST_NAME2,new TestConfiguration(TEST_CLASS_DIR, TEST_NAME2,new String[]{"R"}));
 	}
 
-	@Ignore
+	@Test
 	public void testFindBestPipeline() {
-		runFindPipelineTest(0.1, 5,10, 2,
-			true, "classification", Types.ExecMode.SINGLE_NODE);
+		runFindPipelineTest(0.1, 5,5, 2,
+			true, "multiLogReg", Types.ExecMode.SINGLE_NODE);
+	}
+
+	@Ignore
+	public void testFindBestPipelineHybrid() {
+		runFindPipelineTest(0.1, 5,5, 2,
+			true, "multiLogReg", Types.ExecMode.HYBRID);
 	}
 
 	@Test
@@ -64,16 +72,17 @@ public class CleaningTestClassification extends AutomatedTestBase {
 	private void runFindPipelineTest(Double sample, int topk, int resources, int crossfold,
 		boolean weightedAccuracy, String target, Types.ExecMode et) {
 
-		setOutputBuffering(true);
+//		setOutputBuffering(true);
 		String HOME = SCRIPT_DIR+"functions/pipelines/" ;
 		Types.ExecMode modeOld = setExecMode(et);
 		try {
 			loadTestConfiguration(getTestConfiguration(TEST_NAME1));
 			fullDMLScriptName = HOME + TEST_NAME1 + ".dml";
 			programArgs = new String[] {"-stats", "-exec", "singlenode", "-nvargs", "dirtyData="+DIRTY,
-				"metaData="+META, "primitives="+PRIMITIVES, "parameters="+PARAM, "sampleSize="+ sample,
-				"topk="+ topk, "rv="+ resources, "cv="+ crossfold, "weighted="+ weightedAccuracy,
-				"output="+OUTPUT, "target="+target, "cleanData="+CLEAN, "O="+output("O")};
+				"metaData="+META, "primitives="+PRIMITIVES, "parameters="+PARAM, "logical="+LOGICAL,
+				"sampleSize="+ sample, "topk="+ topk, "rv="+ resources, "cv="+ crossfold,
+				"weighted="+ weightedAccuracy, "output="+OUTPUT, "target="+target, "cleanData="+CLEAN,
+				"O="+output("O")};
 
 			runTest(true, EXCEPTION_NOT_EXPECTED, null, -1);
 
@@ -105,4 +114,4 @@ public class CleaningTestClassification extends AutomatedTestBase {
 			resetExecMode(modeOld);
 		}
 	}
-}
+}
\ No newline at end of file
diff --git a/src/test/java/org/apache/sysds/test/functions/pipelines/CleaningTestCompare.java b/src/test/java/org/apache/sysds/test/functions/pipelines/CleaningTestCompare.java
index d7160be..fbb253e 100644
--- a/src/test/java/org/apache/sysds/test/functions/pipelines/CleaningTestCompare.java
+++ b/src/test/java/org/apache/sysds/test/functions/pipelines/CleaningTestCompare.java
@@ -62,7 +62,7 @@ public class CleaningTestCompare extends AutomatedTestBase {
 			loadTestConfiguration(getTestConfiguration(TEST_NAME1));
 			fullDMLScriptName = HOME + TEST_NAME1 + ".dml";
 			programArgs = new String[] {"-stats", "-exec", "singlenode", "-nvargs", "dirtyData="+DIRTY,
-				"metaData="+META, "primitives="+PRIMITIVES, "parameters="+PARAM,  "topk="+ topk, "rv="+ resources,
+				"metaData="+META, "primitives="+PRIMITIVES, "parameters="+PARAM, "topk="+ topk, "rv="+ resources,
 				"output="+OUTPUT, "target="+target, "cleanData="+CLEAN, "O="+output("O")};
 
 			runTest(true, EXCEPTION_NOT_EXPECTED, null, -1);
diff --git a/src/test/java/org/apache/sysds/test/functions/pipelines/CleaningTestLogical.java b/src/test/java/org/apache/sysds/test/functions/pipelines/CleaningTestLogical.java
index 69909b7..03fc9f9 100644
--- a/src/test/java/org/apache/sysds/test/functions/pipelines/CleaningTestLogical.java
+++ b/src/test/java/org/apache/sysds/test/functions/pipelines/CleaningTestLogical.java
@@ -24,7 +24,6 @@ import org.apache.sysds.test.AutomatedTestBase;
 import org.apache.sysds.test.TestConfiguration;
 import org.apache.sysds.test.TestUtils;
 import org.junit.Assert;
-import org.junit.Ignore;
 import org.junit.Test;
 
 public class CleaningTestLogical extends AutomatedTestBase {
@@ -35,12 +34,12 @@ public class CleaningTestLogical extends AutomatedTestBase {
 	private static final String DATA_DIR = DATASET_DIR+ "pipelines/";
 
 	private final static String DIRTY = DATA_DIR+ "dirty.csv";
-	private final static String CLEAN = DATA_DIR+ "clean.csv";
 	private final static String META = RESOURCE+ "meta/meta_census.csv";
 
 	private static final String PARAM_DIR = "./scripts/pipelines/properties/";
 	private final static String PARAM = PARAM_DIR + "param.csv";
-	private final static String PRIMITIVES = PARAM_DIR + "primitives.csv";
+	private final static String PRIMITIVES = PARAM_DIR + "testPrimitives.csv";
+	private final static String OUTPUT = RESOURCE+"intermediates/logical.csv";
 
 	@Override
 	public void setUp() {
@@ -49,18 +48,24 @@ public class CleaningTestLogical extends AutomatedTestBase {
 
 	@Test
 	public void testLogical1() {
-		runTestLogical(2, 10, 2, 2, 2, 2,
-			"classification", Types.ExecMode.SINGLE_NODE);
+		runTestLogical(10,  4, 2, 2,
+			"multiLogReg", Types.ExecMode.SINGLE_NODE);
 	}
 
-	@Ignore
-	public void testLogicalSP() {
-		runTestLogical(3, 10, 3, 2, 2, 4,
-			"classification", Types.ExecMode.SPARK);
+	@Test
+	public void testLogical2() {
+		runTestLogical(2,  3, 3, 2,
+			"multiLogReg", Types.ExecMode.SINGLE_NODE);
+	}
+
+	@Test
+	public void testLogicalHybrid() {
+		runTestLogical(3,  3, 2, 2,
+			"multiLogReg", Types.ExecMode.HYBRID);
 	}
 
-	private void runTestLogical(int max_iter, int pipelineLength, int crossfold,
-		int num_inst, int num_exec, int n_pop, String target, Types.ExecMode et) {
+	private void runTestLogical(int max_iter, int crossfold,
+		int num_inst, int num_exec,  String target, Types.ExecMode et) {
 
 		//		setOutputBuffering(true);
 		String HOME = SCRIPT_DIR+"functions/pipelines/" ;
@@ -70,8 +75,8 @@ public class CleaningTestLogical extends AutomatedTestBase {
 			fullDMLScriptName = HOME + TEST_NAME + ".dml";
 			programArgs = new String[] {"-stats", "-exec", "singlenode", "-nvargs", "dirtyData="+DIRTY,
 				"metaData="+META, "primitives="+PRIMITIVES, "parameters="+PARAM, "max_iter="+ max_iter,
-				 "pipLength="+ pipelineLength, "cv="+ crossfold, "num_inst="+ num_inst, "num_exec="+ num_exec,
-				"n_pop="+ n_pop,"target="+target, "cleanData="+CLEAN, "O="+output("O")};
+				 "cv="+ crossfold, "num_inst="+ num_inst, "num_exec="+ num_exec,
+				"target="+target, "output="+OUTPUT, "O="+output("O")};
 
 			runTest(true, EXCEPTION_NOT_EXPECTED, null, -1);
 
diff --git a/src/test/scripts/functions/builtin/GridSearchMLogreg.dml b/src/test/scripts/functions/builtin/GridSearchMLogreg.dml
index 33bd9f5..ec2bf9d 100644
--- a/src/test/scripts/functions/builtin/GridSearchMLogreg.dml
+++ b/src/test/scripts/functions/builtin/GridSearchMLogreg.dml
@@ -21,7 +21,7 @@
 
 accuracy = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] B) return (Matrix[Double] err) {
   [M,yhat,acc] = multiLogRegPredict(X=X, B=B, Y=y, verbose=TRUE);
-  err = as.matrix(1-acc);
+  err = as.matrix(1-(acc/100));
 }
 
 X = read($1);
diff --git a/src/test/scripts/functions/pipelines/compareAccuracy.dml b/src/test/scripts/functions/pipelines/compareAccuracy.dml
index 22165c6..ba8df06 100644
--- a/src/test/scripts/functions/pipelines/compareAccuracy.dml
+++ b/src/test/scripts/functions/pipelines/compareAccuracy.dml
@@ -58,12 +58,11 @@ getSchema = metaInfo[1, 2:ncol(metaInfo)]
 getMask = as.matrix(metaInfo[2, 2:ncol(metaInfo)])
 getFdMask = as.matrix(metaInfo[3, 2:ncol(metaInfo)]) # columns of interest for FD computation
 
-# # 1. dropInvalid function will remove the values which are not the part 
+# # dropInvalid function will remove the values which are not the part 
 # # of the column data type  
-# print("check 1")
-X = dropInvalidType(F, getSchema)
+X = utils::stringProcessing(F, getMask, getSchema)
 
-# 2. encode the categorical data
+# encode the categorical data
 if(sum(getMask) > 0)
 {
   # recode the dirty data, always recode the label
@@ -75,7 +74,7 @@ if(sum(getMask) > 0)
 else
   eX = as.matrix(X)
 
-# # 3. extract the class label  
+# extract the class label  
 eY = eX[, ncol(eX)]
 eX = eX[, 1:ncol(eX) - 1]
 
@@ -85,9 +84,6 @@ getMask = getMask[, 1:ncol(getMask) - 1] # strip the mask of class label
 getSchema = getSchema[, 1:ncol(getSchema) - 1] # strip the mask of class label
 getFdMask = getFdMask[, 1:ncol(getFdMask) - 1] # strip the mask of class label
 
-# construct hyper-parameters
-ls = list();
-i = 1; k = 1
 
 FD = discoverFD(X=replace(target=eX, pattern=NaN, replacement=1), Mask=getFdMask, threshold=0.8)
 FD = (diag(matrix(1, rows=nrow(FD), cols=1)) ==0) * FD 
@@ -115,19 +111,19 @@ if(sum(getMask) > 0)
 else 
   oX = as.matrix(O)
 
-# # 3. extract the class label  
+# # extract the class label of original data
 oY = oX[, ncol(oX)]
 oX = oX[, 1:ncol(oX) - 1]
 
 
 
 # do the k cross validations for original clean data
-accuracyMatrix = bandit::crossV(oX, oY, 3, as.matrix(0), matrix("0.000001 100", rows=1, cols=2), TRUE)
+accuracyMatrix = bandit::crossV(oX, oY, 3, as.matrix(0), matrix("0 0.000001 100", rows=1, cols=3), TRUE)
 accuracyMatrix = removeEmpty(target=accuracyMatrix, margin="rows")
 oAcc = mean(accuracyMatrix)
 
 # do the k cross validations for cleaned data
-accuracyMatrix = bandit::crossV(cX, cY, 3, as.matrix(0), matrix("0.000001 100", rows=1, cols=2), TRUE)
+accuracyMatrix = bandit::crossV(cX, cY, 3, as.matrix(0), matrix("0 0.000001 100", rows=1, cols=3), TRUE)
 accuracyMatrix = removeEmpty(target=accuracyMatrix, margin="rows")
 cAcc = mean(accuracyMatrix)
 tol = 1
diff --git a/src/test/scripts/functions/pipelines/intermediates/hyperparams.csv b/src/test/scripts/functions/pipelines/intermediates/hyperparams.csv
index 85972c6..2e19fb1 100644
--- a/src/test/scripts/functions/pipelines/intermediates/hyperparams.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/hyperparams.csv
@@ -1,5 +1,5 @@
-36.0,0,0,0,0,1.0,0,0,0,2.0,2.0,1.0,0,0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,3.0,77.0,1.0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-36.0,0,0,0,0,1.0,0,0,0,2.0,2.0,0,1.0,0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,3.0,77.0,0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-36.0,0,0,0,0,1.0,0,0,0,2.0,2.0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,3.0,68.0,1.0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-36.0,0,0,0,0,1.0,0,0,0,2.0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,3.0,56.0,0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-36.0,0,0,0,0,1.0,0,0,0,2.0,2.0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,3.0,56.0,0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+36.0,0,0,0,0,1.0,0,0,0,2.0,2.0,0,1.0,0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,3.0,86.0,1.0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+36.0,0,0,0,0,1.0,0,0,0,2.0,2.0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,3.0,67.0,1.0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+36.0,0,0,0,0,1.0,0,0,0,2.0,2.0,1.0,0,0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,3.0,102.0,1.0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+36.0,0,0,0,0,1.0,0,0,0,2.0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,3.0,104.0,1.0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+36.0,0,0,0,0,1.0,0,0,0,2.0,2.0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,3.0,51.0,1.0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
diff --git a/src/test/scripts/functions/pipelines/intermediates/logical.csv b/src/test/scripts/functions/pipelines/intermediates/logical.csv
new file mode 100644
index 0000000..d3be4ee
--- /dev/null
+++ b/src/test/scripts/functions/pipelines/intermediates/logical.csv
@@ -0,0 +1 @@
+MVI,OTLR,DUMMY,DIM
diff --git a/src/test/scripts/functions/pipelines/intermediates/pipelines.csv b/src/test/scripts/functions/pipelines/intermediates/pipelines.csv
index e9a6697..f798164 100644
--- a/src/test/scripts/functions/pipelines/intermediates/pipelines.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/pipelines.csv
@@ -1,5 +1,5 @@
-imputeByMedian,scale,dummycoding,pca
-imputeByMedian,scale,dummycoding,pca
-imputeByMedian,scale,dummycoding,pca
-imputeByMean,scale,dummycoding,pca
-imputeByMean,scale,dummycoding,pca
+imputeByMean,scale,dummycoding,m_pca
+imputeByMean,scale,dummycoding,m_pca
+imputeByMedian,scale,dummycoding,m_pca
+imputeByMedian,scale,dummycoding,m_pca
+imputeByMean,scale,dummycoding,m_pca
diff --git a/src/test/scripts/functions/pipelines/testClassification.dml b/src/test/scripts/functions/pipelines/testClassification.dml
index 45c6761..f83bea3 100644
--- a/src/test/scripts/functions/pipelines/testClassification.dml
+++ b/src/test/scripts/functions/pipelines/testClassification.dml
@@ -21,9 +21,6 @@
 # Generate the logical pipelines for data cleaning
 
 source("scripts/pipelines/scripts/utils.dml") as utils;
-source("scripts/pipelines/scripts/logicalFunc.dml") as logical;
-source("scripts/pipelines/scripts/gridsearchMLR.dml") as gs;
-
 
 # read the inputs
 F = read($dirtyData, data_type="frame", format="csv", header=TRUE, 
@@ -32,14 +29,14 @@ F = read($dirtyData, data_type="frame", format="csv", header=TRUE,
 metaInfo = read($metaData, data_type="frame", format="csv", header=FALSE);
 primitives = read($primitives, data_type = "frame", format="csv", header= TRUE)
 param = read($parameters, data_type = "frame", format="csv", header= TRUE)
+logical = read($logical, data_type = "frame", format="csv", header= FALSE)
 sample = $sampleSize
 topK = $topk
 resources = $rv
 crossValidations = $cv
 weightedAccuracy = $weighted # accuracy flag
 targetApplicaton = $target # accuracy flag
-
-
+output = $output
 
 if(nrow(metaInfo) < 2)
   stop("incomplete meta info")
@@ -49,9 +46,9 @@ getMask = as.matrix(metaInfo[2, 2:ncol(metaInfo)])
 getFdMask = as.matrix(metaInfo[3, 2:ncol(metaInfo)]) # columns of interest for FD computation
   
 # 1. dropInvalid function will remove the values which are not the part 
-# of the column data type  
+# of the column data type and convert the data to lowercase
 
-X = dropInvalidType(F, getSchema)
+X = utils::stringProcessing(F, getMask, getSchema)
 
 # 2. encode the categorical data
 if(sum(getMask) > 0)
@@ -78,63 +75,39 @@ getFdMask = getFdMask[, 1:ncol(getFdMask) - 1] # strip the mask of class label
 getSchema = getSchema[, 1:ncol(getSchema) - 1] # strip the mask of class label
 
 
-# get the logical seed
-lgSeed = logical::generateLogicalSeed(eX, eY, getMask, targetApplicaton)
-allLgs = logical::transformLogical(lgSeed)
-
-d_accuracy = 0
 # 4. perform the sampling
-
 [eX, eY] = utils::doSample(eX, eY, sample)
 
-# 5. get train test and validation set with balanced class distribution
-[X_train, y_train, X_test, y_test] = splitBalanced(X=eX, Y=eY, splitRatio=0.7, verbose=FALSE)
-
-# 6. find the best hyper parameters for classification algorithm
+# 5. find the best hyper parameters for classification algorithm
 # for now only find the best values for intercept and maximum outer iteration
-params = list("reg", "maxi");
-paramRanges = list(10^seq(0,-10), seq(10,100, 10));
-if(sum(getMask) > 0)
+opt = utils::getOpByTarget(eX, eY, getMask, targetApplicaton)
+
+# 6. get the cross validated accuracy on dirty dataset (only on training set)
+d_accuracy = 0
+d_accuracy = utils::classifyDirty(eX, eY, opt, getMask, weightedAccuracy, crossValidations)
+
+if(sum(getFdMask) > 0)
 {
-  dX_train = utils::dummycoding(replace(target = rbind(X_train, X_test), pattern = NaN, replacement=0), getMask)
-  dX_test = dX_train[nrow(y_train)+1:nrow(dX_train),] 
-  dX_train = dX_train[1:nrow(y_train),] 
-  [opt, loss] = gs::gridSearchMLR(dX_train, y_train, dX_test, y_test, 
-  "multiLogReg", "lossFunc", params, paramRanges, FALSE);
- }
-else  
-  [opt, loss] = gs::gridSearchMLR(X_train, y_train, X_test, y_test, 
-    "multiLogReg", "lossFunc", params, paramRanges, FALSE);
-
-# as I am testing on CV not on holdout train/test
-X_train = eX
-y_train = eY
-# 7. get the cross validated accuracy on dirty dataset (only on training set)
-d_accuracy = utils::classifyDirty(X_train, y_train, opt, getMask, weightedAccuracy, crossValidations)
-
-FD = discoverFD(X=replace(target=eX, pattern=NaN, replacement=1), Mask=getFdMask, threshold=0.8)
-FD = (diag(matrix(1, rows=nrow(FD), cols=1)) ==0) * FD 
-FD = FD > 0
+  FD = discoverFD(X=replace(target=eX, pattern=NaN, replacement=1), Mask=getFdMask, threshold=0.8)
+  FD = (diag(matrix(1, rows=nrow(FD), cols=1)) ==0) * FD 
+  FD = FD > 0
+}
+FD = as.matrix(0)
 
 metaList = list(mask=getMask, schema=getSchema, fd=FD)
 targetClassification = list(target=targetApplicaton, cv=crossValidations, wAccuracy=weightedAccuracy, 
   dirAcc = d_accuracy, mlHp = opt, cleanData = as.matrix(0))
 
 # # initialize output variables
+# 7. call the optimizer
 pip = as.frame("NULL"); hp = matrix(0,0,0); acc = matrix(0,0,0); features = as.frame("NULL")
-
-
-output = $output
-
-[pip, hp, acc, features] = bandit(X_train=eX, Y_train=eY,  metaList=metaList, targetList=targetClassification, lp=allLgs[1,],
+[pip, hp, acc, features] = bandit(X_train=eX, Y_train=eY,  metaList=metaList, targetList=targetClassification, lp=logical,
   primitives=primitives, param=param, k=topK, R=resources, verbose=TRUE);
 
 
-
 if(as.scalar((is.na(acc[1,1]))) == 1 | as.scalar(acc[1,1]) < d_accuracy)
   stop("warning: no best pipeline found")
   
-
 print("best pipelines")
 print(toString(pip))
 
@@ -146,11 +119,9 @@ print(toString(acc[1, 1]))
 
 
 clean_accuracy = max(acc[1,1])
-
-
+#8. compare results
 result = d_accuracy < clean_accuracy  
-print("result satisfied ------------"+result)
-
+print("result satisfied: "+result)
 
 write(pip, output+"/pipelines.csv", format="csv")
 write(hp, output+"/hyperparams.csv", format="csv")
@@ -160,10 +131,11 @@ write(accuracies , output+"/BestAccuracy.csv", format="csv")
 write(features, output+"/features.csv", format="csv")
 write(result , $O)
 
-lossFunc = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] B) 
+accuracy = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] B) 
 return (Matrix[Double] loss) {
   [prob, yhat, acc] = multiLogRegPredict(X=X, B=B, Y=y,  verbose=FALSE)
   loss = as.matrix(1 - (acc/100))
-  # [confusionCount_c, confusionAVG_c] = confusionMatrix(P=yhat, Y=y)
 }
 
+
+
diff --git a/src/test/scripts/functions/pipelines/testCompare.dml b/src/test/scripts/functions/pipelines/testCompare.dml
index 019df79..1f0e663 100644
--- a/src/test/scripts/functions/pipelines/testCompare.dml
+++ b/src/test/scripts/functions/pipelines/testCompare.dml
@@ -18,11 +18,7 @@
 # under the License.
 #
 #-------------------------------------------------------------
-# Generate the logical pipelines for data cleaning
-
-
-source("scripts/pipelines/scripts/logicalFunc.dml") as logical;
-
+source("scripts/pipelines/scripts/utils.dml") as utils;
 # read the inputs
 F = read($dirtyData, data_type="frame", format="csv", header=TRUE, 
   naStrings= ["NA", "null","  ","NaN", "nan", "", "?", "99999"]);
@@ -36,7 +32,7 @@ targetApplicaton = $target # accuracy flag
 cleanData = read($cleanData, data_type="frame", format="csv", header=TRUE, 
   naStrings= ["NA", "null","  ","NaN", "nan", "", "?", "99999"]);
 
-
+logical = as.frame("MVI")
   
 # take the sample of 500 rows to avoid java heap issue
 
@@ -47,20 +43,19 @@ metaInfo = metaInfo[, 1:21]
 if(nrow(metaInfo) < 2)
   stop("incomplete meta info")
 
- # Do the initial cleaning
- 
- 
+# construct meta vectors
 getSchema = metaInfo[1, 2:ncol(metaInfo)]
 getMask = as.matrix(metaInfo[2, 2:ncol(metaInfo)])
 getFdMask = as.matrix(metaInfo[3, 2:ncol(metaInfo)]) # columns of interest for FD computation
-  
-# 1. dropInvalid function will remove the values which are not the part 
-# of the column data type  
 
-X = dropInvalidType(F, getSchema)
+# can not perform string processing as this creates conflicts with original data like the 
+# string processing change the case to lower case then the value comparison with original data
+# is not possible
+# X = utils::stringProcessing(F, getMask, getSchema)
 
+X = F
 
-# 2. encode the categorical data
+# encode the categorical data
 if(sum(getMask) > 0)
 {
   # always recode the label
@@ -84,12 +79,6 @@ else {
 }
   
 
-# get the logical seed
-lgSeed = logical::generateLogicalSeed(eX, as.matrix(0), getMask, targetApplicaton)
-allLgs = logical::transformLogical(lgSeed)
-
-
-
 FD = discoverFD(X=replace(target=eX, pattern=NaN, replacement=1), Mask=getFdMask, threshold=0.8)
 FD = (diag(matrix(1, rows=nrow(FD), cols=1)) ==0) * FD 
 FD = FD > 0
@@ -97,15 +86,14 @@ FD = FD > 0
 expectedAccuracy = 0.5
 
 metaList = list(mask=getMask, schema=getSchema, fd=FD)
-targetClassification = list(target=targetApplicaton, cv=0, wAccuracy=FALSE, 
+targetCompare = list(target=targetApplicaton, cv=0, wAccuracy=FALSE, 
   dirAcc = expectedAccuracy,  mlHp = as.matrix(0), cleanData = cleanX)
 
 
 # # initialize output variables
 pip = as.frame("NULL"); hp = matrix(0,0,0); acc = matrix(0,0,0); features = as.frame("NULL")
-
-[pip, hp, acc, features] = bandit(X_train=eX, Y_train=as.matrix(0),  metaList=metaList, targetList=targetClassification, 
-  lp=allLgs, primitives=primitives, param=param, k=topK, R=resources, verbose=TRUE);
+[pip, hp, acc, features] = bandit(X_train=eX, Y_train=as.matrix(0),  metaList=metaList, targetList=targetCompare, 
+  lp=logical, primitives=primitives, param=param, k=topK, R=resources, verbose=TRUE);
 
 
 output = $output
@@ -133,9 +121,4 @@ print("result satisfied ------------"+result)
 
 accuracies = cbind(as.matrix(expectedAccuracy), as.matrix(clean_accuracy))
 
-
-# write(pip, output+"/pipelines.csv", format="csv")
-# write(hp, output+"/hyperparams.csv", format="csv")
-# write(acc, output+"/accuracies.csv", format="csv")
-# write(accuracies , output+"/BestAccuracy.csv", format="csv")
 write(result , $O)
\ No newline at end of file
diff --git a/src/test/scripts/functions/pipelines/testLogical.dml b/src/test/scripts/functions/pipelines/testLogical.dml
index d0c7bf5..b1efb9d 100644
--- a/src/test/scripts/functions/pipelines/testLogical.dml
+++ b/src/test/scripts/functions/pipelines/testLogical.dml
@@ -21,8 +21,6 @@
 # Generate the logical pipelines for data cleaning
 
 source("scripts/pipelines/scripts/utils.dml") as utils;
-source("scripts/pipelines/scripts/logicalFunc.dml") as logical;
-source("scripts/pipelines/scripts/gridsearchMLR.dml") as gs;
 source("scripts/pipelines/scripts/enumerateLogical.dml") as lg;
 
 
@@ -39,16 +37,13 @@ targetApplicaton = $target # accuracy flag
 max_iter = $max_iter
 num_inst = $num_inst
 num_exec = $num_exec
-n_pop=$n_pop
-pipLength = $pipLength
 crossValidations = $cv
-
+output = $output
 
 getSchema = metaInfo[1, 2:ncol(metaInfo)]
 getMask = as.matrix(metaInfo[2, 2:ncol(metaInfo)])
 getFdMask = as.matrix(metaInfo[3, 2:ncol(metaInfo)]) # columns of interest for FD computation
   
-
 # encode the categorical data
 if(sum(getMask) > 0)
 {
@@ -73,8 +68,8 @@ getMask = getMask[, 1:ncol(getMask) - 1] # strip the mask of class label
 getFdMask = getFdMask[, 1:ncol(getFdMask) - 1] # strip the mask of class label
 getSchema = getSchema[, 1:ncol(getSchema) - 1] # strip the mask of class label
 # hyperparam for classifier
-opt = matrix("0 100", rows=1, cols=2)
-
+opt = utils::getOpByTarget(eX, eY, getMask, targetApplicaton)
+print("opt "+toString(opt))
 # get the cross validated accuracy on dirty dataset (only on training set)
 d_accuracy = 0
 d_accuracy = utils::classifyDirty(eX, eY, opt, getMask, weightedAccuracy, crossValidations)
@@ -92,31 +87,38 @@ targetClassification = list(target=targetApplicaton, cv=crossValidations, wAccur
 pip = as.frame("NULL"); hp = matrix(0,0,0); acc = matrix(0,0,0); features = as.frame("NULL")
 
 
-logical1 =  frame(["4", "MVI", "SCALE", "DUMMY", "DIM", "0", "0", "0"], rows=1, cols=8)
-# logical2 =  frame(["2", "MVI", "DUMMY", "0", "0", "0", "0", "0"], rows=1, cols=8)
-logical3 =  frame(["3", "MVI", "SCALE", "DUMMY", "0", "0", "0", "0"], rows=1, cols=8)
-logical4 =  frame(["6", "MVI", "OTLR", "CI", "SCALE", "DUMMY", "DIM", "0"], rows=1, cols=8)
-logical5 =  frame(["7", "MVI", "OTLR", "MVI", "CI", "SCALE", "DUMMY", "DIM"], rows=1, cols=8)
-logical6 =  frame(["6", "OTLR", "MVI", "CI", "SCALE", "DUMMY", "DIM", "0"], rows=1, cols=8)
+logical =  frame([
+                   "1", "MVI", "0", "0", "0", "0", 
+                   # "1", "OTLR", "0", "0", "0", "0", 
+                   # "1", "CI", "0", "0", "0", "0", 
+                   # "2", "MVI", "CI", "0", "0", "0", 
+                   "2", "MVI", "OTLR", "0", "0", "0",
+                   "2", "MVI", "SCALE", "0", "0", "0", 
+                   "3", "MVI", "SCALE", "OTLR", "0", "0"
+                   # "4", "MVI", "OTLR", "CI", "SCALE", "0", 
+                   # "4", "OTLR", "MVI", "CI", "SCALE", "0",
+                   # "5", "MVI", "OTLR", "MVI", "CI", "SCALE"
+                   ], rows=4, cols=6)
 
-# log = rbind(logical1, logical2)
-log = rbind(logical1, logical3)
-log = rbind(log, logical4)
-log = rbind(log, logical5)
-log = rbind(log, logical6)
 
-[logicalEnum, score, T] = lg::enumerateLogical(X=eX, y=eY, population=log, max_iter=max_iter, pipLength=pipLength, metaList=metaList,
-  targetList=targetClassification, primitives=primitives, param=param, num_inst=num_inst, num_exec=num_exec, n_pop=n_pop, verbose=FALSE)
-# [logicalEnum, score, T] = lg::enumerateLogical(X=eX, y=eY, population=log, max_iter=3, pipLength=10, metaList=metaList,
-  # targetList=targetClassification, primitives=primitives, param=param, num_inst=4, num_exec=2, n_pop=4, verbose=FALSE)
+categories = frame(["MVI", "OTLR", "SCALE"], rows=1, cols=3)
+cmr = matrix("4 0.7 2", rows=1, cols=3)
+[bestLogical, score, T] = lg::enumerateLogical(X=eX, y=eY, cmr=cmr, cat=categories, population=logical,
+  max_iter=max_iter, metaList=metaList, targetList=targetClassification, primitives=primitives, param=param,
+  num_inst=num_inst, num_exec=num_exec, isTailed=TRUE, verbose=TRUE)
 
 print("score of pipeline: "+toString(score)+" in "+(T/60000)+" mins")
-print("logicalENum "+toString(logicalEnum))
+print("bestLogical "+toString(bestLogical))
 
 result = d_accuracy < score  
 print("result satisfied ------------"+result)
 
 write(result , $O)
+write(bestLogical , output, format="csv")
 
-
+accuracy = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] B) 
+return (Matrix[Double] loss) {
+  [prob, yhat, acc] = multiLogRegPredict(X=X, B=B, Y=y,  verbose=FALSE)
+  loss = as.matrix(1 - (acc/100))
+}