You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by mb...@apache.org on 2021/09/02 22:16:14 UTC

[systemds] 02/02: [SYSTEMDS-3115, 3120] Implements cleaning pipeline enumeration scripts

This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git

commit 3d1ba3c3b53f48d971c358a99f7c8b3e0b609c2b
Author: Matthias Boehm <mb...@gmail.com>
AuthorDate: Fri Sep 3 00:15:05 2021 +0200

    [SYSTEMDS-3115,3120] Implements cleaning pipeline enumeration scripts
    
    * Parfor parallelization logical pipeline enumeration
    * Various vectorization in the correctTypos builtin
---
 scripts/builtin/correctTypos.dml               | 51 +++++---------------------
 scripts/pipelines/scripts/enumerateLogical.dml | 25 ++++++-------
 2 files changed, 21 insertions(+), 55 deletions(-)

diff --git a/scripts/builtin/correctTypos.dml b/scripts/builtin/correctTypos.dml
index 9f95a4d..45d3861 100644
--- a/scripts/builtin/correctTypos.dml
+++ b/scripts/builtin/correctTypos.dml
@@ -123,11 +123,7 @@ s_correctTypos = function(Frame[String] strings, Matrix[Double] nullMask, Double
           A = ascii_matrix[1:as.scalar(lengths[i,1]), i];
           B = ascii_matrix[1:as.scalar(lengths[j,1]), j];
           d = damerauLevenshteinDistanceBound(A, B, distance_threshold, FALSE);
-          if (d == -1) {
-            distance_matrix[i, j] = 42000;
-          } else {
-            distance_matrix[i, j] = d;
-          }
+          distance_matrix[i, j] = ifelse(d == -1, 42000, d);
         }
       }
     }
@@ -178,36 +174,26 @@ s_correctTypos = function(Frame[String] strings, Matrix[Double] nullMask, Double
   }
 }
 
-
 replaceStrings = function(String replacement, String to_replace, Frame[String] strings)
   return(Frame[String] strings) 
 {
-  for (i in 1:nrow(strings)) {
-    if (as.scalar(strings[i,]) == to_replace) {
-      strings[i,] = replacement;
-    }
-  }
+  strings = map(strings, "s -> s.equals(\""+to_replace+"\") ? \""+replacement+"\" : s");
 }
 
-
 insertOrIncrement = function(String str, Frame[Unknown] dict)
   return(Frame[Unknown] dict)
 {
   i = 1;
-  ret = FALSE;
   break = FALSE;
   while (i <= nrow(dict) & !break) {
     if (as.scalar(dict[i, 1]) == str) {
-      value = as.integer(as.scalar(dict[i, 2])) + 1;
-      dict[i, 2] = value;
-      contains = TRUE;
+      dict[i, 2] = as.frame(as.integer(as.scalar(dict[i, 2])) + 1);
       break = TRUE;
     }
     i = i + 1;
   }
-  if (!break) {
+  if (!break)
     dict = rbind(dict, cbind(as.frame(str), as.frame(1)));
-  }
 }
 
 
@@ -216,19 +202,11 @@ damerauLevenshteinDistanceBound = function(matrix[double] A, matrix[double] B, d
 
   dl_matrix = matrix(0, rows = length(A) + 1, cols = length(B) + 1);
   dl_matrix[length(A) + 1, length(B) + 1] = -1;
-
-  for (j in 2:length(B) + 1) {
-    dl_matrix[1, j] = j - 1;
-  }
-
+  dl_matrix[1, 2:(length(B)+1)] = t(seq(2,length(B)+1) - 1);
   dl_matrix[2, 1] = 1;
 
   for (j in 2:length(B) + 1) {
-    if (as.scalar(A[1]) == as.scalar(B[j - 1])) {
-      cost = 0;
-    } else {
-      cost = 1;
-    }
+    cost = as.integer(as.scalar(A[1]) != as.scalar(B[j - 1]))
     dl_matrix[2, j] = min(min(
       dl_matrix[2, j - 1] + 1, 
       dl_matrix[1, j] + 1),
@@ -241,23 +219,14 @@ damerauLevenshteinDistanceBound = function(matrix[double] A, matrix[double] B, d
     i += 1;
 
     dl_matrix[i, 1] = i - 1;
-
-    if (as.scalar(A[i - 1]) == as.scalar(B[1])) {
-      cost = 0;
-    } else {
-      cost = 1;
-    }
+    cost = as.integer(as.scalar(A[i - 1]) != as.scalar(B[1]))
     dl_matrix[i, 2] = min(min(
       dl_matrix[i - 1, 2] + 1, 
       dl_matrix[i, 1] + 1),
       dl_matrix[i - 1, 1] + cost);
 
     for (j in 3:length(B) + 1) {
-      if (as.scalar(A[i - 1]) == as.scalar(B[j - 1])) {
-        cost = 0;
-      } else {
-        cost = 1;
-      }
+      cost = as.integer(as.scalar(A[i - 1]) != as.scalar(B[j - 1]))
       if (as.scalar(A[i - 1]) == as.scalar(B[j - 2]) & as.scalar(A[i - 2]) == as.scalar(B[j - 1])) {
         dl_matrix[i, j] = min(min(
           dl_matrix[i, j - 1] + 1, 
@@ -272,9 +241,7 @@ damerauLevenshteinDistanceBound = function(matrix[double] A, matrix[double] B, d
       }
     }
 
-    if( min(dl_matrix[i - 1, ]) > bound & min(dl_matrix[i, ]) > bound) {
-      break_condition = TRUE;
-    }
+    break_condition = min(dl_matrix[i - 1, ]) > bound & min(dl_matrix[i, ]) > bound;
   }
 
   if (is_verbose){
diff --git a/scripts/pipelines/scripts/enumerateLogical.dml b/scripts/pipelines/scripts/enumerateLogical.dml
index f894c4e..1133eb7 100644
--- a/scripts/pipelines/scripts/enumerateLogical.dml
+++ b/scripts/pipelines/scripts/enumerateLogical.dml
@@ -67,9 +67,9 @@ return (Frame[Unknown] bestLg, Double pre_best, Double T)
   {
     physicalPipList = list()
     logicalPipList = list()
+    
     # # # get the physical instances from logical ones
-    for(i in 1:nrow(population))
-    { 
+    for(i in 1:nrow(population)) { 
       lv = as.integer(as.scalar(population[i, 1])) + 1
       lp = population[i, 2:lv]
       physicalConf = bandit::get_physical_configurations(lp, num_inst, primitives)
@@ -77,23 +77,22 @@ return (Frame[Unknown] bestLg, Double pre_best, Double T)
       logicalPipList = append(logicalPipList, lp)
     }
     
-    scores = matrix(0, rows=length(physicalPipList), cols=1)
-    
     # # # execute the physical pipelines
-    for(i in 1:length(physicalPipList))
-    {
-      physicalConf = as.frame(physicalPipList[i])
-      lp = as.frame(logicalPipList[i])
+    scores = matrix(0, length(physicalPipList), 1)
+    # TODO better parfor-dep handling of multi-assignments to avoid check=0 
+    parfor(i in 1:length(physicalPipList), check=0) {
+      lp2 = as.frame(logicalPipList[i,1])
+      pp2 = as.frame(physicalPipList[i,1])
       # # append configuration keys for extracting the pipeline later on
-      id = seq(1, nrow(physicalConf))
-      physicalConf = cbind(as.frame(id), physicalConf)
+      id = seq(1, nrow(pp2))
+      idpp = cbind(as.frame(id), pp2)
 
       # # execute the physical instances and store the minimum scores, each pipeline is executed num_exec times
-      [outPip,outHp, feaFrameOuter] = bandit::run_with_hyperparam(lp, physicalConf, num_exec, X, y, Xtest, ytest, metaList,
+      [outPip, outHp, feaFrameOuter] = bandit::run_with_hyperparam(lp2, idpp, num_exec, X, y, Xtest, ytest, metaList,
         evaluationFunc, evalFunHp, param, as.frame(""), cv, cvk, verbose)
       # # sort the configurations groupwise
-      max_perf =  bandit::getMaxPerConf(outPip, nrow(physicalConf)) 
-      scores[i] = as.matrix(max_perf[1, 1])
+      max_perf = bandit::getMaxPerConf(outPip, nrow(pp2)) 
+      scores[i,1] = as.matrix(max_perf[1,1])
     }
     
     # # select parents and best score