You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by mb...@apache.org on 2021/09/05 14:02:12 UTC

[systemds] branch master updated: [SYSTEMDS-3120] Performance correctTypos builtin in cleaning pipelines

This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/master by this push:
     new e882735  [SYSTEMDS-3120] Performance correctTypos builtin in cleaning pipelines
e882735 is described below

commit e882735849f327282988c7de08b79366a731998e
Author: Matthias Boehm <mb...@gmail.com>
AuthorDate: Sun Sep 5 15:57:29 2021 +0200

    [SYSTEMDS-3120] Performance correctTypos builtin in cleaning pipelines
    
    This patch replaces a very slow script-level O(N^2) method for building
    string dictionaries (distinct items, frequencies) with a vectorized
    transformencode-based implementation. Within the top-k cleaning pipeline
    enumeration, this patch improved the entire data preparation step
    (string processing including correctTypos, encoding, sampling) on Adult
    from 188s to 4s (with even larger relative improvements for correctTypos
    alone).
---
 scripts/builtin/correctTypos.dml                   | 32 ++++++----------------
 .../apache/sysds/runtime/util/UtilFunctions.java   |  7 +++--
 2 files changed, 14 insertions(+), 25 deletions(-)

diff --git a/scripts/builtin/correctTypos.dml b/scripts/builtin/correctTypos.dml
index 45d3861..55b7246 100644
--- a/scripts/builtin/correctTypos.dml
+++ b/scripts/builtin/correctTypos.dml
@@ -72,18 +72,11 @@ s_correctTypos = function(Frame[String] strings, Matrix[Double] nullMask, Double
   Y = strings
 
   # build dictionary
-  current_string = as.scalar(strings[1]);
-  dict = cbind(as.frame(""), as.frame(1));
-
-  for (i in 1:num_strings) {
-    current_string = as.scalar(strings[i]);
-    if(as.scalar(nullMask[i]) == 0)
-      dict = insertOrIncrement(current_string, dict);
-  }
-  dict = dict[2:nrow(dict),]
+  dict = buildDictionary(strings);
   strings = dict[,1];
   frequencies = as.matrix(dict[,2]) / num_strings;
   lengths = as.matrix(map(strings, "s -> s.length()"));
+  
   num_different_strings = nrow(strings);
   if (is_verbose) {
     print("dict:" )
@@ -129,7 +122,7 @@ s_correctTypos = function(Frame[String] strings, Matrix[Double] nullMask, Double
     }
   }
   upper_triangle = upper.tri(target=distance_matrix, values=TRUE);
-  distance_matrix = distance_matrix + t(upper_triangle) + diag(matrix(42000, rows=num_different_strings, cols=1));
+  distance_matrix = distance_matrix + t(upper_triangle) + diag(matrix(42000, num_different_strings, 1));
 
   sorted_frequency_idxs = order(target=frequencies, index.return=TRUE);
   if (is_verbose) {
@@ -180,23 +173,16 @@ replaceStrings = function(String replacement, String to_replace, Frame[String] s
   strings = map(strings, "s -> s.equals(\""+to_replace+"\") ? \""+replacement+"\" : s");
 }
 
-insertOrIncrement = function(String str, Frame[Unknown] dict)
+buildDictionary = function(Frame[String] S)
   return(Frame[Unknown] dict)
 {
-  i = 1;
-  break = FALSE;
-  while (i <= nrow(dict) & !break) {
-    if (as.scalar(dict[i, 1]) == str) {
-      dict[i, 2] = as.frame(as.integer(as.scalar(dict[i, 2])) + 1);
-      break = TRUE;
-    }
-    i = i + 1;
-  }
-  if (!break)
-    dict = rbind(dict, cbind(as.frame(str), as.frame(1)));
+  [ID,M] = transformencode(target=S, spec="{ids:true,recode:[1]}");
+  dstr = map(M[,1], "s -> UtilFunctions.splitRecodeEntry(s)[0]");
+  dcodes = map(M[,1], "s -> UtilFunctions.splitRecodeEntry(s)[1]");
+  frequencies = table(seq(1,nrow(dstr)),as.matrix(dcodes)) %% table(ID, 1);
+  dict = cbind(dstr, as.frame(frequencies));
 }
 
-
 damerauLevenshteinDistanceBound = function(matrix[double] A, matrix[double] B, double bound, Boolean is_verbose) 
   return(double dl_distance) {
 
diff --git a/src/main/java/org/apache/sysds/runtime/util/UtilFunctions.java b/src/main/java/org/apache/sysds/runtime/util/UtilFunctions.java
index c0507ce..de6d7d6 100644
--- a/src/main/java/org/apache/sysds/runtime/util/UtilFunctions.java
+++ b/src/main/java/org/apache/sysds/runtime/util/UtilFunctions.java
@@ -30,6 +30,7 @@ import org.apache.sysds.runtime.matrix.data.FrameBlock;
 import org.apache.sysds.runtime.matrix.data.MatrixIndexes;
 import org.apache.sysds.runtime.matrix.data.Pair;
 import org.apache.sysds.runtime.meta.TensorCharacteristics;
+import org.apache.sysds.runtime.transform.encode.ColumnEncoderRecode;
 
 import java.text.ParseException;
 import java.text.SimpleDateFormat;
@@ -984,10 +985,12 @@ public class UtilFunctions {
 		return vt;
 	}
 
-
 	public static int getEndIndex(int arrayLength, int startIndex, int blockSize){
 		return (blockSize <= 0)? arrayLength: Math.min(arrayLength, startIndex + blockSize);
 	}
 
-
+	public static String[] splitRecodeEntry(String s) {
+		//forward to column encoder, as UtilFunctions available in map context
+		return ColumnEncoderRecode.splitRecodeMapEntry(s);
+	}
 }