You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by mb...@apache.org on 2021/09/05 14:02:12 UTC
[systemds] branch master updated: [SYSTEMDS-3120] Performance
correctTypos builtin in cleaning pipelines
This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/master by this push:
new e882735 [SYSTEMDS-3120] Performance correctTypos builtin in cleaning pipelines
e882735 is described below
commit e882735849f327282988c7de08b79366a731998e
Author: Matthias Boehm <mb...@gmail.com>
AuthorDate: Sun Sep 5 15:57:29 2021 +0200
[SYSTEMDS-3120] Performance correctTypos builtin in cleaning pipelines
This patch replaces a very slow script-level O(N^2) method for building
string dictionaries (distinct items, frequencies) with a vectorized
transformencode-based implementation. Within the top-k cleaning pipeline
enumeration, this patch improved the entire data preparation step
(string processing including correctTypos, encoding, sampling) on Adult
from 188s to 4s (with even larger relative improvements for correctTypos
alone).
---
scripts/builtin/correctTypos.dml | 32 ++++++----------------
.../apache/sysds/runtime/util/UtilFunctions.java | 7 +++--
2 files changed, 14 insertions(+), 25 deletions(-)
diff --git a/scripts/builtin/correctTypos.dml b/scripts/builtin/correctTypos.dml
index 45d3861..55b7246 100644
--- a/scripts/builtin/correctTypos.dml
+++ b/scripts/builtin/correctTypos.dml
@@ -72,18 +72,11 @@ s_correctTypos = function(Frame[String] strings, Matrix[Double] nullMask, Double
Y = strings
# build dictionary
- current_string = as.scalar(strings[1]);
- dict = cbind(as.frame(""), as.frame(1));
-
- for (i in 1:num_strings) {
- current_string = as.scalar(strings[i]);
- if(as.scalar(nullMask[i]) == 0)
- dict = insertOrIncrement(current_string, dict);
- }
- dict = dict[2:nrow(dict),]
+ dict = buildDictionary(strings);
strings = dict[,1];
frequencies = as.matrix(dict[,2]) / num_strings;
lengths = as.matrix(map(strings, "s -> s.length()"));
+
num_different_strings = nrow(strings);
if (is_verbose) {
print("dict:" )
@@ -129,7 +122,7 @@ s_correctTypos = function(Frame[String] strings, Matrix[Double] nullMask, Double
}
}
upper_triangle = upper.tri(target=distance_matrix, values=TRUE);
- distance_matrix = distance_matrix + t(upper_triangle) + diag(matrix(42000, rows=num_different_strings, cols=1));
+ distance_matrix = distance_matrix + t(upper_triangle) + diag(matrix(42000, num_different_strings, 1));
sorted_frequency_idxs = order(target=frequencies, index.return=TRUE);
if (is_verbose) {
@@ -180,23 +173,16 @@ replaceStrings = function(String replacement, String to_replace, Frame[String] s
strings = map(strings, "s -> s.equals(\""+to_replace+"\") ? \""+replacement+"\" : s");
}
-insertOrIncrement = function(String str, Frame[Unknown] dict)
+buildDictionary = function(Frame[String] S)
return(Frame[Unknown] dict)
{
- i = 1;
- break = FALSE;
- while (i <= nrow(dict) & !break) {
- if (as.scalar(dict[i, 1]) == str) {
- dict[i, 2] = as.frame(as.integer(as.scalar(dict[i, 2])) + 1);
- break = TRUE;
- }
- i = i + 1;
- }
- if (!break)
- dict = rbind(dict, cbind(as.frame(str), as.frame(1)));
+ [ID,M] = transformencode(target=S, spec="{ids:true,recode:[1]}");
+ dstr = map(M[,1], "s -> UtilFunctions.splitRecodeEntry(s)[0]");
+ dcodes = map(M[,1], "s -> UtilFunctions.splitRecodeEntry(s)[1]");
+ frequencies = table(seq(1,nrow(dstr)),as.matrix(dcodes)) %% table(ID, 1);
+ dict = cbind(dstr, as.frame(frequencies));
}
-
damerauLevenshteinDistanceBound = function(matrix[double] A, matrix[double] B, double bound, Boolean is_verbose)
return(double dl_distance) {
diff --git a/src/main/java/org/apache/sysds/runtime/util/UtilFunctions.java b/src/main/java/org/apache/sysds/runtime/util/UtilFunctions.java
index c0507ce..de6d7d6 100644
--- a/src/main/java/org/apache/sysds/runtime/util/UtilFunctions.java
+++ b/src/main/java/org/apache/sysds/runtime/util/UtilFunctions.java
@@ -30,6 +30,7 @@ import org.apache.sysds.runtime.matrix.data.FrameBlock;
import org.apache.sysds.runtime.matrix.data.MatrixIndexes;
import org.apache.sysds.runtime.matrix.data.Pair;
import org.apache.sysds.runtime.meta.TensorCharacteristics;
+import org.apache.sysds.runtime.transform.encode.ColumnEncoderRecode;
import java.text.ParseException;
import java.text.SimpleDateFormat;
@@ -984,10 +985,12 @@ public class UtilFunctions {
return vt;
}
-
public static int getEndIndex(int arrayLength, int startIndex, int blockSize){
return (blockSize <= 0)? arrayLength: Math.min(arrayLength, startIndex + blockSize);
}
-
+ public static String[] splitRecodeEntry(String s) {
+ //forward to column encoder, as UtilFunctions available in map context
+ return ColumnEncoderRecode.splitRecodeMapEntry(s);
+ }
}