You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by mb...@apache.org on 2020/09/12 13:48:25 UTC
[systemds] branch master updated: [SYSTEMDS-2641] Improved slice finding dml algorithm (perf, correctness)

This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/master by this push:
     new 2c5f703  [SYSTEMDS-2641] Improved slice finding dml algorithm (perf, correctness)
2c5f703 is described below

commit 2c5f70353b3863db68442fd57b5db7ed6ca4196d
Author: Matthias Boehm <mb...@gmail.com>
AuthorDate: Sat Sep 12 15:47:58 2020 +0200

    [SYSTEMDS-2641] Improved slice finding dml algorithm (perf, correctness)
    
    This patch fixes the correctness of the slice finding where the
    generation of paired candidates mistakenly used upper.tri instead of
    upper.tri-values which created invalid slices and violated the expected
    monotonicity properties.
    
    Furthermore this patch also includes two major performance improvements,
    specifically (1) the pruning before candidate generation (which does not
    affect overall pruning effectiveness due to the handling of missing
    parents), and (2) an ID transformation to avoid exceeding integer-max
    for many columns and to avoid huge sparse intermediates.
    
    These changes improved the runtime on a replicated (more rows and
    columns) salary dataset from 799s to 52s (only 1), and 2.6s (1 and 2)
    respectively.
---
 scripts/staging/slicing/slicing.dml | 34 ++++++++++++++++++++++------------
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/scripts/staging/slicing/slicing.dml b/scripts/staging/slicing/slicing.dml
index 2c9efb7..8faf416 100644
--- a/scripts/staging/slicing/slicing.dml
+++ b/scripts/staging/slicing/slicing.dml
@@ -179,9 +179,15 @@ getPairedCandidates = function(Matrix[Double] S, Matrix[Double] R, Matrix[Double
     Integer n2, Matrix[Double] foffb, Matrix[Double] foffe)
   return(Matrix[Double] P) 
 {
+  # prune invalid slices (possible without affecting overall
+  # pruning effectiveness due to handling of missing parents)
+  pI = (R[,3] >= minSup);
+  S = removeEmpty(target=S, margin="rows", select=pI)
+  R = removeEmpty(target=R, margin="rows", select=pI)
+
   # join compatible slices (without self)
   join = S %*% t(S) == (level-2)
-  I = upper.tri(target=join, diag=FALSE);
+  I = upper.tri(target=join, diag=FALSE, values=TRUE);
   
   # pair construction
   nr = nrow(I); nc = ncol(I);
@@ -194,8 +200,8 @@ getPairedCandidates = function(Matrix[Double] S, Matrix[Double] R, Matrix[Double
   if( sum(rix)!=0 ) {
     P1 = table(seq(1,nrow(rix)), rix, nrow(rix), nrow(S));
     P2 = table(seq(1,nrow(cix)), cix, nrow(rix), nrow(S));
-    P12 = (P1 + P2);
-    P = ((P1 %*% S) + (P2 %*% S)) != 0;
+    P12 = P1 + P2; # combined slice
+    P = (P12 %*% S) != 0;
     ss = min(P1 %*% R[,3], P2 %*% R[,3])
     se = min(P1 %*% R[,2], P2 %*% R[,2])
 
@@ -206,8 +212,8 @@ getPairedCandidates = function(Matrix[Double] S, Matrix[Double] R, Matrix[Double
       end = as.scalar(foffe[1,j]);
       I = I & (rowSums(P[,beg:end]) <= 1);
     }
-    P = removeEmpty(target=P, margin="rows", select=I);
     P12 = removeEmpty(target=P12, margin="rows", select=I)
+    P = removeEmpty(target=P, margin="rows", select=I);
     ss = removeEmpty(target=ss, margin="rows", select=I);
     se = removeEmpty(target=se, margin="rows", select=I);
 
@@ -224,16 +230,20 @@ getPairedCandidates = function(Matrix[Double] S, Matrix[Double] R, Matrix[Double
       ID = ID + I * prod;
     }
 
+    # ID transformation to avoid exceeding INT_MAX and
+    # and to void creating huge sparse intermediates
+    [ID, M] = transformencode(target=as.frame(ID), spec="{ids:true,recode:[1]}")
+
     # size pruning, with rowMin-rowMax transform 
     # to avoid densification (ignored zeros)
-    map = table(ID,seq(1,nrow(P)))
+    map = table(ID, seq(1,nrow(P)), max(ID), nrow(P))
     ubSizes = 1/rowMaxs(map * (1/t(ss)));
-    ubSizes = replace(target=ubSizes, pattern=Inf, replacement=0);    
+    ubSizes = replace(target=ubSizes, pattern=Inf, replacement=0);
     fSizes = (ubSizes >= minSup)
 
     # error pruning
     ubError = 1/rowMaxs(map * (1/t(se)));
-    ubError = replace(target=ubError, pattern=Inf, replacement=0);    
+    ubError = replace(target=ubError, pattern=Inf, replacement=0);
     ubScores = scoreUB(ubSizes, ubError, eAvg, minSup, alpha, n2);
     [maxsc, minsc] = analyzeTopK(TKC);
     fScores = (ubScores > minsc & ubScores > 0) 
@@ -254,7 +264,7 @@ getPairedCandidates = function(Matrix[Double] S, Matrix[Double] R, Matrix[Double
 evalSlice = function(Matrix[Double] X, Matrix[Double] e, Double eAvg, Matrix[Double] tS, Integer l, Double alpha) 
   return(Matrix[Double] R)
 {
-  I = (X %*% tS) == l;  # slice indicator 
+  I = (X %*% tS) == l; # slice indicator
   ss = t(colSums(I));  # absolute slice size (nnz)
   se = t(t(e) %*% I);  # absolute slice error 
 
@@ -288,9 +298,9 @@ jspec= "{ ids:true, recode:[1,2,3,6], bin:[{id:4, method:equi-width, numbins:14}
 [X,M] = transformencode(target=F, spec=jspec);
 X = X[,2:ncol(X)]
 
-#X = rbind(X,X)
-#X = cbind(X,X)
-#y = rbind(y,y)
+X = rbind(X,X)
+X = cbind(X,X)
+y = rbind(y,y)
 
 # learn model
 B = lm(X=X, y=y, verbose=FALSE);
@@ -298,4 +308,4 @@ yhat = X %*% B;
 e = (y-yhat)^2;
 
 # call slice finding
-[S,C] = slicing(X=X, e=e, k=10, alpha=0.95, minSup=4, verbose=TRUE);
+[S,C] = slicing(X=X, e=e, k=10, alpha=0.95, minSup=32, verbose=TRUE);