You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by mb...@apache.org on 2020/09/12 13:48:25 UTC
[systemds] branch master updated: [SYSTEMDS-2641] Improved slice
finding dml algorithm (perf, correctness)
This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/master by this push:
new 2c5f703 [SYSTEMDS-2641] Improved slice finding dml algorithm (perf, correctness)
2c5f703 is described below
commit 2c5f70353b3863db68442fd57b5db7ed6ca4196d
Author: Matthias Boehm <mb...@gmail.com>
AuthorDate: Sat Sep 12 15:47:58 2020 +0200
[SYSTEMDS-2641] Improved slice finding dml algorithm (perf, correctness)
This patch fixes the correctness of the slice finding where the
generation of paired candidates mistakenly used upper.tri instead of
upper.tri-values which created invalid slices and violated the expected
monotonicity properties.
Furthermore this patch also includes two major performance improvements,
specifically (1) the pruning before candidate generation (which does not
affect overall pruning effectiveness due to the handling of missing
parents), and (2) an ID transformation to avoid exceeding integer-max
for many columns and to avoid huge sparse intermediates.
These changes improved the runtime on a replicated (more rows and
columns) salary dataset from 799s to 52s (only 1), and 2.6s (1 and 2)
respectively.
---
scripts/staging/slicing/slicing.dml | 34 ++++++++++++++++++++++------------
1 file changed, 22 insertions(+), 12 deletions(-)
diff --git a/scripts/staging/slicing/slicing.dml b/scripts/staging/slicing/slicing.dml
index 2c9efb7..8faf416 100644
--- a/scripts/staging/slicing/slicing.dml
+++ b/scripts/staging/slicing/slicing.dml
@@ -179,9 +179,15 @@ getPairedCandidates = function(Matrix[Double] S, Matrix[Double] R, Matrix[Double
Integer n2, Matrix[Double] foffb, Matrix[Double] foffe)
return(Matrix[Double] P)
{
+ # prune invalid slices (possible without affecting overall
+ # pruning effectiveness due to handling of missing parents)
+ pI = (R[,3] >= minSup);
+ S = removeEmpty(target=S, margin="rows", select=pI)
+ R = removeEmpty(target=R, margin="rows", select=pI)
+
# join compatible slices (without self)
join = S %*% t(S) == (level-2)
- I = upper.tri(target=join, diag=FALSE);
+ I = upper.tri(target=join, diag=FALSE, values=TRUE);
# pair construction
nr = nrow(I); nc = ncol(I);
@@ -194,8 +200,8 @@ getPairedCandidates = function(Matrix[Double] S, Matrix[Double] R, Matrix[Double
if( sum(rix)!=0 ) {
P1 = table(seq(1,nrow(rix)), rix, nrow(rix), nrow(S));
P2 = table(seq(1,nrow(cix)), cix, nrow(rix), nrow(S));
- P12 = (P1 + P2);
- P = ((P1 %*% S) + (P2 %*% S)) != 0;
+ P12 = P1 + P2; # combined slice
+ P = (P12 %*% S) != 0;
ss = min(P1 %*% R[,3], P2 %*% R[,3])
se = min(P1 %*% R[,2], P2 %*% R[,2])
@@ -206,8 +212,8 @@ getPairedCandidates = function(Matrix[Double] S, Matrix[Double] R, Matrix[Double
end = as.scalar(foffe[1,j]);
I = I & (rowSums(P[,beg:end]) <= 1);
}
- P = removeEmpty(target=P, margin="rows", select=I);
P12 = removeEmpty(target=P12, margin="rows", select=I)
+ P = removeEmpty(target=P, margin="rows", select=I);
ss = removeEmpty(target=ss, margin="rows", select=I);
se = removeEmpty(target=se, margin="rows", select=I);
@@ -224,16 +230,20 @@ getPairedCandidates = function(Matrix[Double] S, Matrix[Double] R, Matrix[Double
ID = ID + I * prod;
}
+ # ID transformation to avoid exceeding INT_MAX and
+ # and to void creating huge sparse intermediates
+ [ID, M] = transformencode(target=as.frame(ID), spec="{ids:true,recode:[1]}")
+
# size pruning, with rowMin-rowMax transform
# to avoid densification (ignored zeros)
- map = table(ID,seq(1,nrow(P)))
+ map = table(ID, seq(1,nrow(P)), max(ID), nrow(P))
ubSizes = 1/rowMaxs(map * (1/t(ss)));
- ubSizes = replace(target=ubSizes, pattern=Inf, replacement=0);
+ ubSizes = replace(target=ubSizes, pattern=Inf, replacement=0);
fSizes = (ubSizes >= minSup)
# error pruning
ubError = 1/rowMaxs(map * (1/t(se)));
- ubError = replace(target=ubError, pattern=Inf, replacement=0);
+ ubError = replace(target=ubError, pattern=Inf, replacement=0);
ubScores = scoreUB(ubSizes, ubError, eAvg, minSup, alpha, n2);
[maxsc, minsc] = analyzeTopK(TKC);
fScores = (ubScores > minsc & ubScores > 0)
@@ -254,7 +264,7 @@ getPairedCandidates = function(Matrix[Double] S, Matrix[Double] R, Matrix[Double
evalSlice = function(Matrix[Double] X, Matrix[Double] e, Double eAvg, Matrix[Double] tS, Integer l, Double alpha)
return(Matrix[Double] R)
{
- I = (X %*% tS) == l; # slice indicator
+ I = (X %*% tS) == l; # slice indicator
ss = t(colSums(I)); # absolute slice size (nnz)
se = t(t(e) %*% I); # absolute slice error
@@ -288,9 +298,9 @@ jspec= "{ ids:true, recode:[1,2,3,6], bin:[{id:4, method:equi-width, numbins:14}
[X,M] = transformencode(target=F, spec=jspec);
X = X[,2:ncol(X)]
-#X = rbind(X,X)
-#X = cbind(X,X)
-#y = rbind(y,y)
+X = rbind(X,X)
+X = cbind(X,X)
+y = rbind(y,y)
# learn model
B = lm(X=X, y=y, verbose=FALSE);
@@ -298,4 +308,4 @@ yhat = X %*% B;
e = (y-yhat)^2;
# call slice finding
-[S,C] = slicing(X=X, e=e, k=10, alpha=0.95, minSup=4, verbose=TRUE);
+[S,C] = slicing(X=X, e=e, k=10, alpha=0.95, minSup=32, verbose=TRUE);