You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by mb...@apache.org on 2020/09/19 19:48:13 UTC
[systemds] branch master updated: [SYSTEMDS-2641] Additional R
baseline for slice finding tests
This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/master by this push:
new 132f1e7 [SYSTEMDS-2641] Additional R baseline for slice finding tests
132f1e7 is described below
commit 132f1e72896cadb25660bf5321935915ca66dcd9
Author: Matthias Boehm <mb...@gmail.com>
AuthorDate: Sat Sep 19 21:46:51 2020 +0200
[SYSTEMDS-2641] Additional R baseline for slice finding tests
---
.../functions/builtin/BuiltinSliceFinderTest.java | 43 ++-
src/test/scripts/functions/builtin/slicefinder.R | 324 +++++++++++++++++++++
src/test/scripts/functions/builtin/slicefinder.dml | 22 +-
.../{slicefinder.dml => slicefinderPrep.dml} | 6 +-
4 files changed, 365 insertions(+), 30 deletions(-)
diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinSliceFinderTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinSliceFinderTest.java
index f03caa0..168a142 100644
--- a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinSliceFinderTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinSliceFinderTest.java
@@ -22,13 +22,18 @@ package org.apache.sysds.test.functions.builtin;
import org.junit.Assert;
import org.junit.Test;
+
+import java.util.HashMap;
+
import org.apache.sysds.common.Types.ExecMode;
+import org.apache.sysds.runtime.matrix.data.MatrixValue.CellIndex;
import org.apache.sysds.test.AutomatedTestBase;
import org.apache.sysds.test.TestConfiguration;
import org.apache.sysds.test.TestUtils;
-public class BuiltinSliceFinderTest extends AutomatedTestBase {
-
+public class BuiltinSliceFinderTest extends AutomatedTestBase
+{
+ private static final String PREP_NAME = "slicefinderPrep";
private static final String TEST_NAME = "slicefinder";
private static final String TEST_DIR = "functions/builtin/";
private static final String TEST_CLASS_DIR = TEST_DIR + BuiltinSliceFinderTest.class.getSimpleName() + "/";
@@ -94,21 +99,43 @@ public class BuiltinSliceFinderTest extends AutomatedTestBase {
private void runSliceFinderTest(int K, boolean dp, ExecMode mode) {
ExecMode platformOld = setExecMode(ExecMode.HYBRID);
- String dml_test_name = TEST_NAME;
loadTestConfiguration(getTestConfiguration(TEST_NAME));
String HOME = SCRIPT_DIR + TEST_DIR;
String data = HOME + "/data/Salaries.csv";
try {
loadTestConfiguration(getTestConfiguration(TEST_NAME));
- //
- fullDMLScriptName = HOME + dml_test_name + ".dml";
- programArgs = new String[]{"-args", data,
+
+ //run data preparation
+ fullDMLScriptName = HOME + PREP_NAME + ".dml";
+ programArgs = new String[]{"-args", data, output("X"), output("e")};
+ runTest(true, false, null, -1);
+
+ //read output and store for dml and R
+ double[][] X = TestUtils.convertHashMapToDoubleArray(readDMLMatrixFromHDFS("X"));
+ double[][] e = TestUtils.convertHashMapToDoubleArray(readDMLMatrixFromHDFS("e"));
+ writeInputMatrixWithMTD("X", X, true);
+ writeInputMatrixWithMTD("e", e, true);
+
+ //execute main test
+ fullDMLScriptName = HOME + TEST_NAME + ".dml";
+ programArgs = new String[]{"-args", input("X"), input("e"),
String.valueOf(K),String.valueOf(!dp).toUpperCase(),
String.valueOf(VERBOSE).toUpperCase(), output("R")};
+ fullRScriptName = HOME + TEST_NAME + ".R";
+ rCmd = "Rscript" + " " + fullRScriptName + " " + inputDir() + " " + String.valueOf(K)
+ + " " + String.valueOf(!dp).toUpperCase() + " " + expectedDir();
+
runTest(true, false, null, -1);
-
- double[][] ret = TestUtils.convertHashMapToDoubleArray(readDMLMatrixFromHDFS("R"));
+ runRScript(true);
+
+ //compare dml and R
+ HashMap<CellIndex, Double> dmlfile = readDMLMatrixFromHDFS("R");
+ HashMap<CellIndex, Double> rfile = readRMatrixFromFS("R");
+ TestUtils.compareMatrices(dmlfile, rfile, 1e-2, "Stat-DML", "Stat-R");
+
+ //compare expected results
+ double[][] ret = TestUtils.convertHashMapToDoubleArray(dmlfile);
for(int i=0; i<K; i++)
TestUtils.compareMatrices(EXPECTED_TOPK[i], ret[i], 1e-2);
diff --git a/src/test/scripts/functions/builtin/slicefinder.R b/src/test/scripts/functions/builtin/slicefinder.R
new file mode 100644
index 0000000..a84cd4a
--- /dev/null
+++ b/src/test/scripts/functions/builtin/slicefinder.R
@@ -0,0 +1,324 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args<-commandArgs(TRUE)
+options(digits=22)
+library("Matrix")
+library("matrixStats")
+library("doMC")
+
+registerDoMC(NULL) # physical cores
+
+################################################################################
+
+slicefinder = function(X, e,
+ k = 4, maxL = 0, minSup = 32, alpha = 0.5,
+ tpEval = TRUE, tpBlksz = 16, verbose = FALSE)
+{
+ # init debug matrix: levelID, enumerated S, valid S, TKmax, TKmin
+ D = matrix(0, 0, 5);
+ m = nrow(X);
+ n = ncol(X);
+
+ # prepare offset vectors and one-hot encoded X
+ fdom = colMaxs(X);
+ foffb = t(cumsum(t(fdom))) - fdom;
+ foffe = t(cumsum(t(fdom)))
+ rix = matrix(seq(1,m)%*%matrix(1,1,n), m*n, 1)
+ cix = matrix(X + (matrix(1,nrow(X),1) %*% foffb), m*n, 1);
+ X2 = table(rix, cix); #one-hot encoded
+
+ # initialize statistics and basic slices
+ n2 = ncol(X2); # one-hot encoded features
+ eAvg = sum(e) / m; # average error
+ TMP1 = createAndScoreBasicSlices(X2, e, eAvg, minSup, alpha, verbose);
+ S = TMP1[["S"]];
+ R = TMP1[["R"]];
+
+ # initialize top-k
+ TMP2 = maintainTopK(S, R, matrix(0,0,n2), matrix(0,0,4), k, minSup);
+ TK = TMP2[["TK"]]; TKC = TMP2[["TKC"]];
+
+ if( verbose ) {
+ TMP3 = analyzeTopK(TKC);
+ maxsc = TMP3[["maxsc"]]; minsc = TMP3[["minsc"]];
+ print(paste("SliceFinder: initial top-K: count=",nrow(TK),", max=",maxsc,", min=",minsc,sep=""))
+ D = rbind(D, t(as.matrix(list(1, n2, nrow(S), maxsc, minsc))));
+ }
+
+ # lattice enumeration w/ size/error pruning, one iteration per level
+ # termination condition (max #feature levels)
+ maxL = ifelse(maxL<=0, n, maxL)
+ level = 1;
+ while( nrow(S) > 0 & sum(S) > 0 & level < n & level < maxL ) {
+ level = level + 1;
+
+ # enumerate candidate join pairs, incl size/error pruning
+ nrS = nrow(S);
+ S = getPairedCandidates(S, R, TK, TKC, k, level, eAvg, minSup, alpha, n2, foffb, foffe);
+
+ if(verbose) {
+ print(paste("SliceFinder: level ",level,":",sep=""))
+ print(paste(" -- generated paired slice candidates: ",nrS," -> ",nrow(S),sep=""));
+ }
+
+ # extract and evaluate candidate slices
+ if( tpEval ) { # task-parallel
+ R = foreach( i=1:nrow(S), .combine=rbind) %dopar% {
+ return (evalSlice(X2, e, eAvg, as.matrix(S[i,]), level, alpha))
+ }
+ }
+ else { # data-parallel
+ R = evalSlice(X2, e, eAvg, t(S), level, alpha);
+ }
+
+ # maintain top-k after evaluation
+ TMP2 = maintainTopK(S, R, TK, TKC, k, minSup);
+ TK = TMP2[["TK"]]; TKC = TMP2[["TKC"]];
+
+ if(verbose) {
+ TMP3 = analyzeTopK(TKC);
+ maxsc = TMP3[["maxsc"]]; minsc = TMP3[["minsc"]];
+ valid = as.integer(sum(R[,2]>0 & R[,4]>=minSup));
+ print(paste(" -- valid slices after eval: ",valid,"/",nrow(S),sep=""));
+ print(paste(" -- top-K: count=",nrow(TK),", max=",maxsc,", min=",minsc,sep=""));
+ D = rbind(D, t(as.matrix(list(level, nrow(S), valid, maxsc, minsc))));
+ }
+ }
+
+ TK = decodeTopK(TK, foffb, foffe);
+
+ if( verbose ) {
+ print(paste("SliceFinder: terminated at level ",level,":"));
+ print(TK); print(TKC);
+ }
+
+ return (list("TK"=TK, "TKC"=TKC, "D"=D))
+}
+
+rexpand = function(v, n2=max(v)) {
+ R = matrix(0, nrow(v), n2)
+ for( i in 1:nrow(v) ) {
+ R[i,] = tabulate(v[i,], nbins=n2);
+ }
+ return (R)
+}
+
+createAndScoreBasicSlices = function(X2, e, eAvg, minSup, alpha, verbose) {
+ n2 = ncol(X2);
+ cCnts = colSums(X2); # column counts
+ err = t(t(e) %*% X2); # total error vector
+ merr = t(colMaxs(X2 * (e %*% matrix(1,1,ncol(X2))))); # maximum error vector
+
+ if( verbose ) {
+ drop = as.integer(sum(cCnts < minSup | err == 0));
+ print(paste("SliceFinder: dropping ",drop,"/",n2," features below minSup = ",minSup,".", sep=""));
+ }
+
+ # working set of active slices (#attr x #slices) and top k
+ selCols = (cCnts >= minSup & err > 0);
+ attr = as.matrix(seq(1,n2)[selCols])
+ ss = as.matrix(cCnts[selCols])
+ se = as.matrix(err[selCols])
+ sm = as.matrix(merr[selCols])
+ S = rexpand(attr, n2);
+
+ # score 1-slices and create initial top-k
+ sc = score(ss, se, eAvg, alpha, nrow(X2));
+ R = as.matrix(cbind(sc, se, sm, ss));
+
+ return (list("S"=S,"R"=R))
+}
+
+score = function(ss, se, eAvg, alpha, n) {
+ sc = alpha * ((se/ss) / eAvg - 1) - (1-alpha) * (n/ss - 1);
+ sc = replace(sc, NaN, -Inf);
+ return (sc)
+}
+
+scoreUB = function(ss, se, sm, eAvg, minSup, alpha, n) {
+ # Since sc is either monotonically increasing or decreasing, we
+ # probe interesting points of sc in the interval [minSup, ss],
+ # and compute the maximum to serve as the upper bound
+ s = cbind(matrix(minSup,nrow(ss),1), pmax(se/sm,minSup), ss)
+ ex = matrix(1,1,3)
+ sc = rowMaxs(alpha * ((pmin(s*(sm%*%ex),se%*%ex)/s) / eAvg - 1) - (1-alpha) * (1/s*n - 1));
+ sc = replace(sc, NaN, -Inf);
+ return (sc)
+}
+
+
+maintainTopK = function(S, R, TK, TKC, k, minSup) {
+ # prune invalid minSup and scores
+ I = as.matrix((R[,1] > 0) & (R[,4] >= minSup));
+
+ if( sum(I)!=0 ) {
+ S = as.matrix(S[I,])
+ R = as.matrix(R[I,])
+ if( ncol(S) != ncol(TK) & ncol(S)==1 ) {
+ S = t(S); R = t(R);
+ }
+
+ # evaluated candidated and previous top-k
+ slices = as.matrix(rbind(TK, S));
+ scores = as.matrix(rbind(TKC, R));
+
+ # extract top-k
+ IX = as.matrix(order(scores[,1], decreasing=TRUE));
+ IX = as.matrix(IX[1:min(k,nrow(IX)),]);
+ TK = as.matrix(slices[IX,]);
+ TKC = as.matrix(scores[IX,]);
+ }
+ return (list("TK"=TK, "TKC"=TKC))
+}
+
+analyzeTopK = function(TKC) {
+ maxsc = -Inf;
+ minsc = -Inf;
+ if( nrow(TKC)>0 ) {
+ maxsc = TKC[1,1];
+ minsc = TKC[nrow(TKC),1];
+ }
+ return (list("maxsc"=maxsc, "minsc"=minsc))
+}
+
+getPairedCandidates = function(S, R, TK, TKC, k,
+ level, eAvg, minSup, alpha, n2, foffb, foffe)
+{
+ # prune invalid slices (possible without affecting overall
+ # pruning effectiveness due to handling of missing parents)
+ pI = (R[,4] >= minSup & R[,2] > 0);
+ S = S[pI,]; R = R[pI,];
+
+ # join compatible slices (without self)
+ join = S %*% t(S) == (level-2)
+ I = upper.tri(join, diag=FALSE) * join;
+
+ # pair construction
+ nr = nrow(I); nc = ncol(I);
+ rix = matrix(I * (seq(1,nr)%*%matrix(1,1,ncol(I))), nr*nc, 1);
+ cix = matrix(I * (matrix(1,nrow(I),1) %*% t(seq(1,nc))), nr*nc, 1);
+ rix = as.matrix(rix[rix!=0,])
+ cix = as.matrix(cix[cix!=0,])
+
+ P = matrix(0, 0, ncol(S))
+ if( sum(rix)!=0 ) {
+ P1 = rexpand(rix, nrow(S));
+ P2 = rexpand(cix, nrow(S));
+ P12 = P1 + P2; # combined slice
+ P = ((P1 %*% S + P2 %*% S) != 0) * 1;
+ se = pmin(P1 %*% R[,2], P2 %*% R[,2])
+ sm = pmin(P1 %*% R[,3], P2 %*% R[,3])
+ ss = pmin(P1 %*% R[,4], P2 %*% R[,4])
+
+ # prune invalid self joins (>1 bit per feature)
+ I = matrix(1, nrow(P), 1);
+ for( j in 1:ncol(foffb) ) {
+ beg = foffb[1,j]+1;
+ end = foffe[1,j];
+ I = I & (rowSums(P[,beg:end]) <= 1);
+ }
+ P12 = P12[I,]
+ P = P[I,]
+ ss = as.matrix(ss[I])
+ se = as.matrix(se[I])
+ sm = as.matrix(sm[I])
+
+ # prepare IDs for deduplication and pruning
+ ID = matrix(0, nrow(P), 1);
+ dom = foffe-foffb+1;
+ for( j in 1:ncol(dom) ) {
+ beg = foffb[1,j]+1;
+ end = foffe[1,j];
+ I = max.col(P[,beg:end],ties.method="last") * rowSums(P[,beg:end]);
+ prod = 1;
+ if(j<ncol(dom))
+ prod = prod(dom[1,(j+1):ncol(dom)])
+ ID = ID + I * prod;
+ }
+
+ # size pruning, with rowMin-rowMax transform
+ # to avoid densification (ignored zeros)
+ map = table(ID, seq(1,nrow(P)))
+ ex = matrix(1,nrow(map),1)
+ ubSizes = 1/rowMaxs(map * (1/ex%*%t(ss)));
+ ubSizes = as.matrix(replace(ubSizes, Inf, 0));
+ fSizes = (ubSizes >= minSup)
+
+ # error pruning
+ ubError = 1/rowMaxs(map * (1/ex%*%t(se)));
+ ubError = as.matrix(replace(ubError, Inf, 0));
+ ubMError = 1/rowMaxs(map * (1/ex%*%t(sm)));
+ ubMError = as.matrix(replace(ubMError, Inf, 0));
+ ubScores = scoreUB(ubSizes, ubError, ubMError, eAvg, minSup, alpha, n2);
+ TMP3 = analyzeTopK(TKC);
+ minsc = TMP3[["minsc"]]
+ fScores = (ubScores > minsc & ubScores > 0)
+
+ # missing parents pruning
+ numParents = rowSums((map %*% P12) != 0)
+ fParents = (numParents == level);
+
+ # apply all pruning
+ map = map * ((fSizes & fScores & fParents) %*% matrix(1,1,ncol(map)));
+
+ # deduplication of join outputs
+ Dedup = as.matrix(map[rowMaxs(map)!=0,] != 0)
+ P = (Dedup %*% P) != 0
+ }
+ return (P)
+}
+
+evalSlice = function(X, e, eAvg, tS, l, alpha) {
+ I = (X %*% tS) == l; # slice indicator
+ ss = as.matrix(colSums(I)); # absolute slice size (nnz)
+ se = as.matrix(t(t(e) %*% I)); # absolute slice error
+ sm = as.matrix(colMaxs(I * e%*%matrix(1,1,ncol(I)))); # maximum tuple error in slice
+
+ # score of relative error and relative size
+ sc = score(ss, se, eAvg, alpha, nrow(X));
+ R = as.matrix(cbind(sc, se, sm, ss));
+ return (R)
+}
+
+decodeTopK = function(TK, foffb, foffe) {
+ R = matrix(1, nrow(TK), ncol(foffb));
+ if( nrow(TK) > 0 ) {
+ for( j in 1:ncol(foffb) ) {
+ beg = foffb[1,j]+1;
+ end = foffe[1,j];
+ I = rowSums(TK[,beg:end]) * max.col(TK[,beg:end],ties.method="last");
+ R[, j] = I;
+ }
+ }
+ return (R)
+}
+
+################################################################################
+X = as.matrix(readMM(paste(args[1], "X.mtx", sep="")))
+e = as.matrix(readMM(paste(args[1], "e.mtx", sep="")))
+k = as.integer(args[2]);
+tpEval = as.logical(args[3])
+
+TMP = slicefinder(X=X, e=e, k=k, alpha=0.95, minSup=4, tpEval=tpEval, verbose=TRUE);
+R = TMP[["TKC"]]
+
+writeMM(as(R, "CsparseMatrix"), paste(args[4], "R", sep=""));
diff --git a/src/test/scripts/functions/builtin/slicefinder.dml b/src/test/scripts/functions/builtin/slicefinder.dml
index 96d5313..1027b17 100644
--- a/src/test/scripts/functions/builtin/slicefinder.dml
+++ b/src/test/scripts/functions/builtin/slicefinder.dml
@@ -19,24 +19,10 @@
#
#-------------------------------------------------------------
-FXY = read($1, data_type="frame", format="csv", header=TRUE);
-
-F = FXY[,1:ncol(FXY)-1];
-y = as.matrix(FXY[,ncol(FXY)]);
-
-# data preparation
-jspec= "{ ids:true, recode:[1,2,3,6],bin:["
- +"{id:4, method:equi-width, numbins:14},"
- +"{id:5, method:equi-width, numbins:12}]}"
-[X,M] = transformencode(target=F, spec=jspec);
-X = X[,2:ncol(X)]
-
-# learn model
-B = lm(X=X, y=y, verbose=FALSE);
-yhat = X %*% B;
-e = (y-yhat)^2;
+X = read($1);
+e = read($2);
# call slice finding
-[TK,TKC] = slicefinder(X=X, e=e, k=$2, alpha=0.95, minSup=4, tpEval=$3, verbose=$4);
+[TS,TR] = slicefinder(X=X, e=e, k=$3, alpha=0.95, minSup=4, tpEval=$4, verbose=$5);
-write(TKC, $5)
+write(TR, $6)
diff --git a/src/test/scripts/functions/builtin/slicefinder.dml b/src/test/scripts/functions/builtin/slicefinderPrep.dml
similarity index 91%
copy from src/test/scripts/functions/builtin/slicefinder.dml
copy to src/test/scripts/functions/builtin/slicefinderPrep.dml
index 96d5313..036c0ff 100644
--- a/src/test/scripts/functions/builtin/slicefinder.dml
+++ b/src/test/scripts/functions/builtin/slicefinderPrep.dml
@@ -36,7 +36,5 @@ B = lm(X=X, y=y, verbose=FALSE);
yhat = X %*% B;
e = (y-yhat)^2;
-# call slice finding
-[TK,TKC] = slicefinder(X=X, e=e, k=$2, alpha=0.95, minSup=4, tpEval=$3, verbose=$4);
-
-write(TKC, $5)
+write(X, $2)
+write(e, $3)