You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by mb...@apache.org on 2020/09/19 19:48:13 UTC
[systemds] branch master updated: [SYSTEMDS-2641] Additional R baseline for slice finding tests

This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/master by this push:
     new 132f1e7  [SYSTEMDS-2641] Additional R baseline for slice finding tests
132f1e7 is described below

commit 132f1e72896cadb25660bf5321935915ca66dcd9
Author: Matthias Boehm <mb...@gmail.com>
AuthorDate: Sat Sep 19 21:46:51 2020 +0200

    [SYSTEMDS-2641] Additional R baseline for slice finding tests
---
 .../functions/builtin/BuiltinSliceFinderTest.java  |  43 ++-
 src/test/scripts/functions/builtin/slicefinder.R   | 324 +++++++++++++++++++++
 src/test/scripts/functions/builtin/slicefinder.dml |  22 +-
 .../{slicefinder.dml => slicefinderPrep.dml}       |   6 +-
 4 files changed, 365 insertions(+), 30 deletions(-)

diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinSliceFinderTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinSliceFinderTest.java
index f03caa0..168a142 100644
--- a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinSliceFinderTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinSliceFinderTest.java
@@ -22,13 +22,18 @@ package org.apache.sysds.test.functions.builtin;
 
 import org.junit.Assert;
 import org.junit.Test;
+
+import java.util.HashMap;
+
 import org.apache.sysds.common.Types.ExecMode;
+import org.apache.sysds.runtime.matrix.data.MatrixValue.CellIndex;
 import org.apache.sysds.test.AutomatedTestBase;
 import org.apache.sysds.test.TestConfiguration;
 import org.apache.sysds.test.TestUtils;
 
-public class BuiltinSliceFinderTest extends AutomatedTestBase {
-
+public class BuiltinSliceFinderTest extends AutomatedTestBase
+{
+	private static final String PREP_NAME = "slicefinderPrep";
 	private static final String TEST_NAME = "slicefinder";
 	private static final String TEST_DIR = "functions/builtin/";
 	private static final String TEST_CLASS_DIR = TEST_DIR + BuiltinSliceFinderTest.class.getSimpleName() + "/";
@@ -94,21 +99,43 @@ public class BuiltinSliceFinderTest extends AutomatedTestBase {
 	
 	private void runSliceFinderTest(int K, boolean dp, ExecMode mode) {
 		ExecMode platformOld = setExecMode(ExecMode.HYBRID);
-		String dml_test_name = TEST_NAME;
 		loadTestConfiguration(getTestConfiguration(TEST_NAME));
 		String HOME = SCRIPT_DIR + TEST_DIR;
 		String data = HOME + "/data/Salaries.csv";
 		
 		try {
 			loadTestConfiguration(getTestConfiguration(TEST_NAME));
-			//
-			fullDMLScriptName = HOME + dml_test_name + ".dml";
-			programArgs = new String[]{"-args", data,
+			
+			//run data preparation
+			fullDMLScriptName = HOME + PREP_NAME + ".dml";
+			programArgs = new String[]{"-args", data, output("X"), output("e")};
+			runTest(true, false, null, -1);
+			
+			//read output and store for dml and R
+			double[][] X = TestUtils.convertHashMapToDoubleArray(readDMLMatrixFromHDFS("X"));
+			double[][] e = TestUtils.convertHashMapToDoubleArray(readDMLMatrixFromHDFS("e"));
+			writeInputMatrixWithMTD("X", X, true);
+			writeInputMatrixWithMTD("e", e, true);
+			
+			//execute main test
+			fullDMLScriptName = HOME + TEST_NAME + ".dml";
+			programArgs = new String[]{"-args", input("X"), input("e"),
 				String.valueOf(K),String.valueOf(!dp).toUpperCase(),
 				String.valueOf(VERBOSE).toUpperCase(), output("R")};
+			fullRScriptName = HOME + TEST_NAME + ".R";
+			rCmd = "Rscript" + " " + fullRScriptName + " " + inputDir() + " " + String.valueOf(K) 
+				+ " " + String.valueOf(!dp).toUpperCase() + " " + expectedDir();
+			
 			runTest(true, false, null, -1);
-
-			double[][] ret = TestUtils.convertHashMapToDoubleArray(readDMLMatrixFromHDFS("R"));
+			runRScript(true); 
+			
+			//compare dml and R
+			HashMap<CellIndex, Double> dmlfile = readDMLMatrixFromHDFS("R");
+			HashMap<CellIndex, Double> rfile  = readRMatrixFromFS("R");
+			TestUtils.compareMatrices(dmlfile, rfile, 1e-2, "Stat-DML", "Stat-R");
+			
+			//compare expected results
+			double[][] ret = TestUtils.convertHashMapToDoubleArray(dmlfile);
 			for(int i=0; i<K; i++)
 				TestUtils.compareMatrices(EXPECTED_TOPK[i], ret[i], 1e-2);
 		
diff --git a/src/test/scripts/functions/builtin/slicefinder.R b/src/test/scripts/functions/builtin/slicefinder.R
new file mode 100644
index 0000000..a84cd4a
--- /dev/null
+++ b/src/test/scripts/functions/builtin/slicefinder.R
@@ -0,0 +1,324 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args<-commandArgs(TRUE)
+options(digits=22)
+library("Matrix")
+library("matrixStats")
+library("doMC")
+
+registerDoMC(NULL) # physical cores
+
+################################################################################
+
+slicefinder = function(X, e,
+  k = 4, maxL = 0, minSup = 32, alpha = 0.5,
+  tpEval = TRUE, tpBlksz = 16, verbose = FALSE)
+{
+  # init debug matrix: levelID, enumerated S, valid S, TKmax, TKmin
+  D = matrix(0, 0, 5);
+  m = nrow(X);
+  n = ncol(X);
+  
+  # prepare offset vectors and one-hot encoded X
+  fdom = colMaxs(X);
+  foffb = t(cumsum(t(fdom))) - fdom;
+  foffe = t(cumsum(t(fdom)))
+  rix = matrix(seq(1,m)%*%matrix(1,1,n), m*n, 1)
+  cix = matrix(X + (matrix(1,nrow(X),1) %*% foffb), m*n, 1);
+  X2 = table(rix, cix); #one-hot encoded
+
+  # initialize statistics and basic slices
+  n2 = ncol(X2);     # one-hot encoded features
+  eAvg = sum(e) / m; # average error
+  TMP1 = createAndScoreBasicSlices(X2, e, eAvg, minSup, alpha, verbose); 
+  S = TMP1[["S"]];
+  R = TMP1[["R"]];
+
+  # initialize top-k
+  TMP2 = maintainTopK(S, R, matrix(0,0,n2), matrix(0,0,4), k, minSup);
+  TK = TMP2[["TK"]]; TKC = TMP2[["TKC"]];
+
+  if( verbose ) {
+    TMP3 = analyzeTopK(TKC);
+    maxsc = TMP3[["maxsc"]]; minsc = TMP3[["minsc"]];
+    print(paste("SliceFinder: initial top-K: count=",nrow(TK),", max=",maxsc,", min=",minsc,sep=""))
+    D = rbind(D, t(as.matrix(list(1, n2, nrow(S), maxsc, minsc))));
+  }
+
+  # lattice enumeration w/ size/error pruning, one iteration per level
+  # termination condition (max #feature levels)
+  maxL = ifelse(maxL<=0, n, maxL)
+  level = 1;
+  while( nrow(S) > 0 & sum(S) > 0 & level < n & level < maxL ) {
+    level = level + 1;
+
+    # enumerate candidate join pairs, incl size/error pruning 
+    nrS = nrow(S);
+    S = getPairedCandidates(S, R, TK, TKC, k, level, eAvg, minSup, alpha, n2, foffb, foffe); 
+
+    if(verbose) {
+      print(paste("SliceFinder: level ",level,":",sep=""))
+      print(paste(" -- generated paired slice candidates: ",nrS," -> ",nrow(S),sep=""));
+    }
+
+    # extract and evaluate candidate slices
+    if( tpEval ) { # task-parallel
+      R = foreach( i=1:nrow(S), .combine=rbind) %dopar% {
+        return (evalSlice(X2, e, eAvg, as.matrix(S[i,]), level, alpha))
+      }
+    }
+    else { # data-parallel
+      R = evalSlice(X2, e, eAvg, t(S), level, alpha);
+    }
+    
+    # maintain top-k after evaluation
+    TMP2 = maintainTopK(S, R, TK, TKC, k, minSup);
+    TK = TMP2[["TK"]]; TKC = TMP2[["TKC"]];
+
+    if(verbose) {
+      TMP3 = analyzeTopK(TKC);
+      maxsc = TMP3[["maxsc"]]; minsc = TMP3[["minsc"]];
+      valid = as.integer(sum(R[,2]>0 & R[,4]>=minSup));
+      print(paste(" -- valid slices after eval: ",valid,"/",nrow(S),sep=""));
+      print(paste(" -- top-K: count=",nrow(TK),", max=",maxsc,", min=",minsc,sep=""));
+      D = rbind(D, t(as.matrix(list(level, nrow(S), valid, maxsc, minsc))));
+    }
+  }
+
+  TK = decodeTopK(TK, foffb, foffe);
+
+  if( verbose ) {
+    print(paste("SliceFinder: terminated at level ",level,":"));
+    print(TK); print(TKC);
+  }
+  
+  return (list("TK"=TK, "TKC"=TKC, "D"=D))
+}
+
+rexpand = function(v, n2=max(v)) {
+  R = matrix(0, nrow(v), n2)
+  for( i in 1:nrow(v) ) {
+    R[i,] = tabulate(v[i,], nbins=n2);
+  }
+  return (R)
+} 
+
+createAndScoreBasicSlices = function(X2, e, eAvg, minSup, alpha, verbose) {
+  n2 = ncol(X2);
+  cCnts = colSums(X2);    # column counts
+  err = t(t(e) %*% X2);   # total error vector
+  merr = t(colMaxs(X2 * (e %*% matrix(1,1,ncol(X2))))); # maximum error vector
+
+  if( verbose ) {
+    drop = as.integer(sum(cCnts < minSup | err == 0));
+    print(paste("SliceFinder: dropping ",drop,"/",n2," features below minSup = ",minSup,".", sep=""));
+  }
+
+  # working set of active slices (#attr x #slices) and top k
+  selCols = (cCnts >= minSup & err > 0);
+  attr = as.matrix(seq(1,n2)[selCols])
+  ss = as.matrix(cCnts[selCols])
+  se = as.matrix(err[selCols])
+  sm = as.matrix(merr[selCols])
+  S = rexpand(attr, n2);
+  
+  # score 1-slices and create initial top-k 
+  sc = score(ss, se, eAvg, alpha, nrow(X2));
+  R = as.matrix(cbind(sc, se, sm, ss));
+
+  return (list("S"=S,"R"=R))
+}
+
+score = function(ss, se, eAvg, alpha, n) {
+  sc = alpha * ((se/ss) / eAvg - 1) - (1-alpha) * (n/ss - 1);
+  sc = replace(sc, NaN, -Inf);
+  return (sc)
+}
+
+scoreUB = function(ss, se, sm, eAvg, minSup, alpha, n) {
+  # Since sc is either monotonically increasing or decreasing, we
+  # probe interesting points of sc in the interval [minSup, ss],
+  # and compute the maximum to serve as the upper bound 
+  s = cbind(matrix(minSup,nrow(ss),1), pmax(se/sm,minSup), ss)
+  ex = matrix(1,1,3)
+  sc = rowMaxs(alpha * ((pmin(s*(sm%*%ex),se%*%ex)/s) / eAvg - 1) - (1-alpha) * (1/s*n - 1));
+  sc = replace(sc, NaN, -Inf);
+  return (sc)
+}
+
+
+maintainTopK = function(S, R, TK, TKC, k, minSup) {
+  # prune invalid minSup and scores
+  I = as.matrix((R[,1] > 0) & (R[,4] >= minSup));
+
+  if( sum(I)!=0 ) {
+    S = as.matrix(S[I,])
+    R = as.matrix(R[I,])
+    if( ncol(S) != ncol(TK) & ncol(S)==1 ) {
+      S = t(S); R = t(R);
+    }
+
+    # evaluated candidated and previous top-k
+    slices = as.matrix(rbind(TK, S));
+    scores = as.matrix(rbind(TKC, R));
+
+    # extract top-k
+    IX = as.matrix(order(scores[,1], decreasing=TRUE));
+    IX = as.matrix(IX[1:min(k,nrow(IX)),]);
+    TK = as.matrix(slices[IX,]);
+    TKC = as.matrix(scores[IX,]);
+  }
+  return (list("TK"=TK, "TKC"=TKC))
+}
+
+analyzeTopK = function(TKC) {
+  maxsc = -Inf;
+  minsc = -Inf;
+  if( nrow(TKC)>0 ) {
+    maxsc = TKC[1,1];
+    minsc = TKC[nrow(TKC),1];
+  }
+  return (list("maxsc"=maxsc, "minsc"=minsc))
+}
+
+getPairedCandidates = function(S, R, TK, TKC, k,
+  level, eAvg, minSup, alpha, n2, foffb, foffe)
+{
+  # prune invalid slices (possible without affecting overall
+  # pruning effectiveness due to handling of missing parents)
+  pI = (R[,4] >= minSup & R[,2] > 0);
+  S = S[pI,]; R = R[pI,];
+
+  # join compatible slices (without self)
+  join = S %*% t(S) == (level-2)
+  I = upper.tri(join, diag=FALSE) * join;
+
+  # pair construction
+  nr = nrow(I); nc = ncol(I);
+  rix = matrix(I * (seq(1,nr)%*%matrix(1,1,ncol(I))), nr*nc, 1);
+  cix = matrix(I * (matrix(1,nrow(I),1) %*% t(seq(1,nc))), nr*nc, 1);
+  rix = as.matrix(rix[rix!=0,])
+  cix = as.matrix(cix[cix!=0,])
+  
+  P = matrix(0, 0, ncol(S))
+  if( sum(rix)!=0 ) {
+    P1 = rexpand(rix, nrow(S));
+    P2 = rexpand(cix, nrow(S));
+    P12 = P1 + P2; # combined slice
+    P = ((P1 %*% S + P2 %*% S) != 0) * 1;
+    se = pmin(P1 %*% R[,2], P2 %*% R[,2])
+    sm = pmin(P1 %*% R[,3], P2 %*% R[,3])
+    ss = pmin(P1 %*% R[,4], P2 %*% R[,4])
+
+    # prune invalid self joins (>1 bit per feature)
+    I = matrix(1, nrow(P), 1);
+    for( j in 1:ncol(foffb) ) {
+      beg = foffb[1,j]+1;
+      end = foffe[1,j];
+      I = I & (rowSums(P[,beg:end]) <= 1);
+    }
+    P12 = P12[I,]
+    P = P[I,]
+    ss = as.matrix(ss[I])
+    se = as.matrix(se[I])
+    sm = as.matrix(sm[I])
+    
+    # prepare IDs for deduplication and pruning
+    ID = matrix(0, nrow(P), 1);
+    dom = foffe-foffb+1;
+    for( j in 1:ncol(dom) ) {
+      beg = foffb[1,j]+1;
+      end = foffe[1,j];
+      I = max.col(P[,beg:end],ties.method="last") * rowSums(P[,beg:end]);
+      prod = 1;
+      if(j<ncol(dom))
+        prod = prod(dom[1,(j+1):ncol(dom)])
+      ID = ID + I * prod;
+    }
+
+    # size pruning, with rowMin-rowMax transform 
+    # to avoid densification (ignored zeros)
+    map = table(ID, seq(1,nrow(P)))
+    ex = matrix(1,nrow(map),1)
+    ubSizes = 1/rowMaxs(map * (1/ex%*%t(ss)));
+    ubSizes = as.matrix(replace(ubSizes, Inf, 0));
+    fSizes = (ubSizes >= minSup)
+
+    # error pruning
+    ubError = 1/rowMaxs(map * (1/ex%*%t(se)));
+    ubError = as.matrix(replace(ubError, Inf, 0));
+    ubMError = 1/rowMaxs(map * (1/ex%*%t(sm)));
+    ubMError = as.matrix(replace(ubMError, Inf, 0));
+    ubScores = scoreUB(ubSizes, ubError, ubMError, eAvg, minSup, alpha, n2);
+    TMP3 = analyzeTopK(TKC);
+    minsc = TMP3[["minsc"]]
+    fScores = (ubScores > minsc & ubScores > 0)
+
+    # missing parents pruning
+    numParents = rowSums((map %*% P12) != 0)
+    fParents = (numParents == level);
+
+    # apply all pruning
+    map = map * ((fSizes & fScores & fParents) %*% matrix(1,1,ncol(map)));
+
+    # deduplication of join outputs
+    Dedup = as.matrix(map[rowMaxs(map)!=0,] != 0)
+    P = (Dedup %*% P) != 0
+  }
+  return (P)
+}
+
+evalSlice = function(X, e, eAvg, tS, l, alpha) {
+  I = (X %*% tS) == l;           # slice indicator
+  ss = as.matrix(colSums(I));    # absolute slice size (nnz)
+  se = as.matrix(t(t(e) %*% I)); # absolute slice error
+  sm = as.matrix(colMaxs(I * e%*%matrix(1,1,ncol(I)))); # maximum tuple error in slice
+
+  # score of relative error and relative size
+  sc = score(ss, se, eAvg, alpha, nrow(X));
+  R = as.matrix(cbind(sc, se, sm, ss));
+  return (R)
+}
+
+decodeTopK = function(TK, foffb, foffe) {
+  R = matrix(1, nrow(TK), ncol(foffb));
+  if( nrow(TK) > 0 ) {
+    for( j in 1:ncol(foffb) ) {
+      beg = foffb[1,j]+1;
+      end = foffe[1,j];
+      I = rowSums(TK[,beg:end]) * max.col(TK[,beg:end],ties.method="last");
+      R[, j] = I;
+    }
+  }
+  return (R)
+}
+
+################################################################################
+X = as.matrix(readMM(paste(args[1], "X.mtx", sep="")))
+e = as.matrix(readMM(paste(args[1], "e.mtx", sep="")))
+k = as.integer(args[2]);
+tpEval = as.logical(args[3])
+
+TMP = slicefinder(X=X, e=e, k=k, alpha=0.95, minSup=4, tpEval=tpEval, verbose=TRUE);
+R = TMP[["TKC"]]
+
+writeMM(as(R, "CsparseMatrix"), paste(args[4], "R", sep="")); 
diff --git a/src/test/scripts/functions/builtin/slicefinder.dml b/src/test/scripts/functions/builtin/slicefinder.dml
index 96d5313..1027b17 100644
--- a/src/test/scripts/functions/builtin/slicefinder.dml
+++ b/src/test/scripts/functions/builtin/slicefinder.dml
@@ -19,24 +19,10 @@
 #
 #-------------------------------------------------------------
 
-FXY = read($1, data_type="frame", format="csv", header=TRUE);
-
-F = FXY[,1:ncol(FXY)-1];
-y = as.matrix(FXY[,ncol(FXY)]);
-
-# data preparation
-jspec= "{ ids:true, recode:[1,2,3,6],bin:[" 
-      +"{id:4, method:equi-width, numbins:14},"
-      +"{id:5, method:equi-width, numbins:12}]}"
-[X,M] = transformencode(target=F, spec=jspec);
-X = X[,2:ncol(X)]
-
-# learn model
-B = lm(X=X, y=y, verbose=FALSE);
-yhat = X %*% B;
-e = (y-yhat)^2;
+X = read($1);
+e = read($2);
 
 # call slice finding
-[TK,TKC] = slicefinder(X=X, e=e, k=$2, alpha=0.95, minSup=4, tpEval=$3, verbose=$4);
+[TS,TR] = slicefinder(X=X, e=e, k=$3, alpha=0.95, minSup=4, tpEval=$4, verbose=$5);
 
-write(TKC, $5)
+write(TR, $6)
diff --git a/src/test/scripts/functions/builtin/slicefinder.dml b/src/test/scripts/functions/builtin/slicefinderPrep.dml
similarity index 91%
copy from src/test/scripts/functions/builtin/slicefinder.dml
copy to src/test/scripts/functions/builtin/slicefinderPrep.dml
index 96d5313..036c0ff 100644
--- a/src/test/scripts/functions/builtin/slicefinder.dml
+++ b/src/test/scripts/functions/builtin/slicefinderPrep.dml
@@ -36,7 +36,5 @@ B = lm(X=X, y=y, verbose=FALSE);
 yhat = X %*% B;
 e = (y-yhat)^2;
 
-# call slice finding
-[TK,TKC] = slicefinder(X=X, e=e, k=$2, alpha=0.95, minSup=4, tpEval=$3, verbose=$4);
-
-write(TKC, $5)
+write(X, $2)
+write(e, $3)