You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by mb...@apache.org on 2020/09/03 20:13:40 UTC

[systemds] branch master updated: [SYSTEMDS-2641] New slice finding algorithm (dml implementation)

This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/master by this push:
     new 92884d6  [SYSTEMDS-2641] New slice finding algorithm (dml implementation)
92884d6 is described below

commit 92884d6b8f11451d4ffc018720ee0a24e8296fb3
Author: Matthias Boehm <mb...@gmail.com>
AuthorDate: Thu Sep 3 22:12:30 2020 +0200

    [SYSTEMDS-2641] New slice finding algorithm (dml implementation)
    
    This patch adds the new slice finding implementation via linear algebra
    (SliceLine). Even without full pruning, this implementation performs
    exact slice finding for the Salaries dataset in 2.6s (incl compilation
    and startup).
---
 scripts/staging/slicing/slicing.dml | 242 ++++++++++++++++++++++++++++++++++++
 1 file changed, 242 insertions(+)

diff --git a/scripts/staging/slicing/slicing.dml b/scripts/staging/slicing/slicing.dml
new file mode 100644
index 0000000..b458d7a
--- /dev/null
+++ b/scripts/staging/slicing/slicing.dml
@@ -0,0 +1,242 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#-------------------------------------------------------------
+# X         Input matrix (integer encoded [1..v])
+# e         error vector (classification accuracy, l2 norm, etc)
+# k         top-K subsets / slices
+# minSup    minimum support (min number of rows per slice)
+# w         weight [0,1]: 0 only size, 1 only error
+# ------------------------------------------------------------
+# TK        top-k slices (k x ncol(X) if successful) 
+# TKC       score, size, error of slices (k x 3)
+# ------------------------------------------------------------
+
+slicing = function(Matrix[Double] X, Matrix[Double] e, Integer k = 4, Integer minSup = 4, Double w = 0.5, Boolean verbose = FALSE) 
+  return(Matrix[Double] TK, Matrix[Double] TKC)
+{
+  m = nrow(X);
+  n = ncol(X);
+
+  # prepare offset vectors and one-hot encoded X
+  fdom = colMaxs(X);
+  foffb = t(cumsum(t(fdom))) - fdom;
+  foffe = t(cumsum(t(fdom)))
+  rix = matrix(seq(1,m)%*%matrix(1,1,n), m*n, 1)
+  cix = matrix(X + foffb, m*n, 1);
+  X2 = table(rix, cix); #one-hot encoded
+
+  # initialize interesting statistics
+  n2 = ncol(X2);          # one-hot encoded features
+  eAvg = sum(e) / m;      # average error
+  CID = seq(1,n2);        # column IDs
+  cCnts = t(colSums(X2)); # column counts
+  err = t(t(e) %*% X2)    # total error vector
+
+  if( verbose ) {
+    drop = as.integer(sum(cCnts < minSup));
+    print("SliceFinder: dropping "+drop+"/"+n2+" features below minSup = "+minSup+".");
+  }
+
+  # working set of active slices (#attr x #slices) and top k
+  selCols = (cCnts >= minSup);
+  attr = removeEmpty(target=CID, margin="rows", select=selCols);
+  cCnts = removeEmpty(target=cCnts, margin="rows", select=selCols);
+  err = removeEmpty(target=err, margin="rows", select=selCols);
+  S = table(seq(1,nrow(attr)), attr, nrow(attr), n2);
+  continue = ncol(S) > 0 & sum(S) > 0;
+  level = 1;
+
+  # score 1-slices and create initial top-k 
+  R = scoreSlices(cCnts, err, m, eAvg, w);
+
+  [TK, TKC] = maintainTopK(S, R, matrix(0, 0, n2), matrix(0, 0, 3), k, minSup);
+
+  if( verbose ) {
+    [maxsc, minsc] = analyzeTopK(TKC);
+    print("SliceFinder: initial top-K: count="+nrow(TK)+", max="+maxsc+", min="+minsc)
+  }
+
+  # lattice enumeration w/ size/error pruning, one iteration per level
+  while( continue ) {
+    level = level + 1;
+
+    # enumerate candidate join pairs, incl size/error pruning 
+    enumC = getPairedCandidates(S, R, TK, TKC, k, level, minSup, foffb, foffe); 
+
+    if(verbose) {
+      print("SliceFinder: level "+level+":")
+      print(" -- generated paired slice candidates: "+nrow(S)+" -> "+nrow(enumC));      
+    }
+
+    # extract and evaluate candidate slices
+    #  note: this could be done as a single matrix multiply, but to avoid
+    #  large intermediates we use repeated matrix-vector multiplication
+    R = matrix(0, nrow(enumC), 3)
+    parfor( i in 1:nrow(enumC) )
+      R[i,] = evalSlice(X2, e, t(enumC[i,]), level, w);
+
+    # maintain top-k after evaluation
+    [TK, TKC] = maintainTopK(S, R, TK, TKC, k, minSup);
+
+    # prune slices after evaluation and new top-K
+    # TODO evaluate if useful -> more pruning if at least 1 below threhsold?
+    [S, R] = pruneSlices(enumC, R, TK, TKC, k, minSup);
+    #S = enumC;
+
+    if(verbose) {
+      [maxsc, minsc] = analyzeTopK(TKC);
+      print(" -- after eval and pruning: "+nrow(S));
+      print(" -- top-K: count="+nrow(TK)+", max="+maxsc+", min="+minsc);
+    }
+
+    # termination condition (max #feature levels)
+    continue = ncol(S) > 0 & sum(S) > 0 & level < n;
+  }
+
+  if( verbose ) {
+    print(sum(TK));
+    print("SliceFinder: terminated at level "+level+":\n"
+      + toString(TK) + "\n" + toString(TKC));
+  }
+}
+
+maintainTopK = function(Matrix[Double] S, Matrix[Double] R, Matrix[Double] TK, Matrix[Double] TKC, Integer k, Integer minSup) 
+  return(Matrix[Double] TK, Matrix[Double] TKC)
+{
+  # prune invalid minSup and scores
+  I = (R[,1] > 1) & (R[,3] >= minSup);
+
+  if( sum(I)!=0 ) {
+    S = removeEmpty(target=S, margin="rows", select=I);
+    R = removeEmpty(target=R, margin="rows", select=I);
+
+    # evaluated candidated and previous top-k
+    slices = rbind(TK, S);
+    scores = rbind(TKC, R);
+
+    # extract top-k
+    IX = order(target=scores, by=1, decreasing=TRUE, index.return=TRUE);
+    IX = IX[1:min(k,nrow(IX)),];
+    P = table(seq(1,nrow(IX)), IX, nrow(IX), nrow(slices));
+    TK = P %*% slices;
+    TKC = P %*% scores;
+  }
+}
+
+analyzeTopK = function(Matrix[Double] TKC) return(Double maxsc, Double minsc) {
+  maxsc = ifelse(nrow(TKC)>0, as.scalar(TKC[1,1]), NaN);
+  minsc = ifelse(nrow(TKC)>0, as.scalar(TKC[nrow(TKC),1]), NaN);
+}
+
+
+scoreSlices = function(Matrix[Double] ss, Matrix[Double] se, Integer m, Double eAvg, Double w) 
+  return(Matrix[Double] C)
+{
+  sc = w * (se/ss / eAvg) + (1-w) * ss/m;
+  C = cbind(sc, se, ss);
+}
+
+evalSlice = function(Matrix[Double] X, Matrix[Double] e, Matrix[Double] s, Integer l, Double w = 0.5) 
+  return(Matrix[Double] C)
+{
+  I = (X %*% s) == l;          # slice indicator 
+  ss = sum(I);                 # absolute slice size (nnz)
+  se = as.scalar(t(I) %*% e);  # absolute slice error 
+
+  # score of relative error and relative size
+  sc = w * (se/ss / sum(e)/nrow(X)) + (1-w) * ss/nrow(X);
+  C = t(as.matrix(list(sc, se, ss)));
+}
+
+getPairedCandidates = function(Matrix[Double] S, Matrix[Double] R, Matrix[Double] TK, Matrix[Double] TKC, Integer k, Integer level, Integer minSup, Matrix[Double] foffb, Matrix[Double] foffe)
+  return(Matrix[Double] P) 
+{
+while(FALSE){}
+  # join compatible slices (without self)
+  join = S %*% t(S) == (level-2)
+  
+  # pruning by size (at least one below threshold)
+  vsize = outer(R[,3], t(R[,3]), "min") >= minSup;
+  I = join * vsize;
+  I = upper.tri(target=I, diag=FALSE);
+
+  # pair construction
+  nr = nrow(I); nc = ncol(I);
+  rix = matrix(I * seq(1,nr), nr*nc, 1);
+  cix = matrix(I * t(seq(1,nc)), nr*nc, 1);
+  rix = removeEmpty(target=rix, margin="rows");
+  cix = removeEmpty(target=cix, margin="rows");
+  P1 = table(seq(1,nrow(rix)), rix, nrow(rix), nrow(S));
+  P2 = table(seq(1,nrow(cix)), cix, nrow(rix), nrow(S));
+  P = ((P1 %*% S) + (P2 %*% S)) != 0;
+
+  # prune invalid self joins (>1 bit per feature)
+  I = matrix(1, nrow(P), 1);
+  for( j in 1:ncol(foffb) ) {
+    beg = as.scalar(foffb[1,j])+1;
+    end = as.scalar(foffe[1,j]);
+    I = I & (rowSums(P[,beg:end]) <= 1);
+  }
+  P = removeEmpty(target=P, margin="rows", select=I);
+
+  # deduplication
+  # TODO additional size pruning given dedup mapping
+  ID = matrix(1, nrow(P), 1);
+  dom = foffe-foffb;
+  for( j in 1:ncol(dom) ) {
+    beg = as.scalar(foffb[1,j])+1;
+    end = as.scalar(foffe[1,j]);
+    I = rowIndexMax(P[,beg:end]);
+    prod = 1;
+    if(j<ncol(dom))
+      prod = prod(dom[1,(j+1):ncol(dom)])
+    ID = ID + I * prod;
+  }
+  Dedup = removeEmpty(target=table(ID,seq(1,nrow(P))), margin="rows") != 0
+  P = Dedup %*% P
+}
+
+pruneSlices = function(Matrix[Double] S, Matrix[Double] R, Matrix[Double] TK, Matrix[Double] TKC, Integer k, Integer minSup)
+  return(Matrix[Double] S, Matrix[Double] R) 
+{
+  I = R[,3] >= minSup;
+  S = removeEmpty(target=S, margin="rows", select=I);
+  R = removeEmpty(target=R, margin="rows", select=I);
+}
+
+Forig = read("./Salaries.csv", data_type="frame", format="csv", header=TRUE);
+
+F = Forig[,1:ncol(Forig)-1];
+y = as.matrix(Forig[,ncol(Forig)]);
+
+# data preparation
+jspec= "{ ids:true, recode:[1,2,3,6], bin:[{id:4, method:equi-width, numbins:14},{id:5, method:equi-width, numbins:12}]}"
+[X,M] = transformencode(target=F, spec=jspec);
+X = X[,2:ncol(X)]
+
+# learn model
+B = lm(X=X, y=y, verbose=FALSE);
+yhat = X %*% B;
+e = (y-yhat)^2;
+
+# call slice finding
+[S,C] = slicing(X=X, e=e, k=4, w=0.5, minSup=4, verbose=TRUE);