You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@systemds.apache.org by mb...@apache.org on 2020/09/15 19:14:02 UTC

[systemds] branch master updated (d181ea5 -> 9512810)

This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git.


    from d181ea5  [MINOR] Fix parsing gpu binary aggregate instruction
     new 366521f  [MINOR] Performance sparse-sparse permutation matrix multiply
     new 9512810  [SYSTEMDS-2641] Improved slicefinder builtin (hybrid tp, pruning)

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 scripts/builtin/slicefinder.dml                    | 30 +++++++++++++---------
 .../sysds/runtime/matrix/data/LibMatrixMult.java   | 25 ++++++++++--------
 .../sysds/runtime/matrix/data/MatrixBlock.java     |  6 ++---
 .../functions/builtin/BuiltinSliceFinderTest.java  |  2 +-
 src/test/scripts/functions/builtin/slicefinder.dml |  2 +-
 5 files changed, 37 insertions(+), 28 deletions(-)

[systemds] 01/02: [MINOR] Performance sparse-sparse permutation matrix multiply

Posted by mb...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git

commit 366521f9298a465a4b260287c29a453b4eb47572
Author: Matthias Boehm <mb...@gmail.com>
AuthorDate: Tue Sep 15 21:09:24 2020 +0200

    [MINOR] Performance sparse-sparse permutation matrix multiply
    
    This patch leverages the existing kernels for ultra-sparse permutation
    matrix multiply (where the ultra-sparse left-hand-side copies rows of
    the sparse or dense right-hand-side) for scenarios where the lhs is a
    sparse permutation matrix but does not yet qualify as ultra-sparse.
    
    As this approach avoids dense output allocation it can change the
    asymptotic behavior. For example, in slicefinder over the KDD98 dataset,
    the expression (P1 %*% S + P2 %*% S) != 0 took ~420s because the
    3,061,575 x 7,909 was allocated in dense (190GB), and after the
    operation converted to sparse. With the new fine-tuning, we now use the
    existing permutation matrix multiply kernel and allocate the output
    directly in sparse, which improved performance to 431ms (1000x).
---
 .../sysds/runtime/matrix/data/LibMatrixMult.java   | 25 ++++++++++++----------
 .../sysds/runtime/matrix/data/MatrixBlock.java     |  6 +++---
 2 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
index 671eea7..db61927 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
@@ -122,8 +122,9 @@ public class LibMatrixMult
 		//Timing time = new Timing(true);
 		
 		//pre-processing: output allocation
+		boolean m1Perm = m1.isSparsePermutationMatrix();
 		boolean ultraSparse = (fixedRet && ret.sparse)
-			|| (!fixedRet && isUltraSparseMatrixMult(m1, m2));
+			|| (!fixedRet && isUltraSparseMatrixMult(m1, m2, m1Perm));
 		boolean tm2 = checkPrepMatrixMultRightInput(m1,m2);
 		m2 = prepMatrixMultRightInput(m1, m2);
 		ret.sparse = ultraSparse;
@@ -137,7 +138,7 @@ public class LibMatrixMult
 		
 		//core matrix mult computation
 		if( ultraSparse )
-			matrixMultUltraSparse(m1, m2, ret, 0, ru2);
+			matrixMultUltraSparse(m1, m2, ret, m1Perm, 0, ru2);
 		else if(!m1.sparse && !m2.sparse)
 			matrixMultDenseDense(m1, m2, ret, tm2, pm2, 0, ru2, 0, cu);
 		else if(m1.sparse && m2.sparse)
@@ -184,7 +185,8 @@ public class LibMatrixMult
 		
 		//pre-processing: output allocation (in contrast to single-threaded,
 		//we need to allocate sparse as well in order to prevent synchronization)
-		boolean ultraSparse = isUltraSparseMatrixMult(m1, m2);
+		boolean m1Perm = m1.isSparsePermutationMatrix();
+		boolean ultraSparse = isUltraSparseMatrixMult(m1, m2, m1Perm);
 		boolean tm2 = checkPrepMatrixMultRightInput(m1,m2);
 		m2 = prepMatrixMultRightInput(m1, m2);
 		ret.sparse = ultraSparse;
@@ -207,7 +209,7 @@ public class LibMatrixMult
 			ArrayList<MatrixMultTask> tasks = new ArrayList<>();
 			ArrayList<Integer> blklens = UtilFunctions.getBalancedBlockSizesDefault(num, k, (pm2r||pm2c));
 			for( int i=0, lb=0; i<blklens.size(); lb+=blklens.get(i), i++ )
-				tasks.add(new MatrixMultTask(m1, m2, ret, tm2, pm2r, pm2c, lb, lb+blklens.get(i)));
+				tasks.add(new MatrixMultTask(m1, m2, ret, tm2, pm2r, pm2c, m1Perm, lb, lb+blklens.get(i)));
 			//execute tasks
 			List<Future<Object>> taskret = pool.invokeAll(tasks);
 			pool.shutdown();
@@ -1509,14 +1511,14 @@ public class LibMatrixMult
 	 * @param rl row lower bound
 	 * @param ru row upper bound
 	 */
-	private static void matrixMultUltraSparse(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int rl, int ru) {
+	private static void matrixMultUltraSparse(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean m1Perm, int rl, int ru) {
 		final boolean leftUS = m1.isUltraSparse()
 			|| (m1.isUltraSparse(false) && !m2.isUltraSparse())
 			|| (m1.sparse && !m2.sparse);
 		
 		if( m1 == m2 ) //self-product
 			matrixMultUltraSparseSelf(m1, ret, rl, ru);
-		else if( leftUS )
+		else if( leftUS || m1Perm )
 			matrixMultUltraSparseLeft(m1, m2, ret, rl, ru);
 		else
 			matrixMultUltraSparseRight(m1, m2, ret, rl, ru);
@@ -3833,7 +3835,7 @@ public class LibMatrixMult
 			||(!leftTranspose && FPfactor * m1.clen * m1.rlen * m1.rlen > threshold));
 	}
 	
-	public static boolean isUltraSparseMatrixMult(MatrixBlock m1, MatrixBlock m2) {
+	public static boolean isUltraSparseMatrixMult(MatrixBlock m1, MatrixBlock m2, boolean m1Perm) {
 		if( m2.clen == 1 ) //mv always dense
 			return false;
 		//note: ultra-sparse matrix mult implies also sparse outputs, hence we need
@@ -3842,8 +3844,7 @@ public class LibMatrixMult
 			m1.getSparsity(), m2.getSparsity(), m1.rlen, m1.clen, m2.clen, true);
 		return (m1.isUltraSparse() || m2.isUltraSparse()) //base case
 			|| (m1.isUltraSparse(false) && m1 == m2) //ultra-sparse self product
-			|| (m1.isUltraSparsePermutationMatrix() 
-				&& OptimizerUtils.getSparsity(m2.rlen, m2.clen, m2.nonZeros)<1.0)
+			|| (m1Perm && OptimizerUtils.getSparsity(m2.rlen, m2.clen, m2.nonZeros)<1.0)
 			|| ((m1.isUltraSparse(false) || m2.isUltraSparse(false)) 
 				&& outSp < MatrixBlock.ULTRA_SPARSITY_TURN_POINT2)
 			|| (m1.getSparsity() < MatrixBlock.ULTRA_SPARSITY_TURN_POINT2
@@ -3949,17 +3950,19 @@ public class LibMatrixMult
 		private final boolean _tm2; //transposed m2
 		private final boolean _pm2r; //par over m2 rows
 		private final boolean _pm2c; //par over m2 rows
+		private final boolean _m1Perm; //sparse permutation
 		private final int _rl;
 		private final int _ru;
 
 		protected MatrixMultTask( MatrixBlock m1, MatrixBlock m2, MatrixBlock ret,
-				boolean tm2, boolean pm2r, boolean pm2c, int rl, int ru )
+			boolean tm2, boolean pm2r, boolean pm2c, boolean m1Perm, int rl, int ru )
 		{
 			_m1 = m1;
 			_m2 = m2;
 			_tm2 = tm2;
 			_pm2r = pm2r;
 			_pm2c = pm2c;
+			_m1Perm = m1Perm;
 			_rl = rl;
 			_ru = ru;
 			
@@ -3986,7 +3989,7 @@ public class LibMatrixMult
 			
 			//compute block matrix multiplication
 			if( _ret.sparse ) //ultra-sparse
-				matrixMultUltraSparse(_m1, _m2, _ret, rl, ru);
+				matrixMultUltraSparse(_m1, _m2, _ret, _m1Perm, rl, ru);
 			else if(!_m1.sparse && !_m2.sparse)
 				matrixMultDenseDense(_m1, _m2, _ret, _tm2, _pm2r, rl, ru, cl, cu);
 			else if(_m1.sparse && _m2.sparse)
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/MatrixBlock.java b/src/main/java/org/apache/sysds/runtime/matrix/data/MatrixBlock.java
index 0eb484a..032b79e 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/MatrixBlock.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/MatrixBlock.java
@@ -958,11 +958,11 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 			&& (!checkNnz || nonZeros<ULTRA_SPARSE_BLOCK_NNZ);
 	}
 	
-	public boolean isUltraSparsePermutationMatrix() {
-		if( !isUltraSparse(false) )
+	public boolean isSparsePermutationMatrix() {
+		if( !isInSparseFormat() )
 			return false;
-		boolean isPM = true;
 		SparseBlock sblock = getSparseBlock();
+		boolean isPM = (sblock != null);
 		for( int i=0; i<rlen & isPM; i++ )
 			isPM &= sblock.isEmpty(i) || sblock.size(i) == 1;
 		return isPM;

[systemds] 02/02: [SYSTEMDS-2641] Improved slicefinder builtin (hybrid tp, pruning)

Posted by mb...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git

commit 95128102e5269aaa5754283b9d2760b78a8e746c
Author: Matthias Boehm <mb...@gmail.com>
AuthorDate: Tue Sep 15 21:13:13 2020 +0200

    [SYSTEMDS-2641] Improved slicefinder builtin (hybrid tp, pruning)
    
    - New hybrid task-parallel execution w/ exposed block size (2x
    performance improvement for many slices)
    - New pruning before pair enumeration for zero error
    - Slice extraction via separate permutation matrix multiplies
---
 scripts/builtin/slicefinder.dml                    | 30 +++++++++++++---------
 .../functions/builtin/BuiltinSliceFinderTest.java  |  2 +-
 src/test/scripts/functions/builtin/slicefinder.dml |  2 +-
 3 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/scripts/builtin/slicefinder.dml b/scripts/builtin/slicefinder.dml
index 6b00b0c..731f3f9 100644
--- a/scripts/builtin/slicefinder.dml
+++ b/scripts/builtin/slicefinder.dml
@@ -26,8 +26,9 @@
 # maxL      maximum level L (conjunctions of L predicates), 0 unlimited
 # minSup    minimum support (min number of rows per slice)
 # alpha     weight [0,1]: 0 only size, 1 only error
-# dpEval    flag for data-parallel slice evaluation, 
-#           otherwise task-parallel
+# tpEval    flag for task-parallel slice evaluation, 
+#           otherwise data-parallel
+# tpBlksz   block size for task-parallel execution (num slices) 
 # verbose   flag for verbose debug output 
 # ------------------------------------------------------------
 # TK        top-k slices (k x ncol(X) if successful) 
@@ -36,7 +37,7 @@
 
 m_slicefinder = function(Matrix[Double] X, Matrix[Double] e,
     Integer k = 4, Integer maxL = 0, Integer minSup = 32, Double alpha = 0.5,
-    Boolean dpEval = FALSE, Boolean verbose = FALSE)
+    Boolean tpEval = TRUE, Integer tpBlksz = 16, Boolean verbose = FALSE)
   return(Matrix[Double] TK, Matrix[Double] TKC)
 {
   m = nrow(X);
@@ -80,15 +81,19 @@ m_slicefinder = function(Matrix[Double] X, Matrix[Double] e,
     }
 
     # extract and evaluate candidate slices
-    if( dpEval ) { #data-parallel
-      R = evalSlice(X2, e, eAvg, t(S), level, alpha);
-    }
-    else { # task-parallel
+    if( tpEval ) { # task-parallel
+      # hybrid task-parallel w/ 1 matrix-matrix for blocks of 16 matrix-vector 
       R = matrix(0, nrow(S), 4)
-      parfor( i in 1:nrow(S) )
-        R[i,] = evalSlice(X2, e, eAvg, t(S[i,]), level, alpha);
+      parfor( i in 1:ceil(nrow(S)/tpBlksz), check=0 ) {
+        beg = (i-1)*tpBlksz + 1; 
+        end = min(i*tpBlksz, nrow(R));
+        R[beg:end,] = evalSlice(X2, e, eAvg, t(S[beg:end,]), level, alpha);
+      }
     }
-
+    else { # data-parallel
+      R = evalSlice(X2, e, eAvg, t(S), level, alpha);
+    }
+    
     # maintain top-k after evaluation
     [TK, TKC] = maintainTopK(S, R, TK, TKC, k, minSup);
 
@@ -199,7 +204,7 @@ getPairedCandidates = function(Matrix[Double] S, Matrix[Double] R,
 {
   # prune invalid slices (possible without affecting overall
   # pruning effectiveness due to handling of missing parents)
-  pI = (R[,4] >= minSup);
+  pI = (R[,4] >= minSup & R[,2] > 0);
   S = removeEmpty(target=S, margin="rows", select=pI)
   R = removeEmpty(target=R, margin="rows", select=pI)
 
@@ -219,7 +224,8 @@ getPairedCandidates = function(Matrix[Double] S, Matrix[Double] R,
     P1 = table(seq(1,nrow(rix)), rix, nrow(rix), nrow(S));
     P2 = table(seq(1,nrow(cix)), cix, nrow(rix), nrow(S));
     P12 = P1 + P2; # combined slice
-    P = (P12 %*% S) != 0;
+    P = (P1 %*% S + P2 %*% S) != 0;
+
     se = min(P1 %*% R[,2], P2 %*% R[,2])
     sm = min(P1 %*% R[,3], P2 %*% R[,3])
     ss = min(P1 %*% R[,4], P2 %*% R[,4])
diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinSliceFinderTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinSliceFinderTest.java
index e918f5d..53a3499 100644
--- a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinSliceFinderTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinSliceFinderTest.java
@@ -104,7 +104,7 @@ public class BuiltinSliceFinderTest extends AutomatedTestBase {
 			//setOutputBuffering(false);
 			fullDMLScriptName = HOME + dml_test_name + ".dml";
 			programArgs = new String[]{"-args", data,
-				String.valueOf(K),String.valueOf(dp).toUpperCase(),
+				String.valueOf(K),String.valueOf(!dp).toUpperCase(),
 				String.valueOf(VERBOSE).toUpperCase(), output("R")};
 			runTest(true, false, null, -1);
 
diff --git a/src/test/scripts/functions/builtin/slicefinder.dml b/src/test/scripts/functions/builtin/slicefinder.dml
index 442ef79..96d5313 100644
--- a/src/test/scripts/functions/builtin/slicefinder.dml
+++ b/src/test/scripts/functions/builtin/slicefinder.dml
@@ -37,6 +37,6 @@ yhat = X %*% B;
 e = (y-yhat)^2;
 
 # call slice finding
-[TK,TKC] = slicefinder(X=X, e=e, k=$2, alpha=0.95, minSup=4, dpEval=$3, verbose=$4);
+[TK,TKC] = slicefinder(X=X, e=e, k=$2, alpha=0.95, minSup=4, tpEval=$3, verbose=$4);
 
 write(TKC, $5)