You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2016/07/25 20:18:22 UTC
incubator-systemml git commit: [SYSTEMML-809] Performance sparse-dense wdivmm (multi-level blocking)

Repository: incubator-systemml
Updated Branches:
  refs/heads/master 3841ca88e -> fceb6620e


[SYSTEMML-809] Performance sparse-dense wdivmm (multi-level blocking)

So far we used a best-effort cache blocking to ensure row reuse.
However, with high sparsity and large factors this led to severe cache
thrashing effects. We now use a multi-level cache blocking: best effort
for reuse and subsequent blocking for L2 cache size. This also
generalizes the custom blocking for wdivmm_left. 

The performance improvements on 200k x 200k, sparsity=0.001, rank=100
were substantial for both wdivmm left (2400ms -> 540ms) and wdivmm_right
(1500ms -> 600ms).

Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/fceb6620
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/fceb6620
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/fceb6620

Branch: refs/heads/master
Commit: fceb6620e7bc29a097dd7ac6057f963b3a2ff235
Parents: 3841ca8
Author: Matthias Boehm <mb...@us.ibm.com>
Authored: Sun Jul 24 23:30:33 2016 -0700
Committer: Matthias Boehm <mb...@us.ibm.com>
Committed: Sun Jul 24 23:30:33 2016 -0700

----------------------------------------------------------------------
 .../runtime/matrix/data/LibMatrixMult.java      | 99 +++++++++++---------
 1 file changed, 54 insertions(+), 45 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/fceb6620/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
index 57ce557..b5fe663 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
@@ -68,9 +68,10 @@ import org.apache.sysml.runtime.util.UtilFunctions;
 public class LibMatrixMult 
 {
 	//internal configuration
-	public static final boolean LOW_LEVEL_OPTIMIZATION = true;
-	public static final long MEM_OVERHEAD_THRESHOLD = 2L*1024*1024; //MAX 2 MB
+	private static final boolean LOW_LEVEL_OPTIMIZATION = true;
+	private static final long MEM_OVERHEAD_THRESHOLD = 2L*1024*1024; //MAX 2 MB
 	private static final long PAR_MINFLOP_THRESHOLD = 2L*1024*1024; //MIN 2 MFLOP
+	private static final int L2_CACHESIZE = 256 *1024; //256KB (common size)
 	
 	private LibMatrixMult() {
 		//prevent instantiation via private constructor
@@ -2737,15 +2738,18 @@ public class LibMatrixMult
 		//approach: iterate over non-zeros of w, selective mm computation
 		//blocked over ij, while maintaining front of column indexes, where the
 		//blocksize is chosen such that we reuse each  Ui/Vj vector on average 8 times,
-		//with custom blocksizeJ for wdivmm_left to avoid LLC misses on output.
-		final int blocksizeI = (int) (8L*mW.rlen*mW.clen/mW.nonZeros);
-		final int blocksizeJ = left ? Math.max(8,Math.min(256*1024/(mU.clen*8), blocksizeI)) : blocksizeI; 
-		int[] curk = new int[blocksizeI];		
-		boolean[] aligned = (four&&!scalar) ? new boolean[blocksizeI] : null;
+		//we use an additional ij blocking to ensure that Ui/Vj as well as the output
+		//in case of wdivmm_left fit into L2 cache (avoid L2/LLC misses).
+		final int blocksizeIJ = (int) (8L*mW.rlen*mW.clen/mW.nonZeros);
+		final int blocksizeIJ2 = L2_CACHESIZE / (2*mU.clen*8); //mU guaranteed <=1000
 		
-		for( int bi = rl; bi < ru; bi+=blocksizeI ) 
+		int[] curk = new int[blocksizeIJ];		
+		boolean[] aligned = (four&&!scalar) ? new boolean[blocksizeIJ] : null;
+		
+		//blocked execution over row/column blocks
+		for( int bi=rl; bi<ru; bi+=blocksizeIJ ) 
 		{
-			int bimin = Math.min(ru, bi+blocksizeI);
+			int bimin = Math.min(ru, bi+blocksizeIJ);
 			//prepare starting indexes for block row
 			for( int i=bi; i<bimin; i++ ) {
 				int k = (cl==0||w.isEmpty(i)) ? 0 : w.posFIndexGTE(i,cl);
@@ -2755,47 +2759,52 @@ public class LibMatrixMult
 			if( four && !scalar )
 				for( int i=bi; i<bimin; i++ )
 					aligned[i-bi] = w.isAligned(i-bi, x);
-			//blocked execution over column blocks
-			for( int bj = cl; bj < cu; bj+=blocksizeJ ) 
+			
+			for( int bj=cl; bj<cu; bj+=blocksizeIJ )  
 			{
-				int bjmin = Math.min(cu, bj+blocksizeJ);
-				for( int i=bi, uix=bi*cd; i<bimin; i++, uix+=cd ) {
-					if( !w.isEmpty(i) ) {
-						int wpos = w.pos(i);
-						int wlen = w.size(i);
-						int[] wix = w.indexes(i);
-						double[] wval = w.values(i);				
-						
-						int k = wpos + curk[i-bi];
-						if( basic ) {
-							for( ; k<wpos+wlen && wix[k]<bjmin; k++ )
-								ret.appendValue( i, wix[k], wval[k] * dotProduct(u, v, uix, wix[k]*cd, cd));
-						}
-						else if( four ) { //left/right
-							//checking alignment per row is ok because early abort if false, 
-							//row nnz likely fit in L1/L2 cache, and asymptotically better if aligned
-							if( !scalar && w.isAligned(i, x) ) {
-								//O(n) where n is nnz in w/x 
-								double[] xvals = x.values(i);
-								for( ; k<wpos+wlen && wix[k]<bjmin; k++ )
-									wdivmm(wval[k], xvals[k], u, v, c, uix, wix[k]*cd, left, scalar, cd);
+				//blocked execution over row/column blocks for L2
+				for( int bi2=bi, bjmin=Math.min(cu, bj+blocksizeIJ); bi2<bimin; bi2+=blocksizeIJ2)					
+					for( int bj2=bj, bimin2=Math.min(bimin, bi2+blocksizeIJ2); bj2<bjmin; bj2+=blocksizeIJ2 ) {
+							
+						//core wdivmm block matrix mult
+						for( int i=bi2, uix=bi2*cd, bjmin2=Math.min(bjmin, bj2+blocksizeIJ2); i<bimin2; i++, uix+=cd ) {
+							if( w.isEmpty(i) ) continue;
+							
+							int wpos = w.pos(i);
+							int wlen = w.size(i);
+							int[] wix = w.indexes(i);
+							double[] wval = w.values(i);				
+							
+							int k = wpos + curk[i-bi2];
+							if( basic ) {
+								for( ; k<wpos+wlen && wix[k]<bjmin2; k++ )
+									ret.appendValue( i, wix[k], wval[k] * dotProduct(u, v, uix, wix[k]*cd, cd));
 							}
-							else {
-								//scalar or O(n log m) where n/m are nnz in w/x
-								for( ; k<wpos+wlen && wix[k]<bjmin; k++ )
-									if (scalar)
-										wdivmm(wval[k], eps, u, v, c, uix, wix[k]*cd, left, scalar, cd);
-									else
-										wdivmm(wval[k], x.get(i, wix[k]), u, v, c, uix, wix[k]*cd, left, scalar, cd);
+							else if( four ) { //left/right
+								//checking alignment per row is ok because early abort if false, 
+								//row nnz likely fit in L1/L2 cache, and asymptotically better if aligned
+								if( !scalar && w.isAligned(i, x) ) {
+									//O(n) where n is nnz in w/x 
+									double[] xvals = x.values(i);
+									for( ; k<wpos+wlen && wix[k]<bjmin2; k++ )
+										wdivmm(wval[k], xvals[k], u, v, c, uix, wix[k]*cd, left, scalar, cd);
+								}
+								else {
+									//scalar or O(n log m) where n/m are nnz in w/x
+									for( ; k<wpos+wlen && wix[k]<bjmin2; k++ )
+										if (scalar)
+											wdivmm(wval[k], eps, u, v, c, uix, wix[k]*cd, left, scalar, cd);
+										else
+											wdivmm(wval[k], x.get(i, wix[k]), u, v, c, uix, wix[k]*cd, left, scalar, cd);
+								}
 							}
+							else { //left/right minus default
+								for( ; k<wpos+wlen && wix[k]<bjmin2; k++ )
+									wdivmm(wval[k], u, v, c, uix, wix[k]*cd, left, mult, minus, cd);
+							}
+							curk[i-bi2] = k - wpos;
 						}
-						else { //left/right minus default
-							for( ; k<wpos+wlen && wix[k]<bjmin; k++ )
-								wdivmm(wval[k], u, v, c, uix, wix[k]*cd, left, mult, minus, cd);
-						}
-						curk[i-bi] = k - wpos;
 					}
-				}
 			}
 		}
 	}