You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2016/07/25 20:18:22 UTC
incubator-systemml git commit: [SYSTEMML-809] Performance
sparse-dense wdivmm (multi-level blocking)
Repository: incubator-systemml
Updated Branches:
refs/heads/master 3841ca88e -> fceb6620e
[SYSTEMML-809] Performance sparse-dense wdivmm (multi-level blocking)
So far we used a best-effort cache blocking to ensure row reuse.
However, with high sparsity and large factors this led to severe cache
thrashing effects. We now use a multi-level cache blocking: best effort
for reuse and subsequent blocking for L2 cache size. This also
generalizes the custom blocking for wdivmm_left.
The performance improvements on 200k x 200k, sparsity=0.001, rank=100
were substantial for both wdivmm left (2400ms -> 540ms) and wdivmm_right
(1500ms -> 600ms).
Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/fceb6620
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/fceb6620
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/fceb6620
Branch: refs/heads/master
Commit: fceb6620e7bc29a097dd7ac6057f963b3a2ff235
Parents: 3841ca8
Author: Matthias Boehm <mb...@us.ibm.com>
Authored: Sun Jul 24 23:30:33 2016 -0700
Committer: Matthias Boehm <mb...@us.ibm.com>
Committed: Sun Jul 24 23:30:33 2016 -0700
----------------------------------------------------------------------
.../runtime/matrix/data/LibMatrixMult.java | 99 +++++++++++---------
1 file changed, 54 insertions(+), 45 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/fceb6620/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
index 57ce557..b5fe663 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
@@ -68,9 +68,10 @@ import org.apache.sysml.runtime.util.UtilFunctions;
public class LibMatrixMult
{
//internal configuration
- public static final boolean LOW_LEVEL_OPTIMIZATION = true;
- public static final long MEM_OVERHEAD_THRESHOLD = 2L*1024*1024; //MAX 2 MB
+ private static final boolean LOW_LEVEL_OPTIMIZATION = true;
+ private static final long MEM_OVERHEAD_THRESHOLD = 2L*1024*1024; //MAX 2 MB
private static final long PAR_MINFLOP_THRESHOLD = 2L*1024*1024; //MIN 2 MFLOP
+ private static final int L2_CACHESIZE = 256 *1024; //256KB (common size)
private LibMatrixMult() {
//prevent instantiation via private constructor
@@ -2737,15 +2738,18 @@ public class LibMatrixMult
//approach: iterate over non-zeros of w, selective mm computation
//blocked over ij, while maintaining front of column indexes, where the
//blocksize is chosen such that we reuse each Ui/Vj vector on average 8 times,
- //with custom blocksizeJ for wdivmm_left to avoid LLC misses on output.
- final int blocksizeI = (int) (8L*mW.rlen*mW.clen/mW.nonZeros);
- final int blocksizeJ = left ? Math.max(8,Math.min(256*1024/(mU.clen*8), blocksizeI)) : blocksizeI;
- int[] curk = new int[blocksizeI];
- boolean[] aligned = (four&&!scalar) ? new boolean[blocksizeI] : null;
+ //we use an additional ij blocking to ensure that Ui/Vj as well as the output
+ //in case of wdivmm_left fit into L2 cache (avoid L2/LLC misses).
+ final int blocksizeIJ = (int) (8L*mW.rlen*mW.clen/mW.nonZeros);
+ final int blocksizeIJ2 = L2_CACHESIZE / (2*mU.clen*8); //mU guaranteed <=1000
- for( int bi = rl; bi < ru; bi+=blocksizeI )
+ int[] curk = new int[blocksizeIJ];
+ boolean[] aligned = (four&&!scalar) ? new boolean[blocksizeIJ] : null;
+
+ //blocked execution over row/column blocks
+ for( int bi=rl; bi<ru; bi+=blocksizeIJ )
{
- int bimin = Math.min(ru, bi+blocksizeI);
+ int bimin = Math.min(ru, bi+blocksizeIJ);
//prepare starting indexes for block row
for( int i=bi; i<bimin; i++ ) {
int k = (cl==0||w.isEmpty(i)) ? 0 : w.posFIndexGTE(i,cl);
@@ -2755,47 +2759,52 @@ public class LibMatrixMult
if( four && !scalar )
for( int i=bi; i<bimin; i++ )
aligned[i-bi] = w.isAligned(i-bi, x);
- //blocked execution over column blocks
- for( int bj = cl; bj < cu; bj+=blocksizeJ )
+
+ for( int bj=cl; bj<cu; bj+=blocksizeIJ )
{
- int bjmin = Math.min(cu, bj+blocksizeJ);
- for( int i=bi, uix=bi*cd; i<bimin; i++, uix+=cd ) {
- if( !w.isEmpty(i) ) {
- int wpos = w.pos(i);
- int wlen = w.size(i);
- int[] wix = w.indexes(i);
- double[] wval = w.values(i);
-
- int k = wpos + curk[i-bi];
- if( basic ) {
- for( ; k<wpos+wlen && wix[k]<bjmin; k++ )
- ret.appendValue( i, wix[k], wval[k] * dotProduct(u, v, uix, wix[k]*cd, cd));
- }
- else if( four ) { //left/right
- //checking alignment per row is ok because early abort if false,
- //row nnz likely fit in L1/L2 cache, and asymptotically better if aligned
- if( !scalar && w.isAligned(i, x) ) {
- //O(n) where n is nnz in w/x
- double[] xvals = x.values(i);
- for( ; k<wpos+wlen && wix[k]<bjmin; k++ )
- wdivmm(wval[k], xvals[k], u, v, c, uix, wix[k]*cd, left, scalar, cd);
+ //blocked execution over row/column blocks for L2
+ for( int bi2=bi, bjmin=Math.min(cu, bj+blocksizeIJ); bi2<bimin; bi2+=blocksizeIJ2)
+ for( int bj2=bj, bimin2=Math.min(bimin, bi2+blocksizeIJ2); bj2<bjmin; bj2+=blocksizeIJ2 ) {
+
+ //core wdivmm block matrix mult
+ for( int i=bi2, uix=bi2*cd, bjmin2=Math.min(bjmin, bj2+blocksizeIJ2); i<bimin2; i++, uix+=cd ) {
+ if( w.isEmpty(i) ) continue;
+
+ int wpos = w.pos(i);
+ int wlen = w.size(i);
+ int[] wix = w.indexes(i);
+ double[] wval = w.values(i);
+
+ int k = wpos + curk[i-bi2];
+ if( basic ) {
+ for( ; k<wpos+wlen && wix[k]<bjmin2; k++ )
+ ret.appendValue( i, wix[k], wval[k] * dotProduct(u, v, uix, wix[k]*cd, cd));
}
- else {
- //scalar or O(n log m) where n/m are nnz in w/x
- for( ; k<wpos+wlen && wix[k]<bjmin; k++ )
- if (scalar)
- wdivmm(wval[k], eps, u, v, c, uix, wix[k]*cd, left, scalar, cd);
- else
- wdivmm(wval[k], x.get(i, wix[k]), u, v, c, uix, wix[k]*cd, left, scalar, cd);
+ else if( four ) { //left/right
+ //checking alignment per row is ok because early abort if false,
+ //row nnz likely fit in L1/L2 cache, and asymptotically better if aligned
+ if( !scalar && w.isAligned(i, x) ) {
+ //O(n) where n is nnz in w/x
+ double[] xvals = x.values(i);
+ for( ; k<wpos+wlen && wix[k]<bjmin2; k++ )
+ wdivmm(wval[k], xvals[k], u, v, c, uix, wix[k]*cd, left, scalar, cd);
+ }
+ else {
+ //scalar or O(n log m) where n/m are nnz in w/x
+ for( ; k<wpos+wlen && wix[k]<bjmin2; k++ )
+ if (scalar)
+ wdivmm(wval[k], eps, u, v, c, uix, wix[k]*cd, left, scalar, cd);
+ else
+ wdivmm(wval[k], x.get(i, wix[k]), u, v, c, uix, wix[k]*cd, left, scalar, cd);
+ }
}
+ else { //left/right minus default
+ for( ; k<wpos+wlen && wix[k]<bjmin2; k++ )
+ wdivmm(wval[k], u, v, c, uix, wix[k]*cd, left, mult, minus, cd);
+ }
+ curk[i-bi2] = k - wpos;
}
- else { //left/right minus default
- for( ; k<wpos+wlen && wix[k]<bjmin; k++ )
- wdivmm(wval[k], u, v, c, uix, wix[k]*cd, left, mult, minus, cd);
- }
- curk[i-bi] = k - wpos;
}
- }
}
}
}