You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2017/10/28 08:11:47 UTC
[2/3] systemml git commit: [SYSTEMML-1976] Performance codegen outer
ops w/ ultra-sparse inputs
[SYSTEMML-1976] Performance codegen outer ops w/ ultra-sparse inputs
This patch improves the performance of codegen outer operations
(especially left mm), for ultra-sparse inputs. On ultra-sparse inputs,
the allocation and maintenance of column indexes can become a
bottleneck. Accordingly, this patch adds a special case for ultra-sparse
matrices.
For the core update rules of ALS-CG, this patch improved the performance
as follows (10 iterations over the amazon review dataset):
Left (t(t(U) %*% (W * (U %*% t(V))))): 125.7s -> 14.2s
Right ((W * (U %*% t(V))) %*% V): 25.2s -> 17.9s
Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/ede870de
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/ede870de
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/ede870de
Branch: refs/heads/master
Commit: ede870de8a7b01bd44ac3b6bcfe7f0e86b1c93c8
Parents: 83e01b0
Author: Matthias Boehm <mb...@gmail.com>
Authored: Fri Oct 27 22:01:36 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Fri Oct 27 22:01:36 2017 -0700
----------------------------------------------------------------------
.../runtime/codegen/SpoofOuterProduct.java | 77 +++++++++++++-------
1 file changed, 52 insertions(+), 25 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/systemml/blob/ede870de/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
index ac8dc57..26a661a 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
@@ -28,6 +28,7 @@ import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
+import org.apache.sysml.hops.OptimizerUtils;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.compress.CompressedMatrixBlock;
import org.apache.sysml.runtime.instructions.cp.DoubleObject;
@@ -391,7 +392,7 @@ public abstract class SpoofOuterProduct extends SpoofOperator
c[0] = sum;
}
- private void executeSparse(SparseBlock sblock, double[] u, double[] v, double[][] b, double[] scalars,
+ private void executeSparse(SparseBlock sblock, double[] u, double[] v, double[][] b, double[] scalars,
double[] c, int m, int n, int k, long nnz, OutProdType type, int rl, int ru, int cl, int cu)
{
boolean left = (_outerProductType== OutProdType.LEFT_OUTER_PRODUCT);
@@ -401,37 +402,63 @@ public abstract class SpoofOuterProduct extends SpoofOperator
//blocksize is chosen such that we reuse each Ui/Vj vector on average 8 times,
//with custom blocksizeJ for wdivmm_left to avoid LLC misses on output.
final int blocksizeI = (int) (8L*m*n/nnz);
- final int blocksizeJ = left ? Math.max(8,Math.min(L2_CACHESIZE/(k*8), blocksizeI)) : blocksizeI;
- int[] curk = new int[Math.min(blocksizeI,ru-rl)];
- for( int bi = rl; bi < ru; bi+=blocksizeI )
+ if( OptimizerUtils.getSparsity(m, n, nnz) < MatrixBlock.ULTRA_SPARSITY_TURN_POINT ) //ultra-sparse
{
- int bimin = Math.min(ru, bi+blocksizeI);
- //prepare starting indexes for block row
- for( int i=bi; i<bimin; i++ ) {
+ //for ultra-sparse matrices, we do not allocate the index array because
+ //its allocation and maintenance can dominate the total runtime.
+
+ //core wdivmm block matrix mult
+ for( int i=rl, uix=rl*k; i<ru; i++, uix+=k ) {
+ if( sblock.isEmpty(i) ) continue;
+
+ int wpos = sblock.pos(i);
+ int wlen = sblock.size(i);
+ int[] wix = sblock.indexes(i);
+ double[] wval = sblock.values(i);
+
int index = (cl==0||sblock.isEmpty(i)) ? 0 : sblock.posFIndexGTE(i,cl);
- curk[i-bi] = (index>=0) ? index : n;
+ index = wpos + ((index>=0) ? index : n);
+ for( ; index<wpos+wlen && wix[index]<cu; index++ ) {
+ genexecDense(wval[index], u, uix, v, wix[index]*k, b, scalars, c,
+ (left ? wix[index]*k : uix), m, n, k, i, wix[index]);
+ }
}
+ }
+ else //sparse
+ {
+ final int blocksizeJ = left ? Math.max(8,Math.min(L2_CACHESIZE/(k*8), blocksizeI)) : blocksizeI;
+ int[] curk = new int[Math.min(blocksizeI,ru-rl)];
- //blocked execution over column blocks
- for( int bj = cl; bj < cu; bj+=blocksizeJ )
+ for( int bi = rl; bi < ru; bi+=blocksizeI )
{
- int bjmin = Math.min(cu, bj+blocksizeJ);
- //core wdivmm block matrix mult
- for( int i=bi, uix=bi*k; i<bimin; i++, uix+=k ) {
- if( sblock.isEmpty(i) ) continue;
-
- int wpos = sblock.pos(i);
- int wlen = sblock.size(i);
- int[] wix = sblock.indexes(i);
- double[] wval = sblock.values(i);
-
- int index = wpos + curk[i-bi];
- for( ; index<wpos+wlen && wix[index]<bjmin; index++ ) {
- genexecDense(wval[index], u, uix, v, wix[index]*k, b, scalars, c,
- (left ? wix[index]*k : uix), m, n, k, i, wix[index]);
+ int bimin = Math.min(ru, bi+blocksizeI);
+ //prepare starting indexes for block row
+ for( int i=bi; i<bimin; i++ ) {
+ int index = (cl==0||sblock.isEmpty(i)) ? 0 : sblock.posFIndexGTE(i,cl);
+ curk[i-bi] = (index>=0) ? index : n;
+ }
+
+ //blocked execution over column blocks
+ for( int bj = cl; bj < cu; bj+=blocksizeJ )
+ {
+ int bjmin = Math.min(cu, bj+blocksizeJ);
+ //core wdivmm block matrix mult
+ for( int i=bi, uix=bi*k; i<bimin; i++, uix+=k ) {
+ if( sblock.isEmpty(i) ) continue;
+
+ int wpos = sblock.pos(i);
+ int wlen = sblock.size(i);
+ int[] wix = sblock.indexes(i);
+ double[] wval = sblock.values(i);
+
+ int index = wpos + curk[i-bi];
+ for( ; index<wpos+wlen && wix[index]<bjmin; index++ ) {
+ genexecDense(wval[index], u, uix, v, wix[index]*k, b, scalars, c,
+ (left ? wix[index]*k : uix), m, n, k, i, wix[index]);
+ }
+ curk[i-bi] = index - wpos;
}
- curk[i-bi] = index - wpos;
}
}
}