You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2017/10/28 08:11:47 UTC
[2/3] systemml git commit: [SYSTEMML-1976] Performance codegen outer ops w/ ultra-sparse inputs

[SYSTEMML-1976] Performance codegen outer ops w/ ultra-sparse inputs

This patch improves the performance of codegen outer operations
(especially left mm), for ultra-sparse inputs. On ultra-sparse inputs,
the allocation and maintenance of column indexes can become a
bottleneck. Accordingly, this patch adds a special case for ultra-sparse
matrices.

For the core update rules of ALS-CG, this patch improved the performance
as follows (10 iterations over the amazon review dataset):
Left (t(t(U) %*% (W * (U %*% t(V))))): 125.7s -> 14.2s
Right ((W * (U %*% t(V))) %*% V): 25.2s -> 17.9s


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/ede870de
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/ede870de
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/ede870de

Branch: refs/heads/master
Commit: ede870de8a7b01bd44ac3b6bcfe7f0e86b1c93c8
Parents: 83e01b0
Author: Matthias Boehm <mb...@gmail.com>
Authored: Fri Oct 27 22:01:36 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Fri Oct 27 22:01:36 2017 -0700

----------------------------------------------------------------------
 .../runtime/codegen/SpoofOuterProduct.java      | 77 +++++++++++++-------
 1 file changed, 52 insertions(+), 25 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/ede870de/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
index ac8dc57..26a661a 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
@@ -28,6 +28,7 @@ import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 import java.util.concurrent.Future;
 
+import org.apache.sysml.hops.OptimizerUtils;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.compress.CompressedMatrixBlock;
 import org.apache.sysml.runtime.instructions.cp.DoubleObject;
@@ -391,7 +392,7 @@ public abstract class SpoofOuterProduct extends SpoofOperator
 			c[0] = sum;
 	}
 	
-	private void executeSparse(SparseBlock sblock,  double[] u, double[] v, double[][] b, double[] scalars,
+	private void executeSparse(SparseBlock sblock, double[] u, double[] v, double[][] b, double[] scalars,
 		double[] c, int m, int n, int k, long nnz, OutProdType type, int rl, int ru, int cl, int cu) 
 	{
 		boolean left = (_outerProductType== OutProdType.LEFT_OUTER_PRODUCT);
@@ -401,37 +402,63 @@ public abstract class SpoofOuterProduct extends SpoofOperator
 		//blocksize is chosen such that we reuse each  Ui/Vj vector on average 8 times,
 		//with custom blocksizeJ for wdivmm_left to avoid LLC misses on output.
 		final int blocksizeI = (int) (8L*m*n/nnz);
-		final int blocksizeJ = left ? Math.max(8,Math.min(L2_CACHESIZE/(k*8), blocksizeI)) : blocksizeI;
-		int[] curk = new int[Math.min(blocksizeI,ru-rl)];
 		
-		for( int bi = rl; bi < ru; bi+=blocksizeI ) 
+		if( OptimizerUtils.getSparsity(m, n, nnz) < MatrixBlock.ULTRA_SPARSITY_TURN_POINT ) //ultra-sparse
 		{
-			int bimin = Math.min(ru, bi+blocksizeI);
-			//prepare starting indexes for block row
-			for( int i=bi; i<bimin; i++ ) {
+			//for ultra-sparse matrices, we do not allocate the index array because
+			//its allocation and maintenance can dominate the total runtime.
+			
+			//core wdivmm block matrix mult
+			for( int i=rl, uix=rl*k; i<ru; i++, uix+=k ) {
+				if( sblock.isEmpty(i) ) continue;
+				
+				int wpos = sblock.pos(i);
+				int wlen = sblock.size(i);
+				int[] wix = sblock.indexes(i);
+				double[] wval = sblock.values(i);
+				
 				int index = (cl==0||sblock.isEmpty(i)) ? 0 : sblock.posFIndexGTE(i,cl);
-				curk[i-bi] = (index>=0) ? index : n;
+				index = wpos + ((index>=0) ? index : n);
+				for( ; index<wpos+wlen && wix[index]<cu; index++ ) {
+					genexecDense(wval[index], u, uix, v, wix[index]*k, b, scalars, c,
+						(left ? wix[index]*k : uix), m, n, k, i, wix[index]);
+				}
 			}
+		}
+		else //sparse
+		{
+			final int blocksizeJ = left ? Math.max(8,Math.min(L2_CACHESIZE/(k*8), blocksizeI)) : blocksizeI;
+			int[] curk = new int[Math.min(blocksizeI,ru-rl)];
 			
-			//blocked execution over column blocks
-			for( int bj = cl; bj < cu; bj+=blocksizeJ )
+			for( int bi = rl; bi < ru; bi+=blocksizeI ) 
 			{
-				int bjmin = Math.min(cu, bj+blocksizeJ);
-				//core wdivmm block matrix mult
-				for( int i=bi, uix=bi*k; i<bimin; i++, uix+=k ) {
-					if( sblock.isEmpty(i) ) continue;
-					
-					int wpos = sblock.pos(i);
-					int wlen = sblock.size(i);
-					int[] wix = sblock.indexes(i);
-					double[] wval = sblock.values(i);
-					
-					int index = wpos + curk[i-bi];
-					for( ; index<wpos+wlen && wix[index]<bjmin; index++ ) {
-						genexecDense(wval[index], u, uix, v, wix[index]*k, b, scalars, c,
-							(left ? wix[index]*k : uix), m, n, k, i, wix[index]);
+				int bimin = Math.min(ru, bi+blocksizeI);
+				//prepare starting indexes for block row
+				for( int i=bi; i<bimin; i++ ) {
+					int index = (cl==0||sblock.isEmpty(i)) ? 0 : sblock.posFIndexGTE(i,cl);
+					curk[i-bi] = (index>=0) ? index : n;
+				}
+				
+				//blocked execution over column blocks
+				for( int bj = cl; bj < cu; bj+=blocksizeJ )
+				{
+					int bjmin = Math.min(cu, bj+blocksizeJ);
+					//core wdivmm block matrix mult
+					for( int i=bi, uix=bi*k; i<bimin; i++, uix+=k ) {
+						if( sblock.isEmpty(i) ) continue;
+						
+						int wpos = sblock.pos(i);
+						int wlen = sblock.size(i);
+						int[] wix = sblock.indexes(i);
+						double[] wval = sblock.values(i);
+						
+						int index = wpos + curk[i-bi];
+						for( ; index<wpos+wlen && wix[index]<bjmin; index++ ) {
+							genexecDense(wval[index], u, uix, v, wix[index]*k, b, scalars, c,
+								(left ? wix[index]*k : uix), m, n, k, i, wix[index]);
+						}
+						curk[i-bi] = index - wpos;
 					}
-					curk[i-bi] = index - wpos;
 				}
 			}
 		}