You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2017/07/13 02:25:05 UTC

systemml git commit: [SYSTEMML-1767] Performance codegen rowwise template w/ column agg

Repository: systemml
Updated Branches:
  refs/heads/master 4e3ebcaeb -> 62a1b75ba


[SYSTEMML-1767] Performance codegen rowwise template w/ column agg

This patch makes the codegen row-wise template consistent with the
mmchain operation in terms of its condition to fallback to
single-threaded operations if the temporary memory for partial
aggregations exceed the internal threshold. On a scenario with 2M spase
features, this patch improved performance by 20x because it avoids
unnecessary L3 cache thrashing.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/62a1b75b
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/62a1b75b
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/62a1b75b

Branch: refs/heads/master
Commit: 62a1b75baf5d3ae3225ca4126e5a3ea93aa86a0f
Parents: 4e3ebca
Author: Matthias Boehm <mb...@gmail.com>
Authored: Wed Jul 12 19:25:52 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Wed Jul 12 19:26:12 2017 -0700

----------------------------------------------------------------------
 .../hops/codegen/template/TemplateRow.java      |  1 +
 .../sysml/runtime/codegen/SpoofRowwise.java     | 20 +++++++++++++-------
 .../runtime/matrix/data/LibMatrixMult.java      |  9 ++++++---
 3 files changed, 20 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/62a1b75b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
index 0bc0380..5cb016c 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
@@ -320,6 +320,7 @@ public class TemplateRow extends TemplateBase
 			//special case for cbind with zeros
 			CNode cdata1 = tmp.get(hop.getInput().get(0).getHopID());
 			out = new CNodeUnary(cdata1, UnaryType.CBIND0);
+			inHops.remove(hop.getInput().get(1)); //rm 0-matrix
 		}
 		else if(hop instanceof BinaryOp)
 		{

http://git-wip-us.apache.org/repos/asf/systemml/blob/62a1b75b/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java b/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java
index 13536d3..dc6baff 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java
@@ -131,7 +131,7 @@ public abstract class SpoofRowwise extends SpoofOperator
 		double[] scalars = prepInputScalars(scalarObjects);
 		
 		//setup thread-local memory if necessary
-		if( allocTmp )
+		if( allocTmp &&_reqVectMem > 0 )
 			LibSpoofPrimitives.setupThreadLocalMemory(_reqVectMem, n, n2);
 		
 		//core sequential execute
@@ -144,7 +144,7 @@ public abstract class SpoofRowwise extends SpoofOperator
 			executeSparse(a.getSparseBlock(), b, scalars, c, n, 0, m);
 		
 		//post-processing
-		if( allocTmp )
+		if( allocTmp &&_reqVectMem > 0 )
 			LibSpoofPrimitives.cleanupThreadLocalMemory();
 		out.recomputeNonZeros();
 		out.examSparsity();
@@ -155,7 +155,8 @@ public abstract class SpoofRowwise extends SpoofOperator
 		throws DMLRuntimeException
 	{
 		//redirect to serial execution
-		if( k <= 1 || (long)inputs.get(0).getNumRows()*inputs.get(0).getNumColumns()<PAR_NUMCELL_THRESHOLD ) {
+		if( k <= 1 || (_type.isColumnAgg() && !LibMatrixMult.checkParColumnAgg(inputs.get(0), k, false))
+			|| (long)inputs.get(0).getNumRows()*inputs.get(0).getNumColumns()<PAR_NUMCELL_THRESHOLD ) {
 			execute(inputs, scalarObjects, out);
 			return;
 		}
@@ -320,7 +321,8 @@ public abstract class SpoofRowwise extends SpoofOperator
 		public double[] call() throws DMLRuntimeException {
 			
 			//allocate vector intermediates and partial output
-			LibSpoofPrimitives.setupThreadLocalMemory(_reqVectMem, _clen, _clen2);
+			if( _reqVectMem > 0 )
+				LibSpoofPrimitives.setupThreadLocalMemory(_reqVectMem, _clen, _clen2);
 			double[] c = new double[(_clen2>0)?_clen*_clen2 : _clen];
 			
 			if( _a instanceof CompressedMatrixBlock )
@@ -330,7 +332,8 @@ public abstract class SpoofRowwise extends SpoofOperator
 			else
 				executeSparse(_a.getSparseBlock(), _b, _scalars, c, _clen, _rl, _ru);
 			
-			LibSpoofPrimitives.cleanupThreadLocalMemory();
+			if( _reqVectMem > 0 )
+				LibSpoofPrimitives.cleanupThreadLocalMemory();
 			return c;
 		}
 	}
@@ -363,7 +366,8 @@ public abstract class SpoofRowwise extends SpoofOperator
 		@Override
 		public Long call() throws DMLRuntimeException {
 			//allocate vector intermediates
-			LibSpoofPrimitives.setupThreadLocalMemory(_reqVectMem, _clen, _clen2);
+			if( _reqVectMem > 0 )
+				LibSpoofPrimitives.setupThreadLocalMemory(_reqVectMem, _clen, _clen2);
 			
 			if( _a instanceof CompressedMatrixBlock )
 				executeCompressed((CompressedMatrixBlock)_a, _b, _scalars, _c.getDenseBlock(), _clen, _rl, _ru);
@@ -371,7 +375,9 @@ public abstract class SpoofRowwise extends SpoofOperator
 				executeDense(_a.getDenseBlock(), _b, _scalars, _c.getDenseBlock(), _clen, _rl, _ru);
 			else
 				executeSparse(_a.getSparseBlock(), _b, _scalars, _c.getDenseBlock(), _clen, _rl, _ru);
-			LibSpoofPrimitives.cleanupThreadLocalMemory();
+			
+			if( _reqVectMem > 0 )
+				LibSpoofPrimitives.cleanupThreadLocalMemory();
 			
 			//maintain nnz for row partition
 			return _c.recomputeNonZeros(_rl, _ru-1, 0, _c.getNumColumns()-1);

http://git-wip-us.apache.org/repos/asf/systemml/blob/62a1b75b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
index da3b12b..30e7d3d 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
@@ -309,9 +309,7 @@ public class LibMatrixMult
 		
 		//check too high additional memory requirements (fallback to sequential)
 		//check too small workload in terms of flops (fallback to sequential too)
-		if( 8L * mV.rlen * k > MEM_OVERHEAD_THRESHOLD 
-			|| 4L * mX.rlen * mX.clen < PAR_MINFLOP_THRESHOLD) 
-		{ 
+		if( !checkParColumnAgg(mX, k, true) ) { 
 			matrixMultChain(mX, mV, mW, ret, ct);
 			return;
 		}
@@ -3531,6 +3529,11 @@ public class LibMatrixMult
 		return m1rlen > m2clen && m2rlen > m2clen && m2clen > 1 
 			&& m2clen < 64 && 8*m2rlen*m2clen < L2_CACHESIZE;
 	}
+	
+	public static boolean checkParColumnAgg(MatrixBlock m1, int k, boolean inclFLOPs) {
+		return (8L * m1.clen * k <= MEM_OVERHEAD_THRESHOLD 
+			&& (!inclFLOPs || 4L * m1.rlen * m1.clen >= PAR_MINFLOP_THRESHOLD));
+	}
 
 	private static boolean checkParMatrixMultRightInputRows( MatrixBlock m1, MatrixBlock m2, int k ) {
 		//parallelize over rows in rhs matrix if number of rows in lhs/output is very small