You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2017/08/23 19:41:08 UTC

[1/2] systemml git commit: [SYSTEMML-1857] Performance codegen outer operators (degree of par)

Repository: systemml
Updated Branches:
  refs/heads/master 8df0697e0 -> 65e2a46d2


[SYSTEMML-1857] Performance codegen outer operators (degree of par)

Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/06fa73ac
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/06fa73ac
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/06fa73ac

Branch: refs/heads/master
Commit: 06fa73acc4639dd446c2f36c62a956803c247753
Parents: 8df0697
Author: Matthias Boehm <mb...@gmail.com>
Authored: Tue Aug 22 19:26:00 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Wed Aug 23 12:41:45 2017 -0700

----------------------------------------------------------------------
 .../sysml/runtime/codegen/SpoofCellwise.java    |  1 -
 .../runtime/codegen/SpoofMultiAggregate.java    |  1 -
 .../sysml/runtime/codegen/SpoofOperator.java    |  4 ++
 .../runtime/codegen/SpoofOuterProduct.java      | 47 ++++++++++++++------
 .../sysml/runtime/codegen/SpoofRowwise.java     |  1 -
 5 files changed, 38 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/06fa73ac/src/main/java/org/apache/sysml/runtime/codegen/SpoofCellwise.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofCellwise.java b/src/main/java/org/apache/sysml/runtime/codegen/SpoofCellwise.java
index 575043b..5e22406 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofCellwise.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofCellwise.java
@@ -51,7 +51,6 @@ import org.apache.sysml.runtime.util.UtilFunctions;
 public abstract class SpoofCellwise extends SpoofOperator implements Serializable
 {
 	private static final long serialVersionUID = 3442528770573293590L;
-	private static final long PAR_NUMCELL_THRESHOLD = 1024*1024;   //Min 1M elements
 	
 	public enum CellType {
 		NO_AGG,

http://git-wip-us.apache.org/repos/asf/systemml/blob/06fa73ac/src/main/java/org/apache/sysml/runtime/codegen/SpoofMultiAggregate.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofMultiAggregate.java b/src/main/java/org/apache/sysml/runtime/codegen/SpoofMultiAggregate.java
index ae3c353..4aa91bb 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofMultiAggregate.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofMultiAggregate.java
@@ -48,7 +48,6 @@ import org.apache.sysml.runtime.util.UtilFunctions;
 public abstract class SpoofMultiAggregate extends SpoofOperator implements Serializable
 {
 	private static final long serialVersionUID = -6164871955591089349L;
-	private static final long PAR_NUMCELL_THRESHOLD = 1024*1024;   //Min 1M elements
 	
 	private final AggOp[] _aggOps;
 	private final boolean _sparseSafe;

http://git-wip-us.apache.org/repos/asf/systemml/blob/06fa73ac/src/main/java/org/apache/sysml/runtime/codegen/SpoofOperator.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOperator.java b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOperator.java
index 3ea9246..fea51e3 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOperator.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOperator.java
@@ -37,6 +37,10 @@ public abstract class SpoofOperator implements Serializable
 	private static final long serialVersionUID = 3834006998853573319L;
 	private static final Log LOG = LogFactory.getLog(SpoofOperator.class.getName());
 	
+	protected static final long PAR_NUMCELL_THRESHOLD = 1024*1024;   //Min 1M elements
+	protected static final long PAR_MINFLOP_THRESHOLD = 2L*1024*1024; //MIN 2 MFLOP
+	
+	
 	public abstract MatrixBlock execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalars, MatrixBlock out)
 		throws DMLRuntimeException;
 	

http://git-wip-us.apache.org/repos/asf/systemml/blob/06fa73ac/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
index bc99859..1ec873f 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
@@ -112,6 +112,9 @@ public abstract class SpoofOuterProduct extends SpoofOperator
 		if( inputs.get(0).isEmptyBlock(false) )
 			return new DoubleObject(0);
 		
+		if( 2*inputs.get(0).getNonZeros()*inputs.get(1).getNumColumns() < PAR_MINFLOP_THRESHOLD )
+			return execute(inputs, scalarObjects); //sequential
+		
 		//input preparation
 		double[][] ab = getDenseMatrices(prepInputMatrices(inputs, 1, 2, true, false));
 		double[][] b = getDenseMatrices(prepInputMatrices(inputs, 3, true));
@@ -121,15 +124,14 @@ public abstract class SpoofOuterProduct extends SpoofOperator
 		final int m = inputs.get(0).getNumRows();
 		final int n = inputs.get(0).getNumColumns();
 		final int k = inputs.get(1).getNumColumns(); // rank
+		final long nnz = inputs.get(0).getNonZeros();
 		double sum = 0;
 		
 		try 
 		{
 			ExecutorService pool = Executors.newFixedThreadPool(k);
 			ArrayList<ParOuterProdAggTask> tasks = new ArrayList<ParOuterProdAggTask>();
-			//create tasks (for wdivmm-left, parallelization over columns;
-			//for wdivmm-right, parallelization over rows; both ensure disjoint results)
-			int numThreads2 = UtilFunctions.roundToNext(Math.min(8*k,m/32), k);
+			int numThreads2 = getPreferredNumberOfTasks(m, n, nnz, k, numThreads);
 			int blklen = (int)(Math.ceil((double)m/numThreads2));
 			for( int i=0; i<numThreads2 & i*blklen<m; i++ )
 				tasks.add(new ParOuterProdAggTask(inputs.get(0), ab[0], ab[1], b, scalars, 
@@ -259,6 +261,9 @@ public abstract class SpoofOuterProduct extends SpoofOperator
 			out.allocateDenseBlock();
 		}	
 		
+		if( 2*inputs.get(0).getNonZeros()*inputs.get(1).getNumColumns() < PAR_MINFLOP_THRESHOLD )
+			return execute(inputs, scalarObjects, out); //sequential
+		
 		//input preparation
 		double[][] ab = getDenseMatrices(prepInputMatrices(inputs, 1, 2, true, false));
 		double[][] b = getDenseMatrices(prepInputMatrices(inputs, 3, true));
@@ -268,6 +273,7 @@ public abstract class SpoofOuterProduct extends SpoofOperator
 		final int m = inputs.get(0).getNumRows();
 		final int n = inputs.get(0).getNumColumns();
 		final int k = inputs.get(1).getNumColumns(); // rank
+		final long nnz = inputs.get(0).getNonZeros();
 		
 		MatrixBlock a = inputs.get(0);
 		
@@ -284,21 +290,24 @@ public abstract class SpoofOuterProduct extends SpoofOperator
 					int numCG = ((CompressedMatrixBlock)a).getNumColGroups();
 					int blklen = (int)(Math.ceil((double)numCG/numThreads));
 					for( int j=0; j<numThreads & j*blklen<numCG; j++ )
-						tasks.add(new ParExecTask(a, ab[0], ab[1], b, scalars, out, m, n, k, _outerProductType,  0, m, j*blklen, Math.min((j+1)*blklen, numCG)));
+						tasks.add(new ParExecTask(a, ab[0], ab[1], b, scalars, out, m, n, k,
+							_outerProductType,  0, m, j*blklen, Math.min((j+1)*blklen, numCG)));
 				}
 				else {
 					//parallelize over column partitions
 					int blklen = (int)(Math.ceil((double)n/numThreads));
 					for( int j=0; j<numThreads & j*blklen<n; j++ )
-						tasks.add(new ParExecTask(a, ab[0], ab[1], b, scalars, out, m, n, k, _outerProductType,  0, m, j*blklen, Math.min((j+1)*blklen, n)));
+						tasks.add(new ParExecTask(a, ab[0], ab[1], b, scalars, out, m, n, k,
+							_outerProductType,  0, m, j*blklen, Math.min((j+1)*blklen, n)));
 				}
 			}
 			else { //right or cell-wise
 				//parallelize over row partitions
-				int numThreads2 = UtilFunctions.roundToNext(Math.min(8*k,m/32), k);
+				int numThreads2 = getPreferredNumberOfTasks(m, n, nnz, k, numThreads);
 				int blklen = (int)(Math.ceil((double)m/numThreads2));
 				for( int i=0; i<numThreads2 & i*blklen<m; i++ )
-					tasks.add(new ParExecTask(a, ab[0], ab[1], b, scalars, out, m, n, k, _outerProductType, i*blklen, Math.min((i+1)*blklen,m), 0, n));
+					tasks.add(new ParExecTask(a, ab[0], ab[1], b, scalars, out, m, n, k,
+						_outerProductType, i*blklen, Math.min((i+1)*blklen,m), 0, n));
 			}
 			List<Future<Long>> taskret = pool.invokeAll(tasks);
 			pool.shutdown();
@@ -320,6 +329,13 @@ public abstract class SpoofOuterProduct extends SpoofOperator
 		return out;
 	}
 	
+	private static int getPreferredNumberOfTasks(int m, int n, long nnz, int rank, int k) {
+		//compute number of tasks nk in range [k, 8k]
+		int base = (int) Math.min(Math.min(8*k, m/32),
+			Math.ceil((double)2*nnz*rank/PAR_MINFLOP_THRESHOLD));
+		return UtilFunctions.roundToNext(base, k);
+	}
+	
 	private void executeDense(double[] a, double[] u, double[] v, double[][] b, double[] scalars,
 		double[] c, int m, int n, int k, OutProdType type, int rl, int ru, int cl, int cu )
 	{
@@ -427,6 +443,7 @@ public abstract class SpoofOuterProduct extends SpoofOperator
 		if( !out.isInSparseFormat() ) //DENSE
 		{
 			double[] c = out.getDenseBlock();
+			double tmp = 0;
 			for( int bi=rl; bi<ru; bi+=blocksizeIJ ) {
 				int bimin = Math.min(ru, bi+blocksizeIJ);
 				//prepare starting indexes for block row
@@ -441,16 +458,20 @@ public abstract class SpoofOuterProduct extends SpoofOperator
 						int[] wix = sblock.indexes(i);
 						double[] wval = sblock.values(i);
 						int index = wpos + curk[i-bi];
-						for( ; index<wpos+wlen && wix[index]<bjmin; index++ ) {
-							if(type == OutProdType.CELLWISE_OUTER_PRODUCT)
-								c[wix[index]] = genexecCellwise( wval[index], u, uix, v, wix[index]*k, b, scalars, m, n, k, i, wix[index] );
-							else
-								c[0] += genexecCellwise( wval[index], u, uix, v, wix[index]*k, b, scalars, m, n, k, i, wix[index]);
-						}
+						if( type == OutProdType.CELLWISE_OUTER_PRODUCT )
+							for( ; index<wpos+wlen && wix[index]<bjmin; index++ )
+								c[wix[index]] = genexecCellwise( wval[index], 
+									u, uix, v, wix[index]*k, b, scalars, m, n, k, i, wix[index] );
+						else
+							for( ; index<wpos+wlen && wix[index]<bjmin; index++ )
+								tmp += genexecCellwise( wval[index], 
+									u, uix, v, wix[index]*k, b, scalars, m, n, k, i, wix[index]);
 						curk[i-bi] = index - wpos;
 					}
 				}
 			}
+			if( type != OutProdType.CELLWISE_OUTER_PRODUCT )
+				c[0] = tmp;
 		}
 		else //SPARSE
 		{

http://git-wip-us.apache.org/repos/asf/systemml/blob/06fa73ac/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java b/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java
index e2d9f41..f1cef34 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java
@@ -44,7 +44,6 @@ import org.apache.sysml.runtime.util.UtilFunctions;
 public abstract class SpoofRowwise extends SpoofOperator
 {
 	private static final long serialVersionUID = 6242910797139642998L;
-	private static final long PAR_NUMCELL_THRESHOLD = 1024*1024;   //Min 1M elements
 	
 	public enum RowType {
 		NO_AGG,    //no aggregation


[2/2] systemml git commit: [SYSTEMML-1861] Performance sparse-sparse binary mult operations

Posted by mb...@apache.org.
[SYSTEMML-1861] Performance sparse-sparse binary mult operations

This patch improves the performance of sparse-sparse binary multiply
operations. Instead of using a merge join with outer join semantics, we
now use a dedicated case that realizes multiply via inner join semantics
and branchless position maintenance.

On a scenario of X * Y, with 1M x 1K, sparsity=0.1 inputs, this patch
improved performance from 330ms to 235ms.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/65e2a46d
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/65e2a46d
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/65e2a46d

Branch: refs/heads/master
Commit: 65e2a46d2bccfebe4ed5a566d02c34a1cb816da5
Parents: 06fa73a
Author: Matthias Boehm <mb...@gmail.com>
Authored: Tue Aug 22 22:13:05 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Wed Aug 23 12:41:47 2017 -0700

----------------------------------------------------------------------
 .../runtime/matrix/data/LibMatrixBincell.java   | 79 ++++++++++----------
 1 file changed, 39 insertions(+), 40 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/65e2a46d/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java
index e188b4e..9489225 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java
@@ -1184,52 +1184,51 @@ public class LibMatrixBincell
 		}
 	}
 	
-	/**
-	 * like a merge sort
-	 * 
-	 * @param op binary operator
-	 * @param values1 array of double values
-	 * @param cols1 ?
-	 * @param pos1 ?
-	 * @param size1 ?
-	 * @param values2 array of double values
-	 * @param cols2 ?
-	 * @param pos2 ?
-	 * @param size2 ?
-	 * @param resultRow ?
-	 * @param result matrix block
-	 * @throws DMLRuntimeException if DMLRuntimeException occurs
-	 */
 	private static void mergeForSparseBinary(BinaryOperator op, double[] values1, int[] cols1, int pos1, int size1, 
-				double[] values2, int[] cols2, int pos2, int size2, int resultRow, MatrixBlock result) 
+			double[] values2, int[] cols2, int pos2, int size2, int resultRow, MatrixBlock result) 
 		throws DMLRuntimeException
 	{
-		int p1=0, p2=0, column;
-		while( p1<size1 && p2< size2 )
-		{
-			double value = 0;
-			if(cols1[pos1+p1]<cols2[pos2+p2]) {
-				value = op.fn.execute(values1[pos1+p1], 0);
-				column = cols1[pos1+p1];
-				p1++;
+		int p1 = 0, p2 = 0;
+		if( op.fn instanceof Multiply ) { //skip empty
+			//skip empty: merge-join (with inner join semantics)
+			//similar to sorted list intersection
+			SparseBlock sblock = result.getSparseBlock();
+			sblock.allocate(resultRow, Math.min(size1, size2), result.clen);
+			while( p1 < size1 && p2 < size2 ) {
+				int colPos1 = cols1[pos1+p1];
+				int colPos2 = cols2[pos2+p2];
+				if( colPos1 == colPos2 )
+					sblock.append(resultRow, colPos1,
+						op.fn.execute(values1[pos1+p1], values2[pos2+p2]));
+				p1 += (colPos1 <= colPos2) ? 1 : 0;
+				p2 += (colPos1 >= colPos2) ? 1 : 0;
 			}
-			else if(cols1[pos1+p1]==cols2[pos2+p2]) {
-				value = op.fn.execute(values1[pos1+p1], values2[pos2+p2]);
-				column = cols1[pos1+p1];
-				p1++;
-				p2++;
-			}
-			else {
-				value = op.fn.execute(0, values2[pos2+p2]);
-				column = cols2[pos2+p2];
-				p2++;
+			result.nonZeros += sblock.size(resultRow);
+		}
+		else {
+			//general case: merge-join (with outer join semantics) 
+			while( p1 < size1 && p2 < size2 ) {
+				if(cols1[pos1+p1]<cols2[pos2+p2]) {
+					result.appendValue(resultRow, cols1[pos1+p1], 
+						op.fn.execute(values1[pos1+p1], 0));
+					p1++;
+				}
+				else if(cols1[pos1+p1]==cols2[pos2+p2]) {
+					result.appendValue(resultRow, cols1[pos1+p1], 
+						op.fn.execute(values1[pos1+p1], values2[pos2+p2]));
+					p1++;
+					p2++;
+				}
+				else {
+					result.appendValue(resultRow, cols2[pos2+p2], 
+						op.fn.execute(0, values2[pos2+p2]));
+					p2++;
+				}
 			}
-			result.appendValue(resultRow, column, value);	
+			//add left over
+			appendLeftForSparseBinary(op, values1, cols1, pos1, size1, p1, resultRow, result);
+			appendRightForSparseBinary(op, values2, cols2, pos2, size2, p2, resultRow, result);
 		}
-		
-		//add left over
-		appendLeftForSparseBinary(op, values1, cols1, pos1, size1, p1, resultRow, result);
-		appendRightForSparseBinary(op, values2, cols2, pos2, size2, p2, resultRow, result);
 	}
 
 	private static void appendLeftForSparseBinary(BinaryOperator op, double[] values1, int[] cols1, int pos1, int size1,