You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2017/08/23 19:41:08 UTC
[1/2] systemml git commit: [SYSTEMML-1857] Performance codegen outer
operators (degree of par)
Repository: systemml
Updated Branches:
refs/heads/master 8df0697e0 -> 65e2a46d2
[SYSTEMML-1857] Performance codegen outer operators (degree of par)
Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/06fa73ac
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/06fa73ac
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/06fa73ac
Branch: refs/heads/master
Commit: 06fa73acc4639dd446c2f36c62a956803c247753
Parents: 8df0697
Author: Matthias Boehm <mb...@gmail.com>
Authored: Tue Aug 22 19:26:00 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Wed Aug 23 12:41:45 2017 -0700
----------------------------------------------------------------------
.../sysml/runtime/codegen/SpoofCellwise.java | 1 -
.../runtime/codegen/SpoofMultiAggregate.java | 1 -
.../sysml/runtime/codegen/SpoofOperator.java | 4 ++
.../runtime/codegen/SpoofOuterProduct.java | 47 ++++++++++++++------
.../sysml/runtime/codegen/SpoofRowwise.java | 1 -
5 files changed, 38 insertions(+), 16 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/systemml/blob/06fa73ac/src/main/java/org/apache/sysml/runtime/codegen/SpoofCellwise.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofCellwise.java b/src/main/java/org/apache/sysml/runtime/codegen/SpoofCellwise.java
index 575043b..5e22406 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofCellwise.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofCellwise.java
@@ -51,7 +51,6 @@ import org.apache.sysml.runtime.util.UtilFunctions;
public abstract class SpoofCellwise extends SpoofOperator implements Serializable
{
private static final long serialVersionUID = 3442528770573293590L;
- private static final long PAR_NUMCELL_THRESHOLD = 1024*1024; //Min 1M elements
public enum CellType {
NO_AGG,
http://git-wip-us.apache.org/repos/asf/systemml/blob/06fa73ac/src/main/java/org/apache/sysml/runtime/codegen/SpoofMultiAggregate.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofMultiAggregate.java b/src/main/java/org/apache/sysml/runtime/codegen/SpoofMultiAggregate.java
index ae3c353..4aa91bb 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofMultiAggregate.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofMultiAggregate.java
@@ -48,7 +48,6 @@ import org.apache.sysml.runtime.util.UtilFunctions;
public abstract class SpoofMultiAggregate extends SpoofOperator implements Serializable
{
private static final long serialVersionUID = -6164871955591089349L;
- private static final long PAR_NUMCELL_THRESHOLD = 1024*1024; //Min 1M elements
private final AggOp[] _aggOps;
private final boolean _sparseSafe;
http://git-wip-us.apache.org/repos/asf/systemml/blob/06fa73ac/src/main/java/org/apache/sysml/runtime/codegen/SpoofOperator.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOperator.java b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOperator.java
index 3ea9246..fea51e3 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOperator.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOperator.java
@@ -37,6 +37,10 @@ public abstract class SpoofOperator implements Serializable
private static final long serialVersionUID = 3834006998853573319L;
private static final Log LOG = LogFactory.getLog(SpoofOperator.class.getName());
+ protected static final long PAR_NUMCELL_THRESHOLD = 1024*1024; //Min 1M elements
+ protected static final long PAR_MINFLOP_THRESHOLD = 2L*1024*1024; //MIN 2 MFLOP
+
+
public abstract MatrixBlock execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalars, MatrixBlock out)
throws DMLRuntimeException;
http://git-wip-us.apache.org/repos/asf/systemml/blob/06fa73ac/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
index bc99859..1ec873f 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
@@ -112,6 +112,9 @@ public abstract class SpoofOuterProduct extends SpoofOperator
if( inputs.get(0).isEmptyBlock(false) )
return new DoubleObject(0);
+ if( 2*inputs.get(0).getNonZeros()*inputs.get(1).getNumColumns() < PAR_MINFLOP_THRESHOLD )
+ return execute(inputs, scalarObjects); //sequential
+
//input preparation
double[][] ab = getDenseMatrices(prepInputMatrices(inputs, 1, 2, true, false));
double[][] b = getDenseMatrices(prepInputMatrices(inputs, 3, true));
@@ -121,15 +124,14 @@ public abstract class SpoofOuterProduct extends SpoofOperator
final int m = inputs.get(0).getNumRows();
final int n = inputs.get(0).getNumColumns();
final int k = inputs.get(1).getNumColumns(); // rank
+ final long nnz = inputs.get(0).getNonZeros();
double sum = 0;
try
{
ExecutorService pool = Executors.newFixedThreadPool(k);
ArrayList<ParOuterProdAggTask> tasks = new ArrayList<ParOuterProdAggTask>();
- //create tasks (for wdivmm-left, parallelization over columns;
- //for wdivmm-right, parallelization over rows; both ensure disjoint results)
- int numThreads2 = UtilFunctions.roundToNext(Math.min(8*k,m/32), k);
+ int numThreads2 = getPreferredNumberOfTasks(m, n, nnz, k, numThreads);
int blklen = (int)(Math.ceil((double)m/numThreads2));
for( int i=0; i<numThreads2 & i*blklen<m; i++ )
tasks.add(new ParOuterProdAggTask(inputs.get(0), ab[0], ab[1], b, scalars,
@@ -259,6 +261,9 @@ public abstract class SpoofOuterProduct extends SpoofOperator
out.allocateDenseBlock();
}
+ if( 2*inputs.get(0).getNonZeros()*inputs.get(1).getNumColumns() < PAR_MINFLOP_THRESHOLD )
+ return execute(inputs, scalarObjects, out); //sequential
+
//input preparation
double[][] ab = getDenseMatrices(prepInputMatrices(inputs, 1, 2, true, false));
double[][] b = getDenseMatrices(prepInputMatrices(inputs, 3, true));
@@ -268,6 +273,7 @@ public abstract class SpoofOuterProduct extends SpoofOperator
final int m = inputs.get(0).getNumRows();
final int n = inputs.get(0).getNumColumns();
final int k = inputs.get(1).getNumColumns(); // rank
+ final long nnz = inputs.get(0).getNonZeros();
MatrixBlock a = inputs.get(0);
@@ -284,21 +290,24 @@ public abstract class SpoofOuterProduct extends SpoofOperator
int numCG = ((CompressedMatrixBlock)a).getNumColGroups();
int blklen = (int)(Math.ceil((double)numCG/numThreads));
for( int j=0; j<numThreads & j*blklen<numCG; j++ )
- tasks.add(new ParExecTask(a, ab[0], ab[1], b, scalars, out, m, n, k, _outerProductType, 0, m, j*blklen, Math.min((j+1)*blklen, numCG)));
+ tasks.add(new ParExecTask(a, ab[0], ab[1], b, scalars, out, m, n, k,
+ _outerProductType, 0, m, j*blklen, Math.min((j+1)*blklen, numCG)));
}
else {
//parallelize over column partitions
int blklen = (int)(Math.ceil((double)n/numThreads));
for( int j=0; j<numThreads & j*blklen<n; j++ )
- tasks.add(new ParExecTask(a, ab[0], ab[1], b, scalars, out, m, n, k, _outerProductType, 0, m, j*blklen, Math.min((j+1)*blklen, n)));
+ tasks.add(new ParExecTask(a, ab[0], ab[1], b, scalars, out, m, n, k,
+ _outerProductType, 0, m, j*blklen, Math.min((j+1)*blklen, n)));
}
}
else { //right or cell-wise
//parallelize over row partitions
- int numThreads2 = UtilFunctions.roundToNext(Math.min(8*k,m/32), k);
+ int numThreads2 = getPreferredNumberOfTasks(m, n, nnz, k, numThreads);
int blklen = (int)(Math.ceil((double)m/numThreads2));
for( int i=0; i<numThreads2 & i*blklen<m; i++ )
- tasks.add(new ParExecTask(a, ab[0], ab[1], b, scalars, out, m, n, k, _outerProductType, i*blklen, Math.min((i+1)*blklen,m), 0, n));
+ tasks.add(new ParExecTask(a, ab[0], ab[1], b, scalars, out, m, n, k,
+ _outerProductType, i*blklen, Math.min((i+1)*blklen,m), 0, n));
}
List<Future<Long>> taskret = pool.invokeAll(tasks);
pool.shutdown();
@@ -320,6 +329,13 @@ public abstract class SpoofOuterProduct extends SpoofOperator
return out;
}
+ private static int getPreferredNumberOfTasks(int m, int n, long nnz, int rank, int k) {
+ //compute number of tasks nk in range [k, 8k]
+ int base = (int) Math.min(Math.min(8*k, m/32),
+ Math.ceil((double)2*nnz*rank/PAR_MINFLOP_THRESHOLD));
+ return UtilFunctions.roundToNext(base, k);
+ }
+
private void executeDense(double[] a, double[] u, double[] v, double[][] b, double[] scalars,
double[] c, int m, int n, int k, OutProdType type, int rl, int ru, int cl, int cu )
{
@@ -427,6 +443,7 @@ public abstract class SpoofOuterProduct extends SpoofOperator
if( !out.isInSparseFormat() ) //DENSE
{
double[] c = out.getDenseBlock();
+ double tmp = 0;
for( int bi=rl; bi<ru; bi+=blocksizeIJ ) {
int bimin = Math.min(ru, bi+blocksizeIJ);
//prepare starting indexes for block row
@@ -441,16 +458,20 @@ public abstract class SpoofOuterProduct extends SpoofOperator
int[] wix = sblock.indexes(i);
double[] wval = sblock.values(i);
int index = wpos + curk[i-bi];
- for( ; index<wpos+wlen && wix[index]<bjmin; index++ ) {
- if(type == OutProdType.CELLWISE_OUTER_PRODUCT)
- c[wix[index]] = genexecCellwise( wval[index], u, uix, v, wix[index]*k, b, scalars, m, n, k, i, wix[index] );
- else
- c[0] += genexecCellwise( wval[index], u, uix, v, wix[index]*k, b, scalars, m, n, k, i, wix[index]);
- }
+ if( type == OutProdType.CELLWISE_OUTER_PRODUCT )
+ for( ; index<wpos+wlen && wix[index]<bjmin; index++ )
+ c[wix[index]] = genexecCellwise( wval[index],
+ u, uix, v, wix[index]*k, b, scalars, m, n, k, i, wix[index] );
+ else
+ for( ; index<wpos+wlen && wix[index]<bjmin; index++ )
+ tmp += genexecCellwise( wval[index],
+ u, uix, v, wix[index]*k, b, scalars, m, n, k, i, wix[index]);
curk[i-bi] = index - wpos;
}
}
}
+ if( type != OutProdType.CELLWISE_OUTER_PRODUCT )
+ c[0] = tmp;
}
else //SPARSE
{
http://git-wip-us.apache.org/repos/asf/systemml/blob/06fa73ac/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java b/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java
index e2d9f41..f1cef34 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java
@@ -44,7 +44,6 @@ import org.apache.sysml.runtime.util.UtilFunctions;
public abstract class SpoofRowwise extends SpoofOperator
{
private static final long serialVersionUID = 6242910797139642998L;
- private static final long PAR_NUMCELL_THRESHOLD = 1024*1024; //Min 1M elements
public enum RowType {
NO_AGG, //no aggregation
[2/2] systemml git commit: [SYSTEMML-1861] Performance sparse-sparse
binary mult operations
Posted by mb...@apache.org.
[SYSTEMML-1861] Performance sparse-sparse binary mult operations
This patch improves the performance of sparse-sparse binary multiply
operations. Instead of using a merge join with outer join semantics, we
now use a dedicated case that realizes multiply via inner join semantics
and branchless position maintenance.
On a scenario of X * Y, with 1M x 1K, sparsity=0.1 inputs, this patch
improved performance from 330ms to 235ms.
Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/65e2a46d
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/65e2a46d
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/65e2a46d
Branch: refs/heads/master
Commit: 65e2a46d2bccfebe4ed5a566d02c34a1cb816da5
Parents: 06fa73a
Author: Matthias Boehm <mb...@gmail.com>
Authored: Tue Aug 22 22:13:05 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Wed Aug 23 12:41:47 2017 -0700
----------------------------------------------------------------------
.../runtime/matrix/data/LibMatrixBincell.java | 79 ++++++++++----------
1 file changed, 39 insertions(+), 40 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/systemml/blob/65e2a46d/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java
index e188b4e..9489225 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java
@@ -1184,52 +1184,51 @@ public class LibMatrixBincell
}
}
- /**
- * like a merge sort
- *
- * @param op binary operator
- * @param values1 array of double values
- * @param cols1 ?
- * @param pos1 ?
- * @param size1 ?
- * @param values2 array of double values
- * @param cols2 ?
- * @param pos2 ?
- * @param size2 ?
- * @param resultRow ?
- * @param result matrix block
- * @throws DMLRuntimeException if DMLRuntimeException occurs
- */
private static void mergeForSparseBinary(BinaryOperator op, double[] values1, int[] cols1, int pos1, int size1,
- double[] values2, int[] cols2, int pos2, int size2, int resultRow, MatrixBlock result)
+ double[] values2, int[] cols2, int pos2, int size2, int resultRow, MatrixBlock result)
throws DMLRuntimeException
{
- int p1=0, p2=0, column;
- while( p1<size1 && p2< size2 )
- {
- double value = 0;
- if(cols1[pos1+p1]<cols2[pos2+p2]) {
- value = op.fn.execute(values1[pos1+p1], 0);
- column = cols1[pos1+p1];
- p1++;
+ int p1 = 0, p2 = 0;
+ if( op.fn instanceof Multiply ) { //skip empty
+ //skip empty: merge-join (with inner join semantics)
+ //similar to sorted list intersection
+ SparseBlock sblock = result.getSparseBlock();
+ sblock.allocate(resultRow, Math.min(size1, size2), result.clen);
+ while( p1 < size1 && p2 < size2 ) {
+ int colPos1 = cols1[pos1+p1];
+ int colPos2 = cols2[pos2+p2];
+ if( colPos1 == colPos2 )
+ sblock.append(resultRow, colPos1,
+ op.fn.execute(values1[pos1+p1], values2[pos2+p2]));
+ p1 += (colPos1 <= colPos2) ? 1 : 0;
+ p2 += (colPos1 >= colPos2) ? 1 : 0;
}
- else if(cols1[pos1+p1]==cols2[pos2+p2]) {
- value = op.fn.execute(values1[pos1+p1], values2[pos2+p2]);
- column = cols1[pos1+p1];
- p1++;
- p2++;
- }
- else {
- value = op.fn.execute(0, values2[pos2+p2]);
- column = cols2[pos2+p2];
- p2++;
+ result.nonZeros += sblock.size(resultRow);
+ }
+ else {
+ //general case: merge-join (with outer join semantics)
+ while( p1 < size1 && p2 < size2 ) {
+ if(cols1[pos1+p1]<cols2[pos2+p2]) {
+ result.appendValue(resultRow, cols1[pos1+p1],
+ op.fn.execute(values1[pos1+p1], 0));
+ p1++;
+ }
+ else if(cols1[pos1+p1]==cols2[pos2+p2]) {
+ result.appendValue(resultRow, cols1[pos1+p1],
+ op.fn.execute(values1[pos1+p1], values2[pos2+p2]));
+ p1++;
+ p2++;
+ }
+ else {
+ result.appendValue(resultRow, cols2[pos2+p2],
+ op.fn.execute(0, values2[pos2+p2]));
+ p2++;
+ }
}
- result.appendValue(resultRow, column, value);
+ //add left over
+ appendLeftForSparseBinary(op, values1, cols1, pos1, size1, p1, resultRow, result);
+ appendRightForSparseBinary(op, values2, cols2, pos2, size2, p2, resultRow, result);
}
-
- //add left over
- appendLeftForSparseBinary(op, values1, cols1, pos1, size1, p1, resultRow, result);
- appendRightForSparseBinary(op, values2, cols2, pos2, size2, p2, resultRow, result);
}
private static void appendLeftForSparseBinary(BinaryOperator op, double[] values1, int[] cols1, int pos1, int size1,