You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2018/02/04 04:45:49 UTC
systemml git commit: [SYSTEMML-2129] Performance sparse relu_backward
operations
Repository: systemml
Updated Branches:
refs/heads/master f14255f46 -> a671883fc
[SYSTEMML-2129] Performance sparse relu_backward operations
This patch improves the performance of sparse relu_backward operations
(including the common case of dense-sparse) by (1) allocating the output
in sparse representation if at least one input is sparse, (2) dedicated
implementations for all dense-sparse combinations, and (3) thread-local
nnz maintenance to avoid unnecessary passes over the output.
On 1000 iterations of lenet over mnist, this patch improved the
relu_backward runtime (for 2000 exec calls), from 24.9s to 5.6s.
Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/a671883f
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/a671883f
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/a671883f
Branch: refs/heads/master
Commit: a671883fc5cac2d81e0dcbd33c57abe1d45b593a
Parents: f14255f
Author: Matthias Boehm <mb...@gmail.com>
Authored: Sat Feb 3 20:08:22 2018 -0800
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Sat Feb 3 20:45:51 2018 -0800
----------------------------------------------------------------------
.../apache/sysml/api/ScriptExecutorUtils.java | 15 +-
.../cp/ConvolutionCPInstruction.java | 10 +-
.../sysml/runtime/matrix/data/LibMatrixDNN.java | 4 +-
.../runtime/matrix/data/LibMatrixDNNRelu.java | 146 ++++++++-----------
4 files changed, 70 insertions(+), 105 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/systemml/blob/a671883f/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
index 253a317..a6c276f 100644
--- a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
+++ b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
@@ -113,18 +113,9 @@ public class ScriptExecutorUtils {
// display statistics (incl caching stats if enabled)
Statistics.stopRunTimer();
-
- if (!exceptionThrown) {
- if (statisticsMaxHeavyHitters > 0)
- System.out.println(Statistics.display(statisticsMaxHeavyHitters));
- else
- System.out.println(Statistics.display());
- } else {
- if (statisticsMaxHeavyHitters > 0)
- System.err.println(Statistics.display(statisticsMaxHeavyHitters));
- else
- System.err.println(Statistics.display());
- }
+ (exceptionThrown ? System.err : System.out)
+ .println(Statistics.display(statisticsMaxHeavyHitters > 0 ?
+ statisticsMaxHeavyHitters : DMLScript.STATISTICS_COUNT));
}
}
http://git-wip-us.apache.org/repos/asf/systemml/blob/a671883f/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
index 08d220e..e32c3cf 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
@@ -227,15 +227,13 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
// (X > 0) * dout
MatrixBlock input = ec.getMatrixInput(input1.getName(), getExtendedOpcode());
MatrixBlock dout = ec.getMatrixInput(_in2.getName(), getExtendedOpcode());
- MatrixBlock outputBlock;
+ MatrixBlock outputBlock = new MatrixBlock(input.getNumRows(), input.getNumColumns(),
+ input.isInSparseFormat() || dout.isInSparseFormat() );
- if( !input.isEmpty() && !dout.isEmpty() ) {
- outputBlock = new MatrixBlock(input.getNumRows(), input.getNumColumns(), false);
- outputBlock.allocateDenseBlock();
+ if( !input.isEmpty() && !dout.isEmpty() ) { //sparse-safe
+ outputBlock.allocateBlock();
LibMatrixDNN.reluBackward(input, dout, outputBlock, _numThreads);
}
- else
- outputBlock = new MatrixBlock(input.getNumRows(), input.getNumColumns(), true);
// release inputs/outputs
ec.releaseMatrixInput(input1.getName(), getExtendedOpcode());
http://git-wip-us.apache.org/repos/asf/systemml/blob/a671883f/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
index e8a88d8..1ad56b2 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
@@ -273,10 +273,10 @@ public class LibMatrixDNN {
input.getNumRows() + " != " + dout.getNumRows() + " || " + input.getNumColumns() + " != " + dout.getNumColumns());
}
- execute(LibMatrixDNNRelu.getReluBackwardWorkers(params), params);
+ long nnz = execute(LibMatrixDNNRelu.getReluBackwardWorkers(params), params);
// post-processing: maintain nnz
- outputBlock.recomputeNonZeros();
+ outputBlock.setNonZeros(nnz);
outputBlock.examSparsity();
}
http://git-wip-us.apache.org/repos/asf/systemml/blob/a671883f/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNRelu.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNRelu.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNRelu.java
index c8f2f41..c8f85b1 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNRelu.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNRelu.java
@@ -23,17 +23,12 @@ import java.util.concurrent.Callable;
import org.apache.sysml.hops.OptimizerUtils;
import org.apache.sysml.runtime.DMLRuntimeException;
-import org.apache.sysml.runtime.instructions.InstructionUtils;
-import org.apache.sysml.runtime.matrix.operators.ScalarOperator;
/**
* This class contains the different implementation of rotate180 operation
*/
public class LibMatrixDNNRelu
{
- private static ScalarOperator GT0 = InstructionUtils.parseScalarBinaryOperator(">", false, 0);
-
-
/**
* Factory method that returns list of callable tasks for performing relu backward operation
*
@@ -64,94 +59,75 @@ public class LibMatrixDNNRelu
@Override
public Long call() throws Exception {
- //note: X (m x n), dout (m x n) -> out (m x n)
- DenseBlock out = _params.output.getDenseBlock();
- final int n = _params.input1.getNumColumns();
- if(!_params.input1.isInSparseFormat() && !_params.input2.isInSparseFormat()) {
- DenseBlock x = _params.input1.getDenseBlock();
- DenseBlock dout = _params.input2.getDenseBlock();
- for(int i = _rl; i < _ru; i++) {
- double[] xvals = x.values(i), doutvals = dout.values(i), cvals = out.values(i);
- int xpos = x.pos(i), doutpos = dout.pos(i), cpos = out.pos(i);
- for(int j=0; j<n; j++)
- cvals[cpos+j] = xvals[xpos+j] > 0 ? doutvals[doutpos +j] : 0;
- }
- }
- else {
- scalarOperations(_params.input1, out, n, _rl, _ru, GT0); // (X > 0)
- binaryOperationInPlaceMult(_params.input2, out, n, _rl, _ru); // (X > 0) * dout
- }
- return 0L;
+ MatrixBlock m1 = _params.input1;
+ MatrixBlock m2 = _params.input2;
+ MatrixBlock out = _params.output;
+ final int n = m1.getNumColumns();
+ if( m1.isEmptyBlock(false) || m2.isEmptyBlock(false) )
+ return 0L; //nothing to do
+
+ //compute c = (a > 0) * b
+ //(if there is at least one sparse input, the output is allocated in sparse)
+ if(!m1.isInSparseFormat() && !m2.isInSparseFormat())
+ reluBackwardDenseDense(m1.getDenseBlock(), m2.getDenseBlock(), out.getDenseBlock(), n, _rl, _ru);
+ else if(!m1.isInSparseFormat() && m2.isInSparseFormat())
+ reluBackwardDenseSparse(m1.getDenseBlock(), m2.getSparseBlock(), out.getSparseBlock(), _rl, _ru);
+ else if(m1.isInSparseFormat() && !m2.isInSparseFormat())
+ reluBackwardSparseDense(m1.getSparseBlock(), m2.getDenseBlock(), out.getSparseBlock(), _rl, _ru);
+ else //sparse-sparse
+ reluBackwardSparseDense(m1.getSparseBlock(), m2.getSparseBlock(), out.getSparseBlock(), _rl, _ru);
+
+ //thread-local nnz maintenance
+ return out.recomputeNonZeros(_rl, _ru-1);
}
}
- private static void scalarOperations(MatrixBlock src, DenseBlock c,
- int destNumCols, int src_rl, int src_ru, ScalarOperator op)
- throws DMLRuntimeException
- {
- if(src.isInSparseFormat()) {
- for(int i = src_rl; i < src_ru; i++) {
- if( src.getSparseBlock().isEmpty(i) ) continue;
- int apos = src.getSparseBlock().pos(i);
- int alen = src.getSparseBlock().size(i);
- int[] aix = src.getSparseBlock().indexes(i);
- double[] avals = src.getSparseBlock().values(i);
- double[] cvals = c.values(i);
- int cix = c.pos(i);
- for(int j = apos; j < apos+alen; j++)
- cvals[ cix+aix[j] ] = op.executeScalar(avals[j]);
- }
- }
- else {
- DenseBlock a = src.getDenseBlock();
- for(int i = src_rl; i < src_ru; i++) {
- double[] avals = a.values(i), cvals = c.values(i);
- int aix = a.pos(i), cix = c.pos(i);
- for(int j=0; j<destNumCols; j++)
- cvals[cix+j] = op.executeScalar(avals[aix+j]);
- }
+ private static void reluBackwardDenseDense(DenseBlock a, DenseBlock b, DenseBlock c, int n, int rl, int ru) {
+ for(int i = rl; i < ru; i++) {
+ double[] avals = a.values(i), bvals = b.values(i);
+ double[] cvals = c.values(i);
+ int ix = a.pos(i);
+ for(int j=0; j<n; j++)
+ cvals[ix+j] = (avals[ix+j] > 0) ? bvals[ix +j] : 0;
}
}
- private static void binaryOperationInPlaceMult(MatrixBlock src,
- DenseBlock c, int destNumCols, int src_rl, int src_ru)
- throws DMLRuntimeException
- {
- if( src.isEmptyBlock(false) ) {
- c.set(src_rl, src_rl, 0, destNumCols, 0);
- return;
+ private static void reluBackwardDenseSparse(DenseBlock a, SparseBlock b, SparseBlock c, int rl, int ru) {
+ for(int i = rl; i < ru; i++) {
+ if( b.isEmpty(i) ) continue;
+ int bpos = b.pos(i), blen = b.size(i);
+ int[] bix = b.indexes(i);
+ double[] bvals = b.values(i), avals = a.values(i);
+ int aix = a.pos(i);
+ c.allocate(i, blen);
+ for(int k=bpos; k<bpos+blen; k++)
+ c.append(i, bix[k], (avals[aix+bix[k]] > 0) ? bvals[k] : 0);
}
-
- if(src.isInSparseFormat()) {
- for(int i = src_rl; i < src_ru; i++) {
- if( !src.getSparseBlock().isEmpty(i) ) {
- int apos = src.getSparseBlock().pos(i);
- int alen = src.getSparseBlock().size(i);
- int[] aix = src.getSparseBlock().indexes(i);
- double[] avals = src.getSparseBlock().values(i);
- double[] cvals = c.values(i);
- int cix = c.pos(i);
- int prevDestIndex = 0;
- for(int j = apos; j < apos+alen; j++) {
- c.set(i, i+1, prevDestIndex, aix[j], 0);
- prevDestIndex = aix[j]+1;
- cvals[ cix+aix[j] ] *= avals[j];
- }
- c.set(i, i+1, prevDestIndex, destNumCols, 0);
- }
- else {
- c.set(i, i+1, 0, destNumCols, 0);
- }
- }
+ }
+
+ private static void reluBackwardSparseDense(SparseBlock a, DenseBlock b, SparseBlock c, int rl, int ru) {
+ for(int i = rl; i < ru; i++) {
+ if( a.isEmpty(i) ) continue;
+ int apos = a.pos(i), alen = a.size(i);
+ int[] aix = a.indexes(i);
+ double[] avals = a.values(i), bvals = b.values(i);
+ int bix = b.pos(i);
+ c.allocate(i, alen);
+ for(int k=apos; k<apos+alen; k++)
+ c.append(i, aix[k], (avals[k] > 0) ? bvals[bix+aix[k]] : 0);
}
- else { //DENSE
- DenseBlock a = src.getDenseBlock();
- for(int i = src_rl; i < src_ru; i++) {
- double[] avals = a.values(i), cvals = c.values(i);
- int aix = a.pos(i), cix = c.pos(i);
- for(int j=0; j<destNumCols; j++)
- cvals[cix+j] *= avals[aix+j];
- }
+ }
+
+ private static void reluBackwardSparseDense(SparseBlock a, SparseBlock b, SparseBlock c, int rl, int ru) {
+ //b is the driver as it has likely less non-zeros
+ for(int i = rl; i < ru; i++) {
+ if( a.isEmpty(i) || b.isEmpty(i) ) continue;
+ int bpos = b.pos(i), blen = b.size(i);
+ int[] bix = b.indexes(i);
+ double[] bvals = b.values(i);
+ c.allocate(i, blen);
+ for(int k=bpos; k<bpos+blen; k++)
+ c.append(i, bix[k], (a.get(i, bix[k]) > 0) ? bvals[k] : 0);
}
}
}