You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2017/10/22 09:04:19 UTC

systemml git commit: [SYSTEMML-1970] Performance conv2d-backward-filter (for sparse filter)

Repository: systemml
Updated Branches:
  refs/heads/master 311e4aac9 -> b261661a8


[SYSTEMML-1970] Performance conv2d-backward-filter (for sparse filter)

This patch makes a number of performance improvements to sparse and
dense conv2d backward filter:

1) Conv2d backward w/ sparse filter: So far the rotate180 per input row,
converted any dense or sparse input into dense intermediates, which are
then fed into the matrix multiplication. We now rotate sparse filters
into sparse intermediates, which gives very good mm improvements due to
sparse-dense matrix multiplication. This patch also fixes the used
sparsity of related tests, which so far never created sparse inputs.

2) Minor dense conv2d improvements: Additional improvements includes the
removal of unnecessary allocations of matrix multiplication outputs,
more efficient output accumulation, and cache-conscious transpose
addition operations.

On an end-to-end cnn application, this patch improved the runtime from
610s to 498s per epoch.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/b261661a
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/b261661a
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/b261661a

Branch: refs/heads/master
Commit: b261661a834bfaef1eacc7fa0a14e885811082a1
Parents: 311e4aa
Author: Matthias Boehm <mb...@gmail.com>
Authored: Sun Oct 22 02:04:49 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Sun Oct 22 02:04:49 2017 -0700

----------------------------------------------------------------------
 .../LibMatrixDNNConv2dBackwardDataHelper.java   |   2 +-
 .../LibMatrixDNNConv2dBackwardFilterHelper.java |  71 +++++----
 .../runtime/matrix/data/LibMatrixDNNHelper.java |   8 +-
 .../matrix/data/LibMatrixDNNIm2ColHelper.java   |  48 +++---
 .../data/LibMatrixDNNRotate180Helper.java       |  67 ++++-----
 .../functions/tensor/Conv2DBackwardTest.java    | 147 ++++++++-----------
 .../functions/tensor/Conv2DBackwardTest.R       |   4 +-
 .../functions/tensor/Conv2DBackwardTest.dml     |   4 +-
 8 files changed, 162 insertions(+), 189 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/b261661a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardDataHelper.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardDataHelper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardDataHelper.java
index 609af11..04c13e6 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardDataHelper.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardDataHelper.java
@@ -81,7 +81,7 @@ public class LibMatrixDNNConv2dBackwardDataHelper {
 			MatrixBlock dout_reshaped = new MatrixBlock(PQ, K, false);
 			dout_reshaped.allocateDenseBlock();
 			LibMatrixDNNRotate180Helper.Rotate180Worker rotate180Worker = 
-					LibMatrixDNNRotate180Helper.Rotate180Worker.getWorker( dout, dout_reshaped.getDenseBlock(), _params, true);
+					LibMatrixDNNRotate180Helper.Rotate180Worker.getWorker( dout, dout_reshaped, _params, true, false);
 			long time1 = 0; long time2 = 0;
 			for(int n = _rl; n < _ru; n++)  {
 				// rotate180(dout[n,]) => dout_reshaped

http://git-wip-us.apache.org/repos/asf/systemml/blob/b261661a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java
index b89be82..de45b81 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java
@@ -22,7 +22,7 @@ import java.util.concurrent.Callable;
 
 import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.runtime.matrix.data.LibMatrixDNNIm2ColHelper.Im2colWorker;
-import org.apache.sysml.runtime.util.ConvolutionUtils;
+import org.apache.sysml.runtime.matrix.data.LibMatrixDNNRotate180Helper.Rotate180Worker;
 import org.apache.sysml.utils.NativeHelper;
 
 public class LibMatrixDNNConv2dBackwardFilterHelper {
@@ -43,12 +43,13 @@ public class LibMatrixDNNConv2dBackwardFilterHelper {
 		
 		@Override
 		public Long call() throws Exception {
-			int CRS = _params.C*_params.R*_params.S; 
-			double [] dout_n = new double[_params.P*_params.Q*_params.K];
+			int CRS = _params.C*_params.R*_params.S, PQ = _params.P*_params.Q, K = _params.K;
+			MatrixBlock dout_n = new MatrixBlock(PQ, K, false);
+			dout_n.allocateBlock();
 			LibMatrixDNNRotate180Helper.Rotate180Worker rotate180Worker = 
-					LibMatrixDNNRotate180Helper.Rotate180Worker.getWorker( _params.input2, dout_n, _params, true);
-			// partialRetBlock is size: [params.C*params.R*params.S, params.K]
-			double [] partialRetBlock = new double[CRS*_params.K];
+					LibMatrixDNNRotate180Helper.Rotate180Worker.getWorker( _params.input2, dout_n, _params, true, false);
+			double [] ldout_n = dout_n.getDenseBlock();
+			double [] partRet = new double[CRS*_params.K]; //CRS x K
 			for(int n = _rl; n < _ru; n++) {
 				if( !_params.input1.getSparseBlock().isEmpty(n) ) {
 					// rotate180(dout[n,]) => dout_n
@@ -59,11 +60,11 @@ public class LibMatrixDNNConv2dBackwardFilterHelper {
 					int[] aix = _params.input1.getSparseBlock().indexes(n);
 					double[] avals = _params.input1.getSparseBlock().values(n);
 					NativeHelper.conv2dBackwardFilterSparseDense(apos, alen, aix, avals, 
-							dout_n, partialRetBlock, 1, _params.C, _params.H, _params.W, _params.K, 
+							ldout_n, partRet, 1, _params.C, _params.H, _params.W, _params.K, 
 							_params.R, _params.S, _params.stride_h, _params.stride_w, _params.pad_h, _params.pad_w, _params.P, _params.Q, 1);
 				}
 			}
-			inplaceTransposedAddition(partialRetBlock, _params);
+			inplaceTransAdd(partRet, _params);
 			return 0L;
 		}
 	}
@@ -72,9 +73,9 @@ public class LibMatrixDNNConv2dBackwardFilterHelper {
 	 * General conv2d backward data operator
 	 */
 	public static class Conv2dBackwardFilter implements Callable<Long> {
-
-		public int _rl; public int _ru; 
+		private final int _rl, _ru; 
 		private final ConvolutionParameters _params; 
+		
 		public Conv2dBackwardFilter(int rl, int ru, ConvolutionParameters params) {
 			_rl = rl; _ru = ru;
 			_params = params;
@@ -82,15 +83,17 @@ public class LibMatrixDNNConv2dBackwardFilterHelper {
 		
 		@Override
 		public Long call() throws Exception {
-			int PQ = _params.P*_params.Q; int K = _params.K; int CRS = _params.C*_params.R*_params.S;
+			int PQ = _params.P*_params.Q, K = _params.K, CRS = _params.C*_params.R*_params.S;
 			MatrixBlock dout = _params.input2;
 			MatrixBlock im2ColOutBlock = new MatrixBlock(CRS, PQ, false);
-			MatrixBlock dout_reshaped = new MatrixBlock(PQ, K, false);
-			dout_reshaped.allocateDenseBlock();
+			MatrixBlock dout_reshaped = new MatrixBlock(PQ, K, dout.sparse);
+			MatrixBlock temp = new MatrixBlock(CRS, K, false);
+			dout_reshaped.allocateBlock();
+			temp.allocateBlock();
+			
 			Im2colWorker im2ColWorker = Im2colWorker.getWorker( _params.input1, im2ColOutBlock, _params, true, false);
-			LibMatrixDNNRotate180Helper.Rotate180Worker rotate180Worker = 
-					LibMatrixDNNRotate180Helper.Rotate180Worker.getWorker( dout, dout_reshaped.getDenseBlock(), _params, true);
-			double [] partialRetBlock = new double[CRS*_params.K];
+			Rotate180Worker rotate180Worker = Rotate180Worker.getWorker( dout, dout_reshaped, _params, true, false);
+			double [] partRet = new double[CRS*_params.K];
 			long time1 = 0; long time2 = 0;
 			for(int n = _rl; n < _ru; n++) {
 				// rotate180(dout[n,]) => dout_reshaped
@@ -101,22 +104,19 @@ public class LibMatrixDNNConv2dBackwardFilterHelper {
 				im2ColWorker.execute(n);
 				long t2 = DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0;
 				
-				MatrixBlock temp = new MatrixBlock(CRS, K, false);
+				temp.reset(CRS, K, false);
 				LibMatrixDNNHelper.singleThreadedMatMult(im2ColOutBlock, dout_reshaped, temp, true, true, _params);
 				long t3 = DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0;
 				
-				if(!temp.isEmptyBlock()) {
-					// partialRetBlock is size: [params.C*params.R*params.S, params.K]
-					ConvolutionUtils.binaryOperationInPlace(temp, partialRetBlock, 0, K, 0, CRS, 
-							LibMatrixDNN._binaryElementWiseAddition);
-				}
+				if( !temp.isEmptyBlock() ) //accumulate row results
+					LibMatrixMult.vectAdd(temp.getDenseBlock(), partRet, 0, 0, K*CRS);
 				
 				if(DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS) {
 					time1 += t2 - t1;
 					time2 += t3 - t2;
 				}
 			}
-			inplaceTransposedAddition(partialRetBlock, _params);
+			inplaceTransAdd(partRet, _params);
 			if(DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS) {
 				LibMatrixDNN.loopedConvBwdFilterIm2ColTime.addAndGet(time1);
 				LibMatrixDNN.loopedConvBwdFilterMatMultTime.addAndGet(time2);
@@ -124,15 +124,22 @@ public class LibMatrixDNNConv2dBackwardFilterHelper {
 			return 0L;
 		}
 	}
-	private static synchronized void inplaceTransposedAddition(double [] partialRetBlock, ConvolutionParameters params) {
-		// Perform transposed addition: output of size [K, CRS] += partialRetBlock of size [CRS,K]
-		int iter = 0; int CRS = params.C*params.R*params.S; int K = params.K;
-		double [] outputArr = params.output.denseBlock;
-		for(int i = 0; i < CRS; i++) {
-			for(int j = 0; j < K; j++, iter++) {
-				int index = j*CRS+i;
-				outputArr[index] += partialRetBlock[iter];
+	
+	private static synchronized void inplaceTransAdd(double[] a, ConvolutionParameters params) {
+		// Perform transposed addition: output of size [K, CRS] += input of size [CRS,K]
+		double [] c = params.output.denseBlock;
+		final int CRS = params.C*params.R*params.S, K = params.K;
+		final int blocksizeIJ = 128; //L2 cache
+		
+		//cache-conscious blocked execution
+		for( int bi=0; bi<CRS; bi+=blocksizeIJ )
+			for( int bj=0; bj<K; bj+=blocksizeIJ ) {
+				int bimin = Math.min(bi+blocksizeIJ, CRS);
+				int bjmin = Math.min(bj+blocksizeIJ, K);
+				//core transpose add operation
+				for(int i=bi, aix=bi*K; i<bimin; i++, aix+=K)
+					for(int j=bj, cix=i+bj*CRS; j<bjmin; j++, cix+=CRS)
+						c[cix] += a[aix+j];
 			}
-		}
 	}
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/b261661a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java
index b80a786..6117b90 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java
@@ -24,6 +24,8 @@ import java.util.concurrent.Callable;
 
 import org.apache.sysml.hops.OptimizerUtils;
 import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.matrix.data.LibMatrixDNNConv2dBackwardFilterHelper.Conv2dBackwardFilter;
+import org.apache.sysml.runtime.matrix.data.LibMatrixDNNConv2dBackwardFilterHelper.SparseNativeConv2dBackwardFilterDense;
 import org.apache.sysml.runtime.instructions.InstructionUtils;
 import org.apache.sysml.runtime.util.ConvolutionUtils;
 import org.apache.sysml.utils.NativeHelper;
@@ -169,13 +171,13 @@ public class LibMatrixDNNHelper {
 		int taskSize = (int)(Math.ceil((double)params.N / k));
 		
 		boolean isEmptyDenseInput = (!params.input1.isInSparseFormat() && params.input1.denseBlock == null) || 
-																(!params.input2.isInSparseFormat() && params.input2.denseBlock == null);
+			(!params.input2.isInSparseFormat() && params.input2.denseBlock == null);
 		
 		for(int i = 0; i*taskSize < params.N; i++) {
 			if(LibMatrixDNN.isEligibleForConv2dBackwardFilterSparseDense(params)) 
-				ret.add(new LibMatrixDNNConv2dBackwardFilterHelper.SparseNativeConv2dBackwardFilterDense(i*taskSize, Math.min((i+1)*taskSize, params.N), params));
+				ret.add(new SparseNativeConv2dBackwardFilterDense(i*taskSize, Math.min((i+1)*taskSize, params.N), params));
 			else if(!isEmptyDenseInput)
-				ret.add(new LibMatrixDNNConv2dBackwardFilterHelper.Conv2dBackwardFilter(i*taskSize, Math.min((i+1)*taskSize, params.N), params));
+				ret.add(new Conv2dBackwardFilter(i*taskSize, Math.min((i+1)*taskSize, params.N), params));
 			else
 				throw new DMLRuntimeException("Unsupported operator");
 		}

http://git-wip-us.apache.org/repos/asf/systemml/blob/b261661a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java
index 7b43257..a4a6d3d 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java
@@ -71,12 +71,11 @@ public class LibMatrixDNNIm2ColHelper {
 	 * Special case operator for performing dense im2col when stride = [1, 1] and pad = [0, 0] by using System.arraycopy
 	 */
 	static class DenseIm2colWorkerStride1Pad0 implements Im2colWorker {
-		double [] inputArray; double [] outputArray; 
-		int CRS; int S; int R; int P; int Q; int CHW; int H; int W;
+		private final double [] inputArray, outputArray; 
+		private final int S, R, P, Q, CHW, H, W;
 		public DenseIm2colWorkerStride1Pad0(double [] inputArray, double [] outputArray, ConvolutionParameters params) {
 			this.inputArray = inputArray;
 			this.outputArray = outputArray;
-			this.CRS = params.C * params.R * params.S;
 			this.H = params.H; this.W = params.W; this.R = params.R; this.S = params.S; this.P = params.P; this.Q = params.Q;
 			this.CHW = params.C*params.H*params.W;
 		}
@@ -100,10 +99,8 @@ public class LibMatrixDNNIm2ColHelper {
 					System.arraycopy(inputArray, inputOffset + wOffset, outputArray, outOffset, Q);
 					int w = Q - 1;
 					int wPadded = w + wOffset;
-					if (hPadded < H && wPadded < W)
-						outputArray[outOffset + w] = inputArray[inputOffset + wPadded];
-					else
-						outputArray[outOffset + w] = 0;
+					boolean assign = (hPadded < H && wPadded < W);
+					outputArray[outOffset + w] = assign ? inputArray[inputOffset + wPadded] : 0;
 				}
 			}
 		}
@@ -115,8 +112,8 @@ public class LibMatrixDNNIm2ColHelper {
 	 * Special case operator for performing dense im2col when stride = [1, 1] and pad = [0, 0] by using System.arraycopy
 	 */
 	static class DenseIm2colWorkerStride1Pad0AllChannels implements Im2colWorker {
-		double [] inputArray; double [] outputArray; 
-		int CRS; int S; int R; int P; int Q; int CHW; int H; int W;
+		private final double [] inputArray, outputArray; 
+		private final int CRS, S, R, P, Q, CHW, H, W;
 		public DenseIm2colWorkerStride1Pad0AllChannels(double [] inputArray, double [] outputArray, ConvolutionParameters params) {
 			this.inputArray = inputArray;
 			this.outputArray = outputArray;
@@ -144,10 +141,8 @@ public class LibMatrixDNNIm2ColHelper {
 					System.arraycopy(inputArray, inputOffset + wOffset, outputArray, outOffset, Q);
 					int w = Q - 1;
 					int wPadded = w + wOffset;
-					if (hPadded < H && wPadded < W)
-						outputArray[outOffset + w] = inputArray[inputOffset + wPadded];
-					else
-						outputArray[outOffset + w] = 0;
+					boolean assign = (hPadded < H && wPadded < W);
+					outputArray[outOffset + w] = assign ? inputArray[inputOffset + wPadded] : 0;
 				}
 			}
 		}
@@ -157,13 +152,12 @@ public class LibMatrixDNNIm2ColHelper {
 	 * Performing dense im2col (general case)
 	 */
 	static class DenseIm2colWorker implements Im2colWorker {
-		double [] inputArray; double [] outputArray; 
-		int CRS; int S; int R; int P; int Q; int CHW; int H; int W; 
-		int stride_h; int stride_w; int pad_h; int pad_w;
+		private final double [] inputArray, outputArray; 
+		private final int S, R, P, Q, CHW, H, W; 
+		private final int stride_h, stride_w, pad_h, pad_w;
 		public DenseIm2colWorker(double [] inputArray, double [] outputArray, ConvolutionParameters params) {
 			this.inputArray = inputArray;
 			this.outputArray = outputArray;
-			this.CRS = params.C * params.R * params.S;
 			this.H = params.H; this.W = params.W; this.R = params.R; this.S = params.S; this.P = params.P; this.Q = params.Q;
 			this.CHW = params.C*params.H*params.W;
 			this.stride_h = params.stride_h; this.stride_w = params.stride_w;
@@ -190,10 +184,8 @@ public class LibMatrixDNNIm2ColHelper {
 					} else {
 						for (int w = 0; w < Q; ++w) {
 							int wPadded = w * stride_w - pad_w + wOffset;
-							if (wPadded >= 0 && wPadded < W)
-								outputArray[outOffset + w] = inputArray[inputOffset + wPadded];
-							else
-								outputArray[outOffset + w] = 0;
+							boolean assign = (wPadded >= 0 && wPadded < W);
+							outputArray[outOffset + w] = assign ? inputArray[inputOffset + wPadded] : 0;
 						}
 					}
 				}
@@ -205,9 +197,9 @@ public class LibMatrixDNNIm2ColHelper {
 	 * Performing dense im2col (general case)
 	 */
 	private static class DenseIm2colWorkerAllChannels implements Im2colWorker {
-		double [] inputArray; double [] outputArray; 
-		int CRS; int S; int R; int P; int Q; int CHW; int H; int W; 
-		int stride_h; int stride_w; int pad_h; int pad_w;
+		private final double[] inputArray, outputArray; 
+		private final int CRS, S, R, P, Q, CHW, H, W; 
+		private final int stride_h, stride_w, pad_h, pad_w;
 		public DenseIm2colWorkerAllChannels(double [] inputArray, double [] outputArray, ConvolutionParameters params) {
 			this.inputArray = inputArray;
 			this.outputArray = outputArray;
@@ -239,10 +231,8 @@ public class LibMatrixDNNIm2ColHelper {
 					} else {
 						for (int w = 0; w < Q; ++w) {
 							int wPadded = w * stride_w - pad_w + wOffset;
-							if (wPadded >= 0 && wPadded < W)
-								outputArray[outOffset + w] = inputArray[inputOffset + wPadded];
-							else
-								outputArray[outOffset + w] = 0;
+							boolean assign = (wPadded >= 0 && wPadded < W);
+							outputArray[outOffset + w] = assign ? inputArray[inputOffset + wPadded] : 0;
 						}
 					}
 				}
@@ -314,7 +304,7 @@ public class LibMatrixDNNIm2ColHelper {
 		private final MatrixBlock input, output;
 		private final int S, R, P, Q, W, HW;
 		private final int stride_h, stride_w, pad_h, pad_w; 
-		final boolean trans;
+		private final boolean trans;
 		
 		public SparseSparseIm2colWorker(MatrixBlock input, MatrixBlock im2ColOutBlock, ConvolutionParameters params, boolean trans) {
 			this.input = input;

http://git-wip-us.apache.org/repos/asf/systemml/blob/b261661a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNRotate180Helper.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNRotate180Helper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNRotate180Helper.java
index 6bc7caf..7a71ced 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNRotate180Helper.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNRotate180Helper.java
@@ -18,8 +18,6 @@
  */
 package org.apache.sysml.runtime.matrix.data;
 
-import java.util.Arrays;
-
 /**
  * This class contains the different implementation of rotate180 operation
  */
@@ -27,11 +25,12 @@ public class LibMatrixDNNRotate180Helper {
 
 	static interface Rotate180Worker {
 		public void execute(int inputN, int outputN);
-		public static Rotate180Worker getWorker(MatrixBlock input, double [] outputArray, ConvolutionParameters params, boolean zeroOutSparseOutput) {
-			if(!input.isInSparseFormat()) 
-				return new DenseRotate180Worker(input, outputArray, params);
+		public static Rotate180Worker getWorker(MatrixBlock in, MatrixBlock out, 
+			ConvolutionParameters params, boolean zeroOutSparseOutput, boolean trans) {
+			if(!in.isInSparseFormat()) 
+				return new DenseRotate180Worker(in, out.getDenseBlock(), params);
 			else
-				return new SparseRotate180Worker(input, outputArray, params, zeroOutSparseOutput);
+				return new SparseRotate180Worker(in, out, params, trans);
 		}
 	}
 	
@@ -71,39 +70,41 @@ public class LibMatrixDNNRotate180Helper {
 	 * Because the number of rows of output (i.e. NPQ) is much larger than number of columns (i.e. K) 
 	 */
 	static class SparseRotate180Worker implements Rotate180Worker {
-
-		double [] outputArray;  MatrixBlock input;
-		ConvolutionParameters params; boolean zeroOutSparseOutput;
-		public SparseRotate180Worker(MatrixBlock input, double [] outputArray,  ConvolutionParameters params, boolean zeroOutSparseOutput) {
-			this.outputArray = outputArray;
+		private final MatrixBlock in, out;
+		private final ConvolutionParameters params;
+		private final boolean trans;
+		
+		public SparseRotate180Worker(MatrixBlock input, MatrixBlock output, 
+			ConvolutionParameters params, boolean trans) {
+			this.in = input;
+			this.out = output;
 			this.params = params;
-			this.zeroOutSparseOutput = zeroOutSparseOutput;
-			this.input = input;
-			if(outputArray == null)
-				throw new RuntimeException("Incorrect usage: empty inputs");
+			this.trans = trans;
 		}
 		
 		@Override
 		public void execute(int inputN, int outputN) {
-			if(zeroOutSparseOutput)
-				Arrays.fill(outputArray, 0);
+			out.reset();
 			
-			int outputOffset = outputN*params.K*params.P*params.Q;
-			if(!input.isEmptyBlock()) {
-				if( !input.sparseBlock.isEmpty(inputN) ) {
-					int [] tensorIndexes = new int[3];
-					int apos = input.sparseBlock.pos(inputN);
-					int alen = input.sparseBlock.size(inputN);
-					int[] aix = input.sparseBlock.indexes(inputN);
-					double[] avals = input.sparseBlock.values(inputN);
-					for(int j = apos; j < apos+alen; j++) {
-						LibMatrixDNNHelper.computeTensorIndexes(aix[j], tensorIndexes, params.P, params.Q);
-						int k = tensorIndexes[0];
-						int p = tensorIndexes[1];
-						int q = tensorIndexes[2];
-						outputArray[outputOffset + p*params.Q*params.K + q*params.K + k] = avals[j];
-					}
-				}
+			SparseBlock sblock = in.sparseBlock;
+			if( sblock==null || sblock.isEmpty(inputN) )
+				return;
+			
+			int outputOffset = outputN*params.P*params.Q;
+			int [] tensorIndexes = new int[3];
+			int apos = sblock.pos(inputN);
+			int alen = sblock.size(inputN);
+			int[] aix = sblock.indexes(inputN);
+			double[] avals = sblock.values(inputN);
+			for(int j = apos; j < apos+alen; j++) {
+				LibMatrixDNNHelper.computeTensorIndexes(aix[j], tensorIndexes, params.P, params.Q);
+				int k = tensorIndexes[0];
+				int p = tensorIndexes[1];
+				int q = tensorIndexes[2];
+				if( trans )
+					out.appendValue(k, outputOffset + p*params.Q + q, avals[j]);
+				else
+					out.appendValue(outputOffset + p*params.Q + q, k, avals[j]);
 			}
 		}
 	}

http://git-wip-us.apache.org/repos/asf/systemml/blob/b261661a/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DBackwardTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DBackwardTest.java b/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DBackwardTest.java
index decca59..6a9528b 100644
--- a/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DBackwardTest.java
+++ b/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DBackwardTest.java
@@ -32,7 +32,6 @@ import org.junit.Test;
 
 public class Conv2DBackwardTest extends AutomatedTestBase
 {
-	
 	private final static String TEST_NAME = "Conv2DBackwardTest";
 	private final static String TEST_DIR = "functions/tensor/";
 	private final static String TEST_CLASS_DIR = TEST_DIR + Conv2DBackwardTest.class.getSimpleName() + "/";
@@ -40,214 +39,188 @@ public class Conv2DBackwardTest extends AutomatedTestBase
 	
 	@Override
 	public void setUp() {
-		addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, 
-				new String[] {"B"}));
+		TestUtils.clearAssertionInformation();
+		addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[] {"B"}));
 	}
 	
 	
 	@Test
-	public void testConv2DBackwardFilterDense1() 
-	{
+	public void testConv2DBackwardFilterDense1() {
 		int numImg = 3; int imgSize = 3; int numChannels = 3; int numFilters = 1; int filterSize = 2; int stride = 1; int pad = 0;
 		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false, false);
 	}
 	
 	@Test
-	public void testConv2DBackwardFilterDense2() 
-	{
+	public void testConv2DBackwardFilterDense2() {
 		int numImg = 3; int imgSize = 3; int numChannels = 3; int numFilters = 4; int filterSize = 2; int stride = 1; int pad = 0;
 		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false, false);
 	}
 	
 	@Test
-	public void testConv2DBackwardFilterDense3() 
-	{
+	public void testConv2DBackwardFilterDense3() {
 		int numImg = 3; int imgSize = 10; int numChannels = 4; int numFilters = 3; int filterSize = 2; int stride = 2; int pad = 1;
 		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false, false);
 	}
 	
 	@Test
-	public void testConv2DBackwardFilterDense4() 
-	{
+	public void testConv2DBackwardFilterDense4() {
 		int numImg = 3; int imgSize = 10; int numChannels = 4; int numFilters = 3; int filterSize = 5; int stride = 1; int pad = 1;
 		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false, false);
 	}
 	
 	@Test
-	public void testConv2DBackwardFilterDense5() 
-	{
+	public void testConv2DBackwardFilterDense5() {
 		int numImg = 3; int imgSize = 10; int numChannels = 2; int numFilters = 3; int filterSize = 5; int stride = 3; int pad = 2;
 		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false, false);
 	}
 	
 	@Test
-	public void testConv2DBackwardFilterSparse1() 
-	{
+	public void testConv2DBackwardFilterSparse1() {
 		int numImg = 3; int imgSize = 3; int numChannels = 3; int numFilters = 1; int filterSize = 2; int stride = 1; int pad = 0;
 		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false, true);
 	}
 	
 	@Test
-	public void testConv2DBackwardFilterSparse2() 
-	{
+	public void testConv2DBackwardFilterSparse2() {
 		int numImg = 3; int imgSize = 3; int numChannels = 3; int numFilters = 4; int filterSize = 2; int stride = 1; int pad = 0;
 		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false, true);
 	}
 	
 	@Test
-	public void testConv2DBackwardFilterSparse3() 
-	{
+	public void testConv2DBackwardFilterSparse3() {
 		int numImg = 3; int imgSize = 10; int numChannels = 4; int numFilters = 3; int filterSize = 2; int stride = 2; int pad = 1;
 		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false, true);
 	}
 	
 	@Test
-	public void testConv2DBackwardFilterSparse4() 
-	{
+	public void testConv2DBackwardFilterSparse4() {
 		int numImg = 3; int imgSize = 10; int numChannels = 4; int numFilters = 3; int filterSize = 5; int stride = 1; int pad = 1;
 		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false, true);
 	}
 	
 	@Test
-	public void testConv2DBackwardFilterSparse5() 
-	{
+	public void testConv2DBackwardFilterSparse5() {
 		int numImg = 3; int imgSize = 10; int numChannels = 2; int numFilters = 3; int filterSize = 5; int stride = 3; int pad = 2;
 		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false, true);
 	}
 	
 	@Test
-	public void testConv2DBackwardFilterSparse6() 
-	{
+	public void testConv2DBackwardFilterSparse6() {
 		int numImg = 3; int imgSize = 3; int numChannels = 3; int numFilters = 1; int filterSize = 2; int stride = 1; int pad = 0;
 		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true, false);
 	}
 	
 	@Test
-	public void testConv2DBackwardFilterSparse7() 
-	{
+	public void testConv2DBackwardFilterSparse7() {
 		int numImg = 3; int imgSize = 3; int numChannels = 3; int numFilters = 4; int filterSize = 2; int stride = 1; int pad = 0;
 		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true, false);
 	}
 	
 	@Test
-	public void testConv2DBackwardFilterSparse8() 
-	{
+	public void testConv2DBackwardFilterSparse8() {
 		int numImg = 3; int imgSize = 10; int numChannels = 4; int numFilters = 3; int filterSize = 2; int stride = 2; int pad = 1;
 		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true, false);
 	}
 	
 	@Test
-	public void testConv2DBackwardFilterSparse9() 
-	{
+	public void testConv2DBackwardFilterSparse9() {
 		int numImg = 3; int imgSize = 10; int numChannels = 4; int numFilters = 3; int filterSize = 5; int stride = 1; int pad = 1;
 		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true, false);
 	}
 	
 	@Test
-	public void testConv2DBackwardFilterSparse10() 
-	{
+	public void testConv2DBackwardFilterSparse10() {
 		int numImg = 3; int imgSize = 10; int numChannels = 2; int numFilters = 3; int filterSize = 5; int stride = 3; int pad = 2;
 		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true, false);
 	}
 	
 	@Test
-	public void testConv2DBackwardFilterSparse11() 
-	{
+	public void testConv2DBackwardFilterSparse11() {
 		int numImg = 3; int imgSize = 3; int numChannels = 3; int numFilters = 1; int filterSize = 2; int stride = 1; int pad = 0;
 		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true, true);
 	}
 	
 	@Test
-	public void testConv2DBackwardFilterSparse12() 
-	{
+	public void testConv2DBackwardFilterSparse12() {
 		int numImg = 3; int imgSize = 3; int numChannels = 3; int numFilters = 4; int filterSize = 2; int stride = 1; int pad = 0;
 		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true, true);
 	}
 	
 	@Test
-	public void testConv2DBackwardFilterSparse13() 
-	{
+	public void testConv2DBackwardFilterSparse13() {
 		int numImg = 3; int imgSize = 10; int numChannels = 4; int numFilters = 3; int filterSize = 2; int stride = 2; int pad = 1;
 		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true, true);
 	}
 	
 	@Test
-	public void testConv2DBackwardFilterSparse14() 
-	{
+	public void testConv2DBackwardFilterSparse14() {
 		int numImg = 3; int imgSize = 10; int numChannels = 4; int numFilters = 3; int filterSize = 5; int stride = 1; int pad = 1;
 		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true, true);
 	}
 	
 	@Test
-	public void testConv2DBackwardFilterSparse15() 
-	{
+	public void testConv2DBackwardFilterSparse15() {
 		int numImg = 3; int imgSize = 10; int numChannels = 2; int numFilters = 3; int filterSize = 5; int stride = 3; int pad = 2;
 		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true, false);
 	}
 	
-	/**
-	 * 
-	 * @param et
-	 * @param sparse
-	 */
+	@Test
+	public void testConv2DBackwardFilterSparse16() {
+		int numImg = 10; int imgSize = 40; int numChannels = 4; int numFilters = 30; int filterSize = 25; int stride = 1; int pad = 0;
+		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true, false);
+	}
+	
+	@Test
+	public void testConv2DBackwardFilterSparse17() {
+		int numImg = 10, imgSize = 40, numChannels = 4, numFilters = 30, filterSize = 25, stride = 1, pad = 0;
+		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false, true);
+	}
+	
 	public void runConv2DBackwardFilterTest( ExecType et, int imgSize, int numImg, int numChannels, int numFilters, 
-			int filterSize, int stride, int pad, boolean sparse1, boolean sparse2) 
+		int filterSize, int stride, int pad, boolean sparse1, boolean sparse2) 
 	{
-		RUNTIME_PLATFORM oldRTP = rtplatform;
-			
+		RUNTIME_PLATFORM platformOld = rtplatform;
+		switch( et ){
+			case MR: rtplatform = RUNTIME_PLATFORM.HADOOP; break;
+			case SPARK: rtplatform = RUNTIME_PLATFORM.SPARK; break;
+			default: rtplatform = RUNTIME_PLATFORM.HYBRID; break;
+		}
 		boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG;
+		if( rtplatform == RUNTIME_PLATFORM.SPARK )
+			DMLScript.USE_LOCAL_SPARK_CONFIG = true;
+		
 		try
 		{
-			String sparseVal1 = (""+sparse1).toUpperCase();
-			String sparseVal2 = (""+sparse2).toUpperCase();
-			
-	    TestConfiguration config = getTestConfiguration(TEST_NAME);
-	    if(et == ExecType.SPARK) {
-	    	rtplatform = RUNTIME_PLATFORM.SPARK;
-	    }
-	    else {
-	    	rtplatform = (et==ExecType.MR)? RUNTIME_PLATFORM.HADOOP : RUNTIME_PLATFORM.SINGLE_NODE;
-	    }
-			if( rtplatform == RUNTIME_PLATFORM.SPARK )
-				DMLScript.USE_LOCAL_SPARK_CONFIG = true;
-			
+			String sparseVal1 = String.valueOf(sparse1).toUpperCase();
+			String sparseVal2 = String.valueOf(sparse2).toUpperCase();
+			long P = ConvolutionUtils.getP(imgSize, filterSize, stride, pad);
+			TestConfiguration config = getTestConfiguration(TEST_NAME);
 			loadTestConfiguration(config);
-	        
-			/* This is for running the junit test the new way, i.e., construct the arguments directly */
+			
 			String RI_HOME = SCRIPT_DIR + TEST_DIR;
 			fullDMLScriptName = RI_HOME + TEST_NAME + ".dml";
-			
-			
-			long P = ConvolutionUtils.getP(imgSize, filterSize, stride, pad);
-			
-			programArgs = new String[]{"-explain", "-args",  "" + imgSize, "" + numImg, 
-				"" + numChannels, "" + numFilters, 
-				"" + filterSize, "" + stride, "" + pad,
-				"" + P, "" + P, 
-				output("B"), sparseVal1, sparseVal2};
-			        
-			boolean exceptionExpected = false;
-			int expectedNumberOfJobs = -1;
-			runTest(true, exceptionExpected, null, expectedNumberOfJobs);
-			
+			programArgs = new String[]{"-explain", "-args", 
+				String.valueOf(imgSize), String.valueOf(numImg), 
+				String.valueOf(numChannels), String.valueOf(numFilters), 
+				String.valueOf(filterSize), String.valueOf(stride), String.valueOf(pad), 
+				String.valueOf(P), String.valueOf(P), output("B"), sparseVal1, sparseVal2};
 			fullRScriptName = RI_HOME + TEST_NAME + ".R";
 			rCmd = "Rscript" + " " + fullRScriptName + " " + imgSize + " " + numImg + 
 					" " + numChannels + " " + numFilters + 
 					" " + filterSize + " " + stride + " " + pad + " " + P + " " + P + " " + expectedDir() +
 					" " + sparseVal1 + " " + sparseVal2;
-			// Run comparison R script
+			
+			// Run DML and R scripts
+			runTest(true, false, null, -1);
 			runRScript(true);
-			HashMap<CellIndex, Double> bHM = readRMatrixFromFS("B");
 			
+			HashMap<CellIndex, Double> bHM = readRMatrixFromFS("B");
 			HashMap<CellIndex, Double> dmlfile = readDMLMatrixFromHDFS("B");
 			TestUtils.compareMatrices(dmlfile, bHM, epsilon, "B-DML", "NumPy");
-			
 		}
-		finally
-		{
-			rtplatform = oldRTP;
+		finally {
+			rtplatform = platformOld;
 			DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld;
 		}
 	}
-	
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/b261661a/src/test/scripts/functions/tensor/Conv2DBackwardTest.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/tensor/Conv2DBackwardTest.R b/src/test/scripts/functions/tensor/Conv2DBackwardTest.R
index 7319da7..3964e17 100644
--- a/src/test/scripts/functions/tensor/Conv2DBackwardTest.R
+++ b/src/test/scripts/functions/tensor/Conv2DBackwardTest.R
@@ -35,13 +35,13 @@ x=matrix(seq(1, numImg*numChannels*imgSize*imgSize), numImg, numChannels*imgSize
 dout=matrix(seq(1, numImg*numFilters*P*Q), numImg, numFilters*P*Q, byrow=TRUE)
 
 if(as.logical(args[11])) {
-	zero_mask = (x - mean(x)) > 0 
+	zero_mask = (x - mean(x)*1.5) > 0 
 	x = x * zero_mask
 } else {
 	x = x - mean(x)
 }
 if(as.logical(args[12])) {
-	zero_mask = (dout - mean(dout)) > 0 
+	zero_mask = (dout - mean(dout)*1.5) > 0 
 	dout = dout * zero_mask
 } else {
 	dout = dout - mean(dout)

http://git-wip-us.apache.org/repos/asf/systemml/blob/b261661a/src/test/scripts/functions/tensor/Conv2DBackwardTest.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/tensor/Conv2DBackwardTest.dml b/src/test/scripts/functions/tensor/Conv2DBackwardTest.dml
index fb14c1c..b17281e 100644
--- a/src/test/scripts/functions/tensor/Conv2DBackwardTest.dml
+++ b/src/test/scripts/functions/tensor/Conv2DBackwardTest.dml
@@ -33,14 +33,14 @@ Q = $9
 x=matrix(seq(1, numImg*numChannels*imgSize*imgSize), rows=numImg, cols=numChannels*imgSize*imgSize)
 dout=matrix(seq(1, numImg*numFilters*P*Q), rows=numImg, cols=numFilters*P*Q)
 if($11) {
-	zero_mask = (x - mean(x)) > 0 
+	zero_mask = (x - mean(x)*1.5) > 0 
 	x = x * zero_mask
 }
 else {
 	x = x - mean(x)
 }
 if($12) {
-	zero_mask = (dout - mean(dout)) > 0 
+	zero_mask = (dout - mean(dout)*1.5) > 0 
 	dout = dout * zero_mask
 }
 else {