You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@systemml.apache.org by mb...@apache.org on 2017/10/14 09:30:11 UTC

[1/3] systemml git commit: [SYSTEMML-1958] Performance sparse conv2d via transposed im2col-mm

Repository: systemml
Updated Branches:
  refs/heads/master c69bd7f30 -> 8ca61ae26


[SYSTEMML-1958] Performance sparse conv2d via transposed im2col-mm

This patch improves the performance of sparse conv2d operations, where
the input matrix is sparse. The traditional approach performs im2col on
Xi to get dXi and computes the partial result with F %*% dXi, which
relies on dense-sparse matrix multiplications that cause unnecessary
overhead due to scattered writes in the inner loop. Instead, we now
perform this operation as t(t(dXi) %*% t(F)), which uses sparse-dense
matrix multiplications that exploit sparsity in an outer loop (skipping
of entire rows in the rhs) and in a cache-conscious manner. The overhead
for transpose operations is reduced by piggybacking t(dXi) and t(out)
into the im2col and output copy as well as doing t(F) just once for all
rows in the batch.

On a cnn scoring scenario, with varying sparsity betweeen 0.02 and 0.1,
this patch improved the matrix multiplication performance by ~2.5x and
the end-to-end performance over the entire dataset from 1587s to 1086s. 

Furthermore, this patch also fixes and extends the existing sparse
conv2d tests. So far, the respective scripts used a sparsity of 0.5
which left the data and weights in dense representation. We now use a
sparsity of ~0.25. Additionally, this also includes a hardening of the
output format handling of unary and binary operations (i.e., a more
accurate decision if the released inputs cover the memory requirements
of a potential format change).


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/d641c224
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/d641c224
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/d641c224

Branch: refs/heads/master
Commit: d641c22499b1f620d8cf4b6b8975bb1b9e3d1602
Parents: c69bd7f
Author: Matthias Boehm <mb...@gmail.com>
Authored: Fri Oct 13 19:29:40 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Sat Oct 14 02:30:36 2017 -0700

----------------------------------------------------------------------
 .../cp/ComputationCPInstruction.java            |  10 +-
 .../cp/MatrixMatrixArithmeticCPInstruction.java |   4 +-
 .../cp/MatrixScalarBuiltinCPInstruction.java    |   2 +-
 .../LibMatrixCuDNNConvolutionAlgorithm.java     |   3 +-
 .../LibMatrixDNNConv2dBackwardFilterHelper.java |   3 +-
 .../matrix/data/LibMatrixDNNConv2dHelper.java   | 103 +++++++++--
 .../runtime/matrix/data/LibMatrixDNNHelper.java |  19 ++-
 .../matrix/data/LibMatrixDNNIm2ColHelper.java   | 161 ++++++++---------
 .../runtime/matrix/data/LibMatrixReorg.java     |   4 +-
 .../sysml/runtime/matrix/data/MatrixBlock.java  |  17 +-
 .../functions/tensor/Conv2DTest.java            | 171 ++++++++++++-------
 src/test/scripts/functions/tensor/Conv2DTest.R  |   4 +-
 .../scripts/functions/tensor/Conv2DTest.dml     |   4 +-
 13 files changed, 321 insertions(+), 184 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/d641c224/src/main/java/org/apache/sysml/runtime/instructions/cp/ComputationCPInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/ComputationCPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/ComputationCPInstruction.java
index 67dc051..572290d 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/cp/ComputationCPInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/ComputationCPInstruction.java
@@ -19,7 +19,6 @@
 
 package org.apache.sysml.runtime.instructions.cp;
 
-import org.apache.sysml.hops.OptimizerUtils;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
 import org.apache.sysml.runtime.matrix.operators.Operator;
 
@@ -54,11 +53,12 @@ public abstract class ComputationCPInstruction extends CPInstruction {
 		return checkGuardedRepresentationChange(in1, null, out);
 	}
 
-	protected boolean checkGuardedRepresentationChange( MatrixBlock in1, MatrixBlock in2, MatrixBlock out )
-	{
-		double memDense = OptimizerUtils.estimateSize(out.getNumRows(), out.getNumColumns());
+	protected boolean checkGuardedRepresentationChange( MatrixBlock in1, MatrixBlock in2, MatrixBlock out ) {
 		double memIn1 = (in1 != null) ? in1.getInMemorySize() : 0;
 		double memIn2 = (in2 != null) ? in2.getInMemorySize() : 0;
-		return ( memDense < memIn1 + memIn2 );	
+		double memReq = out.isInSparseFormat() ? 
+			MatrixBlock.estimateSizeDenseInMemory(out.getNumRows(), out.getNumColumns()) :
+			MatrixBlock.estimateSizeSparseInMemory(out.getNumRows(), out.getNumColumns(), out.getSparsity());
+		return ( memReq < memIn1 + memIn2 );
 	}
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/d641c224/src/main/java/org/apache/sysml/runtime/instructions/cp/MatrixMatrixArithmeticCPInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/MatrixMatrixArithmeticCPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/MatrixMatrixArithmeticCPInstruction.java
index 309fe07..6333f9b 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/cp/MatrixMatrixArithmeticCPInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/MatrixMatrixArithmeticCPInstruction.java
@@ -37,8 +37,8 @@ public class MatrixMatrixArithmeticCPInstruction extends ArithmeticBinaryCPInstr
 		throws DMLRuntimeException
 	{
 		// Read input matrices
-        MatrixBlock inBlock1 = ec.getMatrixInput(input1.getName(), getExtendedOpcode());
-        MatrixBlock inBlock2 = ec.getMatrixInput(input2.getName(), getExtendedOpcode());
+		MatrixBlock inBlock1 = ec.getMatrixInput(input1.getName(), getExtendedOpcode());
+		MatrixBlock inBlock2 = ec.getMatrixInput(input2.getName(), getExtendedOpcode());
 		
 		// Perform computation using input matrices, and produce the result matrix
 		BinaryOperator bop = (BinaryOperator) _optr;

http://git-wip-us.apache.org/repos/asf/systemml/blob/d641c224/src/main/java/org/apache/sysml/runtime/instructions/cp/MatrixScalarBuiltinCPInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/MatrixScalarBuiltinCPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/MatrixScalarBuiltinCPInstruction.java
index dcacb21..35c5177 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/cp/MatrixScalarBuiltinCPInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/MatrixScalarBuiltinCPInstruction.java
@@ -43,7 +43,7 @@ public class MatrixScalarBuiltinCPInstruction extends BuiltinBinaryCPInstruction
 		MatrixBlock inBlock = ec.getMatrixInput(mat.getName(), getExtendedOpcode());
 		ScalarObject constant = (ScalarObject) ec.getScalarInput(scalar.getName(), scalar.getValueType(), scalar.isLiteral());
 		
-		ScalarOperator sc_op = (ScalarOperator)	_optr;
+		ScalarOperator sc_op = (ScalarOperator) _optr;
 		sc_op.setConstant(constant.getDoubleValue());
 		
 		MatrixBlock retBlock = (MatrixBlock) inBlock.scalarOperations(sc_op, new MatrixBlock());

http://git-wip-us.apache.org/repos/asf/systemml/blob/d641c224/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
index 0378c7a..f49433d 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
@@ -25,7 +25,6 @@ import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
 import org.apache.sysml.utils.GPUStatistics;
 
 import jcuda.Pointer;
-import jcuda.jcudnn.cudnnConvolutionBwdDataPreference;
 import jcuda.jcudnn.cudnnConvolutionBwdFilterPreference;
 import jcuda.jcudnn.cudnnConvolutionDescriptor;
 import jcuda.jcudnn.cudnnConvolutionFwdPreference;
@@ -218,7 +217,7 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements java.lang.AutoCloseab
 	public static LibMatrixCuDNNConvolutionAlgorithm cudnnGetConvolutionBackwardDataAlgorithm(
 			GPUContext gCtx, String instName, int N, int C, int H, int W, int K, int R, int S, 
 			int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q, long workspaceLimit) throws DMLRuntimeException {
-		long t1 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
+		//long t1 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
 		LibMatrixCuDNNConvolutionAlgorithm ret = new LibMatrixCuDNNConvolutionAlgorithm(gCtx, instName, N, C, H, W, K, R, S, 
 				pad_h, pad_w, stride_h, stride_w, P, Q);
 		

http://git-wip-us.apache.org/repos/asf/systemml/blob/d641c224/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java
index a135f62..b89be82 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java
@@ -21,6 +21,7 @@ package org.apache.sysml.runtime.matrix.data;
 import java.util.concurrent.Callable;
 
 import org.apache.sysml.api.DMLScript;
+import org.apache.sysml.runtime.matrix.data.LibMatrixDNNIm2ColHelper.Im2colWorker;
 import org.apache.sysml.runtime.util.ConvolutionUtils;
 import org.apache.sysml.utils.NativeHelper;
 
@@ -86,7 +87,7 @@ public class LibMatrixDNNConv2dBackwardFilterHelper {
 			MatrixBlock im2ColOutBlock = new MatrixBlock(CRS, PQ, false);
 			MatrixBlock dout_reshaped = new MatrixBlock(PQ, K, false);
 			dout_reshaped.allocateDenseBlock();
-			LibMatrixDNNIm2ColHelper.Im2colWorker im2ColWorker = LibMatrixDNNIm2ColHelper.Im2colWorker.getWorker( _params.input1, im2ColOutBlock, _params, true);
+			Im2colWorker im2ColWorker = Im2colWorker.getWorker( _params.input1, im2ColOutBlock, _params, true, false);
 			LibMatrixDNNRotate180Helper.Rotate180Worker rotate180Worker = 
 					LibMatrixDNNRotate180Helper.Rotate180Worker.getWorker( dout, dout_reshaped.getDenseBlock(), _params, true);
 			double [] partialRetBlock = new double[CRS*_params.K];

http://git-wip-us.apache.org/repos/asf/systemml/blob/d641c224/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java
index 876996f..dd44de2 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java
@@ -22,6 +22,7 @@ import java.util.ArrayList;
 import java.util.concurrent.Callable;
 
 import org.apache.sysml.api.DMLScript;
+import org.apache.sysml.runtime.matrix.data.LibMatrixDNNIm2ColHelper.Im2colWorker;
 import org.apache.sysml.utils.NativeHelper;
 
 /**
@@ -33,11 +34,13 @@ public class LibMatrixDNNConv2dHelper {
 	 * Performs convolution via: partialCopy1(filter %*% im2col(input)) = output.
 	 * This operator has less memory pressure than LoopedIm2ColConv2dAllChannels.
 	 */
-	public static class LoopedIm2ColConv2dOneChannel implements Callable<Long> 
+	public static class LoopedIm2ColConv2dOneChan implements Callable<Long> 
 	{
-		public int _rl; public int _ru; 
-		private final ConvolutionParameters _params; ArrayList<MatrixBlock> _filters;
-		public LoopedIm2ColConv2dOneChannel(int rl, int ru, ConvolutionParameters params, ArrayList<MatrixBlock> filters) {
+		protected final int _rl, _ru; 
+		protected final ConvolutionParameters _params; 
+		protected final ArrayList<MatrixBlock> _filters;
+		
+		public LoopedIm2ColConv2dOneChan(int rl, int ru, ConvolutionParameters params, ArrayList<MatrixBlock> filters) {
 			_rl = rl; _ru = ru;
 			_params = params; 
 			_filters = filters;
@@ -48,7 +51,7 @@ public class LibMatrixDNNConv2dHelper {
 			int PQ = _params.P*_params.Q; int K = _params.K;
 			int RS = _params.R*_params.S;
 			MatrixBlock im2ColOutBlock = new MatrixBlock(RS, PQ, false);
-			LibMatrixDNNIm2ColHelper.Im2colWorker im2ColWorker = LibMatrixDNNIm2ColHelper.Im2colWorker.getWorker( _params.input1, im2ColOutBlock, _params, false);
+			Im2colWorker im2ColWorker = Im2colWorker.getWorker( _params.input1, im2ColOutBlock, _params, false, false);
 			long time1 = 0; long time2 = 0;
 			for(int n = _rl; n < _ru; n++)  {
 				for(int c = 0; c < _params.C; c++)  {
@@ -115,22 +118,22 @@ public class LibMatrixDNNConv2dHelper {
 	/**
 	 * Performs convolution via: partialCopy1(filter %*% im2col(input)) = output
 	 */
-	public static class LoopedIm2ColConv2dAllChannels implements Callable<Long> 
+	public static class LoopedIm2ColConv2dAllChan implements Callable<Long> 
 	{
-		public int _rl; public int _ru; 
-		private final ConvolutionParameters _params;
-		public LoopedIm2ColConv2dAllChannels(int rl, int ru, ConvolutionParameters params) {
+		protected final int _rl, _ru; 
+		protected final ConvolutionParameters _params;
+		
+		public LoopedIm2ColConv2dAllChan(int rl, int ru, ConvolutionParameters params) {
 			_rl = rl; _ru = ru;
 			_params = params;
 		}
 
 		@Override
 		public Long call() throws Exception {
-			int PQ = _params.P*_params.Q; int K = _params.K; int CRS = _params.C*_params.R*_params.S;
+			final int PQ = _params.P*_params.Q, K = _params.K, CRS = _params.C*_params.R*_params.S;
 			MatrixBlock outIm2col = new MatrixBlock(CRS, PQ, false);
 			MatrixBlock outMM = new MatrixBlock(K, PQ, false);
-			LibMatrixDNNIm2ColHelper.Im2colWorker im2ColWorker = 
-					LibMatrixDNNIm2ColHelper.Im2colWorker.getWorker( _params.input1, outIm2col, _params, true);
+			Im2colWorker im2ColWorker = Im2colWorker.getWorker( _params.input1, outIm2col, _params, true, false);
 			long time1 = 0; long time2 = 0;
 			for(int n = _rl; n < _ru; n++)  {
 				// im2col(input) => _im2ColOutBlock
@@ -189,6 +192,82 @@ public class LibMatrixDNNConv2dHelper {
 		}
 	}
 	
+	/**
+	 * This implementation is similar to LoopedIm2ColConv2dAllChan, except for using a 
+	 * sparse-dense matrix multiplication with t(t(Xi) %*% t(F)) instead of a 
+	 * dense-sparse matrix multiplication with Xi %*% F.
+	 * 
+	 * NOTE: this implementation assumes that the filter is passed in transposed form
+	 * in order to share this temporary matrix (and its creation cost) across threads.
+	 */
+	public static class LoopedIm2ColConv2dTransAllChan extends LoopedIm2ColConv2dAllChan
+	{
+		public LoopedIm2ColConv2dTransAllChan(int rl, int ru, ConvolutionParameters params) {
+			super(rl, ru, params);
+		}
+
+		@Override
+		public Long call() throws Exception {
+			final int PQ = _params.P*_params.Q, K = _params.K, CRS = _params.C*_params.R*_params.S;
+			MatrixBlock outIm2col = new MatrixBlock(PQ, CRS, false);
+			MatrixBlock outMM = new MatrixBlock(PQ, K, false);
+			Im2colWorker im2ColWorker = Im2colWorker.getWorker( _params.input1, outIm2col, _params, true, true);
+			
+			for(int n = _rl; n < _ru; n++)  {
+				// im2col(input) => _im2ColOutBlock
+				im2ColWorker.execute(n);
+				
+				// t(_im2ColOutBlock) %*% t(filter) => t(matMultOutBlock)
+				outMM.reset(outMM.rlen, outMM.clen, false);
+				LibMatrixDNNHelper.singleThreadedMatMult(outIm2col, _params.input2, outMM, false, true, _params);
+				
+				// Copy the matrix matMultOutBlock of shape [K X PQ] to params.output.denseBlock + destPos
+				partialCopyTrans(outMM, _params.output, n*K*PQ, K, PQ);
+				
+				// Add bias to current row if necessary, always dense
+				if(_params.bias != null)
+					LibMatrixDNNHelper.addBias(n, _params.output.getDenseBlock(), _params.bias.getDenseBlock(), K, PQ);
+			}
+			
+			//multi-threaded nnz maintenance of current working set
+			return _params.output.recomputeNonZeros(_rl, _ru-1);
+		}
+		
+		private static void partialCopyTrans(MatrixBlock src, MatrixBlock dest, int destPos, int K, int PQ) {
+			if( src.isEmptyBlock() )
+				return;
+			//copy src into its destination row w/ piggybacked transpose
+			//src is [PQ x K] -> [K x PQ] -> [1 x KPQ]
+			if(src.isInSparseFormat()) {
+				SparseBlock sblock = src.sparseBlock;
+				double[] c = dest.denseBlock;
+				for(int i = 0; i < src.getNumRows(); i++) {
+					if( sblock.isEmpty(i) ) continue;
+					int apos = sblock.pos(i);
+					int alen = sblock.size(i);
+					int[] aix = sblock.indexes(i);
+					double[] avals = sblock.values(i);
+					int desPosK = destPos + i;
+					for(int j = apos; j < apos+alen; j++)
+						c[desPosK+aix[j]*PQ] = avals[j];
+				}
+			}
+			else {
+				double[] a = src.denseBlock;
+				double[] c = dest.denseBlock;
+				final int blocksizeIJ = 128; //128KB for L2
+				//cache-conscious blocked execution
+				for( int bi = 0; bi < PQ; bi+=blocksizeIJ )
+					for( int bj = 0; bj < K; bj+=blocksizeIJ ) {
+						int bimin = Math.min(bi+blocksizeIJ, PQ);
+						int bjmin = Math.min(bj+blocksizeIJ, K);
+						//core transpose operation
+						for(int i=bi, aix=bi*K+bj, cix=bj*PQ+bi; i<bimin; i++, aix+=K, cix++)
+							LibMatrixReorg.transposeRow(a, c, aix, destPos+cix, PQ, bjmin-bj);
+					}
+			}
+		}
+	}
 	
 	/**
 	 * This operator is used only if native is enabled, filter is dense and input is sparse

http://git-wip-us.apache.org/repos/asf/systemml/blob/d641c224/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java
index 276a78e..b80a786 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java
@@ -125,15 +125,28 @@ public class LibMatrixDNNHelper {
 			filters = splitFilter(params);
 		}
 		
-		boolean isEmptyDenseInput = !params.input1.isInSparseFormat() && params.input1.denseBlock == null;
+		MatrixBlock in1 = params.input1;
+		boolean isEmptyDenseInput = !in1.isInSparseFormat() && in1.denseBlock == null;
+		boolean isTransPref = in1.sparse && !params.input2.sparse && 
+			MatrixBlock.evalSparseFormatInMemory(in1.clen, in1.rlen, in1.nonZeros);
+		
+		//transpose filter once for efficient sparse-dense multiplies in LoopedIm2ColConv2dTransAllChan
+		//in order to share the temporary object and its creation costs across threads
+		if( !LibMatrixDNN.isEligibleForConv2dSparse(params) 
+			&& !isEmptyDenseInput && allChannels && isTransPref ) {
+			params.input2 = LibMatrixReorg.transpose(params.input2, 
+				new MatrixBlock(params.input2.clen, params.input2.rlen, false), k);
+		}
 		
 		for(int i = 0; i*taskSize < params.N; i++) {
 			if(LibMatrixDNN.isEligibleForConv2dSparse(params)) 
 				ret.add(new LibMatrixDNNConv2dHelper.SparseNativeConv2d(i*taskSize, Math.min((i+1)*taskSize, params.N), params));
+			else if(!isEmptyDenseInput && allChannels && isTransPref)
+				ret.add(new LibMatrixDNNConv2dHelper.LoopedIm2ColConv2dTransAllChan(i*taskSize, Math.min((i+1)*taskSize, params.N), params));
 			else if(!isEmptyDenseInput && allChannels)
-				ret.add(new LibMatrixDNNConv2dHelper.LoopedIm2ColConv2dAllChannels(i*taskSize, Math.min((i+1)*taskSize, params.N), params));
+				ret.add(new LibMatrixDNNConv2dHelper.LoopedIm2ColConv2dAllChan(i*taskSize, Math.min((i+1)*taskSize, params.N), params));
 			else if(!isEmptyDenseInput && !allChannels)
-				ret.add(new LibMatrixDNNConv2dHelper.LoopedIm2ColConv2dOneChannel(i*taskSize, Math.min((i+1)*taskSize, params.N), params, filters));
+				ret.add(new LibMatrixDNNConv2dHelper.LoopedIm2ColConv2dOneChan(i*taskSize, Math.min((i+1)*taskSize, params.N), params, filters));
 			else
 				throw new DMLRuntimeException("Unsupported operator");
 		}

http://git-wip-us.apache.org/repos/asf/systemml/blob/d641c224/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java
index 3296c7f..7b43257 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java
@@ -31,7 +31,7 @@ public class LibMatrixDNNIm2ColHelper {
 	static interface Im2colWorker {
 		public void execute(int n);
 		public void execute(int n, int c);
-		public static Im2colWorker getWorker(MatrixBlock input, MatrixBlock out, ConvolutionParameters params, boolean allChannels) {
+		public static Im2colWorker getWorker(MatrixBlock input, MatrixBlock out, ConvolutionParameters params, boolean allChannels, boolean trans) {
 			if(!input.isInSparseFormat()) {
 				boolean stride1Pad0 = params.stride_h == 1 && params.stride_w == 1 
 						&& params.pad_h == 0 && params.pad_w == 0;
@@ -58,10 +58,11 @@ public class LibMatrixDNNIm2ColHelper {
 				//preallocate sparse-rows
 				double sparsity = Math.min(MatrixBlock.SPARSITY_TURN_POINT, 
 					(input.getNonZeros()*2.0) / (input.getNumRows()*input.getNumColumns()));
+				int estnnz = (int)Math.ceil((trans ? params.C*params.R*params.S : params.P*params.Q)*sparsity);
 				for(int r = 0; r < out.rlen; r++)
-					out.getSparseBlock().allocate(r, (int)Math.ceil(params.P*params.Q*sparsity));
+					out.getSparseBlock().allocate(r, estnnz);
 				
-				return new SparseSparseIm2colWorkerAllChannels(input, out, params);
+				return new SparseSparseIm2colWorkerAllChan(input, out, params, trans);
 			}
 		}
 	}
@@ -203,7 +204,7 @@ public class LibMatrixDNNIm2ColHelper {
 	/**
 	 * Performing dense im2col (general case)
 	 */
-	static class DenseIm2colWorkerAllChannels implements Im2colWorker {
+	private static class DenseIm2colWorkerAllChannels implements Im2colWorker {
 		double [] inputArray; double [] outputArray; 
 		int CRS; int S; int R; int P; int Q; int CHW; int H; int W; 
 		int stride_h; int stride_w; int pad_h; int pad_w;
@@ -252,17 +253,19 @@ public class LibMatrixDNNIm2ColHelper {
 	/**
 	 * Performing sparse im2col for all channels for a given row n of the input matrix.
 	 */
-	private static class SparseSparseIm2colWorkerAllChannels implements Im2colWorker {
-		final MatrixBlock input, output;
-		final int S, R, P, Q, W, HW;
-		final int stride_h, stride_w, pad_h, pad_w;
-		public SparseSparseIm2colWorkerAllChannels(MatrixBlock input, MatrixBlock im2ColOutBlock, ConvolutionParameters params) {
+	private static class SparseSparseIm2colWorkerAllChan implements Im2colWorker {
+		private final MatrixBlock input, output;
+		private final int S, R, P, Q, W, HW;
+		private final int stride_h, stride_w, pad_h, pad_w;
+		private final boolean trans;
+		public SparseSparseIm2colWorkerAllChan(MatrixBlock input, MatrixBlock im2ColOutBlock, ConvolutionParameters params, boolean trans) {
 			this.input = input;
 			this.output = im2ColOutBlock;
 			this.HW = params.H * params.W;
 			this.W = params.W; this.R = params.R; this.S = params.S; this.P = params.P; this.Q = params.Q;
 			this.stride_h = params.stride_h; this.stride_w = params.stride_w;
 			this.pad_h = params.pad_h; this.pad_w = params.pad_w;
+			this.trans = trans;
 			if(!input.isInSparseFormat()) 
 				throw new RuntimeException("Incorrect operator selection. Expected dense input for SparseIm2colWorkerAllChannels");
 		}
@@ -275,16 +278,13 @@ public class LibMatrixDNNIm2ColHelper {
 		@Override
 		public void execute(int n) {
 			output.reset();
-			
 			SparseBlock sblock = input.sparseBlock;
-			if( sblock.isEmpty(n) ) {
+			if( sblock.isEmpty(n) )
 				return;
-			}
-			
-			int apos = input.sparseBlock.pos(n);
-			int alen = input.sparseBlock.size(n);
-			int[] aix = input.sparseBlock.indexes(n);
-			double[] avals = input.sparseBlock.values(n);
+			int apos = sblock.pos(n);
+			int alen = sblock.size(n);
+			int[] aix = sblock.indexes(n);
+			double[] avals = sblock.values(n);
 			
 			// Iterate over the sparse block
 			for(int j=apos; j<apos+alen; j++) {
@@ -297,9 +297,69 @@ public class LibMatrixDNNIm2ColHelper {
 				int wInput = chw % W; 
 				
 				appendInputValueToIm2colOutput(output, cInput, hInput, wInput, avals[j], 
-						R, S, P, Q, stride_h, stride_w, pad_h, pad_w);
+						R, S, P, Q, stride_h, stride_w, pad_h, pad_w, trans);
 			}
 			// Since the chw are appended in sorted order, no need to sort the output rows
+			// unless in trans mode, then sorting is needed
+			if( trans )
+				output.sortSparseRows();
+		}
+	}
+	
+	/**
+	 * Performing sparse im2col for a given channel c and for a given row n of the input matrix.
+	 */
+	@SuppressWarnings("unused")
+	private static class SparseSparseIm2colWorker implements Im2colWorker {
+		private final MatrixBlock input, output;
+		private final int S, R, P, Q, W, HW;
+		private final int stride_h, stride_w, pad_h, pad_w; 
+		final boolean trans;
+		
+		public SparseSparseIm2colWorker(MatrixBlock input, MatrixBlock im2ColOutBlock, ConvolutionParameters params, boolean trans) {
+			this.input = input;
+			this.output = im2ColOutBlock;
+			this.HW = params.H*params.W;
+			this.W = params.W; this.R = params.R; this.S = params.S; this.P = params.P; this.Q = params.Q;
+			this.stride_h = params.stride_h; this.stride_w = params.stride_w;
+			this.pad_h = params.pad_h; this.pad_w = params.pad_w;
+			this.trans = trans;
+		}
+		
+		@Override
+		public void execute(int n) {
+			throw new RuntimeException("Not supported");
+		}
+
+		@Override
+		public void execute(int n, int cInput) {
+			output.reset();
+			SparseBlock sblock = input.sparseBlock;
+			if( sblock.isEmpty(n) )
+				return;
+			int apos = sblock.pos(n);
+			int alen = sblock.size(n);
+			int[] aix = sblock.indexes(n);
+			double[] avals = sblock.values(n);
+				
+			// Iterate over the sparse block
+			for(int j=apos; j<apos+alen; j++) {
+				// Note: the input is of shape [N, CHW]
+				int chw = aix[j];
+				
+				if(cInput == (chw / HW)) {
+					// Get individual zero-based c,h,w indexes from zero-based 'chw'
+					int hInput = (chw - cInput*HW)/W;
+					int wInput = chw % W; 
+					
+					appendInputValueToIm2colOutput(output, cInput, hInput, wInput, avals[j], 
+							R, S, P, Q, stride_h, stride_w, pad_h, pad_w, trans);
+				}
+			}
+			// Since the chw are appended in sorted order, no need to sort the output rows
+			// unless in trans mode, then sorting is needed
+			if( trans )
+				output.sortSparseRows();
 		}
 	}
 	
@@ -321,7 +381,7 @@ public class LibMatrixDNNIm2ColHelper {
 	 * @param pad_w pad width
 	 */
 	private static void appendInputValueToIm2colOutput(MatrixBlock output, int cInput, int hInput, int wInput, double value, 
-			int R, int S, int P, int Q, int stride_h, int stride_w, int pad_h, int pad_w) {
+			int R, int S, int P, int Q, int stride_h, int stride_w, int pad_h, int pad_w, boolean trans) {
 		if(value == 0) 
 			return;
 		int RS = R*S;
@@ -350,67 +410,12 @@ public class LibMatrixDNNIm2ColHelper {
 			for(int s = sMin; s <= sMax; s += stride_w) {
 				int q = (wInput - s + pad_w)  / stride_w;
 				// chw -> [crs, pq]
-				output.appendValue(outRowIndex + s, pQ + q, value);
-				// Since the chw are appended in sorted order, no need to sort the output rows
-			}
-		}
-	}
-	
-	/**
-	 * Performing sparse im2col for a given channel c and for a given row n of the input matrix.
-	 */
-	static class SparseSparseIm2colWorker implements Im2colWorker {
-		MatrixBlock input; MatrixBlock output;
-		int CRS; int S; int R; int P; int Q; int H; int W; int HW; int RS;
-		int stride_h; int stride_w; int pad_h; int pad_w; 
-		public SparseSparseIm2colWorker(MatrixBlock input, MatrixBlock im2ColOutBlock, ConvolutionParameters params) {
-			this.input = input;
-			this.output = im2ColOutBlock;
-			this.CRS = params.C * params.R * params.S;
-			this.HW = params.H*params.W;
-			this.RS = params.R*params.S;
-			this.H = params.H; this.W = params.W; this.R = params.R; this.S = params.S; this.P = params.P; this.Q = params.Q;
-			this.stride_h = params.stride_h; this.stride_w = params.stride_w;
-			this.pad_h = params.pad_h; this.pad_w = params.pad_w;
-		}
-		
-		@Override
-		public void execute(int n) {
-			throw new RuntimeException("Not supported");
-		}
-
-		@Override
-		public void execute(int n, int cInput) {
-			if( !input.sparseBlock.isEmpty(n) ) {
-				output.sparseBlock.reset();
-				output.setNonZeros(0);
-				int apos = input.sparseBlock.pos(n);
-				int alen = input.sparseBlock.size(n);
-				int[] aix = input.sparseBlock.indexes(n);
-				double[] avals = input.sparseBlock.values(n);
-				
-				// Iterate over the sparse block
-				for(int j=apos; j<apos+alen; j++) {
-					// Note: the input is of shape [N, CHW]
-					int chw = aix[j];
-					
-					if(cInput == (chw / HW)) {
-						// Get individual zero-based c,h,w indexes from zero-based 'chw'
-						int hInput = (chw - cInput*HW)/W;
-						int wInput = chw % W; 
-						
-						appendInputValueToIm2colOutput(output, cInput, hInput, wInput, avals[j], 
-								R, S, P, Q, stride_h, stride_w, pad_h, pad_w);
-					}
-				}
+				if( trans )
+					output.appendValue(pQ + q, outRowIndex + s, value);
+				else
+					output.appendValue(outRowIndex + s, pQ + q, value);
 				// Since the chw are appended in sorted order, no need to sort the output rows
-				// if(meta.sortRows) output.sortSparseRows();
-			}
-			else {
-				output.setNonZeros(0);
 			}
 		}
-		
 	}
-
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/d641c224/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java
index 76002fe..8d7d4a5 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java
@@ -948,7 +948,7 @@ public class LibMatrixReorg
 		
 		//compute rest (not aligned to 8-blocks)
 		for( int j=0; j<bn; j++, aix++, cix+=n2 )
-			c[ cix ] = a[ aix+0 ];	
+			c[ cix ] = a[ aix+0 ];
 		
 		//unrolled 8-blocks
 		for( int j=bn; j<len; j+=8, aix+=8, cix+=8*n2 )
@@ -960,7 +960,7 @@ public class LibMatrixReorg
 			c[ cix + 4*n2 ] = a[ aix+4 ];
 			c[ cix + 5*n2 ] = a[ aix+5 ];
 			c[ cix + 6*n2 ] = a[ aix+6 ];
-			c[ cix + 7*n2 ] = a[ aix+7 ];	
+			c[ cix + 7*n2 ] = a[ aix+7 ];
 		}
 	}
 

http://git-wip-us.apache.org/repos/asf/systemml/blob/d641c224/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
index 66a5d9a..8117c04 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
@@ -112,7 +112,7 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 		EMPTY_BLOCK,  
 		ULTRA_SPARSE_BLOCK, //ultra sparse representation, in-mem same as sparse
 		SPARSE_BLOCK, //sparse representation, see sparseRows 
-		DENSE_BLOCK, //dense representation, see denseBlock			
+		DENSE_BLOCK, //dense representation, see denseBlock
 	}
 	
 	//matrix meta data
@@ -470,6 +470,10 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 		return (nonZeros = nnz);
 	}
 	
+	public double getSparsity() {
+		return OptimizerUtils.getSparsity(rlen, clen, nonZeros);
+	}
+	
 	public boolean isVector() {
 		return (rlen == 1 || clen == 1);
 	}
@@ -776,7 +780,7 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 	 */
 	public void sortSparseRows() {
 		if( !sparse || sparseBlock==null )
-			return;		
+			return;
 		sparseBlock.sort();
 	}
 	
@@ -2413,10 +2417,8 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 	////////
 	// Estimates size and sparsity
 
-	public long estimateSizeInMemory() 
-	{
-		double sp = OptimizerUtils.getSparsity(rlen, clen, nonZeros);
-		return estimateSizeInMemory(rlen, clen, sp);
+	public long estimateSizeInMemory() {
+		return estimateSizeInMemory(rlen, clen, getSparsity());
 	}
 
 	public static long estimateSizeInMemory(long nrows, long ncols, double sparsity)
@@ -2607,9 +2609,8 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 		if( !isAllocated() ) 
 			return 44;
 		//in-memory size of dense/sparse representation
-		double sp = OptimizerUtils.getSparsity(rlen, clen, nonZeros);
 		return !sparse ? estimateSizeDenseInMemory(rlen, clen) :
-			estimateSizeSparseInMemory(rlen, clen, sp,
+			estimateSizeSparseInMemory(rlen, clen, getSparsity(),
 			SparseBlockFactory.getSparseBlockType(sparseBlock));
 	}
 	

http://git-wip-us.apache.org/repos/asf/systemml/blob/d641c224/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DTest.java b/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DTest.java
index 0a6bbb1..1341212 100644
--- a/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DTest.java
+++ b/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DTest.java
@@ -31,7 +31,6 @@ import org.junit.Test;
 
 public class Conv2DTest extends AutomatedTestBase
 {
-	
 	private final static String TEST_NAME = "Conv2DTest";
 	private final static String TEST_DIR = "functions/tensor/";
 	private final static String TEST_CLASS_DIR = TEST_DIR + Conv2DTest.class.getSimpleName() + "/";
@@ -39,77 +38,134 @@ public class Conv2DTest extends AutomatedTestBase
 	
 	@Override
 	public void setUp() {
-		addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, 
-				new String[] {"B"}));
+		TestUtils.clearAssertionInformation();
+		addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[] {"B"}));
 	}
 	
 	@Test
-	public void testConv2DDense1() 
-	{
+	public void testConv2DDense1() {
 		int numImg = 5; int imgSize = 3; int numChannels = 3; int numFilters = 6; int filterSize = 2; int stride = 1; int pad = 0;
 		runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false, false);
 	}
 	
 	
 	@Test
-	public void testConv2DDense2() 
-	{
+	public void testConv2DDense2() {
 		int numImg = 1; int imgSize = 10; int numChannels = 4; int numFilters = 3; int filterSize = 4; int stride = 2; int pad = 0;
 		runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false, false);
 	}
 	
 	@Test
-	public void testConv2DDense3() 
-	{
+	public void testConv2DDense3() {
 		int numImg = 1; int imgSize = 10; int numChannels = 4; int numFilters = 3; int filterSize = 4; int stride = 2; int pad = 1;
 		runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false, false);
 	}
 	
 	@Test
-	public void testConv2DDense4() 
-	{
+	public void testConv2DDense4() {
 		int numImg = 3; int imgSize = 10; int numChannels = 1; int numFilters = 3; int filterSize = 2; int stride = 2; int pad = 1;
 		runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false, false);
 	}
 	
 	@Test
-	public void testConv2DDense5() 
-	{
+	public void testConv2DDense5() {
 		int numImg = 3; int imgSize = 8; int numChannels = 2; int numFilters = 3; int filterSize = 3; int stride = 1; int pad = 2;
 		runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false, false);
 	}
 	
 	@Test
-	public void testConv2DDense6() 
-	{
+	public void testConv2DDense6() {
 		int numImg = 1; int imgSize = 10; int numChannels = 4; int numFilters = 3; int filterSize = 4; int stride = 1; int pad = 0;
 		runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false, false);
 	}
 	
 	@Test
-	public void testConv2DDense7() 
-	{
-		int numImg = 3; int imgSize = 10; int numChannels = 1; int numFilters = 3; int filterSize = 2; int stride = 1; int pad = 0;
+	public void testConv2DDense7() {
+		int numImg = 3; int imgSize = 64; int numChannels = 1; int numFilters = 3; int filterSize = 2; int stride = 1; int pad = 0;
 		runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false, false);
 	}
 	
 	@Test
-	public void testConv2DSparse1() 
-	{
-		int numImg = 1; int imgSize = 10; int numChannels = 4; int numFilters = 3; int filterSize = 4; int stride = 2; int pad = 0;
-		runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false, true);
+	public void testConv2DSparse1a() {
+		int numImg = 64; int imgSize = 16; int numChannels = 3; int numFilters = 6; int filterSize = 2; int stride = 1; int pad = 0;
+		runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true, false);
 	}
 	
 	@Test
-	public void testConv2DSparse2() 
-	{
-		int numImg = 1; int imgSize = 10; int numChannels = 4; int numFilters = 3; int filterSize = 4; int stride = 2; int pad = 1;
+	public void testConv2DSparse2a() {
+		int numImg = 64; int imgSize = 16; int numChannels = 4; int numFilters = 3; int filterSize = 4; int stride = 2; int pad = 0;
 		runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true, false);
 	}
 	
-	public void testConv2DSparse3() 
-	{
-		int numImg = 3; int imgSize = 10; int numChannels = 1; int numFilters = 3; int filterSize = 2; int stride = 2; int pad = 1;
+	@Test
+	public void testConv2DSparse3a() {
+		int numImg = 64; int imgSize = 16; int numChannels = 4; int numFilters = 3; int filterSize = 4; int stride = 2; int pad = 1;
+		runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true, false);
+	}
+	
+	@Test
+	public void testConv2DSparse4a() {
+		int numImg = 64; int imgSize = 16; int numChannels = 1; int numFilters = 3; int filterSize = 2; int stride = 2; int pad = 1;
+		runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true, false);
+	}
+	
+	@Test
+	public void testConv2DSparse5a() {
+		int numImg = 64; int imgSize = 16; int numChannels = 2; int numFilters = 3; int filterSize = 3; int stride = 1; int pad = 2;
+		runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true, false);
+	}
+	
+	@Test
+	public void testConv2DSparse6a() {
+		int numImg = 64; int imgSize = 16; int numChannels = 4; int numFilters = 3; int filterSize = 4; int stride = 1; int pad = 0;
+		runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true, false);
+	}
+	
+	@Test
+	public void testConv2DSparse7a() {
+		int numImg = 64; int imgSize = 16; int numChannels = 1; int numFilters = 3; int filterSize = 2; int stride = 1; int pad = 0;
+		runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true, false);
+	}
+	
+	@Test
+	public void testConv2DSparse1b() {
+		int numImg = 64; int imgSize = 16; int numChannels = 3; int numFilters = 6; int filterSize = 2; int stride = 1; int pad = 0;
+		runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false, true);
+	}
+	
+	@Test
+	public void testConv2DSparse2b() {
+		int numImg = 64; int imgSize = 16; int numChannels = 4; int numFilters = 3; int filterSize = 4; int stride = 2; int pad = 0;
+		runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false, true);
+	}
+	
+	@Test
+	public void testConv2DSparse3b() {
+		int numImg = 64; int imgSize = 16; int numChannels = 4; int numFilters = 3; int filterSize = 4; int stride = 2; int pad = 1;
+		runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false, true);
+	}
+	
+	@Test
+	public void testConv2DSparse4b() {
+		int numImg = 64; int imgSize = 16; int numChannels = 1; int numFilters = 3; int filterSize = 2; int stride = 2; int pad = 1;
+		runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false, true);
+	}
+	
+	@Test
+	public void testConv2DSparse5b() {
+		int numImg = 64; int imgSize = 16; int numChannels = 2; int numFilters = 3; int filterSize = 3; int stride = 1; int pad = 2;
+		runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false, true);
+	}
+	
+	@Test
+	public void testConv2DSparse6b() {
+		int numImg = 64; int imgSize = 16; int numChannels = 4; int numFilters = 3; int filterSize = 4; int stride = 1; int pad = 0;
+		runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false, true);
+	}
+	
+	@Test
+	public void testConv2DSparse7b() {
+		int numImg = 64; int imgSize = 16; int numChannels = 1; int numFilters = 3; int filterSize = 2; int stride = 1; int pad = 0;
 		runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false, true);
 	}
 	
@@ -192,66 +248,49 @@ public class Conv2DTest extends AutomatedTestBase
 		runConv2DTest(ExecType.SPARK, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true, true);
 	}
 	
-	/**
-	 * 
-	 * @param et
-	 * @param sparse
-	 */
 	public void runConv2DTest( ExecType et, int imgSize, int numImg, int numChannels, int numFilters, 
 			int filterSize, int stride, int pad, boolean sparse1, boolean sparse2) 
 	{
-		RUNTIME_PLATFORM oldRTP = rtplatform;
-			
+		RUNTIME_PLATFORM platformOld = rtplatform;
+		switch( et ){
+			case MR: rtplatform = RUNTIME_PLATFORM.HADOOP; break;
+			case SPARK: rtplatform = RUNTIME_PLATFORM.SPARK; break;
+			default: rtplatform = RUNTIME_PLATFORM.HYBRID; break;
+		}
 		boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG;
+		if( rtplatform == RUNTIME_PLATFORM.SPARK )
+			DMLScript.USE_LOCAL_SPARK_CONFIG = true;
 		
 		try
 		{
-			String sparseVal1 = (""+sparse1).toUpperCase();
-			String sparseVal2 = (""+sparse2).toUpperCase();
-			
-	    TestConfiguration config = getTestConfiguration(TEST_NAME);
-	    if(et == ExecType.SPARK) {
-	    	rtplatform = RUNTIME_PLATFORM.SPARK;
-	    }
-	    else {
-	    	rtplatform = (et==ExecType.MR)? RUNTIME_PLATFORM.HADOOP : RUNTIME_PLATFORM.SINGLE_NODE;
-	    }
-			if( rtplatform == RUNTIME_PLATFORM.SPARK )
-				DMLScript.USE_LOCAL_SPARK_CONFIG = true;
-			
+			String sparseVal1 = String.valueOf(sparse1).toUpperCase();
+			String sparseVal2 = String.valueOf(sparse2).toUpperCase();
+			TestConfiguration config = getTestConfiguration(TEST_NAME);
 			loadTestConfiguration(config);
-	        
-			/* This is for running the junit test the new way, i.e., construct the arguments directly */
+			
 			String RI_HOME = SCRIPT_DIR + TEST_DIR;
 			fullDMLScriptName = RI_HOME + TEST_NAME + ".dml";
-			
-			
-			programArgs = new String[]{"-explain", "recompile_runtime", "-args",  "" + imgSize, "" + numImg, 
-				"" + numChannels, "" + numFilters, 
-				"" + filterSize, "" + stride, "" + pad, 
+			programArgs = new String[]{"-explain", "recompile_runtime", "-args", 
+				String.valueOf(imgSize), String.valueOf(numImg), 
+				String.valueOf(numChannels), String.valueOf(numFilters), 
+				String.valueOf(filterSize), String.valueOf(stride), String.valueOf(pad), 
 				output("B"), sparseVal1, sparseVal2};
-			
 			fullRScriptName = RI_HOME + TEST_NAME + ".R";
 			rCmd = "Rscript" + " " + fullRScriptName + " " + imgSize + " " + numImg + 
 					" " + numChannels + " " + numFilters + 
 					" " + filterSize + " " + stride + " " + pad + " " + expectedDir() +
 					" " + sparseVal1 + " " + sparseVal2; 
 			
-			boolean exceptionExpected = false;
-			int expectedNumberOfJobs = -1;
-			runTest(true, exceptionExpected, null, expectedNumberOfJobs);
-
-			// Run comparison R script
+			// Run DML and R scripts
+			runTest(true, false, null, -1);
 			runRScript(true);
-			HashMap<CellIndex, Double> bHM = readRMatrixFromFS("B");
 			
+			HashMap<CellIndex, Double> bHM = readRMatrixFromFS("B");
 			HashMap<CellIndex, Double> dmlfile = readDMLMatrixFromHDFS("B");
 			TestUtils.compareMatrices(dmlfile, bHM, epsilon, "B-DML", "B-R");
-			
 		}
-		finally
-		{
-			rtplatform = oldRTP;
+		finally {
+			rtplatform = platformOld;
 			DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld;
 		}
 	}

http://git-wip-us.apache.org/repos/asf/systemml/blob/d641c224/src/test/scripts/functions/tensor/Conv2DTest.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/tensor/Conv2DTest.R b/src/test/scripts/functions/tensor/Conv2DTest.R
index 6c6641f..dbe9ea4 100644
--- a/src/test/scripts/functions/tensor/Conv2DTest.R
+++ b/src/test/scripts/functions/tensor/Conv2DTest.R
@@ -33,13 +33,13 @@ x=matrix(seq(1, numImg*numChannels*imgSize*imgSize), numImg, numChannels*imgSize
 w=matrix(seq(1, numFilters*numChannels*filterSize*filterSize), numFilters, numChannels*filterSize*filterSize, byrow=TRUE)
 
 if(as.logical(args[9])) {
-	zero_mask = (x - mean(x)) > 0 
+	zero_mask = (x - mean(x)*1.5) > 0 
 	x = x * zero_mask
 } else {
 	x = x - mean(x)
 }
 if(as.logical(args[10])) {
-	zero_mask = (w - mean(w)) > 0 
+	zero_mask = (w - mean(w)*1.5) > 0 
 	w = w * zero_mask
 } else {
 	w = w - mean(w)

http://git-wip-us.apache.org/repos/asf/systemml/blob/d641c224/src/test/scripts/functions/tensor/Conv2DTest.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/tensor/Conv2DTest.dml b/src/test/scripts/functions/tensor/Conv2DTest.dml
index 2eedca8..8ba3dcf 100644
--- a/src/test/scripts/functions/tensor/Conv2DTest.dml
+++ b/src/test/scripts/functions/tensor/Conv2DTest.dml
@@ -32,14 +32,14 @@ w=matrix(seq(1, numFilters*numChannels*filterSize*filterSize), rows=numFilters,
 b=matrix(seq(1, numFilters), rows=numFilters, cols=1) 
 
 if($9) {
-	zero_mask = (x - mean(x)) > 0 
+	zero_mask = (x - mean(x)*1.5) > 0 
 	x = x * zero_mask
 }
 else {
 	x = x - mean(x)
 }
 if($10) {
-	zero_mask = (w - mean(w)) > 0 
+	zero_mask = (w - mean(w)*1.5) > 0 
 	w = w * zero_mask
 }
 else {

[2/3] systemml git commit: [SYSTEMML-1959] Extended shallow-serialize for sparse matrices

Posted by mb...@apache.org.

[SYSTEMML-1959] Extended shallow-serialize for sparse matrices 

Originally, all dense and sparse matrices where serializes into byte
arrays on write into the buffer pool. Later, we introduced the concept
of shallow-serialize where we simply hold a strong reference to all
matrices whose in-memory size is equal or close to its size in
serialized form, which greatly reduced the bufferpool overhead.

For tall and skinny sparse matrices in MCSR format (our default), this
is not the case, which causes serialization overhead and potentially
unnecessary evictions if the serialized representation is larger than
2GB (max integer size of byte arrays). This patch overcomes these
issues, by deciding upon a potential conversion from MCSR to CSR, which
allows for shallow serialize. Since this only happens in cases where we
would have serialized or evicted the MCSR block, this conversion is safe
with regard to potentially unnecessary overhead.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/db725dec
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/db725dec
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/db725dec

Branch: refs/heads/master
Commit: db725dec447be39b4e2f16ec66d669ad48217874
Parents: d641c22
Author: Matthias Boehm <mb...@gmail.com>
Authored: Fri Oct 13 22:44:09 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Sat Oct 14 02:30:37 2017 -0700

----------------------------------------------------------------------
 .../controlprogram/caching/ByteBuffer.java      | 17 +++---
 .../controlprogram/caching/CacheBlock.java      | 20 +++++++
 .../sysml/runtime/matrix/data/FrameBlock.java   | 11 +++-
 .../sysml/runtime/matrix/data/MatrixBlock.java  | 56 ++++++++++++++------
 4 files changed, 81 insertions(+), 23 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/db725dec/src/main/java/org/apache/sysml/runtime/controlprogram/caching/ByteBuffer.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/ByteBuffer.java b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/ByteBuffer.java
index 807fdc4..a87e4b4 100644
--- a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/ByteBuffer.java
+++ b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/ByteBuffer.java
@@ -52,7 +52,7 @@ public class ByteBuffer
 	public void serializeBlock( CacheBlock cb ) 
 		throws IOException
 	{	
-		_shallow = cb.isShallowSerialize();
+		_shallow = cb.isShallowSerialize(true);
 		_matrix = (cb instanceof MatrixBlock);
 		
 		try
@@ -69,6 +69,10 @@ public class ByteBuffer
 			}
 			else //SPARSE/DENSE -> DENSE
 			{
+				//convert to shallow serialize block if necessary
+				if( !cb.isShallowSerialize() )
+					cb.toShallowSerializeBlock();
+				
 				//shallow serialize
 				_cdata = cb;
 			}
@@ -160,14 +164,15 @@ public class ByteBuffer
 	 */
 	public static boolean isValidCapacity( long size, CacheBlock cb )
 	{
-		if( !cb.isShallowSerialize() ) { //SPARSE matrix blocks
+		if( !cb.isShallowSerialize(true) ) { //SPARSE matrix blocks
 			// since cache blocks are serialized into a byte representation
 			// the buffer buffer can hold at most 2GB in size 
-			return ( size <= Integer.MAX_VALUE );	
+			return ( size <= Integer.MAX_VALUE );
 		}
-		else {//DENSE matrix / frame blocks
-			// since for dense matrix blocks we use a shallow serialize (strong reference), 
-			// the byte buffer can hold any size (currently upper bounded by 16GB) 
+		else {//DENSE/SPARSE matrix / frame blocks
+			// for dense and under special conditions also sparse matrix blocks 
+			// we use a shallow serialize (strong reference), there is no additional
+			// capacity constraint for serializing these blocks into byte arrays
 			return true;
 		}
 	}

http://git-wip-us.apache.org/repos/asf/systemml/blob/db725dec/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheBlock.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheBlock.java b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheBlock.java
index eb34b09..bd03ead 100644
--- a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheBlock.java
@@ -59,6 +59,26 @@ public interface CacheBlock extends Writable
 	public boolean isShallowSerialize();
 	
 	/**
+	 * Indicates if the cache block is subject to shallow serialized,
+	 * which is generally true if in-memory size and serialized size
+	 * are almost identical allowing to avoid unnecessary deep serialize.
+	 * 
+	 * @param inclConvert if true report blocks as shallow serialize that are
+	 * currently not amenable but can be brought into an amenable form
+	 * via {@link #toShallowSerializeBlock() toShallowSerializeBlock}.
+	 * 
+	 * @return true if shallow serialized
+	 */
+	public boolean isShallowSerialize(boolean inclConvert);
+	
+	/**
+	 * Converts a cache block that is not shallow serializable into
+	 * a form that is shallow serializable. This methods has no affect
+	 * if the given cache block is not amenable.
+	 */
+	public void toShallowSerializeBlock();
+	
+	/**
 	 * Free unnecessarily allocated empty block.
 	 */
 	public void compactEmptyBlock();

http://git-wip-us.apache.org/repos/asf/systemml/blob/db725dec/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java b/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java
index a79f6c2..a56fd6a 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java
@@ -753,15 +753,24 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable
 	
 	@Override
 	public boolean isShallowSerialize() {
+		return isShallowSerialize(false);
+	}
+	
+	@Override
+	public boolean isShallowSerialize(boolean inclConvert) {
 		//shallow serialize if non-string schema because a frame block
 		//is always dense but strings have large array overhead per cell
 		boolean ret = true;
 		for( int j=0; j<_schema.length && ret; j++ )
 			ret &= (_schema[j] != ValueType.STRING);
-		
 		return ret;
 	}
 	
+	@Override 
+	public void toShallowSerializeBlock() {
+		//do nothing (not applicable).
+	}
+	
 	@Override
 	public void compactEmptyBlock() {
 		//do nothing

http://git-wip-us.apache.org/repos/asf/systemml/blob/db725dec/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
index 8117c04..e563c60 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
@@ -105,6 +105,8 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 	public static final SparseBlock.Type DEFAULT_INPLACE_SPARSEBLOCK = SparseBlock.Type.CSR;
 	//allowed overhead for shallow serialize in terms of in-memory-size/x <= serialized-size 
 	public static final double MAX_SHALLOW_SERIALIZE_OVERHEAD = 1.3;
+	//flag if MCSR blocks that do not qualify for shallow serialize should be converted to CSR
+	public static final boolean CONVERT_MCSR_TO_CSR_ON_DEEP_SERIALIZE = true;
 	//basic header (int rlen, int clen, byte type)
 	public static final int HEADER_SIZE = 9;
 	
@@ -947,6 +949,10 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 			isPM &= sblock.isEmpty(i) || sblock.size(i) == 1;
 		return isPM;
 	}
+	
+	private boolean isUltraSparseSerialize(boolean sparseDst) {
+		return nonZeros<rlen && sparseDst;
+	}
 
 	/**
 	 * Evaluates if this matrix block should be in sparse format in
@@ -1004,7 +1010,7 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 		//ensure exact size estimates for write
 		if( nonZeros <= 0 ) {
 			recomputeNonZeros();
-		}	
+		}
 		
 		//decide on in-memory representation
 		return evalSparseFormatOnDisk(lrlen, lclen, nonZeros);
@@ -1674,8 +1680,8 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 				if( !a.isEmpty(i) ) {
 					boolean update = a.set(i, cl, 0);
 					if( updateNNZ )
-						nonZeros -= update ? 1 : 0;							
-				}			
+						nonZeros -= update ? 1 : 0;
+				}
 		}
 		else
 		{
@@ -1685,7 +1691,7 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 					a.deleteIndexRange(i, cl, cu+1);
 					if( updateNNZ )
 						nonZeros += (a.size(i)-lnnz);
-				}	
+				}
 		}
 	}
 	
@@ -2028,7 +2034,7 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 			for(long i=0; i<nonZeros; i++) {
 				int r = in.readInt();
 				int c = in.readInt();
-				double val = in.readDouble();			
+				double val = in.readDouble();
 				denseBlock[r*clen+c] = val;
 			}
 		}
@@ -2037,7 +2043,7 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 			//col: read iv-pairs
 			for(long i=0; i<nonZeros; i++) {
 				int r = in.readInt();
-				double val = in.readDouble();			
+				double val = in.readDouble();
 				denseBlock[r] = val;
 			}
 		}
@@ -2060,7 +2066,7 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 			//write sparse to *
 			if( sparseBlock==null || nonZeros==0 ) 
 				writeEmptyBlock(out);
-			else if( nonZeros<rlen && sparseDst ) 
+			else if( isUltraSparseSerialize(sparseDst) ) 
 				writeSparseToUltraSparse(out); 
 			else if( sparseDst ) 
 				writeSparseBlock(out);
@@ -2072,7 +2078,7 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 			//write dense to *
 			if( denseBlock==null || nonZeros==0 ) 
 				writeEmptyBlock(out);
-			else if( nonZeros<rlen && sparseDst )
+			else if( isUltraSparseSerialize(sparseDst) )
 				writeDenseToUltraSparse(out);
 			else if( sparseDst )
 				writeDenseToSparse(out);
@@ -2127,8 +2133,8 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 					for(int j=pos; j<pos+nr; j++) {
 						out.writeInt(cols[j]);
 						out.writeDouble(values[j]);
-					}					
-				}	
+					}
+				}
 			}
 			for(;r<rlen; r++)
 				out.writeInt(0);
@@ -2206,14 +2212,14 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 						for( ; j<aix[apos+j2]; j++ )
 							out.writeDouble( 0 );
 						out.writeDouble( avals[apos+j2] );
-					}					
+					}
 					//remaining 0 values in row
 					for( int j=aix[apos+alen-1]+1; j<clen; j++)
 						out.writeDouble( 0 );
 				}
 				else //empty row
 					for( int j=0; j<clen; j++ )
-						out.writeDouble( 0 );	
+						out.writeDouble( 0 );
 			}
 		}
 	}
@@ -2361,7 +2367,7 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 		}
 		else {
 			//default serialize (general case)
-			write(os);	
+			write(os);
 		}
 	}
 	
@@ -2621,12 +2627,30 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 	
 	@Override
 	public boolean isShallowSerialize() {
+		return isShallowSerialize(false);
+	}
+	
+	@Override
+	public boolean isShallowSerialize(boolean inclConvert) {
 		//shallow serialize if dense, dense in serialized form or already in CSR
-		return !sparse || !evalSparseFormatOnDisk()
+		boolean sparseDst = evalSparseFormatOnDisk();
+		return !sparse || !sparseDst
 			|| (sparse && sparseBlock instanceof SparseBlockCSR)
 			|| (sparse && sparseBlock instanceof SparseBlockMCSR
-				&& getInMemorySize()/MAX_SHALLOW_SERIALIZE_OVERHEAD 
-				<= getExactSerializedSize());
+				&& getInMemorySize() / MAX_SHALLOW_SERIALIZE_OVERHEAD 
+				<= getExactSerializedSize())
+			|| (sparse && sparseBlock instanceof SparseBlockMCSR
+				&& nonZeros < Integer.MAX_VALUE //CSR constraint
+				&& inclConvert && CONVERT_MCSR_TO_CSR_ON_DEEP_SERIALIZE
+				&& !isUltraSparseSerialize(sparseDst));
+	}
+	
+	@Override 
+	public void toShallowSerializeBlock() {
+		if( isShallowSerialize() || !isShallowSerialize(true) )
+			return;
+		sparseBlock = SparseBlockFactory.copySparseBlock(
+			SparseBlock.Type.CSR, sparseBlock, false);
 	}
 	
 	@Override

[3/3] systemml git commit: [SYSTEMML-1960] Shallow copy on right-indexing of sparse row batches

Posted by mb...@apache.org.

[SYSTEMML-1960] Shallow copy on right-indexing of sparse row batches

This patch improves the performance of rightindexing row batches from
sparse matrices with X[rl:ru, ]. In case of our default MCSR sparse
block representation, we now use a shallow copy (pointer move) of rows
because the physical row representation remains unchanged. Result
correctness is ensured because we only use this in the CP matrix
indexing instruction, where other instructions are guaranteed to adhere
to copy on write semantics.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/8ca61ae2
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/8ca61ae2
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/8ca61ae2

Branch: refs/heads/master
Commit: 8ca61ae269ce4e365c57f74ff0487d483766ccf6
Parents: db725de
Author: Matthias Boehm <mb...@gmail.com>
Authored: Fri Oct 13 23:29:40 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Sat Oct 14 02:30:38 2017 -0700

----------------------------------------------------------------------
 .../cp/MatrixIndexingCPInstruction.java         | 16 +++++++------
 .../data/BinaryBlockToRowBlockConverter.java    |  3 +--
 .../sysml/runtime/matrix/data/MatrixBlock.java  | 25 +++++++++++++-------
 3 files changed, 26 insertions(+), 18 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/8ca61ae2/src/main/java/org/apache/sysml/runtime/instructions/cp/MatrixIndexingCPInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/MatrixIndexingCPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/MatrixIndexingCPInstruction.java
index 85c2a3d..bee0434 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/cp/MatrixIndexingCPInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/MatrixIndexingCPInstruction.java
@@ -48,7 +48,7 @@ public final class MatrixIndexingCPInstruction extends IndexingCPInstruction {
 	@Override
 	public void processInstruction(ExecutionContext ec)
 			throws DMLRuntimeException 
-	{	
+	{
 		String opcode = getOpcode();
 		IndexRange ixrange = getIndexRange(ec);
 		
@@ -64,17 +64,19 @@ public final class MatrixIndexingCPInstruction extends IndexingCPInstruction {
 				resultBlock = mo.readMatrixPartition(ixrange.add(1));
 			else //via slicing the in-memory matrix
 			{
-				//execute right indexing operation
+				//execute right indexing operation (with shallow row copies for range
+				//of entire sparse rows, which is safe due to copy on update)
 				MatrixBlock matBlock = ec.getMatrixInput(input1.getName(), getExtendedOpcode());
-				resultBlock = matBlock.sliceOperations(ixrange, new MatrixBlock());	
+				resultBlock = matBlock.sliceOperations((int)ixrange.rowStart, (int)ixrange.rowEnd, 
+					(int)ixrange.colStart, (int)ixrange.colEnd, false, new MatrixBlock());
 				
 				//unpin rhs input
 				ec.releaseMatrixInput(input1.getName(), getExtendedOpcode());
 				
 				//ensure correct sparse/dense output representation
-				//(memory guarded by release of input)
-				resultBlock.examSparsity(getExtendedOpcode());
-			}	
+				if( checkGuardedRepresentationChange(matBlock, resultBlock) )
+					resultBlock.examSparsity(getExtendedOpcode());
+			}
 			
 			//unpin output
 			ec.setMatrixOutput(output.getName(), resultBlock, getExtendedOpcode());
@@ -119,6 +121,6 @@ public final class MatrixIndexingCPInstruction extends IndexingCPInstruction {
 			ec.setMatrixOutput(output.getName(), resultBlock, updateType, getExtendedOpcode());
 		}
 		else
-			throw new DMLRuntimeException("Invalid opcode (" + opcode +") encountered in MatrixIndexingCPInstruction.");		
+			throw new DMLRuntimeException("Invalid opcode (" + opcode +") encountered in MatrixIndexingCPInstruction.");
 	}
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/8ca61ae2/src/main/java/org/apache/sysml/runtime/matrix/data/BinaryBlockToRowBlockConverter.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/BinaryBlockToRowBlockConverter.java b/src/main/java/org/apache/sysml/runtime/matrix/data/BinaryBlockToRowBlockConverter.java
index 68bf038..cd133da 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/BinaryBlockToRowBlockConverter.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/BinaryBlockToRowBlockConverter.java
@@ -92,8 +92,7 @@ public class BinaryBlockToRowBlockConverter implements Converter<MatrixIndexes,
 			//rowlow, rowup, collow, colup (1-based specification)
 			_srcBlock.sliceOperations( _pos, _pos, 0, _srcBlock.getNumColumns()-1, _destBlock );
 		}
-		catch(DMLException ex)
-		{
+		catch(DMLException ex) {
 			throw new RuntimeException("Failed to slice matrix block into row blocks.", ex);
 		}
 			

http://git-wip-us.apache.org/repos/asf/systemml/blob/8ca61ae2/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
index e563c60..dac4315 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
@@ -3833,10 +3833,15 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 		return ret;
 	}
 
-	public final MatrixBlock sliceOperations(IndexRange ixrange, MatrixBlock ret) throws DMLRuntimeException {
+	public MatrixBlock sliceOperations(IndexRange ixrange, MatrixBlock ret) throws DMLRuntimeException {
 		return sliceOperations(
-				(int)ixrange.rowStart, (int)ixrange.rowEnd, 
-				(int)ixrange.colStart, (int)ixrange.colEnd, ret);
+			(int)ixrange.rowStart, (int)ixrange.rowEnd, 
+			(int)ixrange.colStart, (int)ixrange.colEnd, true, ret);
+	}
+	
+	@Override
+	public MatrixBlock sliceOperations(int rl, int ru, int cl, int cu, CacheBlock ret) throws DMLRuntimeException {
+		return sliceOperations(rl, ru, cl, cu, true, ret);
 	}
 	
 	/**
@@ -3851,7 +3856,7 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 	 * @return matrix block
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
-	public MatrixBlock sliceOperations(int rl, int ru, int cl, int cu, CacheBlock ret) 
+	public MatrixBlock sliceOperations(int rl, int ru, int cl, int cu, boolean deep, CacheBlock ret) 
 		throws DMLRuntimeException 
 	{	
 		// check the validity of bounds
@@ -3880,7 +3885,7 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 		{
 			//core slicing operation (nnz maintained internally)
 			if (sparse) 
-				sliceSparse(rl, ru, cl, cu, result);
+				sliceSparse(rl, ru, cl, cu, deep, result);
 			else 
 				sliceDense(rl, ru, cl, cu, result);
 		}
@@ -3888,7 +3893,7 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 		return result;
 	}
 
-	private void sliceSparse(int rl, int ru, int cl, int cu, MatrixBlock dest) 
+	private void sliceSparse(int rl, int ru, int cl, int cu, boolean deep, MatrixBlock dest) 
 		throws DMLRuntimeException
 	{
 		//check for early abort
@@ -3909,10 +3914,12 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 				}
 			}
 		}
-		else if( rl==ru && cl==0 && cu==clen-1 ) //ROW VECTOR 
+		else if( cl==0 && cu==clen-1 ) //ROW batch
 		{
 			//note: always sparse dest, but also works for dense
-			dest.appendRow(0, sparseBlock.get(rl));
+			boolean ldeep = (deep && sparseBlock instanceof SparseBlockMCSR);
+			for(int i = rl; i <= ru; i++)
+				dest.appendRow(i-rl, sparseBlock.get(i), ldeep);
 		}
 		else //general case (sparse/dense dest)
 		{
@@ -3926,7 +3933,7 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 					int astart = (cl>0)?sparseBlock.posFIndexGTE(i, cl) : apos;
 					if( astart != -1 )
 						for( int j=astart; j<apos+alen && aix[j] <= cu; j++ )
-							dest.appendValue(i-rl, aix[j]-cl, avals[j]);	
+							dest.appendValue(i-rl, aix[j]-cl, avals[j]);
 				}
 		}
 	}