You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2016/08/14 20:02:51 UTC

incubator-systemml git commit: [SYSTEMML-540] Removed non-performing operators as well as avoided unnecessary sparse conversions

Repository: incubator-systemml
Updated Branches:
  refs/heads/master 623779912 -> e9aa58414


[SYSTEMML-540] Removed non-performing operators as well as avoided unnecessary sparse conversions

- Removed im2col, col2im, rotate180, reshape_col as instructions
- Improved performance of conv2d, conv2d_backward, conv2d_backward_filter
- Converted sparse filters to dense


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/e9aa5841
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/e9aa5841
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/e9aa5841

Branch: refs/heads/master
Commit: e9aa58414fcbcc39b9099e8722ab40e7c60a159f
Parents: 6237799
Author: Niketan Pansare <np...@us.ibm.com>
Authored: Sun Aug 14 12:58:54 2016 -0700
Committer: Niketan Pansare <np...@us.ibm.com>
Committed: Sun Aug 14 12:58:54 2016 -0700

----------------------------------------------------------------------
 .../org/apache/sysml/hops/ConvolutionOp.java    | 133 +-----------
 src/main/java/org/apache/sysml/hops/Hop.java    |   5 -
 .../java/org/apache/sysml/hops/ReorgOp.java     |   7 -
 .../apache/sysml/lops/ConvolutionTransform.java |  21 +-
 .../sysml/parser/BuiltinFunctionExpression.java |   4 +-
 .../org/apache/sysml/parser/DMLTranslator.java  |  41 +---
 .../instructions/CPInstructionParser.java       |   4 -
 .../cp/ConvolutionCPInstruction.java            |  78 +------
 .../sysml/runtime/matrix/data/LibMatrixDNN.java | 202 ++-----------------
 .../runtime/matrix/data/LibMatrixMult.java      |  31 ++-
 .../sysml/runtime/matrix/data/MatrixBlock.java  |   2 +-
 .../sysml/runtime/util/ConvolutionUtils.java    | 201 ------------------
 .../functions/tensor/Conv2DBackwardTest.java    |  51 +----
 .../functions/tensor/Conv2DTest.java            |  53 +----
 14 files changed, 74 insertions(+), 759 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/e9aa5841/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
index fe277d1..8c38a48 100644
--- a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
+++ b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
@@ -33,15 +33,12 @@ import org.apache.sysml.parser.Expression.ValueType;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
 import org.apache.sysml.runtime.matrix.data.LibMatrixDNN.ConvolutionParameters;
-import org.apache.sysml.runtime.util.ConvolutionUtils;
 
 public class ConvolutionOp extends Hop  implements MultiThreadedHop
 {	
 	private Hop.ConvOp op;
 
 	private int _maxNumThreads = -1; //-1 for unlimited
-	
-	public static boolean FORCE_NON_IM2COL = false;
 
 	private ConvolutionOp() {
 		//default constructor for clone
@@ -94,41 +91,14 @@ public class ConvolutionOp extends Hop  implements MultiThreadedHop
 		
 		ExecType et = optFindExecType();
 		
-		Lop ret = ConvolutionUtils.constructConvolutionLops(this, et);
-		if(ret != null) {
-			setLops(ret);
-			return ret;
-		}
-		ret = ConvolutionUtils.constructConvolutionBackwardDataLops(this, et);
-		if(ret != null) {
-			setLops(ret);
-			return ret;
-		}
-		
 		ArrayList<Hop> inputs = getInput();
 		switch( op )
 		{
-			case IM2COL:
-			case RESHAPE_COL:
-			case ROTATE180:
-			case COL2IM:
-			{	
-				et = ExecType.CP; // TODO: Since max_backwards and other Convolution Ops only implemented for CP
-				
-				if( et == ExecType.CP  )
-				{
-					setLops(constructConvolutionLops(et, inputs));
-					break;
-				}
-				else {
-					// TODO: Add support for SPARK/MR backends once we are happy with the performance of
-					// single node Lenet script. 
-					throw new HopsException("Unimplemented ConvolutionOp for execution type: " + et.name());
-				}
-				// break;
-			}
 			case MAX_POOLING:
 			case MAX_POOLING_BACKWARD:
+			case DIRECT_CONV2D:
+			case DIRECT_CONV2D_BACKWARD_DATA:
+			case DIRECT_CONV2D_BACKWARD_FILTER:
 			{	
 				//TODO: Fix me. Currently forcing the instruction to GPU if gpu flag is set
 				if(DMLScript.USE_ACCELERATOR) {
@@ -147,22 +117,6 @@ public class ConvolutionOp extends Hop  implements MultiThreadedHop
 				}
 				// break;
 			}
-			case DIRECT_CONV2D:
-			case DIRECT_CONV2D_BACKWARD_DATA:
-			case DIRECT_CONV2D_BACKWARD_FILTER:
-			{	
-				if( et == ExecType.GPU )
-				{
-					setLops(constructConvolutionLops(et, inputs));
-					break;
-				}
-				else {
-					// TODO: Add support for SPARK/MR backends once we are happy with the performance of
-					// single node Lenet script. 
-					throw new HopsException("Unimplemented ConvolutionOp for execution type: " + et.name());
-				}
-				// break;
-			}
 			default: 
 				throw new HopsException("Unsupported lops construction for operation type '"+op+"'.");
 		}
@@ -261,24 +215,6 @@ public class ConvolutionOp extends Hop  implements MultiThreadedHop
 	protected double computeOutputMemEstimate( long dim1, long dim2, long nnz )
 	{		
 		double sparsity = 1.0;
-		switch(op) 
-		{
-			case RESHAPE_COL:
-			case ROTATE180:
-			{
-				sparsity = OptimizerUtils.getSparsity(dim1, dim2, nnz);
-				break;
-			}
-			case IM2COL:
-			case COL2IM: 
-			case MAX_POOLING: 
-			case MAX_POOLING_BACKWARD:
-			case DIRECT_CONV2D: 
-			case DIRECT_CONV2D_BACKWARD_FILTER: 
-			case DIRECT_CONV2D_BACKWARD_DATA:
-				sparsity = 1.0; // worst-case estimate
-				break;
-		}
 		return OptimizerUtils.estimateSizeExactSparsity(dim1, dim2, sparsity);
 	}
 	
@@ -306,38 +242,6 @@ public class ConvolutionOp extends Hop  implements MultiThreadedHop
 		
 		switch(op) 
 		{
-			case RESHAPE_COL:
-			{				
-				ret = new long[3];
-				ret[0] = params.N;
-				ret[1] = getExtractedVal(params.K, params.P, params.Q);
-				ret[2] = mc.getNonZeros(); // exact estimates
-				break;
-			}
-			case ROTATE180:
-			{
-				ret = new long[3];
-				ret[0] = getExtractedVal(params.N, params.P, params.Q);
-				ret[1] = params.K;
-				ret[2] = mc.getNonZeros(); // exact estimates
-				break;
-			}
-			case IM2COL:
-			{
-				ret = new long[3];
-				ret[0] = getExtractedVal(params.C, params.R, params.S);
-				ret[1] = getExtractedVal(params.N, params.P, params.Q);
-				ret[2] = -1;
-				break;
-			}
-			case COL2IM:
-			{
-				ret = new long[3];
-				ret[0] = params.N;
-				ret[1] = getExtractedVal(params.C, params.H, params.W);
-				ret[2] = -1;
-				break;
-			}
 			case MAX_POOLING:
 			{
 				ret = new long[3];
@@ -496,8 +400,6 @@ public class ConvolutionOp extends Hop  implements MultiThreadedHop
 	@Override
 	public void refreshSizeInformation()
 	{
-		Hop input1 = getInput().get(0);
-		
 		ConvolutionParameters params;
 		try {
 			params = parseInput();
@@ -507,35 +409,6 @@ public class ConvolutionOp extends Hop  implements MultiThreadedHop
 		
 		switch(op) 
 		{
-			case IM2COL:
-			{
-				_dim1 = getExtractedVal(params.C, params.R, params.S);
-				_dim2 = getExtractedVal(params.N, params.P, params.Q);
-				_nnz = -1;
-				break;
-			}
-			case COL2IM:
-			{
-				// Set _dim1, _dim2 and if possible _nnz (use input1.getNnz())
-				_dim1 = params.N;
-				_dim2 = getExtractedVal(params.C, params.H, params.W);
-				_nnz = -1; // cannot infer stats
-				break;
-			}
-			case RESHAPE_COL:
-			{
-				_dim1 = params.N;
-				_dim2 = getExtractedVal(params.K, params.P, params.Q);
-				_nnz = input1.getNnz(); // exact estimates
-				break;
-			}
-			case ROTATE180:
-			{
-				_dim1 = getExtractedVal(params.N, params.P, params.Q);
-				_dim2 = params.K;
-				_nnz = input1.getNnz(); // exact estimates
-				break;
-			}
 			case MAX_POOLING:
 			{	
 				_dim1 = params.N;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/e9aa5841/src/main/java/org/apache/sysml/hops/Hop.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/Hop.java b/src/main/java/org/apache/sysml/hops/Hop.java
index 7d69940..6afe60e 100644
--- a/src/main/java/org/apache/sysml/hops/Hop.java
+++ b/src/main/java/org/apache/sysml/hops/Hop.java
@@ -1147,7 +1147,6 @@ public abstract class Hop
 	};
 	
 	public enum ConvOp {
-		IM2COL, RESHAPE_COL, ROTATE180, COL2IM, 
 		MAX_POOLING, MAX_POOLING_BACKWARD,
 		DIRECT_CONV2D, DIRECT_CONV2D_BACKWARD_FILTER, DIRECT_CONV2D_BACKWARD_DATA
 	};
@@ -1220,10 +1219,6 @@ public abstract class Hop
 	protected static final HashMap<ConvOp, org.apache.sysml.lops.ConvolutionTransform.OperationTypes> HopsConv2Lops;
 	static {
 		HopsConv2Lops = new HashMap<ConvOp, org.apache.sysml.lops.ConvolutionTransform.OperationTypes>();
-		HopsConv2Lops.put(ConvOp.IM2COL, org.apache.sysml.lops.ConvolutionTransform.OperationTypes.IM2COL);
-		HopsConv2Lops.put(ConvOp.RESHAPE_COL, org.apache.sysml.lops.ConvolutionTransform.OperationTypes.RESHAPE_COL);
-		HopsConv2Lops.put(ConvOp.ROTATE180, org.apache.sysml.lops.ConvolutionTransform.OperationTypes.ROTATE180);
-		HopsConv2Lops.put(ConvOp.COL2IM, org.apache.sysml.lops.ConvolutionTransform.OperationTypes.COL2IM);
 		HopsConv2Lops.put(ConvOp.MAX_POOLING, org.apache.sysml.lops.ConvolutionTransform.OperationTypes.MAX_POOLING);
 		HopsConv2Lops.put(ConvOp.MAX_POOLING_BACKWARD, org.apache.sysml.lops.ConvolutionTransform.OperationTypes.MAX_POOLING_BACKWARD);
 		HopsConv2Lops.put(ConvOp.DIRECT_CONV2D, org.apache.sysml.lops.ConvolutionTransform.OperationTypes.DIRECT_CONV2D);

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/e9aa5841/src/main/java/org/apache/sysml/hops/ReorgOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/ReorgOp.java b/src/main/java/org/apache/sysml/hops/ReorgOp.java
index 5f5138b..7c87a76 100644
--- a/src/main/java/org/apache/sysml/hops/ReorgOp.java
+++ b/src/main/java/org/apache/sysml/hops/ReorgOp.java
@@ -35,7 +35,6 @@ import org.apache.sysml.lops.Transform.OperationTypes;
 import org.apache.sysml.parser.Expression.DataType;
 import org.apache.sysml.parser.Expression.ValueType;
 import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
-import org.apache.sysml.runtime.util.ConvolutionUtils;
 
 /**
  *  Reorg (cell) operation: aij
@@ -120,12 +119,6 @@ public class ReorgOp extends Hop implements MultiThreadedHop
 		if( getLops() != null )
 			return getLops();
 
-		Lop ret = ConvolutionUtils.constructConvolutionBackwardFilterLops(this);
-		if(ret != null) {
-			setLops( ret );
-			return ret;
-		}
-		
 		ExecType et = optFindExecType();
 		
 		switch( op )

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/e9aa5841/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java b/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
index fdf280d..9164d36 100644
--- a/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
+++ b/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
@@ -30,12 +30,7 @@ public class ConvolutionTransform extends Lop
 
 	
 	public enum OperationTypes {
-		IM2COL,
-		RESHAPE_COL,
-		ROTATE180,
-		COL2IM,
-		MAX_POOLING,
-		MAX_POOLING_BACKWARD,
+		MAX_POOLING, MAX_POOLING_BACKWARD,
 		DIRECT_CONV2D, DIRECT_CONV2D_BACKWARD_FILTER, DIRECT_CONV2D_BACKWARD_DATA
 	};
 	
@@ -101,19 +96,7 @@ public class ConvolutionTransform extends Lop
 
 	private String getOpcode() {
 		switch(operation) {
-			
-		case IM2COL:
-			return "im2col";
-			
-		case RESHAPE_COL:
-			return "reshape_col";
-		
-		case ROTATE180:
-			return "rotate180";
-		
-		case COL2IM:
-			return "col2im";
-			
+				
 		case MAX_POOLING:
 			return "maxpooling";
 			

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/e9aa5841/src/main/java/org/apache/sysml/parser/BuiltinFunctionExpression.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/parser/BuiltinFunctionExpression.java b/src/main/java/org/apache/sysml/parser/BuiltinFunctionExpression.java
index 3bb7b0a..bf31347 100644
--- a/src/main/java/org/apache/sysml/parser/BuiltinFunctionExpression.java
+++ b/src/main/java/org/apache/sysml/parser/BuiltinFunctionExpression.java
@@ -1109,8 +1109,8 @@ public class BuiltinFunctionExpression extends DataIdentifier
 		case MAX_POOL_BACKWARD:
 		{
 			// At DML level:
-			// output = conv2d(input, filter, input_shape=[3, 2, 2], filter_shape=[3, 2, 2], 
-			// strides=[1, 1], border_mode="valid")
+			// output = conv2d(input, filter, input_shape=[1, 3, 2, 2], filter_shape=[1, 3, 2, 2], 
+			// strides=[1, 1], padding=[1,1])
 			// 
 			// Converted to following in constructor (only supported NCHW):
 			// output = conv2d(input, filter, stride1, stride2, padding1,padding2,  

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/e9aa5841/src/main/java/org/apache/sysml/parser/DMLTranslator.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/parser/DMLTranslator.java b/src/main/java/org/apache/sysml/parser/DMLTranslator.java
index b5bb7c3..f3cb0b1 100644
--- a/src/main/java/org/apache/sysml/parser/DMLTranslator.java
+++ b/src/main/java/org/apache/sysml/parser/DMLTranslator.java
@@ -2803,18 +2803,9 @@ public class DMLTranslator
 			
 		case CONV2D:
 		{
-			Hop filter = expr2;
-			// Step 1: IM2COL
 			Hop image = expr;
-			ArrayList<Hop> inHops1 = getALHopsForConvOp(image, source, 2, hops);
-			Hop loweredMat = new ConvolutionOp(image.getName(), image.getDataType(), image.getValueType(), Hop.ConvOp.IM2COL, inHops1);
-
-			// Step 2: Matrix multiplication
-			Hop temp = new AggBinaryOp("temp" + target.getName(), target.getDataType(), target.getValueType(), OpOp2.MULT, AggOp.SUM, filter, loweredMat);
-
-			// Step 3: Reshape col
-			ArrayList<Hop> inHops2 = getALHopsForConvOp(temp, source, 2, hops);
-			currBuiltinOp = new ConvolutionOp(target.getName(), target.getDataType(), target.getValueType(), Hop.ConvOp.RESHAPE_COL, inHops2);
+			ArrayList<Hop> inHops1 = getALHopsForConvOp(image, source, 1, hops);
+			currBuiltinOp = new ConvolutionOp(target.getName(), target.getDataType(), target.getValueType(), Hop.ConvOp.DIRECT_CONV2D, inHops1);
 			setBlockSizeAndRefreshSizeInfo(image, currBuiltinOp);
 			break;
 		}
@@ -2841,33 +2832,17 @@ public class DMLTranslator
 		case CONV2D_BACKWARD_FILTER:
 		{
 			Hop image = expr;
-			Hop dout = expr2;
-
-			ArrayList<Hop> inHops1 = getALHopsForConvOp(image, source, 2, hops);
-			Hop x_col = new ConvolutionOp(image.getName(), image.getDataType(), image.getValueType(), Hop.ConvOp.IM2COL, inHops1);
-
-			ArrayList<Hop> inHops2 = getALHopsForConvOp(dout, source, 2, hops);
-			Hop dout_reshaped = new ConvolutionOp(dout.getName(), dout.getDataType(), dout.getValueType(), Hop.ConvOp.ROTATE180, inHops2);
-
-			Hop dfilter1 = new AggBinaryOp(target.getName(), target.getDataType(), target.getValueType(), OpOp2.MULT, AggOp.SUM, x_col, dout_reshaped);
-			currBuiltinOp = new ReorgOp("tempTranspose" + image.getName(), image.getDataType(), image.getValueType(), Hop.ReOrgOp.TRANSPOSE, dfilter1);
+			ArrayList<Hop> inHops1 = getALHopsForConvOp(image, source, 1, hops);
+			currBuiltinOp = new ConvolutionOp(target.getName(), target.getDataType(), target.getValueType(), Hop.ConvOp.DIRECT_CONV2D_BACKWARD_FILTER, inHops1);
 			setBlockSizeAndRefreshSizeInfo(image, currBuiltinOp);
 			break;
 		}
 		case CONV2D_BACKWARD_DATA:
 		{
-			Hop filter = expr;
-			Hop dout = expr2;
-
-			ArrayList<Hop> inHops1 = getALHopsForConvOp(dout, source, 2, hops);
-			Hop dout_reshaped = new ConvolutionOp(dout.getName(), dout.getDataType(), dout.getValueType(), Hop.ConvOp.ROTATE180, inHops1);
-
-			Hop temp1 = new AggBinaryOp("temp" + target.getName(), target.getDataType(), target.getValueType(), OpOp2.MULT, AggOp.SUM, dout_reshaped, filter);
-			// Hop temp2 = new ReorgOp("tempTranspose" + target.getName(), target.getDataType(), target.getValueType(), Hop.ReOrgOp.TRANSPOSE, temp1);
-
-			ArrayList<Hop> inHops2 = getALHopsForConvOp(temp1, source, 2, hops);
-			currBuiltinOp = new ConvolutionOp(target.getName(), target.getDataType(), target.getValueType(), Hop.ConvOp.COL2IM, inHops2);
-			setBlockSizeAndRefreshSizeInfo(filter, currBuiltinOp);
+			Hop image = expr;
+			ArrayList<Hop> inHops1 = getALHopsForConvOp(image, source, 1, hops);
+			currBuiltinOp = new ConvolutionOp(target.getName(), target.getDataType(), target.getValueType(), Hop.ConvOp.DIRECT_CONV2D_BACKWARD_DATA, inHops1);
+			setBlockSizeAndRefreshSizeInfo(image, currBuiltinOp);
 			break;
 		}
 			 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/e9aa5841/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java b/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
index ae13d3d..909525f 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
@@ -218,10 +218,6 @@ public class CPInstructionParser extends InstructionParser
 		String2CPInstructionType.put( "rsort"      , CPINSTRUCTION_TYPE.Reorg);
 
 		// Opcodes related to convolutions
-		String2CPInstructionType.put( "im2col"      , CPINSTRUCTION_TYPE.Convolution);
-		String2CPInstructionType.put( "reshape_col"      , CPINSTRUCTION_TYPE.Convolution);
-		String2CPInstructionType.put( "rotate180"      , CPINSTRUCTION_TYPE.Convolution);
-		String2CPInstructionType.put( "col2im"      , CPINSTRUCTION_TYPE.Convolution);
 		String2CPInstructionType.put( "maxpooling"      , CPINSTRUCTION_TYPE.Convolution);
 		String2CPInstructionType.put( "maxpooling_backward"      , CPINSTRUCTION_TYPE.Convolution);
 		String2CPInstructionType.put( "conv2d"      , CPINSTRUCTION_TYPE.Convolution);

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/e9aa5841/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
index 4b04eca..5e83ffa 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
@@ -80,13 +80,7 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
 
 		String[] parts = InstructionUtils.getInstructionPartsWithValueType(str);
 		String opcode = parts[0];
-		if (opcode.equalsIgnoreCase("reshape_col")
-				|| opcode.equalsIgnoreCase("rotate180")
-				|| opcode.equalsIgnoreCase("im2col")
-				|| opcode.equalsIgnoreCase("col2im")
-				|| opcode.equalsIgnoreCase("pooling_pre_reshape")
-				|| opcode.equalsIgnoreCase("pooling_post_reshape")
-				|| opcode.equalsIgnoreCase("maxpooling")) {
+		if (opcode.equalsIgnoreCase("maxpooling")) {
 			InstructionUtils.checkNumFields(parts, 15);
 			// stride1, stride2, padding1, padding2
 			// input_shape1, input_shape2, input_shape3, input_shape4,
@@ -115,8 +109,7 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
 			return new ConvolutionCPInstruction(in, out, opcode, str, stride,
 					padding, input_shape, filter_shape, k);
 		} 
-		else if (opcode.equalsIgnoreCase("pooling_backward_reshape")
-				|| opcode.equalsIgnoreCase("maxpooling_backward")
+		else if (opcode.equalsIgnoreCase("maxpooling_backward")
 				|| opcode.equalsIgnoreCase("conv2d")
 				|| opcode.equalsIgnoreCase("conv2d_backward_filter")
 				|| opcode.equalsIgnoreCase("conv2d_backward_data")) {
@@ -186,38 +179,7 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
 		int Q = (int) ConvolutionUtils.getQ(W, S, stride_w, pad_w);
 		
 		ConvolutionParameters params = new ConvolutionParameters(N, C, H, W, K, R, S, stride_h, stride_w, pad_h, pad_w, _numThreads);
-		
-		if (instOpcode.equalsIgnoreCase("im2col")) {
-			checkHeightWidth(ec, params);
-			checkInputDimensionForIm2col(matBlock, params);
-			outputBlock = getDenseOutputBlock(ec, C * R * S, N * P * Q, true);
-			params.setReuseNonZeroedOutput(_reuseNonZeroedOutput);
-			LibMatrixDNN.im2col(matBlock, outputBlock, params);
-		}
-		else if (instOpcode.equalsIgnoreCase("reshape_col")) {
-			checkHeightWidth(ec, params);
-			// Is eligible for REUSE_NONZEROED_OUTPUT but cannot guarantee that previous output has been rmvar-ed
-			// without somewhat expensive HashMap checks
-			outputBlock = getDenseOutputBlock(ec, N, K * P * Q, true);
-			params.setReuseNonZeroedOutput(_reuseNonZeroedOutput);
-			LibMatrixDNN.reshape_col(matBlock, outputBlock, params);
-		}
-		else if (instOpcode.equalsIgnoreCase("rotate180")) {
-			checkHeightWidth(ec, params);
-			// Is eligible for REUSE_NONZEROED_OUTPUT and always an intermediate instruction
-			outputBlock = getDenseOutputBlock(ec, N * P * Q, K, true);
-			params.setReuseNonZeroedOutput(_reuseNonZeroedOutput);
-			LibMatrixDNN.rotate180(matBlock, outputBlock, params);
-		}
-		else if (instOpcode.equalsIgnoreCase("col2im")) {
-			checkHeightWidth(ec, params);
-			checkInputDimensionForCol2im(matBlock, params);
-			// needs to be zeroed-out
-			outputBlock = getDenseOutputBlock(ec, N, C * H * W, false);
-			params.setReuseNonZeroedOutput(_reuseNonZeroedOutput);
-			LibMatrixDNN.col2im(matBlock, outputBlock, params);
-		}
-		else if (instOpcode.equalsIgnoreCase("maxpooling")) {
+		if (instOpcode.equalsIgnoreCase("maxpooling")) {
 			// Is eligible for REUSE_NONZEROED_OUTPUT but cannot guarantee that previous output has been rmvar-ed
 			// without somewhat expensive HashMap checks
 			outputBlock = getDenseOutputBlock(ec, N, C*P*Q, true);
@@ -284,38 +246,4 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
 			Statistics.incrementAllocationTime(System.nanoTime()-start, false);
 		return outputBlock;
 	}
-	
-	private void checkHeightWidth(ExecutionContext ec, ConvolutionParameters params) throws DMLRuntimeException {
-		int numChannelsInFilter = getScalarInput(ec, _filter_shape, 1);
-		
-		if (numChannelsInFilter != params.C) { 
-			throw new DMLRuntimeException("The number of channels of input and filter should match");
-		}
-		if((params.W + 2 * params.pad_w - params.S) % params.stride_w != 0) {
-			throw new DMLRuntimeException("The width does not work (Hint: (W + 2 * pad_w - S) % stride_w should be 0 [ ==> (" + params.W + "+" + " 2*" + params.pad_w + "-" +  params.S + ") % " + params.stride_w + "!= 0] ");
-		}
-		if((params.H + 2 * params.pad_h - params.R) % params.stride_h != 0) {
-			throw new DMLRuntimeException("The height does not work (Hint: (H + 2 * pad_h - R) % stride_h should be 0 [ ==> (" + params.H + "+" + " 2*" + params.pad_h + "-" +  params.R + ") % " + params.stride_h + "!= 0] ");
-		}
-		if(params.H <= 0) {
-			throw new DMLRuntimeException("Height of output patch should be zero");
-		}
-		if(params.Q <= 0) {
-			throw new DMLRuntimeException("Width of output patch should be zero");
-		}
-	}
-
-
-
-	private void checkInputDimensionForIm2col(MatrixBlock matBlock, ConvolutionParameters params) throws DMLRuntimeException {
-		if((params.N != matBlock.getNumRows() || params.C*params.H*params.W != matBlock.getNumColumns())) {
-			throw new DMLRuntimeException("Incorrect input shape in im2col");
-		}
-	}
-	
-	private void checkInputDimensionForCol2im(MatrixBlock matBlock, ConvolutionParameters params) throws DMLRuntimeException {
-		if((params.N*params.P*params.Q != matBlock.getNumRows() || params.C*params.R*params.S != matBlock.getNumColumns())) {
-			throw new DMLRuntimeException("Incorrect input shape in col2im");
-		}
-	}
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/e9aa5841/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
index 59a6a47..c2b3f7d 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
@@ -74,7 +74,7 @@ public class LibMatrixDNN {
 	}
 	
 	enum TaskType {
-		ReshapeCol, Rotate180, Im2Col, Col2Im, MaxPooling_Forward, MaxPooling_Backward, 
+		MaxPooling_Forward, MaxPooling_Backward, 
 		LoopedIm2ColConv2d, LoopedIm2ColConv2dBwdFilter, LoopedIm2ColConv2dBwdData
 	}
 	
@@ -250,6 +250,11 @@ public class LibMatrixDNN {
 			throw new DMLRuntimeException("Only positive strides supported");
 		}
 		
+		// Convert filter (which is relatively small matrix) to dense
+		if(params.input1.isInSparseFormat()) {
+			params.input1.sparseToDense();
+		}
+		
 		if(DMLScript.STATISTICS) {
 			if(filter.isInSparseFormat() || dout.isInSparseFormat()) {
 				conv2dBwdDataSparseCount.addAndGet(1);
@@ -375,7 +380,7 @@ public class LibMatrixDNN {
 		
 		MatrixBlock temp = new MatrixBlock(params.P*params.Q, params.C*params.R*params.S, false);
 		long t1 = DMLScript.STATISTICS ? System.nanoTime() : 0;
-		LibMatrixMult.matrixMult(dout_reshaped, filter, temp);
+		LibMatrixMult.matrixMult(dout_reshaped, filter, temp, false);
 		long t2 = DMLScript.STATISTICS ? System.nanoTime() : 0 ;
 		doCol2imOverSingleImage(n, temp, params);
 		long t3 = DMLScript.STATISTICS ? System.nanoTime() : 0 ;
@@ -400,7 +405,7 @@ public class LibMatrixDNN {
 		
 		MatrixBlock temp = new MatrixBlock(params.C*params.R*params.S, params.K, false);
 		long t3 = DMLScript.STATISTICS ? System.nanoTime() : 0 ;
-		LibMatrixMult.matrixMult(im2ColOutBlock, dout_reshaped, temp);
+		LibMatrixMult.matrixMult(im2ColOutBlock, dout_reshaped, temp, false);
 		long t4 = DMLScript.STATISTICS ? System.nanoTime() : 0 ;
 		if(DMLScript.STATISTICS) {
 			loopedConvBwdFilterMatMultTime.addAndGet(t4-t3);
@@ -427,6 +432,11 @@ public class LibMatrixDNN {
 			throw new DMLRuntimeException("Incorrect input to conv2d");
 		}
 		
+		// Convert filter (which is relatively small matrix) to dense
+		if(params.input2.isInSparseFormat()) {
+			params.input2.sparseToDense();
+		}
+		
 		if(DMLScript.STATISTICS) {
 			if(input.isInSparseFormat() || filter.isInSparseFormat()) {
 				conv2dSparseCount.addAndGet(1);
@@ -461,7 +471,7 @@ public class LibMatrixDNN {
 		
 		im2ColOutBlock.setNonZeros(nnz);
 		MatrixBlock matMultOutBlock = new MatrixBlock(params.K, params.P*params.Q, false);
-		LibMatrixMult.matrixMult(params.input2, im2ColOutBlock, matMultOutBlock);
+		LibMatrixMult.matrixMult(params.input2, im2ColOutBlock, matMultOutBlock, false);
 		long t3 = DMLScript.STATISTICS ? System.nanoTime() : 0;
 		
 		if(DMLScript.STATISTICS) {
@@ -751,37 +761,6 @@ public class LibMatrixDNN {
 		}
 		params.outputNNZ.addAndGet(tmpNNZ);
 	}
-		
-	// Reshape a 4D tensor of dimension (N, K, P, Q) to matrix of dimension (NPQ, K)
-	public static void rotate180(MatrixBlock input, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
-		params.input1 = input;
-		params.output = outputBlock;
-		
-		if(input.getNumColumns() != params.K*params.P*params.Q || input.getNumRows() != params.N) {
-			throw new DMLRuntimeException("Incorrect input dimensions in rotate180:" + input.getNumRows() + " " + input.getNumColumns() + " " + params.N + " " + params.K*params.P*params.Q);
-		}
-		
-		int constrainedNumThreads = OptimizerUtils.getConstrainedNumThreads(params.numThreads);
-		if(!ALLOW_MULTI_THREADED_OPS || constrainedNumThreads <= 1) {
-			warnSingleThreaded();
-			for (int n = 0; n < params.N; n++) {
-				doRotate180(n, params);
-			}
-		}
-		else {
-			runConvTask(constrainedNumThreads, 1, TaskType.Rotate180, params);
-		}
-		outputBlock.setNonZeros(input.getNonZeros()); // As number of non-zeros doesnot change for rotate180
-	}
-	
-	private static void doRotate180(int n, ConvolutionParameters params) throws DMLRuntimeException {
-		double [] outputArray = null;
-		if (!params.output.isInSparseFormat())
-			outputArray = params.output.getDenseBlock();
-		else
-			throw new DMLRuntimeException("Sparse output is not supported for rotate180");
-		doRotate180(n, n, params.input1, outputArray, params, false);
-	}
 	
 	private static void doRotate180(int inputN, int outputN, MatrixBlock input, 
 			double [] outputArray,  ConvolutionParameters params, boolean zeroOutSparseOutput) throws DMLRuntimeException {
@@ -818,29 +797,6 @@ public class LibMatrixDNN {
 		}
 	}
 	
-	
-	// Reshape a matrix of dimension (K, NPQ) to 4D tensor of dimension (N, K, P, params.Q)
-	public static void reshape_col(MatrixBlock input, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
-		params.input1 = input;
-		params.output = outputBlock;
-		
-		if(input.getNumColumns() != params.N*params.P*params.Q || input.getNumRows() != params.K) {
-			throw new DMLRuntimeException("Incorrect input dimensions in reshape_col:" + input.getNumRows() + " " + input.getNumColumns());
-		}
-		
-		int constrainedNumThreads = OptimizerUtils.getConstrainedNumThreads(params.numThreads);
-		if(!ALLOW_MULTI_THREADED_OPS || constrainedNumThreads <= 1) {
-			warnSingleThreaded();
-			for (int n = 0; n < params.N; n++) { 
-				doReshapeCol(n, params);
-			}
-		}
-		else {
-			runConvTask(constrainedNumThreads, 1, TaskType.ReshapeCol, params);
-		}
-		outputBlock.setNonZeros(input.getNonZeros()); // As number of non-zeros doesnot change for reshape_col
-	}
-	
 	private static int [] getTaskSize(int constrainedNumThreads, int maxNumTaskSize1, int maxNumTaskSize2) {
 		int taskSize1 = 1; int taskSize2 = 1;
 		// Why this heuristics ? To reduce the impact of the thread-creation overhead in case of small tasks
@@ -939,30 +895,6 @@ public class LibMatrixDNN {
 		@Override
 		public Object call() throws DMLRuntimeException {
 			switch(type) {
-				case ReshapeCol:
-					for (int n = n1; n < n2; n++) {
-						doReshapeCol(n, params);
-					}
-					break;
-				case Rotate180:
-					for (int n = n1; n < n2; n++) {
-						doRotate180(n, params);
-					}
-					break;
-				case Im2Col:
-					long nnz = 0;
-					for (int n = n1; n < n2; n++) {
-						for (int z = z1; z < z2; z++) {
-							nnz += doIm2colOverInputPath_NCHW(n, z, params);
-						}
-					}
-					params.outputNNZ.addAndGet(nnz);
-					break;
-				case Col2Im:
-					for (int n = n1; n < n2; n++) {
-						doCol2imOverMultipleImages(n, params);
-					}
-					break;
 				case MaxPooling_Forward:
 					for (int n = n1; n < n2; n++) {
 						for (int z = z1; z < z2; z++) {
@@ -1011,84 +943,6 @@ public class LibMatrixDNN {
 		}
 	}
 		
-	private static void doReshapeCol(int n, ConvolutionParameters params) {
-		double [] inputArray = null;
-		if (!params.input1.isInSparseFormat())
-			inputArray = params.input1.getDenseBlock();
-		double [] outputArray = null;
-		if (!params.output.isInSparseFormat())
-			outputArray = params.output.getDenseBlock();
-		
-		if(inputArray != null) {
-			for (int k = 0; k < params.K; k++)  {
-				System.arraycopy(inputArray, k*params.N*params.P*params.Q + n*params.P*params.Q, outputArray, n*params.K*params.P*params.Q + k*params.P*params.Q, params.P*params.Q);
-			}
-		}
-		else {
-			for (int k = 0; k < params.K; k++) {
-				for (int p = 0; p < params.P; p++) { 
-					for (int q = 0; q < params.Q; q++) {
-						outputArray[n*params.K*params.P*params.Q + k*params.P*params.Q + p*params.Q + q] = params.input1.quickGetValue(k, n*params.P*params.Q + p*params.Q + q);
-					}
-				}
-			}
-		}
-	}
-	
-	// Converts a 4D tensor (N, C, R, S) to a matrix of dimension (CRS, NPQ)
-	public static void im2col(MatrixBlock input, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
-		params.input1 = input;
-		params.output = outputBlock;
-		
-		params.outputNNZ.set(0);
-		
-		if(DMLScript.STATISTICS) {
-			if(input.isInSparseFormat()) {
-				im2colSparseCount.addAndGet(1);
-			}
-			else {
-				im2colDenseCount.addAndGet(1);
-			}
-		}
-		
-		int constrainedNumThreads = OptimizerUtils.getConstrainedNumThreads(params.numThreads);
-		if(!ALLOW_MULTI_THREADED_OPS || constrainedNumThreads <= 1) {
-			warnSingleThreaded();
-			long nnz = 0;
-			for (int n = 0; n < params.N; n++) { // Do following for all images
-				for (int c = 0; c < params.C; c++) { // Since format is NCHW
-					nnz += doIm2colOverInputPath_NCHW(n, c, params);
-				}
-			}
-			outputBlock.setNonZeros(nnz);
-		}
-		else {
-			runConvTask(constrainedNumThreads, params.C, TaskType.Im2Col, params);
-			outputBlock.setNonZeros(params.outputNNZ.get());
-		}
-		
-	}
-	
-	// Converts a matrix of dimension (CRS, NPQ) to a 4D tensor (N, C, H, W)
-	public static void col2im(MatrixBlock input, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
-		params.input1 = input;
-		params.output = outputBlock;
-		
-		int constrainedNumThreads = OptimizerUtils.getConstrainedNumThreads(params.numThreads);
-		if(!ALLOW_MULTI_THREADED_OPS || constrainedNumThreads <= 1) {
-			warnSingleThreaded();
-			// Sequential col2im
-			for (int n = 0; n < params.N; n++) { // Do following for all images
-				doCol2imOverMultipleImages(n, params);
-			}
-		}
-		else {
-			// Parallel col2im
-			runConvTask(constrainedNumThreads, 1, TaskType.Col2Im, params);
-		}
-	}
-	
-	
 	// Converts input: PQ X CRS matrix and writes to 1 X CHW
 	private static void doCol2imOverSingleImage(int outputN, MatrixBlock input, ConvolutionParameters params) throws DMLRuntimeException {
 		if(input.rlen != params.P*params.Q || input.clen != params.C*params.R*params.S) {
@@ -1169,34 +1023,6 @@ public class LibMatrixDNN {
 		}
 	}
 		
-	// NPQ X CRS
-	private static void doCol2imOverMultipleImages(int n, ConvolutionParameters params) throws DMLRuntimeException {
-		MatrixBlock input = params.input1;
-		
-		if(input.rlen != params.N*params.P*params.Q || input.clen != params.C*params.R*params.S) {
-			throw new DMLRuntimeException("Incorrect input dimensions");
-		}
-		
-		double [] outputArray = null;
-		if (!params.output.isInSparseFormat())
-			outputArray = params.output.getDenseBlock();
-		else {
-			throw new DMLRuntimeException("Only dense output is implemented");
-		}
-		
-		if(!input.isInSparseFormat()) {
-			double [] inputArray = input.getDenseBlock();
-			doCol2IMDenseInput(n, n, inputArray, outputArray, params);
-		}
-		else {
-			doCol2IMSparseInput(n, n, input.getSparseBlockIterator(n*params.P*params.Q, (n+1)*params.P*params.Q), outputArray, params);
-		}
-	}
-	
-	private static long doIm2colOverInputPath_NCHW(int n, int c, ConvolutionParameters params) throws DMLRuntimeException {
-		return doIm2colOverInputPath_NCHW(n, c, null, params);
-	}
-	
 	private static long doIm2colOverInputPath_NCHW(int n, int c, MatrixBlock output, ConvolutionParameters params) throws DMLRuntimeException {
 		double [] inputArray = null;
 		if (!params.input1.isInSparseFormat())

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/e9aa5841/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
index 9d878be..6902d40 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
@@ -100,6 +100,31 @@ public class LibMatrixMult
 	}
 	
 	/**
+	 * This method allows one to disabling exam sparsity. This feature is useful if matrixMult is used as an intermediate
+	 * operation (for example: LibMatrixDNN). It makes sense for LibMatrixDNN because the output is internally
+	 * consumed by another dense instruction, which makes repeated conversion to sparse wasteful.
+	 * This should be used in rare cases and if you are unsure,
+	 * use the method 'matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret)' instead.
+	 * 
+	 * @param m1
+	 * @param m2
+	 * @param ret
+	 * @param examSparsity
+	 * @throws DMLRuntimeException
+	 */
+	public static void matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean examSparsity) 
+			throws DMLRuntimeException
+	{	
+		matrixMult(m1, m2, ret, 0, m1.rlen, examSparsity);
+	}
+	
+	public static void matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int rl, int ru) 
+			throws DMLRuntimeException
+	{
+		matrixMult(m1, m2, ret, rl, ru, true);
+	}
+	
+	/**
 	 * 
 	 * @param m1
 	 * @param m2
@@ -108,7 +133,7 @@ public class LibMatrixMult
 	 * @param ru
 	 * @throws DMLRuntimeException
 	 */
-	public static void matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int rl, int ru) 
+	public static void matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int rl, int ru, boolean examSparsity) 
 		throws DMLRuntimeException
 	{	
 		//check inputs / outputs
@@ -146,7 +171,9 @@ public class LibMatrixMult
 		//post-processing: nnz/representation
 		if( !ret.sparse )
 			ret.recomputeNonZeros();
-		ret.examSparsity();
+		
+		if(examSparsity)
+			ret.examSparsity();
 		
 		//System.out.println("MM ("+m1.isInSparseFormat()+","+m1.getNumRows()+","+m1.getNumColumns()+","+m1.getNonZeros()+")x" +
 		//		              "("+m2.isInSparseFormat()+","+m2.getNumRows()+","+m2.getNumColumns()+","+m2.getNonZeros()+") in "+time.stop());

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/e9aa5841/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
index 8f84bd7..1316ad8 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
@@ -1224,7 +1224,7 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 	 * 
 	 * @throws DMLRuntimeException
 	 */
-	private void sparseToDense() 
+	void sparseToDense() 
 		throws DMLRuntimeException 
 	{	
 		//set target representation

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/e9aa5841/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java b/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java
index ac19816..80b20cd 100644
--- a/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java
+++ b/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java
@@ -19,20 +19,6 @@
 
 package org.apache.sysml.runtime.util;
 
-import java.util.ArrayList;
-
-import org.apache.sysml.api.DMLScript;
-import org.apache.sysml.hops.AggBinaryOp;
-import org.apache.sysml.hops.ConvolutionOp;
-import org.apache.sysml.hops.Hop;
-import org.apache.sysml.hops.HopsException;
-import org.apache.sysml.hops.ReorgOp;
-import org.apache.sysml.hops.Hop.ConvOp;
-import org.apache.sysml.hops.Hop.ReOrgOp;
-import org.apache.sysml.lops.Lop;
-import org.apache.sysml.lops.LopsException;
-import org.apache.sysml.lops.LopProperties.ExecType;
-
 
 public class ConvolutionUtils {
 	
@@ -54,191 +40,4 @@ public class ConvolutionUtils {
 		return ret;
 	}
 	
-	private static boolean isMatMult(Hop hop) {
-		if(hop != null && hop instanceof AggBinaryOp) {
-			return true;
-		}
-		return false;
-	}
-	private static boolean isTranspose(Hop hop) {
-		if(hop != null && hop instanceof ReorgOp && ((ReorgOp)hop).getOp() == ReOrgOp.TRANSPOSE) {
-			return true;
-		}
-		return false;
-	}
-	private static boolean isConvolutionOp(Hop hop, Hop.ConvOp op) {
-		if(hop != null && hop instanceof ConvolutionOp && ((ConvolutionOp) hop).getOp() == op) {
-			return true;
-		}
-		return false;
-	}
-	
-	// Simple heuristic that prefers im2col for non-test/non-validation cases.
-	private static boolean preferIm2Col(ExecType et, long N, long K, long C, long R, long S, long P, long Q) throws HopsException {
-		if(et == ExecType.CP && ConvolutionOp.FORCE_NON_IM2COL) {
-			return false;
-		}
-//		else if(et == ExecType.CP && N < 256 ) {
-//			return true; // Prefer im2col to non-test/non-validation
-//		}
-		return false;
-	}
-	
-	public static Lop constructConvolutionBackwardFilterLops(Hop currentHop) throws HopsException, LopsException {
-		ExecType et = ExecType.CP; // TODO: Check memory estimates
-		if(DMLScript.USE_ACCELERATOR)
-			et = ExecType.GPU; // TODO: Add memory estimate checks
-		else if(et == ExecType.MR || et == ExecType.SPARK)
-			return null;
-		
-		if(currentHop != null && isTranspose(currentHop)) {
-			Hop matMult = currentHop.getInput().get(0);
-			if(matMult != null && isMatMult(matMult)) {
-				Hop x_col = matMult.getInput().get(0);
-				Hop right = matMult.getInput().get(1);
-				if(isConvolutionOp(x_col, ConvOp.IM2COL) && isConvolutionOp(right, ConvOp.ROTATE180)) {
-					Hop image = x_col.getInput().get(0);
-					Hop dout = right.getInput().get(0);
-					ArrayList<Hop> inputs = new ArrayList<Hop>();
-					inputs.add(image);
-					inputs.add(dout);
-					for(int i = 1; i < x_col.getInput().size(); i++) {
-						inputs.add(x_col.getInput().get(i));
-					}
-					
-					// K, C * R * S
-					long N = currentHop.computeSizeInformation(inputs.get(6));
-					long C = currentHop.computeSizeInformation(inputs.get(7));
-					long H = currentHop.computeSizeInformation(inputs.get(8));
-					long W = currentHop.computeSizeInformation(inputs.get(9));
-					long K = currentHop.computeSizeInformation(inputs.get(10));
-					long R = currentHop.computeSizeInformation(inputs.get(12));
-					long S = currentHop.computeSizeInformation(inputs.get(13));
-					long stride_h = currentHop.computeSizeInformation(inputs.get(2));
-					long stride_w = currentHop.computeSizeInformation(inputs.get(3));
-					long pad_h = currentHop.computeSizeInformation(inputs.get(4));
-					long pad_w = currentHop.computeSizeInformation(inputs.get(5));
-					long P = -1; long Q = -1;
-					if(H > 0 && R > 0 && stride_h > 0 && pad_h > 0)
-						P = ConvolutionUtils.getP(H, R, stride_h, pad_h);
-					if(W > 0 && S > 0 && stride_w > 0 && pad_w > 0)
-						Q = ConvolutionUtils.getQ(W, S, stride_w, pad_w);
-					
-					if(preferIm2Col(et, N, K, C, R, S, P, Q)) {
-						return null;
-					}
-					
-					long rlen = K;
-					long clen = ConvolutionOp.getExtractedVal(C, R, S);
-					return ConvolutionOp.constructFusedConvolutionLops(et, inputs, ConvOp.DIRECT_CONV2D_BACKWARD_FILTER, (ConvolutionOp) x_col, rlen, clen);
-				}
-			}
-		}
-		return null;
-	}
-	
-	public static Lop constructConvolutionLops(Hop currentHop, ExecType et) throws HopsException, LopsException {
-		if(DMLScript.USE_ACCELERATOR)
-			et = ExecType.GPU; // TODO: Add memory estimate checks
-		else if(et == ExecType.MR || et == ExecType.SPARK)
-			return null;
-		
-		if(currentHop != null && isConvolutionOp(currentHop, ConvOp.RESHAPE_COL)) {
-			Hop matMult = currentHop.getInput().get(0);
-			if(matMult != null && isMatMult(matMult)) {
-				Hop filter = matMult.getInput().get(0);
-				Hop x_col = matMult.getInput().get(1);
-				if(isConvolutionOp(x_col, ConvOp.IM2COL)) {
-					Hop image = x_col.getInput().get(0);
-					ArrayList<Hop> inputs = new ArrayList<Hop>();
-					inputs.add(image);
-					inputs.add(filter);
-					for(int i = 1; i < x_col.getInput().size(); i++) {
-						inputs.add(x_col.getInput().get(i));
-					}
-					
-					// N, K * P * Q
-					long N = currentHop.computeSizeInformation(inputs.get(6));
-					long C = currentHop.computeSizeInformation(inputs.get(7));
-					long H = currentHop.computeSizeInformation(inputs.get(8));
-					long W = currentHop.computeSizeInformation(inputs.get(9));
-					long K = currentHop.computeSizeInformation(inputs.get(10));
-					long R = currentHop.computeSizeInformation(inputs.get(12));
-					long S = currentHop.computeSizeInformation(inputs.get(13));
-					long stride_h = currentHop.computeSizeInformation(inputs.get(2));
-					long stride_w = currentHop.computeSizeInformation(inputs.get(3));
-					long pad_h = currentHop.computeSizeInformation(inputs.get(4));
-					long pad_w = currentHop.computeSizeInformation(inputs.get(5));
-					long P = -1; long Q = -1;
-					if(H > 0 && R > 0 && stride_h > 0 && pad_h > 0)
-						P = ConvolutionUtils.getP(H, R, stride_h, pad_h);
-					if(W > 0 && S > 0 && stride_w > 0 && pad_w > 0)
-						Q = ConvolutionUtils.getQ(W, S, stride_w, pad_w);
-					
-					if(preferIm2Col(et, N, K, C, R, S, P, Q)) {
-						return null;
-					}
-					
-					long rlen = N;
-					long clen = ConvolutionOp.getExtractedVal(K, P, Q);
-					return ConvolutionOp.constructFusedConvolutionLops(et, inputs, ConvOp.DIRECT_CONV2D, (ConvolutionOp) x_col, rlen, clen);
-				}
-			}
-		}
-		
-		return null;
-	}
-	
-	public static Lop constructConvolutionBackwardDataLops(Hop currentHop, ExecType et) throws HopsException, LopsException {
-		if(DMLScript.USE_ACCELERATOR)
-			et = ExecType.GPU; // TODO: Add memory estimate checks
-		else if(et == ExecType.MR || et == ExecType.SPARK)
-			return null;
-		
-		if(currentHop != null && isConvolutionOp(currentHop, ConvOp.COL2IM)) {
-			Hop matMult = currentHop.getInput().get(0);
-			if(matMult != null && isMatMult(matMult)) {
-				Hop rotate180 = matMult.getInput().get(0);
-				Hop filter = matMult.getInput().get(1);
-				if(isConvolutionOp(rotate180, ConvOp.ROTATE180)) {
-					ArrayList<Hop> inputs = new ArrayList<Hop>();
-					inputs.add(filter);
-					inputs.add(rotate180.getInput().get(0));
-					for(int i = 1; i < rotate180.getInput().size(); i++) {
-						inputs.add(rotate180.getInput().get(i));
-					}
-					
-					// N, C * H * W
-					long N = currentHop.computeSizeInformation(inputs.get(6));
-					long C = currentHop.computeSizeInformation(inputs.get(7));
-					long H = currentHop.computeSizeInformation(inputs.get(8));
-					long W = currentHop.computeSizeInformation(inputs.get(9));
-					long K = currentHop.computeSizeInformation(inputs.get(10));
-					long R = currentHop.computeSizeInformation(inputs.get(12));
-					long S = currentHop.computeSizeInformation(inputs.get(13));
-					long stride_h = currentHop.computeSizeInformation(inputs.get(2));
-					long stride_w = currentHop.computeSizeInformation(inputs.get(3));
-					long pad_h = currentHop.computeSizeInformation(inputs.get(4));
-					long pad_w = currentHop.computeSizeInformation(inputs.get(5));
-					long P = -1; long Q = -1;
-					if(H > 0 && R > 0 && stride_h > 0 && pad_h > 0)
-						P = ConvolutionUtils.getP(H, R, stride_h, pad_h);
-					if(W > 0 && S > 0 && stride_w > 0 && pad_w > 0)
-						Q = ConvolutionUtils.getQ(W, S, stride_w, pad_w);
-					
-					if(preferIm2Col(et, N, K, C, R, S, P, Q)) {
-						return null;
-					}
-					long rlen = N;
-					long clen = ConvolutionOp.getExtractedVal(C, H, W);
-					return ConvolutionOp.constructFusedConvolutionLops(et, inputs, ConvOp.DIRECT_CONV2D_BACKWARD_DATA, (ConvolutionOp) rotate180, rlen, clen);					
-				}
-			}
-			
-		}
-		
-		return null;
-	}
-	
-	
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/e9aa5841/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DBackwardTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DBackwardTest.java b/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DBackwardTest.java
index c213b55..74d3d14 100644
--- a/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DBackwardTest.java
+++ b/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DBackwardTest.java
@@ -22,7 +22,6 @@ import java.util.HashMap;
 
 import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM;
-import org.apache.sysml.hops.ConvolutionOp;
 import org.apache.sysml.lops.LopProperties.ExecType;
 import org.apache.sysml.runtime.matrix.data.MatrixValue.CellIndex;
 import org.apache.sysml.runtime.util.ConvolutionUtils;
@@ -50,70 +49,35 @@ public class Conv2DBackwardTest extends AutomatedTestBase
 	public void testConv2DBackwardFilterDense1() 
 	{
 		int numImg = 3; int imgSize = 3; int numChannels = 3; int numFilters = 1; int filterSize = 2; int stride = 1; int pad = 0;
-		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false);
+		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad);
 	}
 	
 	@Test
 	public void testConv2DBackwardFilterDense2() 
 	{
 		int numImg = 3; int imgSize = 3; int numChannels = 3; int numFilters = 4; int filterSize = 2; int stride = 1; int pad = 0;
-		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false);
+		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad);
 	}
 	
 	@Test
 	public void testConv2DBackwardFilterDense3() 
 	{
 		int numImg = 3; int imgSize = 10; int numChannels = 4; int numFilters = 3; int filterSize = 2; int stride = 2; int pad = 1;
-		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false);
+		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad);
 	}
 	
 	@Test
 	public void testConv2DBackwardFilterDense4() 
 	{
 		int numImg = 3; int imgSize = 10; int numChannels = 4; int numFilters = 3; int filterSize = 5; int stride = 1; int pad = 1;
-		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false);
+		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad);
 	}
 	
 	@Test
 	public void testConv2DBackwardFilterDense5() 
 	{
 		int numImg = 3; int imgSize = 10; int numChannels = 2; int numFilters = 3; int filterSize = 5; int stride = 3; int pad = 2;
-		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false);
-	}
-	
-	@Test
-	public void testConv2DBackwardFilterDense6() 
-	{
-		int numImg = 3; int imgSize = 3; int numChannels = 3; int numFilters = 1; int filterSize = 2; int stride = 1; int pad = 0;
-		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true);
-	}
-	
-	@Test
-	public void testConv2DBackwardFilterDense7() 
-	{
-		int numImg = 3; int imgSize = 3; int numChannels = 3; int numFilters = 4; int filterSize = 2; int stride = 1; int pad = 0;
-		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true);
-	}
-	
-	@Test
-	public void testConv2DBackwardFilterDense8() 
-	{
-		int numImg = 3; int imgSize = 10; int numChannels = 4; int numFilters = 3; int filterSize = 2; int stride = 2; int pad = 1;
-		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true);
-	}
-	
-	@Test
-	public void testConv2DBackwardFilterDense9() 
-	{
-		int numImg = 3; int imgSize = 10; int numChannels = 4; int numFilters = 3; int filterSize = 5; int stride = 1; int pad = 1;
-		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true);
-	}
-	
-	@Test
-	public void testConv2DBackwardFilterDense10() 
-	{
-		int numImg = 3; int imgSize = 10; int numChannels = 2; int numFilters = 3; int filterSize = 5; int stride = 3; int pad = 2;
-		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true);
+		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad);
 	}
 	
 	/**
@@ -122,13 +86,11 @@ public class Conv2DBackwardTest extends AutomatedTestBase
 	 * @param sparse
 	 */
 	public void runConv2DBackwardFilterTest( ExecType et, int imgSize, int numImg, int numChannels, int numFilters, 
-			int filterSize, int stride, int pad, boolean forceNonIm2Col) 
+			int filterSize, int stride, int pad) 
 	{
 		RUNTIME_PLATFORM oldRTP = rtplatform;
 			
 		boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG;
-		boolean oldForceNonIm2col = ConvolutionOp.FORCE_NON_IM2COL;
-		ConvolutionOp.FORCE_NON_IM2COL = forceNonIm2Col;
 		try
 		{
 		    TestConfiguration config = getTestConfiguration(TEST_NAME);
@@ -176,7 +138,6 @@ public class Conv2DBackwardTest extends AutomatedTestBase
 		{
 			rtplatform = oldRTP;
 			DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld;
-			ConvolutionOp.FORCE_NON_IM2COL = oldForceNonIm2col;
 		}
 	}
 	

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/e9aa5841/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DTest.java b/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DTest.java
index 8b87372..e247d08 100644
--- a/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DTest.java
+++ b/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DTest.java
@@ -22,7 +22,6 @@ import java.util.HashMap;
 
 import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM;
-import org.apache.sysml.hops.ConvolutionOp;
 import org.apache.sysml.lops.LopProperties.ExecType;
 import org.apache.sysml.runtime.matrix.data.MatrixValue.CellIndex;
 import org.apache.sysml.test.integration.AutomatedTestBase;
@@ -48,88 +47,49 @@ public class Conv2DTest extends AutomatedTestBase
 	public void testConv2DDense1() 
 	{
 		int numImg = 5; int imgSize = 3; int numChannels = 3; int numFilters = 6; int filterSize = 2; int stride = 1; int pad = 0;
-		runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false);
+		runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad);
 	}
 	
 	@Test
 	public void testConv2DDense2() 
 	{
 		int numImg = 1; int imgSize = 10; int numChannels = 4; int numFilters = 3; int filterSize = 4; int stride = 2; int pad = 0;
-		runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false);
+		runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad);
 	}
 	
 	@Test
 	public void testConv2DDense3() 
 	{
 		int numImg = 1; int imgSize = 10; int numChannels = 4; int numFilters = 3; int filterSize = 4; int stride = 2; int pad = 1;
-		runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false);
+		runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad);
 	}
 	
 	@Test
 	public void testConv2DDense4() 
 	{
 		int numImg = 3; int imgSize = 10; int numChannels = 1; int numFilters = 3; int filterSize = 2; int stride = 2; int pad = 1;
-		runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false);
+		runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad);
 	}
 	
 	@Test
 	public void testConv2DDense5() 
 	{
 		int numImg = 3; int imgSize = 8; int numChannels = 2; int numFilters = 3; int filterSize = 3; int stride = 1; int pad = 2;
-		runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false);
+		runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad);
 	}
 	
-	@Test
-	public void testConv2DDense6() 
-	{
-		int numImg = 5; int imgSize = 3; int numChannels = 3; int numFilters = 6; int filterSize = 2; int stride = 1; int pad = 0;
-		runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true);
-	}
-	
-	@Test
-	public void testConv2DDense7() 
-	{
-		int numImg = 1; int imgSize = 10; int numChannels = 4; int numFilters = 3; int filterSize = 4; int stride = 2; int pad = 0;
-		runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true);
-	}
-	
-	@Test
-	public void testConv2DDense8() 
-	{
-		int numImg = 1; int imgSize = 10; int numChannels = 4; int numFilters = 3; int filterSize = 4; int stride = 2; int pad = 1;
-		runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true);
-	}
-	
-	@Test
-	public void testConv2DDense9() 
-	{
-		int numImg = 3; int imgSize = 10; int numChannels = 1; int numFilters = 3; int filterSize = 2; int stride = 2; int pad = 1;
-		runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true);
-	}
-	
-	@Test
-	public void testConv2DDense10() 
-	{
-		int numImg = 3; int imgSize = 8; int numChannels = 2; int numFilters = 3; int filterSize = 3; int stride = 1; int pad = 2;
-		runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true);
-	}
-	
-	
 	/**
 	 * 
 	 * @param et
 	 * @param sparse
 	 */
 	public void runConv2DTest( ExecType et, int imgSize, int numImg, int numChannels, int numFilters, 
-			int filterSize, int stride, int pad, boolean FORCE_NON_IM2COL) 
+			int filterSize, int stride, int pad) 
 	{
 		RUNTIME_PLATFORM oldRTP = rtplatform;
 			
 		boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG;
 		
-		boolean oldForceNonIm2col = ConvolutionOp.FORCE_NON_IM2COL; 
-		ConvolutionOp.FORCE_NON_IM2COL = FORCE_NON_IM2COL;
-		
 		try
 		{
 		    TestConfiguration config = getTestConfiguration(TEST_NAME);
@@ -175,7 +135,6 @@ public class Conv2DTest extends AutomatedTestBase
 		{
 			rtplatform = oldRTP;
 			DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld;
-			ConvolutionOp.FORCE_NON_IM2COL = oldForceNonIm2col;
 		}
 	}
 }