You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2017/09/07 19:50:00 UTC

[3/5] systemml git commit: [SYSTEMML-540] Support sparse GPU conv2d as well as fix memory estimation of convolution operations

http://git-wip-us.apache.org/repos/asf/systemml/blob/772d9302/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
index 2b9335c..59ac29e 100644
--- a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
+++ b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
@@ -191,7 +191,7 @@ public class ConvolutionOp extends Hop  implements MultiThreadedHop
 //		// TODO: Inserting reblock requires knowing columns apriori
 //		ConvolutionTransform transform1 = new ConvolutionTransform(addReblockIfNecessary(et, lopOp, in), lopOp, getDataType(), getValueType(), et, k);
 //		setReblockedOutputDimension(et, transform1);
-		ConvolutionTransform transform1 = new ConvolutionTransform(in, lopOp, getDataType(), getValueType(), et, k);
+		ConvolutionTransform transform1 = new ConvolutionTransform(in, lopOp, getDataType(), getValueType(), et, k, computeIntermediateMemEstimate(-1, -1, -1 ));
 		setOutputDimensions(transform1);
 		
 		setLineNumbers(transform1);
@@ -223,13 +223,171 @@ public class ConvolutionOp extends Hop  implements MultiThreadedHop
 		return OptimizerUtils.estimateSizeExactSparsity(dim1, dim2, sparsity);
 	}
 	
+	// ---------------------------------------------------------------
+	// Utility methods to guard the computation of memory estimates in presense of unknowns
+	private static class IntermediateDimensions {
+		int dim1; int dim2; double sp;
+		public IntermediateDimensions(ConvolutionOp h, String dim1Str, String dim2Str, double sp) {
+			dim1 = (int) h.getDim(dim1Str);
+			dim2 = (int) h.getDim(dim2Str);
+			this.sp = sp;
+		}
+		public IntermediateDimensions(ConvolutionOp h, String dim1Str, String dim2Str) {
+			dim1 = (int) h.getDim(dim1Str);
+			dim2 = (int) h.getDim(dim2Str);
+			sp = 1;
+		}
+		public IntermediateDimensions(ConvolutionOp h, int dim1, String dim2Str) {
+			this.dim1 = dim1;
+			dim2 = (int) h.getDim(dim2Str);
+			sp = 1;
+		}
+		
+		/**
+		 * Add two computed memory estimates
+		 * 
+		 * @param val1 memory estimate 1
+		 * @param val2 memory estimate 2
+		 * @return sum of memory estimates
+		 */
+		static double guardedAdd(double val1, double val2) {
+			if(val1 < 0 || val2 < 0) return OptimizerUtils.DEFAULT_SIZE;
+			double ret = val1 + val2;
+			if(ret >= OptimizerUtils.DEFAULT_SIZE) return OptimizerUtils.DEFAULT_SIZE;
+			else return ret;
+		}
+		
+		/**
+		 * Compute memory estimates for given intermediate matrices 
+		 * 
+		 * @param intermediates list of intermediates
+		 * @param numWorkers number of workers
+		 * @return memory estimate
+		 */
+		public static double addEstimateSizes(ArrayList<IntermediateDimensions> intermediates, int numWorkers) {
+			double memBudget = 0; 
+			for(int i = 0; i < intermediates.size(); i++) {
+				memBudget = guardedAdd(memBudget, OptimizerUtils.estimateSizeExactSparsity(
+						intermediates.get(i).dim1, intermediates.get(i).dim2, intermediates.get(i).sp)*numWorkers);
+			}
+			return memBudget;
+		}
+		
+		/**
+		 * Compute max of two computed memory estimates
+		 * @param val1 memory estimate 1
+		 * @param val2 memory estimate 2
+		 * @return max of memory estimates
+		 */
+		public static double guardedMax(double val1, double val2) {
+			if(val1 < 0 || val2 < 0) return OptimizerUtils.DEFAULT_SIZE;
+			double ret = Math.max(val1, val2);
+			if(ret >= OptimizerUtils.DEFAULT_SIZE) return OptimizerUtils.DEFAULT_SIZE;
+			else return ret;
+		}
+	}
+	
+	/**
+	 * Helper utility to compute intermediate memory estimate
+	 * 
+	 * @param gpuIntermediates intermediates for GPU
+	 * @param cpIntermediates intermediates for CP
+	 * @return memory estimates
+	 */
+	private double computeIntermediateMemEstimateHelper(
+			ArrayList<IntermediateDimensions> gpuIntermediates,
+			ArrayList<IntermediateDimensions> cpIntermediates) {
+		// Since CP operators use row-level parallelism by default
+		int numWorkers = (int) Math.min(OptimizerUtils.getConstrainedNumThreads(_maxNumThreads), Math.max(getDim("N"), 1));
+		if(DMLScript.USE_ACCELERATOR) {
+			// Account for potential sparse-to-dense conversion
+			double gpuMemBudget = IntermediateDimensions.addEstimateSizes(gpuIntermediates, 1);
+			double cpMemoryBudget = IntermediateDimensions.addEstimateSizes(cpIntermediates, numWorkers);
+			if(cpMemoryBudget > gpuMemBudget) {
+				double oneThreadCPMemBudget = IntermediateDimensions.addEstimateSizes(cpIntermediates, 1);
+				if(oneThreadCPMemBudget <= gpuMemBudget) {
+					// Why limit CPU ? in-order to give more opportunity to compile GPU operators
+					cpMemoryBudget = oneThreadCPMemBudget;
+				}
+			}
+			// Finally, use the maximum of CP and GPU memory budget
+			return IntermediateDimensions.guardedMax(cpMemoryBudget, gpuMemBudget);
+		}
+		else {
+			// When -gpu flag is not provided, the memory estimates for CP are not affected.
+			return IntermediateDimensions.addEstimateSizes(cpIntermediates, numWorkers);
+		}
+	}
+	
 	@Override
-	protected double computeIntermediateMemEstimate( long dim1, long dim2, long nnz )
+	protected double computeIntermediateMemEstimate( long ignoreDim1, long ignoreDim2, long ignoreNnz )
 	{	
-		//default: no intermediate memory requirements
-		return 0;
+		ArrayList<IntermediateDimensions> gpuIntermediates = new ArrayList<IntermediateDimensions>();
+		ArrayList<IntermediateDimensions> cpIntermediates = new ArrayList<IntermediateDimensions>();
+		if(getOp() == ConvOp.DIRECT_CONV2D) {
+			// Assumption: To compile a GPU conv2d operator, following should fit on the GPU:
+			// 1. output in dense format (i.e. computeOutputMemEstimate) 
+			// 2. input in any format
+			// 3. atleast one input row in dense format
+			// 4. filter in dense format
+			
+			// Account for potential sparse-to-dense conversion of atleast 1 input row and filter
+			gpuIntermediates.add(new IntermediateDimensions(this, 1, "CHW"));
+			gpuIntermediates.add(new IntermediateDimensions(this, "K", "CRS"));
+			
+			// im2col operation preserves the worst-case sparsity of the input.
+			cpIntermediates.add(new IntermediateDimensions(this, "CRS", "PQ", getInput().get(0).getSparsity()));
+		}
+		else if(getOp() == ConvOp.DIRECT_CONV2D_BACKWARD_DATA) {
+			// Assumption: To compile a GPU conv2d_backward_data operator, following should fit on the GPU:
+			// 1. output in dense format (i.e. computeOutputMemEstimate) 
+			// 2. dout in any format
+			// 3. atleast one dout row in dense format
+			// 4. filter in dense format
+			
+			// Account for potential sparse-to-dense conversion of atleast 1 input row and filter
+			gpuIntermediates.add(new IntermediateDimensions(this, 1, "KPQ"));
+			gpuIntermediates.add(new IntermediateDimensions(this, "K", "CRS"));
+			
+			// There are 2 intermediates: rotate180 and input to col2im for conv2d_backward_data
+			// rotate180 preserves the "exact" sparsity of the dout matrix
+			cpIntermediates.add(new IntermediateDimensions(this, "PQ", "K", getInput().get(1).getSparsity()));
+			// Note: worst-case sparsity for the input of col2im (of size NPQ x CRS where N is determined by degree of parallelism)
+			cpIntermediates.add(new IntermediateDimensions(this, "PQ", "CRS"));
+		}
+		else if(getOp() == ConvOp.DIRECT_CONV2D_BACKWARD_FILTER) {
+			// Assumption: To compile a GPU conv2d_backward_filter operator, following should fit on the GPU:
+			// 1. output in dense format (i.e. computeOutputMemEstimate) 
+			// 2. dout in any format
+			// 3. atleast one dout and input row in dense format
+			
+			// Account for potential sparse-to-dense conversion of atleast 1 input + dout row
+			gpuIntermediates.add(new IntermediateDimensions(this, 1, "CHW"));
+			gpuIntermediates.add(new IntermediateDimensions(this, 1, "KPQ"));
+			
+			// There are 2 intermediates: im2col and rotate180 for conv2d_backward_filter
+			// rotate180 preserves the "exact" sparsity of the dout matrix
+			cpIntermediates.add(new IntermediateDimensions(this, "PQ", "K", getInput().get(1).getSparsity()));
+			// im2col operation preserves the worst-case sparsity of the input.
+			cpIntermediates.add(new IntermediateDimensions(this, "CRS", "PQ", getInput().get(0).getSparsity()));
+		}
+		else if(getOp() == ConvOp.MAX_POOLING) {
+			// Account for potential sparse-to-dense conversion of atleast 1 input row
+			gpuIntermediates.add(new IntermediateDimensions(this, 1, "CHW"));
+		}
+		else if(getOp() == ConvOp.MAX_POOLING_BACKWARD) {
+			// Account for potential sparse-to-dense conversion of atleast 1 input + dout row
+			gpuIntermediates.add(new IntermediateDimensions(this, 1, "CHW"));
+			gpuIntermediates.add(new IntermediateDimensions(this, 1, "CPQ"));
+		}
+		
+		if(gpuIntermediates.size() > 0 || cpIntermediates.size() > 0)
+			return computeIntermediateMemEstimateHelper(gpuIntermediates, cpIntermediates);
+		else
+			return 0;
 	}
 	
+	
 	@Override
 	protected long[] inferOutputCharacteristics( MemoTable memo )
 	{
@@ -243,65 +401,9 @@ public class ConvolutionOp extends Hop  implements MultiThreadedHop
 			ret[2] = -1;
 			return (ret[0]>0 && ret[1]>0) ? ret : null;
 		}
-	
-		ConvolutionParameters params;
-		try {
-			params = parseInput();
-		} catch (DMLRuntimeException e) {
-			throw new RuntimeException(e);
-		}
 		
-		switch(op) 
-		{
-			case MAX_POOLING: {
-				// input
-				long N = getInput().get(0)._dim1;
-				ret[0] = N;
-				ret[1] = getExtractedVal(params.C, params.P, params.Q);
-				ret[2] = -1;
-				break;
-			}
-			case DIRECT_CONV2D: {
-				// input, filter
-				long N = getInput().get(0)._dim1;
-				ret[0] = N;
-				ret[1] = getExtractedVal(params.K, params.P, params.Q);
-				ret[2] = -1;
-				break;
-			}
-			case DIRECT_CONV2D_BACKWARD_FILTER: {
-				// input, dout	
-				ret[0] = params.K;
-				ret[1] = getExtractedVal(params.C, params.R, params.S);
-				ret[2] = -1;
-				break;
-			}
-			case MAX_POOLING_BACKWARD: {
-				// input, dout
-				ret[0] = getInput().get(0)._dim1;
-				ret[1] = getInput().get(0)._dim2;
-				ret[2] = -1;
-				break;
-			}
-			case DIRECT_CONV2D_BACKWARD_DATA: {
-				// filter, dout
-				long N = getInput().get(1)._dim1;
-				ret[0] = N;
-				ret[1] = getExtractedVal(params.C, params.H, params.W);
-				ret[2] = -1;
-				break;
-			}
-			default:
-				throw new RuntimeException("Unsupported op:" + op.name());
-		}
-		
-		if(LOG.isDebugEnabled() && (ret[0] <= 0 || ret[1] <= 0)) {
-			LOG.debug("Unknown dimensions for ConvolutionOp in inferOutputCharacteristics:" + op.name() + " " + ret[0] + " " + ret[1] + 
-					" img_dim=[" + params.N + " " + params.C + " " + params.H + " " + params.W + "]" +
-					" filter_dim=[" + params.K + " " + params.C + " " + params.H + " " + params.W + "]" + 
-					" output_feature_map=[" + params.P + " " + params.Q + "] stride=[" + params.stride_h + " " + params.stride_w + "]" +
-					" pad=[" + params.pad_h + " " + params.pad_w + "]");
-		}
+		refreshSizeInformation();
+		ret[0] = _dim1; ret[1] = _dim2; ret[2] = _nnz;
 		
 		//safe return (create entry only if at least dims known)
 		return (ret[0]>0 && ret[1]>0) ? ret : null;
@@ -347,50 +449,44 @@ public class ConvolutionOp extends Hop  implements MultiThreadedHop
 		return _etype;
 	}
 	
+	// Caching parameters speed-ups dynamic recompilation time by avoiding unnecessary computeSizeInformation
+	private ConvolutionParameters _cachedParams = new ConvolutionParameters(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, _maxNumThreads);
 	// stride1, stride2, padding1, padding2  
 	// input_shape1, input_shape2, input_shape3, input_shape4, 
 	// filter_shape1, filter_shape2, filter_shape3, filter_shape4
 	ConvolutionParameters parseInput() throws DMLRuntimeException {
-		ConvolutionParameters params = null;
 		if(op == ConvOp.MAX_POOLING_BACKWARD 
 				|| op == ConvOp.DIRECT_CONV2D 
 				|| op == ConvOp.DIRECT_CONV2D_BACKWARD_FILTER
 				|| op == ConvOp.DIRECT_CONV2D_BACKWARD_DATA) {
-			params = new ConvolutionParameters(
-					computeSizeInformation(getInput().get(6)),
-					computeSizeInformation(getInput().get(7)), 
-					computeSizeInformation(getInput().get(8)), 
-					computeSizeInformation(getInput().get(9)), 
-					computeSizeInformation(getInput().get(10)), 
-					computeSizeInformation(getInput().get(12)), 
-					computeSizeInformation(getInput().get(13)), 
-					computeSizeInformation(getInput().get(2)), 
-					computeSizeInformation(getInput().get(3)), 
-					computeSizeInformation(getInput().get(4)), 
-					computeSizeInformation(getInput().get(5)), _maxNumThreads);
+			_cachedParams.setIfUnknown(
+					getInput().get(6),
+					getInput().get(7), 
+					getInput().get(8), 
+					getInput().get(9), 
+					getInput().get(10), 
+					getInput().get(12), 
+					getInput().get(13), 
+					getInput().get(2), 
+					getInput().get(3), 
+					getInput().get(4), 
+					getInput().get(5), _maxNumThreads);
 		}
 		else {
-			params = new ConvolutionParameters(
-					computeSizeInformation(getInput().get(5)),
-					computeSizeInformation(getInput().get(6)), 
-					computeSizeInformation(getInput().get(7)), 
-					computeSizeInformation(getInput().get(8)), 
-					computeSizeInformation(getInput().get(9)), 
-					computeSizeInformation(getInput().get(11)), 
-					computeSizeInformation(getInput().get(12)), 
-					computeSizeInformation(getInput().get(1)), 
-					computeSizeInformation(getInput().get(2)), 
-					computeSizeInformation(getInput().get(3)), 
-					computeSizeInformation(getInput().get(4)), _maxNumThreads);
-		}
-		return params;
-	}
-
-	public static long getExtractedVal(long val1, long val2, long val3) {
-		if(val1 == -1 || val2 == -1 || val3 == -1) {
-			return -1;
+			_cachedParams.setIfUnknown(
+					getInput().get(5),
+					getInput().get(6), 
+					getInput().get(7), 
+					getInput().get(8), 
+					getInput().get(9), 
+					getInput().get(11), 
+					getInput().get(12), 
+					getInput().get(1), 
+					getInput().get(2), 
+					getInput().get(3), 
+					getInput().get(4), _maxNumThreads);
 		}
-		return val1*val2*val3;
+		return _cachedParams;
 	}
 	
 	@Override
@@ -400,72 +496,50 @@ public class ConvolutionOp extends Hop  implements MultiThreadedHop
 			Hop input1 = getInput().get(0);
 			setDim1(input1.getDim1());
 			setDim2(input1.getDim2());
+			_nnz = -1; // cannot infer stats
 			return;
 		}
 		
-		ConvolutionParameters params;
-		try {
-			params = parseInput();
-		} catch (DMLRuntimeException e) {
-			throw new RuntimeException(e);
-		}
-		
 		switch(op) 
 		{
 			case MAX_POOLING:
 			{	
-				// input
-				long N = getInput().get(0)._dim1;
-				_dim1 = N;
-				_dim2 = getExtractedVal(params.C, params.P, params.Q);
+				_dim1 = getDim("N");
+				_dim2 = getDim("CPQ");
 				_nnz = -1; // cannot infer stats
 				break;
 			}
 			case MAX_POOLING_BACKWARD:
 			{
-				// input, dout
-				_dim1 = getInput().get(0)._dim1;
-				_dim2 = getInput().get(0)._dim2;
+				_dim1 = getDim("N");
+				_dim2 = getDim("CHW");
 				_nnz = -1;
 				break;
 			}
 			case DIRECT_CONV2D:
 			{
-				// input, filter
-				long N = getInput().get(0)._dim1;
-				_dim1 = N;
-				_dim2 = getExtractedVal(params.K, params.P, params.Q);
+				_dim1 = getDim("N");
+				_dim2 = getDim("KPQ");
 				_nnz = -1; // cannot infer stats
 				break;
 			}
 			case DIRECT_CONV2D_BACKWARD_DATA:
 			{
-				// filter, dout
-				long N = getInput().get(1)._dim1;
-				_dim1 = N;
-				_dim2 = getExtractedVal(params.C, params.H, params.W);
+				_dim1 = getDim("N");
+				_dim2 = getDim("CHW");
 				_nnz = -1; // cannot infer stats
 				break;
 			}
 			case DIRECT_CONV2D_BACKWARD_FILTER:
 			{
-				// input, dout	
-				_dim1 = params.K;
-				_dim2 = getExtractedVal(params.C, params.R, params.S);
+				_dim1 = getDim("K");
+				_dim2 = getDim("CRS");
 				_nnz = -1; // cannot infer stats
 				break;
 			}
 			default:
 				throw new RuntimeException("The sizes are not refreshed for " + op.name());
 		}
-		
-		if(LOG.isDebugEnabled() && (_dim1 <= 0 || _dim2 <= 0)) {
-			LOG.debug("Unknown dimensions for ConvolutionOp in refreshSizeInformation:" + op.name() + " " + _dim1 + " " + _dim2 + 
-					" img_dim=[" + params.N + " " + params.C + " " + params.H + " " + params.W + "]" +
-					" filter_dim=[" + params.K + " " + params.C + " " + params.H + " " + params.W + "]" + 
-					" output_feature_map=[" + params.P + " " + params.Q + "] stride=[" + params.stride_h + " " + params.stride_w + "]" +
-					" pad=[" + params.pad_h + " " + params.pad_w + "]");
-		}
 	}
 	
 	@Override
@@ -511,4 +585,132 @@ public class ConvolutionOp extends Hop  implements MultiThreadedHop
 	public int getMaxNumThreads() {
 		return _maxNumThreads;
 	}
+	
+	
+	// ------------------------------------------------------------------------------------------------------
+	// Utility methods to get the dimensions taking into account unknown dimensions
+	
+	/**
+	 * Convenient method to get the dimensions required by ConvolutionOp.
+	 * 
+	 * @param dimString can be K, CRS, N, CHW, KPQ, PQ
+	 * @return either -1 or value associated with the dimString
+	 */
+	private long getDim(String dimString) {
+		if(op == ConvOp.BIAS_ADD || op == ConvOp.BIAS_MULTIPLY) {
+			throw new RuntimeException("getDim method should not be invoked for bias_add and bias_multiply");
+		}
+		ConvolutionParameters params;
+		try {
+			params = parseInput();
+		} catch (DMLRuntimeException e) {
+			throw new RuntimeException(e);
+		}
+		Hop filter = null; 	// shape: K x CRS 
+		Hop input = null; 	// shape: N x CHW
+		Hop dout = null;	// shape: N x KPQ
+		Hop dout1 = null;	// shape: N x CPQ
+		
+		if(getOp() == ConvOp.DIRECT_CONV2D) {
+			input  = getInput().get(0);
+			filter = getInput().get(1);
+		}
+		else if(getOp() == ConvOp.DIRECT_CONV2D_BACKWARD_DATA) {
+			filter = getInput().get(0);
+			dout  = getInput().get(1);
+		}
+		else if(getOp() == ConvOp.DIRECT_CONV2D_BACKWARD_FILTER) {
+			input = getInput().get(0);
+			dout  = getInput().get(1);
+		}
+		else if(getOp() == ConvOp.MAX_POOLING) {
+			input = getInput().get(0);
+		}
+		else if(getOp() == ConvOp.MAX_POOLING_BACKWARD) {
+			input = getInput().get(0);
+			dout1  = getInput().get(1);
+		}
+		
+		long ret = -1;
+		if(dimString.equals("K") && filter != null) {
+			ret = getNonNegative(ret, getNonNegative(params.K, filter._dim1));
+		}
+		else if(dimString.equals("CRS") && filter != null) {
+			ret = getNonNegative(ret, getNonNegative(nonNegativeMultiply(params.C, params.R, params.S), filter._dim2));
+		}
+		else if(dimString.equals("N") && input != null) {
+			ret = getNonNegative(ret, getNonNegative(params.N, input._dim1));
+		}
+		else if(dimString.equals("CHW") && input != null) {
+			ret = getNonNegative(ret, getNonNegative(nonNegativeMultiply(params.C, params.H, params.W), input._dim2));
+		}
+		else if(dimString.equals("N") && dout != null) {
+			ret = getNonNegative(ret, getNonNegative(params.N, dout._dim1));
+		}
+		else if(dimString.equals("KPQ") && dout != null) {
+			ret = getNonNegative(ret, getNonNegative(nonNegativeMultiply(params.K, params.P, params.Q), dout._dim2));
+		}
+		else if(dimString.equals("N") && dout1 != null) {
+			ret = getNonNegative(ret, getNonNegative(params.N, dout1._dim1));
+		}
+		else if(dimString.equals("CPQ") && dout1 != null) {
+			ret = getNonNegative(ret, getNonNegative(nonNegativeMultiply(params.C, params.P, params.Q), dout1._dim2));
+		}
+		else if(dimString.equals("K")) {
+			ret = getNonNegative(ret, params.K >= 0 ? params.K : -1);
+		}
+		else if(dimString.equals("CRS")) {
+			ret = getNonNegative(ret, nonNegativeMultiply(params.C, params.R, params.S));
+		}
+		else if(dimString.equals("N")) {
+			ret = getNonNegative(ret, params.N >= 0 ? params.N : -1);
+		}
+		else if(dimString.equals("CHW")) {
+			ret = getNonNegative(ret, nonNegativeMultiply(params.C, params.H, params.W));
+		}
+		else if(dimString.equals("KPQ")) {
+			ret = getNonNegative(ret, nonNegativeMultiply(params.K, params.P, params.Q));
+		}
+		else if(dimString.equals("PQ")) {
+			ret = getNonNegative(ret, nonNegativeMultiply(params.P, params.Q));
+		}
+		else if(dimString.equals("CPQ")) {
+			ret = getNonNegative(ret, nonNegativeMultiply(params.C, params.P, params.Q));
+		}
+		else {
+			throw new RuntimeException("Unsupported dimension:" + dimString + " for operator " + getOp().name());
+		}
+		
+		if(LOG.isDebugEnabled() && ret < 0) {
+			LOG.debug("Unknown dimension " + dimString + " for ConvolutionOp:" + op.name() + 
+					" img_dim=[" + params.N + " " + params.C + " " + params.H + " " + params.W + "]" +
+					" filter_dim=[" + params.K + " " + params.C + " " + params.H + " " + params.W + "]" + 
+					" output_feature_map=[" + params.P + " " + params.Q + "] stride=[" + params.stride_h + " " + params.stride_w + "]" +
+					" pad=[" + params.pad_h + " " + params.pad_w + "]");
+		}
+		return ret;
+	}
+	
+	private long nonNegativeMultiply(long val1, long val2, long val3) {
+		if(val1 >= 0 && val2 >= 0 && val3 >= 0) {
+			return val1 * val2 * val3;
+		}
+		else return -1;
+	}
+	private long nonNegativeMultiply(long val1, long val2) {
+		if(val1 >= 0 && val2 >= 0) {
+			return val1 * val2;
+		}
+		else return -1;
+	}
+	private long getNonNegative(long val1, long val2) {
+		if(val1 >= 0 && val2 >= 0) {
+			if(val1 == val2) return val1;
+			else throw new RuntimeException("Incorrect dimensions in Convolution Hop: " + val1 + " != " + val2);
+		}
+		else if(val1 >= 0) return val1;
+		else if(val2 >= 0) return val2;
+		else return -1;
+	}
+	// ------------------------------------------------------------------------------------------------------
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/772d9302/src/main/java/org/apache/sysml/hops/Hop.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/Hop.java b/src/main/java/org/apache/sysml/hops/Hop.java
index eeaa5f1..b454771 100644
--- a/src/main/java/org/apache/sysml/hops/Hop.java
+++ b/src/main/java/org/apache/sysml/hops/Hop.java
@@ -708,31 +708,8 @@ public abstract class Hop implements ParseInfo
 		_validCPSizeEstimate = (wstats!=null) ? OptimizerUtils.isValidCPMatrixSize(
 			wstats[0], wstats[1], OptimizerUtils.getSparsity(wstats[0], wstats[1], wstats[2])) : false;
 	}
-
 	
 	/**
-	 * Computes the hop-specific output memory estimate in bytes. Should be 0 if not
-	 * applicable. 
-	 * 
-	 * @param dim1 dimension 1
-	 * @param dim2 dimension 2
-	 * @param nnz number of non-zeros
-	 * @return memory estimate
-	 */
-	protected abstract double computeOutputMemEstimate( long dim1, long dim2, long nnz );
-
-	/**
-	 * Computes the hop-specific intermediate memory estimate in bytes. Should be 0 if not
-	 * applicable.
-	 * 
-	 * @param dim1 dimension 1
-	 * @param dim2 dimension 2
-	 * @param nnz number of non-zeros
-	 * @return memory estimate
-	 */
-	protected abstract double computeIntermediateMemEstimate( long dim1, long dim2, long nnz );
-
-	/**
 	 * Computes the output matrix characteristics (rows, cols, nnz) based on worst-case output
 	 * and/or input estimates. Should return null if dimensions are unknown.
 	 * 
@@ -849,6 +826,21 @@ public abstract class Hop implements ParseInfo
 	
 	public abstract String getOpString();
 
+	// ========================================================================================
+	// Design doc: Memory estimation of GPU
+	// 1. Since not all operator are supported on GPU, isGPUEnabled indicates whether an operation 
+	// is enabled for GPU. This method doesnot take into account any memory estimates.
+	// 2. To simplify memory estimation logic, the methods computeOutputMemEstimate and computeIntermediateMemEstimate
+	// should return maximum of memory required for GPU and CP operators. 
+	// 3. Additionally, these methods are guarded so that when -gpu flag is not provided, additional memory overhead due to GPU
+	// are ignored. For example: sparse-to-dense conversion on GPU. 
+	// 4. (WIP) Every GPU operators should respect the memory returned by computeIntermediateMemEstimate (and computeOutputMemEstimate - see below point).
+	// 5. (WIP) Every GPU operator should create output in the same format as the corresponding CP operator. That is,  computeOutputMemEstimate
+	// are consistent across both CP and GPU in terms of worst-case.
+	// 6. The drawback of using maximum memory (mem = Math.max(mem_gpu, mem_gpu)) are:
+	// - GPU operator is not selected when mem_gpu < total memory available on GPU < mem
+	// - CP operator is not selected (i.e. distributed operator compiled) when mem_cpu < driver memory budget < mem
+	
 	/**
 	 * In memory-based optimizer mode (see OptimizerUtils.isMemoryBasedOptLevel()), 
 	 * the exectype is determined by checking this method as well as memory budget of this Hop. 
@@ -861,6 +853,31 @@ public abstract class Hop implements ParseInfo
 	 */
 	public abstract boolean isGPUEnabled();
 	
+	/**
+	 * Computes the hop-specific output memory estimate in bytes. Should be 0 if not
+	 * applicable. 
+	 * 
+	 * @param dim1 dimension 1
+	 * @param dim2 dimension 2
+	 * @param nnz number of non-zeros
+	 * @return memory estimate
+	 */
+	protected abstract double computeOutputMemEstimate( long dim1, long dim2, long nnz );
+
+	/**
+	 * Computes the hop-specific intermediate memory estimate in bytes. Should be 0 if not
+	 * applicable.
+	 * 
+	 * @param dim1 dimension 1
+	 * @param dim2 dimension 2
+	 * @param nnz number of non-zeros
+	 * @return memory estimate
+	 */
+	protected abstract double computeIntermediateMemEstimate( long dim1, long dim2, long nnz );
+	
+	// ========================================================================================
+
+	
 	protected boolean isVector() {
 		return (dimsKnown() && (_dim1 == 1 || _dim2 == 1) );
 	}

http://git-wip-us.apache.org/repos/asf/systemml/blob/772d9302/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java b/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
index 8784956..121112b 100644
--- a/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
+++ b/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
@@ -37,6 +37,7 @@ public class ConvolutionTransform extends Lop
 	
 	private OperationTypes operation = null;
 	private int numThreads = -1;
+	private double intermediateMemBudget = 0;
 	
 	/**
 	 * Constructor when we have one input.
@@ -47,12 +48,14 @@ public class ConvolutionTransform extends Lop
 	 * @param vt value type
 	 * @param et execution type
 	 * @param k number of threads
+	 * @param intermediateMemBudget intermediate memory budget
 	 */
-	public ConvolutionTransform(Lop input, ConvolutionTransform.OperationTypes op, DataType dt, ValueType vt, ExecType et, int k) 
+	public ConvolutionTransform(Lop input, ConvolutionTransform.OperationTypes op, DataType dt, ValueType vt, ExecType et, int k, double intermediateMemBudget) 
 	{
 		super(Lop.Type.Transform, dt, vt);		
 		init(input, op, dt, vt, et);
 		numThreads = k;
+		this.intermediateMemBudget = intermediateMemBudget;
 	}
 	
 	public ConvolutionTransform(Lop input1, Lop input2, ConvolutionTransform.OperationTypes op, DataType dt, ValueType vt, ExecType et, int k) 
@@ -165,6 +168,9 @@ public class ConvolutionTransform extends Lop
 				sb.append( OPERAND_DELIMITOR );
 				sb.append( numThreads );
 			}
+			
+			sb.append( OPERAND_DELIMITOR );
+			sb.append( intermediateMemBudget );
 			return sb.toString();
 		}
 		else {
@@ -210,6 +216,9 @@ public class ConvolutionTransform extends Lop
 			sb.append( OPERAND_DELIMITOR );
 			sb.append( numThreads );
 		}
+		
+		sb.append( OPERAND_DELIMITOR );
+		sb.append( intermediateMemBudget );
 	}
 
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/772d9302/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
index 629b688..e91029e 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
@@ -22,6 +22,10 @@ package org.apache.sysml.runtime.instructions.cp;
 import java.util.ArrayList;
 import java.util.Arrays;
 
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysml.api.DMLScript;
+import org.apache.sysml.hops.OptimizerUtils;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysml.runtime.functionobjects.SwapIndex;
@@ -41,24 +45,25 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
 	private ArrayList<CPOperand> _filter_shape;
 	private ArrayList<CPOperand> _stride = new ArrayList<CPOperand>();
 	private ArrayList<CPOperand> _padding = new ArrayList<CPOperand>();
-	private int _numThreads = -1;
-
-	private ConvolutionCPInstruction(CPOperand in, CPOperand in2, CPOperand out, String opcode, String istr,
-			int numThreads) throws DMLRuntimeException {
-		super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), in, out, opcode, istr);
-		if (!(opcode.equals("bias_add") || opcode.equals("relu_backward") || opcode.equals("bias_multiply"))) {
-			throw new DMLRuntimeException(
-					"Incorrect usage. Expected the opcode to be bias_add or bias_multiply or relu_backward, but found "
-							+ opcode);
+	private int _numThreads = -1;	private double _intermediateMemoryBudget = 0;
+	private static final Log LOG = LogFactory.getLog(ConvolutionCPInstruction.class.getName());
+	private static boolean warnedUnderUtilitization = false;
+	
+	public ConvolutionCPInstruction(CPOperand in, CPOperand in2, CPOperand out, String opcode, String istr, int numThreads, double intermediateMemoryBudget) throws DMLRuntimeException {
+		super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), in, out,
+				opcode, istr);
+		if( !(opcode.equals("bias_add") || opcode.equals("relu_backward") || opcode.equals("bias_multiply") ) ) {
+			throw new DMLRuntimeException("Incorrect usage. Expected the opcode to be bias_add or bias_multiply or relu_backward, but found " + opcode);
 		}
 		_in2 = in2;
 		_cptype = CPINSTRUCTION_TYPE.Convolution;
 		_numThreads = numThreads;
+		_intermediateMemoryBudget = intermediateMemoryBudget;
 	}
 
 	private ConvolutionCPInstruction(CPOperand in, CPOperand out, String opcode, String istr,
 			ArrayList<CPOperand> stride, ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape,
-			ArrayList<CPOperand> filter_shape, int numThreads) {
+			ArrayList<CPOperand> filter_shape, int numThreads, double intermediateMemoryBudget) {
 		super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), in, out, opcode, istr);
 		_cptype = CPINSTRUCTION_TYPE.Convolution;
 		_stride = stride;
@@ -66,12 +71,15 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
 		_input_shape = input_shape;
 		_filter_shape = filter_shape;
 		_numThreads = numThreads;
+		_intermediateMemoryBudget = intermediateMemoryBudget;
 	}
-
-	private ConvolutionCPInstruction(CPOperand in, CPOperand in2, CPOperand out, String opcode, String istr,
-			ArrayList<CPOperand> stride, ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape,
-			ArrayList<CPOperand> filter_shape, int numThreads) {
-		super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), in, out, opcode, istr);
+	
+	public ConvolutionCPInstruction(CPOperand in, CPOperand in2, CPOperand out, String opcode,
+			String istr, ArrayList<CPOperand> stride,
+			ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape,
+			ArrayList<CPOperand> filter_shape, int numThreads, double intermediateMemoryBudget) {
+		super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), in, out,
+				opcode, istr);
 		_in2 = in2;
 		_cptype = CPINSTRUCTION_TYPE.Convolution;
 		_stride = stride;
@@ -79,12 +87,15 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
 		_input_shape = input_shape;
 		_filter_shape = filter_shape;
 		_numThreads = numThreads;
+		_intermediateMemoryBudget = intermediateMemoryBudget;
 	}
-
-	private ConvolutionCPInstruction(CPOperand in, CPOperand in2, CPOperand in3, CPOperand out, String opcode,
-			String istr, ArrayList<CPOperand> stride, ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape,
-			ArrayList<CPOperand> filter_shape, int numThreads) {
-		super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), in, out, opcode, istr);
+	
+	public ConvolutionCPInstruction(CPOperand in, CPOperand in2, CPOperand in3, CPOperand out, String opcode,
+			String istr, ArrayList<CPOperand> stride,
+			ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape,
+			ArrayList<CPOperand> filter_shape, int numThreads, double intermediateMemoryBudget) {
+		super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), in, out,
+				opcode, istr);
 		_in2 = in2;
 		_in3 = in3;
 		_cptype = CPINSTRUCTION_TYPE.Convolution;
@@ -93,6 +104,7 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
 		_input_shape = input_shape;
 		_filter_shape = filter_shape;
 		_numThreads = numThreads;
+		_intermediateMemoryBudget = intermediateMemoryBudget;
 	}
 
 	public static ConvolutionCPInstruction parseInstruction(String str)
@@ -101,7 +113,7 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
 		String[] parts = InstructionUtils.getInstructionPartsWithValueType(str);
 		String opcode = parts[0];
 		if (opcode.equalsIgnoreCase("maxpooling") || opcode.equalsIgnoreCase("relu_maxpooling")) {
-			InstructionUtils.checkNumFields(parts, 15);
+			InstructionUtils.checkNumFields(parts, 16);
 			// stride1, stride2, padding1, padding2
 			// input_shape1, input_shape2, input_shape3, input_shape4,
 			// filter_shape1, filter_shape2, filter_shape3, filter_shape4, k
@@ -127,13 +139,13 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
 			int k = Integer.parseInt(parts[15]);
 
 			return new ConvolutionCPInstruction(in, out, opcode, str, stride,
-					padding, input_shape, filter_shape, k);
+					padding, input_shape, filter_shape, k, Double.parseDouble(parts[16]));
 		} 
 		else if (opcode.equalsIgnoreCase("maxpooling_backward") || opcode.equalsIgnoreCase("relu_maxpooling_backward")
 				|| opcode.equalsIgnoreCase("conv2d")
 				|| opcode.equalsIgnoreCase("conv2d_backward_filter")
 				|| opcode.equalsIgnoreCase("conv2d_backward_data")) {
-			InstructionUtils.checkNumFields(parts, 16);
+			InstructionUtils.checkNumFields(parts, 17);
 			// dout, stride1, stride2, padding1, padding2
 			// input_shape1, input_shape2, input_shape3, input_shape4,
 			// filter_shape1, filter_shape2, filter_shape3, filter_shape4, k
@@ -160,10 +172,10 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
 			int k = Integer.parseInt(parts[16]);
 
 			return new ConvolutionCPInstruction(in, in2, out, opcode, str, stride,
-					padding, input_shape, filter_shape, k);
+					padding, input_shape, filter_shape, k, Double.parseDouble(parts[17]));
 		}
 		else if (opcode.equalsIgnoreCase("conv2d_bias_add")) {
-			InstructionUtils.checkNumFields(parts, 17);
+			InstructionUtils.checkNumFields(parts, 18);
 			// dout, stride1, stride2, padding1, padding2
 			// input_shape1, input_shape2, input_shape3, input_shape4,
 			// filter_shape1, filter_shape2, filter_shape3, filter_shape4, k
@@ -191,15 +203,15 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
 			int k = Integer.parseInt(parts[17]);
 
 			return new ConvolutionCPInstruction(in, in2, in3, out, opcode, str, stride,
-					padding, input_shape, filter_shape, k);
+					padding, input_shape, filter_shape, k, Double.parseDouble(parts[18]));
 		}
 		else if (opcode.equalsIgnoreCase("bias_add") || opcode.equals("relu_backward") || opcode.equalsIgnoreCase("bias_multiply") ) {
-			InstructionUtils.checkNumFields(parts, 4);
+			InstructionUtils.checkNumFields(parts, 5);
 			CPOperand in = new CPOperand(parts[1]);
 			CPOperand in2 = new CPOperand(parts[2]);
 			CPOperand out = new CPOperand(parts[3]);
 			int k = Integer.parseInt(parts[4]);
-			return new ConvolutionCPInstruction(in, in2, out, opcode, str, k);
+			return new ConvolutionCPInstruction(in, in2, out, opcode, str, k, Double.parseDouble(parts[5]));
 		}
 		else {
 			throw new DMLRuntimeException("Unknown opcode while parsing a ConvolutionCPInstruction: " + str);
@@ -363,6 +375,7 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
 			ec.releaseMatrixInput(_in2.getName(), getExtendedOpcode());
 		}
 		else if (instOpcode.equalsIgnoreCase("conv2d")) {
+			resetNumThreads(params, C*R*S, P*Q, matBlock.getNonZeros() / (matBlock.getNumRows()*matBlock.getNumColumns()));
 			MatrixBlock filter = ec.getMatrixInput(_in2.getName(), getExtendedOpcode());
 			if(filter.isEmpty() || matBlock.isEmpty()) {
 				outputBlock = new MatrixBlock(N, K*P*Q, true);
@@ -377,6 +390,7 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
 			ec.releaseMatrixInput(_in2.getName(), getExtendedOpcode());
 		}
 		else if (instOpcode.equalsIgnoreCase("conv2d_bias_add")) {
+			resetNumThreads(params, C*R*S, P*Q, matBlock.getNonZeros() / (matBlock.getNumRows()*matBlock.getNumColumns()));
 			MatrixBlock filter = ec.getMatrixInput(_in3.getName(), getExtendedOpcode());
 			MatrixBlock bias = ec.getMatrixInput(_in2.getName(), getExtendedOpcode());
 			if(bias.getNumRows() != params.K || bias.getNumColumns() != 1) {
@@ -446,6 +460,27 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
 		ec.setMatrixOutput(getOutputVariableName(), outputBlock, getExtendedOpcode());
 	}
 	
+	/**
+	 * Reset the number of thread to respect the intermediate CP memory budget
+	 * 
+	 * @param params convolution parameters
+	 * @param numRows number of rows of intermediate matrix used per thread
+	 * @param numCols number of rows of intermediate matrix used per thread
+	 * @param sparsity sparsity of intermediate matrix used per thread
+	 */
+	private void resetNumThreads(ConvolutionParameters params, int numRows, int numCols, double sparsity) {
+		if(DMLScript.USE_ACCELERATOR) {
+			double memBudget1Thread = OptimizerUtils.estimateSizeExactSparsity(numRows, numCols, sparsity);
+			int limitedDegreeOfParallelism = (int) Math.floor(_intermediateMemoryBudget / memBudget1Thread);
+			if(params.numThreads > limitedDegreeOfParallelism) {
+				params.numThreads = limitedDegreeOfParallelism;
+				if(!warnedUnderUtilitization)
+					LOG.warn("CPU Under-utilization to respect the intermediate memory budget. To avoid this, please try reducing the mini-batch or forcing gpu execution.");
+				warnedUnderUtilitization = true;
+			}
+		}
+	}
+	
 	private MatrixBlock getDenseOutputBlock(int numRows, int numCols) throws DMLRuntimeException {
 		MatrixBlock outputBlock = new MatrixBlock(numRows, numCols, false);
 		outputBlock.allocateDenseBlock();

http://git-wip-us.apache.org/repos/asf/systemml/blob/772d9302/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
index 5b37576..b25f787 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
@@ -27,6 +27,7 @@ import org.apache.sysml.runtime.functionobjects.SwapIndex;
 import org.apache.sysml.runtime.instructions.InstructionUtils;
 import org.apache.sysml.runtime.instructions.cp.CPOperand;
 import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
+import org.apache.sysml.runtime.matrix.data.LibMatrixCuDNN;
 import org.apache.sysml.runtime.matrix.operators.ReorgOperator;
 import org.apache.sysml.runtime.util.ConvolutionUtils;
 import org.apache.sysml.utils.GPUStatistics;
@@ -40,9 +41,9 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
 	private ArrayList<CPOperand> _filter_shape;
 	private ArrayList<CPOperand> _stride = new ArrayList<CPOperand>();
 	private ArrayList<CPOperand> _padding = new ArrayList<CPOperand>();
-
-	private ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, CPOperand out, String opcode, String istr)
-			throws DMLRuntimeException {
+	private double _intermediateMemoryBudget = 0;
+	
+	public ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, CPOperand out, String opcode, String istr, double intermediateMemoryBudget) throws DMLRuntimeException {
 		super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), opcode, istr);
 		if (!(opcode.equals("bias_add") || opcode.equals("bias_multiply") || opcode.equals("relu_backward"))) {
 			throw new DMLRuntimeException(
@@ -53,18 +54,23 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
 		_input2 = in2;
 		_gputype = GPUINSTRUCTION_TYPE.Convolution;
 		_output = out;
+		_intermediateMemoryBudget = intermediateMemoryBudget;
 	}
-
-	private ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, CPOperand in3, CPOperand out, String opcode,
-			String istr, ArrayList<CPOperand> stride, ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape,
-			ArrayList<CPOperand> filter_shape) {
-		this(in1, in2, out, opcode, istr, stride, padding, input_shape, filter_shape);
+	
+	public ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, CPOperand in3, CPOperand out, String opcode,
+			String istr, ArrayList<CPOperand> stride,
+			ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape,
+			ArrayList<CPOperand> filter_shape, double intermediateMemoryBudget) 
+	{
+		this(in1, in2, out, opcode, istr, stride, padding,  input_shape, filter_shape, intermediateMemoryBudget);
 		_input3 = in3;
 	}
-
-	private ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, CPOperand out, String opcode, String istr,
-			ArrayList<CPOperand> stride, ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape,
-			ArrayList<CPOperand> filter_shape) {
+	
+	public ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, CPOperand out, String opcode,
+			String istr, ArrayList<CPOperand> stride,
+			ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape,
+			ArrayList<CPOperand> filter_shape, double intermediateMemoryBudget) 
+	{
 		super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), opcode, istr);
 		_gputype = GPUINSTRUCTION_TYPE.Convolution;
 
@@ -75,6 +81,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
 		_padding = padding;
 		_input_shape = input_shape;
 		_filter_shape = filter_shape;
+		_intermediateMemoryBudget = intermediateMemoryBudget;
 	}
 
 	public static ConvolutionGPUInstruction parseInstruction(String str)
@@ -87,7 +94,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
 			 || opcode.equalsIgnoreCase("conv2d_backward_filter")
 			 || opcode.equalsIgnoreCase("conv2d_backward_data")
 			 || opcode.equalsIgnoreCase("maxpooling_backward")) ) {
-			InstructionUtils.checkNumFields(parts, 15);
+			InstructionUtils.checkNumFields(parts, 16);
 			CPOperand in1 = new CPOperand(parts[1]);
 			CPOperand in2 = new CPOperand(parts[2]);
 			CPOperand out = new CPOperand(parts[15]);
@@ -110,10 +117,10 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
 			filter_shape.add(new CPOperand(parts[14]));
 
 			return new ConvolutionGPUInstruction(in1, in2, out, opcode, str, stride,
-					padding, input_shape, filter_shape);
+					padding, input_shape, filter_shape, Double.parseDouble(parts[16]));
 		}
 		else if (opcode.equalsIgnoreCase("conv2d_bias_add")) {
-			InstructionUtils.checkNumFields(parts, 16);
+			InstructionUtils.checkNumFields(parts, 17);
 			CPOperand in1 = new CPOperand(parts[1]);
 			CPOperand in2 = new CPOperand(parts[2]);
 			CPOperand in3 = new CPOperand(parts[3]);
@@ -137,10 +144,10 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
 			filter_shape.add(new CPOperand(parts[15]));
 
 			return new ConvolutionGPUInstruction(in1, in2, in3, out, opcode, str, stride,
-					padding, input_shape, filter_shape);
+					padding, input_shape, filter_shape, Double.parseDouble(parts[17]));
 		}
 		else if (opcode.equalsIgnoreCase("maxpooling")) {
-			InstructionUtils.checkNumFields(parts, 14);
+			InstructionUtils.checkNumFields(parts, 15);
 			CPOperand in1 = new CPOperand(parts[1]);
 			CPOperand out = new CPOperand(parts[14]);
 		
@@ -162,14 +169,14 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
 			filter_shape.add(new CPOperand(parts[13]));
 
 			return new ConvolutionGPUInstruction(in1, null, out, opcode, str, stride,
-					padding, input_shape, filter_shape);
+					padding, input_shape, filter_shape, Double.parseDouble(parts[15]));
 		}
 		else if( opcode.equalsIgnoreCase("bias_add") || opcode.equalsIgnoreCase("relu_backward") || opcode.equalsIgnoreCase("bias_multiply")  ) {
-			InstructionUtils.checkNumFields(parts, 3);
+			InstructionUtils.checkNumFields(parts, 4);
 			CPOperand in1 = new CPOperand(parts[1]);
 			CPOperand in2 = new CPOperand(parts[2]);
 			CPOperand out = new CPOperand(parts[3]);
-			return new ConvolutionGPUInstruction(in1, in2, out, opcode, str);
+			return new ConvolutionGPUInstruction(in1, in2, out, opcode, str, Double.parseDouble(parts[4]));
 		}
 		else {
 			throw new DMLRuntimeException("Unknown opcode while parsing a ConvolutionGPUInstruction: " + str);	
@@ -251,8 +258,8 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
 			
 			MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), N, K * P * Q);
 			
-			LibMatrixCUDA.conv2d(ec.getGPUContext(0), getExtendedOpcode(), image, filter, out, N, C, H, W,
-					K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+			LibMatrixCuDNN.conv2d(ec.getGPUContext(0), getExtendedOpcode(), image, filter, out, N, C, H, W,
+					K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, _intermediateMemoryBudget);
 		}
 		else if (instOpcode.equalsIgnoreCase("conv2d_bias_add")) {
 			MatrixObject image = getMatrixInputForGPUInstruction(ec, _input1.getName());
@@ -266,8 +273,8 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
 			
 			MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), N, K * P * Q);
 			
-			LibMatrixCUDA.conv2dBiasAdd(ec.getGPUContext(0), getExtendedOpcode(), image, bias, filter, out, N, C, H, W,
-						K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+			LibMatrixCuDNN.conv2dBiasAdd(ec.getGPUContext(0), getExtendedOpcode(), image, bias, filter, out, N, C, H, W,
+						K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, _intermediateMemoryBudget);
 		}
 		else if (instOpcode.equalsIgnoreCase("conv2d_backward_filter")) {
 			MatrixObject image = getMatrixInputForGPUInstruction(ec, _input1.getName());
@@ -281,8 +288,8 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
 			
 			MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), K, C * R * S);
 			
-			LibMatrixCUDA.conv2dBackwardFilter(ec.getGPUContext(0), getExtendedOpcode(), image, dout, out, N, C, H, W,
-					K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+			LibMatrixCuDNN.conv2dBackwardFilter(ec.getGPUContext(0), getExtendedOpcode(), image, dout, out, N, C, H, W,
+					K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, _intermediateMemoryBudget);
 			// TODO: For now always copy the device data to host
 			// ec.gpuCtx.copyDeviceToHost(outputBlock);
 		}
@@ -298,8 +305,8 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
 			
 			MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), N, C * H * W);
 			
-			LibMatrixCUDA.conv2dBackwardData(ec.getGPUContext(0), getExtendedOpcode(), filter, dout, out, N, C, H, W,
-					K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+			LibMatrixCuDNN.conv2dBackwardData(ec.getGPUContext(0), getExtendedOpcode(), filter, dout, out, N, C, H, W,
+					K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, _intermediateMemoryBudget);
 		}
 		else if (instOpcode.equalsIgnoreCase("maxpooling")) {
 			MatrixObject image = getMatrixInputForGPUInstruction(ec, _input1.getName());
@@ -311,8 +318,8 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
 			MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), N, C * P * Q);
 			
 			if(instOpcode.equalsIgnoreCase("maxpooling"))
-				LibMatrixCUDA.maxpooling(ec.getGPUContext(0), getExtendedOpcode(), image, out, N, C, H, W,
-					K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+				LibMatrixCuDNN.maxpooling(ec.getGPUContext(0), getExtendedOpcode(), image, out, N, C, H, W,
+					K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, _intermediateMemoryBudget);
 		}
 		else if (instOpcode.equalsIgnoreCase("maxpooling_backward")) {
 			MatrixObject image = getMatrixInputForGPUInstruction(ec, _input1.getName());
@@ -326,8 +333,8 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
 			
 			MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), N, C * H * W);
 			
-			LibMatrixCUDA.maxpoolingBackward(ec.getGPUContext(0), getExtendedOpcode(), image, dout, out, N, C, H, W,
-					K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+			LibMatrixCuDNN.maxpoolingBackward(ec.getGPUContext(0), getExtendedOpcode(), image, dout, out, N, C, H, W,
+					K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, _intermediateMemoryBudget);
 		}
 		else {
 			throw new DMLRuntimeException("Unsupported GPU context for " + instOpcode);
@@ -345,6 +352,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
 		ec.releaseMatrixOutputForGPUInstruction(_output.getName());
 	}
 
+
 	private int getScalarInput(ExecutionContext ec, ArrayList<CPOperand> aL, int index) 
 		throws DMLRuntimeException 
 	{

http://git-wip-us.apache.org/repos/asf/systemml/blob/772d9302/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
index af27dc6..5096566 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
@@ -24,6 +24,7 @@ import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysml.runtime.instructions.cp.CPOperand;
 import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
+import org.apache.sysml.runtime.matrix.data.LibMatrixCuDNN;
 import org.apache.sysml.runtime.matrix.operators.Operator;
 import org.apache.sysml.utils.GPUStatistics;
 
@@ -44,7 +45,7 @@ public class MatrixBuiltinGPUInstruction extends BuiltinUnaryGPUInstruction {
 
 		switch(opcode) {
 			case "sel+":
-				LibMatrixCUDA.relu(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break;
+				LibMatrixCuDNN.relu(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break;
 			case "exp":
 				LibMatrixCUDA.exp(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break;
 			case "sqrt":

http://git-wip-us.apache.org/repos/asf/systemml/blob/772d9302/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
index c6b82c4..197daaf 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
@@ -49,6 +49,7 @@ import org.apache.commons.logging.LogFactory;
 import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.conf.ConfigurationManager;
 import org.apache.sysml.conf.DMLConfig;
+import org.apache.sysml.hops.OptimizerUtils;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
@@ -151,6 +152,11 @@ public class GPUContext {
 		LOG.info(" GPU memory - Total: " + (total[0] * (1e-6)) + " MB, Available: " + (free[0] * (1e-6)) + " MB on "
 				+ this);
 
+		if(GPUContextPool.initialGPUMemBudget() > OptimizerUtils.getLocalMemBudget()) {
+			LOG.warn("Potential under-utilization: GPU memory (" + GPUContextPool.initialGPUMemBudget() 
+					+ ") > driver memory budget (" + OptimizerUtils.getLocalMemBudget() + "). "
+					+ "Consider increasing the driver memory budget.");
+		}
 	}
 
 	private void initializeCudaLibraryHandles() throws DMLRuntimeException {

http://git-wip-us.apache.org/repos/asf/systemml/blob/772d9302/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java b/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
index 78b6e3b..6d06ee5 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
@@ -21,6 +21,7 @@ package org.apache.sysml.runtime.matrix.data;
 
 import java.io.Serializable;
 
+import org.apache.sysml.hops.Hop;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.util.ConvolutionUtils;
 
@@ -34,7 +35,9 @@ public class ConvolutionParameters implements Serializable {
 	public int K; public int R; public int S; public int stride_h; public int stride_w; public int pad_h; public int pad_w;
 	public int P; public int Q; public int numThreads;
 	
+	// Optional variables used by ConvolutionCPInstruction
 	public boolean enableNative = false;
+	
 	public MatrixBlock input1; public MatrixBlock input2; public MatrixBlock output;
 	
 	public MatrixBlock bias;
@@ -62,6 +65,28 @@ public class ConvolutionParameters implements Serializable {
 				"], pad=[" + pad_h + "," + pad_w + "])";  
 	}
 	
+	public void setIfUnknown(Hop N, Hop C, Hop H, Hop W,
+			Hop K, Hop R, Hop S, Hop stride_h, Hop stride_w, Hop pad_h, Hop pad_w, int numThreads) throws DMLRuntimeException {
+		if(this.N < 0) this.N = convertToInt(Hop.computeSizeInformation(N));
+		if(this.C < 0) this.C = convertToInt(Hop.computeSizeInformation(C));
+		if(this.H < 0) this.H = convertToInt(Hop.computeSizeInformation(H));
+		if(this.W < 0) this.W = convertToInt(Hop.computeSizeInformation(W));
+		if(this.K < 0) this.K = convertToInt(Hop.computeSizeInformation(K));
+		if(this.R < 0) this.R = convertToInt(Hop.computeSizeInformation(R));
+		if(this.S < 0) this.S = convertToInt(Hop.computeSizeInformation(S));
+		if(this.stride_h < 0) this.stride_h = convertToInt(Hop.computeSizeInformation(stride_h));
+		if(this.stride_w < 0) this.stride_w = convertToInt(Hop.computeSizeInformation(stride_w));
+		if(this.pad_h < 0) this.pad_h = convertToInt(Hop.computeSizeInformation(pad_h));
+		if(this.pad_w < 0) this.pad_w = convertToInt(Hop.computeSizeInformation(pad_w));
+		if(this.P < 0 && this.H >= 0 && this.R >= 0 && this.stride_h >= 0 && this.pad_h >= 0) {
+			this.P = (int) ConvolutionUtils.getP(this.H, this.R, this.stride_h, this.pad_h);
+		}
+		if(this.Q < 0 && this.W >= 0 && this.S >= 0 && this.stride_w >= 0 && this.pad_w >= 0) {
+			this.Q = (int) ConvolutionUtils.getQ(this.W, this.S, this.stride_w, this.pad_w);
+		}
+		this.numThreads = numThreads;
+	}
+	
 	public ConvolutionParameters(long N, long C, long H, long W,
 			long K, long R, long S, long stride_h, long stride_w, long pad_h, long pad_w, int numThreads) throws DMLRuntimeException {
 		this.N = convertToInt(N);