You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2017/10/28 20:48:30 UTC
systemml git commit: [SYSTEMML-540] Avoid redundant computation of cudnnPoolingForward in max_pool_backward

Repository: systemml
Updated Branches:
  refs/heads/master 118e3c0f6 -> 06d5bb073


[SYSTEMML-540] Avoid redundant computation of cudnnPoolingForward in max_pool_backward

- If the max_pool is invoked in the forward pass, then its output can be
  reused by the max_pool_backward rather than calling cudnnPoolingForward
  again. For sentence CNN with 2 epochs, this reduces the time for
  max_pool_backward from 6.361 to 2.966 seconds.

Closes #691.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/06d5bb07
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/06d5bb07
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/06d5bb07

Branch: refs/heads/master
Commit: 06d5bb073792345f7c4b7ecd0fb4454a335cc421
Parents: 118e3c0
Author: Niketan Pansare <np...@us.ibm.com>
Authored: Sat Oct 28 13:44:37 2017 -0700
Committer: Niketan Pansare <np...@us.ibm.com>
Committed: Sat Oct 28 13:45:52 2017 -0700

----------------------------------------------------------------------
 .../org/apache/sysml/hops/ConvolutionOp.java    | 163 +++++++++++++------
 .../gpu/ConvolutionGPUInstruction.java          |  43 ++++-
 .../runtime/matrix/data/LibMatrixCuDNN.java     |  51 +++---
 .../sysml/test/gpu/NeuralNetworkOpTests.java    |  82 ++++++++++
 4 files changed, 260 insertions(+), 79 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/06d5bb07/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
index 50a7ca3..16a8b63 100644
--- a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
+++ b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
@@ -47,14 +47,23 @@ public class ConvolutionOp extends Hop  implements MultiThreadedHop
 	private static final boolean THROW_ERROR_IF_INFERRED_SHAPE_MISMATCH = true;
 	// -------------------------------------------------------------------------
 	
+	// Specifies the type of this hop
 	private Hop.ConvOp op;
-
 	private int _maxNumThreads = -1; //-1 for unlimited
 
 	private ConvolutionOp() {
 		//default constructor for clone
 	}
 
+	/**
+	 * Create a hop from the builtin expression
+	 * 
+	 * @param l name of the hop
+	 * @param dt datatype (only supports matrix datatype)
+	 * @param vt valuetype  (only supports matrix valuetype) 
+	 * @param o type of this hop
+	 * @param inp input hops
+	 */
 	public ConvolutionOp(String l, DataType dt, ValueType vt, ConvOp o, ArrayList<Hop> inp) 
 	{
 		super(l, dt, vt);
@@ -75,8 +84,7 @@ public class ConvolutionOp extends Hop  implements MultiThreadedHop
 		HopsException.check(_input.size() >= 1, this, "should have at least one input but has %d inputs", _input.size());
 	}
 
-	public ConvOp getOp()
-	{
+	public ConvOp getOp() {
 		return op;
 	}
 	
@@ -163,77 +171,129 @@ public class ConvolutionOp extends Hop  implements MultiThreadedHop
 		return input instanceof ConvolutionOp && ((ConvolutionOp) input).getOp() == ConvOp.DIRECT_CONV2D;
 	}
 	
+	/**
+	 * Compares the input parameters for max_pool/max_pool_backward operations
+	 * 
+	 * @return true if the following parameters match: stride=[stride, stride], padding=[pad, pad], input_shape=[numImg, numChannels, imgSize, imgSize], pool_size=[poolSize1, poolSize2]
+	 */
+	private static boolean isPoolingParametersEqualAndKnown(ConvolutionParameters param1, ConvolutionParameters param2) {
+		return isEqualAndKnown(param1.stride_h, param2.stride_h) && isEqualAndKnown(param1.stride_w, param2.stride_w) && 
+			isEqualAndKnown(param1.pad_h, param2.pad_h) && isEqualAndKnown(param1.pad_w, param2.pad_w) &&
+			isEqualAndKnown(param1.R, param2.R) && isEqualAndKnown(param1.S, param2.S) &&
+			isEqualAndKnown(param1.N, param2.N) && isEqualAndKnown(param1.C, param2.C) &&
+			isEqualAndKnown(param1.H, param2.H) && isEqualAndKnown(param1.W, param2.W);
+	}
+	
+	private static boolean isEqualAndKnown(int val1, int val2) {
+		return val1 >= 0 && val2 >= 0 && val1 == val2;
+	}
+	
+	/**
+	 * Returns the output lop of maxpool operation with same parameters as this hop.
+	 * If corresponding output lop is not found or if this is not a max_pool_backward operation, this function returns null
+	 * 
+	 * @return output lop of maxpool operation with same parameters as this hop
+	 * @throws HopsException if error 
+	 * @throws LopsException if error
+	 */
+	private Lop getMaxPoolOutputLop() throws HopsException, LopsException {
+		if(op != ConvOp.MAX_POOLING_BACKWARD)
+			return null;
+		
+		Hop inputImage = getInput().get(0);
+		for(Hop tmpParent : inputImage.getParent()) {
+			if(!(tmpParent instanceof ConvolutionOp))
+				continue;
+			ConvolutionOp parent = (ConvolutionOp) tmpParent;
+			if(parent.getOp() == ConvOp.MAX_POOLING && isPoolingParametersEqualAndKnown(parent._cachedParams, _cachedParams)) {
+				return parent.constructLops();
+			}
+		}
+		return null;
+	}
+	
 	public Lop constructConvolutionLops(ExecType et, ArrayList<Hop> inputs) throws HopsException, LopsException {
 		if(inputs.size() != getNumExpectedInputs()) 
 			throw new HopsException("Incorrect number of inputs for " + op.name());
 		
-		Lop in = null; Lop in2 = null;
-		ArrayList<Hop> inputs1 = inputs;
-		int k = OptimizerUtils.getConstrainedNumThreads(_maxNumThreads);
+		// ---------------------------------------------------------------
+		// Deal with fused operators and contruct lhsInputLop/optionalRhsInputLop
+		Lop lhsInputLop = null; Lop optionalRhsInputLop = null;
+		ArrayList<Hop> inputsOfPotentiallyFusedOp = inputs;
 		OperationTypes lopOp = HopsConv2Lops.get(op);
-
+		
 		// RELU_MAX_POOLING and RELU_MAX_POOLING_BACKWARD is extremely useful for CP backend 
 		// by reducing unnecessary sparse-to-dense-to-sparse conversion.
 		// For other backends, this operators is not necessary as it reduces an additional relu operator.
 		if(OptimizerUtils.ALLOW_OPERATOR_FUSION && et == ExecType.CP && op == ConvOp.MAX_POOLING && isInputReLU(inputs.get(0))) {
-			in = inputs.get(0).getInput().get(0).constructLops();
+			lhsInputLop = inputs.get(0).getInput().get(0).constructLops();
 			lopOp = OperationTypes.RELU_MAX_POOLING;
 		}
 		else if(OptimizerUtils.ALLOW_OPERATOR_FUSION && et == ExecType.CP && op == ConvOp.MAX_POOLING_BACKWARD && isInputReLU(inputs.get(0))) {
-			in = inputs.get(0).getInput().get(0).constructLops();
+			lhsInputLop = inputs.get(0).getInput().get(0).constructLops();
 			lopOp = OperationTypes.RELU_MAX_POOLING_BACKWARD;
 		}
 		else if(OptimizerUtils.ALLOW_OPERATOR_FUSION && op == ConvOp.BIAS_ADD && isInputConv2d(inputs.get(0))) {
 			lopOp = OperationTypes.DIRECT_CONV2D_BIAS_ADD;
 			
 			// the first lop is image 
-			in = inputs.get(0).getInput().get(0).constructLops();
+			lhsInputLop = inputs.get(0).getInput().get(0).constructLops();
 			// the second lop is bias
-			in2 = inputs.get(1).constructLops();
+			optionalRhsInputLop = inputs.get(1).constructLops();
 			
 			// Use the inputs from conv2d rather than bias_add
-			inputs1 = inputs.get(0).getInput();
+			inputsOfPotentiallyFusedOp = inputs.get(0).getInput();
 		}
 		else {
-			in = inputs.get(0).constructLops();
+			lhsInputLop = inputs.get(0).constructLops();
 		}
+		// ---------------------------------------------------------------
 		
-//		// TODO: Inserting reblock requires knowing columns apriori
-//		ConvolutionTransform transform1 = new ConvolutionTransform(addReblockIfNecessary(et, lopOp, in), lopOp, getDataType(), getValueType(), et, k);
-//		setReblockedOutputDimension(et, transform1);
-		double cpIntermediateMemEstimate = computeIntermediateMemEstimate(-1, -1, -1 );
+		// ---------------------------------------------------------------
+		// Compute intermediate memory budget that can be passed to GPU operators 
+		// for better CuDNN operator selection at runtime
+		double intermediateMemEstimate = computeIntermediateMemEstimate(-1, -1, -1 );
 		if(et == ExecType.GPU && _dim1 > 0 && _dim2 > 0) {
 			// This enables us to compile more efficient matrix-matrix CuDNN operation instead of 
 			// row-by-row invocation of multiple vector-matrix CuDNN operations.
 			// This is possible as the operations on GPU are single-threaded
 			double optimisticIntermediateMemEstimate = GPUContextPool.initialGPUMemBudget() - getOutputMemEstimate() - inputs.get(0).getOutputMemEstimate();
-			if(in2 != null) {
+			if(optionalRhsInputLop != null) {
 				optimisticIntermediateMemEstimate -= inputs.get(1).getOutputMemEstimate();
 			}
-			cpIntermediateMemEstimate = Math.max(cpIntermediateMemEstimate, optimisticIntermediateMemEstimate);
+			intermediateMemEstimate = Math.max(intermediateMemEstimate, optimisticIntermediateMemEstimate);
 		}
-		ConvolutionTransform transform1 = new ConvolutionTransform(in, lopOp, getDataType(), getValueType(), et, k, cpIntermediateMemEstimate);
-		setOutputDimensions(transform1);
+		// ---------------------------------------------------------------
 		
-		setLineNumbers(transform1);
-		in.addOutput(transform1);
+		// Contruct the lop
+		ConvolutionTransform convolutionLop = new ConvolutionTransform(lhsInputLop, lopOp, 
+				getDataType(), getValueType(), et, OptimizerUtils.getConstrainedNumThreads(_maxNumThreads), intermediateMemEstimate);
 		
-		if(in2 != null) {
-			transform1.addInput(in2);
-			in2.addOutput(transform1);
-		}
+		// Propagate the output dimensions and the line number of ConvolutionOp to ConvolutionTransform
+		setOutputDimensions(convolutionLop);
+		setLineNumbers(convolutionLop);
 		
-		// stride1, stride2, padding1, padding2  
-		// input_shape1, input_shape2, input_shape3, input_shape4, 
-		// filter_shape1, filter_shape2, filter_shape3, filter_shape4
-		for( int i=1; i < inputs1.size(); i++ )
-		{
-			Lop ltmp = inputs1.get(i).constructLops();
-			transform1.addInput(ltmp);
-			ltmp.addOutput(transform1);
+		// ---------------------------------------------------------------
+		// Add input/output for parent lops of convolutionLop
+		lhsInputLop.addOutput(convolutionLop);
+		if(optionalRhsInputLop != null) {
+			convolutionLop.addInput(optionalRhsInputLop);
+			optionalRhsInputLop.addOutput(convolutionLop);
+		}
+		for( int i=1; i < inputsOfPotentiallyFusedOp.size(); i++ ) {
+			Lop ltmp = inputsOfPotentiallyFusedOp.get(i).constructLops();
+			convolutionLop.addInput(ltmp);
+			ltmp.addOutput(convolutionLop);
 		}
-		transform1.setLevel(); //force order of added lops
-		return transform1;
+		// Only valid for MAX_POOLING_BACKWARD on GPU
+		Lop optionalMaxPoolOutput = (et == ExecType.GPU) ? getMaxPoolOutputLop() : null; 
+		if(optionalMaxPoolOutput != null) {
+			convolutionLop.addInput(optionalMaxPoolOutput);
+			optionalMaxPoolOutput.addOutput(convolutionLop);
+		}
+		convolutionLop.setLevel(); //force order of added lops
+		// ---------------------------------------------------------------
+		return convolutionLop;
 	}
 
 			
@@ -453,12 +513,10 @@ public class ConvolutionOp extends Hop  implements MultiThreadedHop
 		
 		ExecType REMOTE = OptimizerUtils.isSparkExecutionMode() ? ExecType.SPARK : ExecType.MR;
 		
-		if( _etypeForced != null ) 			
-		{
+		if( _etypeForced != null ) {
 			_etype = _etypeForced;
 		}
-		else 
-		{	
+		else {	
 			if ( OptimizerUtils.isMemoryBasedOptLevel() ) {
 				_etype = findExecTypeByMemEstimate();
 			}
@@ -479,8 +537,9 @@ public class ConvolutionOp extends Hop  implements MultiThreadedHop
 		return _etype;
 	}
 	
-	// Caching parameters speed-ups dynamic recompilation time by avoiding unnecessary computeSizeInformation
+	// Parameters recomputed in refreshSizeInformation and passed across many calls of getDim
 	private ConvolutionParameters _cachedParams = new ConvolutionParameters(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, _maxNumThreads);
+	
 	// stride1, stride2, padding1, padding2  
 	// input_shape1, input_shape2, input_shape3, input_shape4, 
 	// filter_shape1, filter_shape2, filter_shape3, filter_shape4
@@ -494,16 +553,16 @@ public class ConvolutionOp extends Hop  implements MultiThreadedHop
 			imageHeightHop = getInput().get(8);
 			filterHeightHop = getInput().get(12);
 			_cachedParams.setIfUnknown(
-					getInput().get(6),
-					getInput().get(7), 
-					imageHeightHop, 
-					getInput().get(9), 
-					getInput().get(10), 
-					filterHeightHop, 
-					getInput().get(13), 
-					getInput().get(2), 
-					getInput().get(3), 
-					getInput().get(4), 
+					getInput().get(6),  // N
+					getInput().get(7),  // C
+					imageHeightHop,     // H
+					getInput().get(9),  // W
+					getInput().get(10), // K
+					filterHeightHop,    // R
+					getInput().get(13), // S
+					getInput().get(2),  // stride_h
+					getInput().get(3),  // stride_w
+					getInput().get(4),  // pad+h
 					getInput().get(5), _maxNumThreads);
 		}
 		else {

http://git-wip-us.apache.org/repos/asf/systemml/blob/06d5bb07/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
index 354ea63..8565b5a 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
@@ -92,8 +92,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
 		
 		if( ( opcode.equalsIgnoreCase("conv2d")
 			 || opcode.equalsIgnoreCase("conv2d_backward_filter")
-			 || opcode.equalsIgnoreCase("conv2d_backward_data")
-			 || opcode.equalsIgnoreCase("maxpooling_backward")) ) {
+			 || opcode.equalsIgnoreCase("conv2d_backward_data")) ) {
 			InstructionUtils.checkNumFields(parts, 16);
 			CPOperand in1 = new CPOperand(parts[1]);
 			CPOperand in2 = new CPOperand(parts[2]);
@@ -119,6 +118,39 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
 			return new ConvolutionGPUInstruction(in1, in2, out, opcode, str, stride,
 					padding, input_shape, filter_shape, Double.parseDouble(parts[16]));
 		}
+		else if( opcode.equalsIgnoreCase("maxpooling_backward") ) {
+			boolean withMaxPoolOut = false;
+			if(parts.length == 18) {
+				withMaxPoolOut = true;
+			}
+			else
+				InstructionUtils.checkNumFields(parts, 16);
+			CPOperand in1 = new CPOperand(parts[1]);
+			CPOperand in2 = new CPOperand(parts[2]);
+			CPOperand in3 = withMaxPoolOut ? new CPOperand(parts[15]) : null;
+			CPOperand out = withMaxPoolOut ? new CPOperand(parts[16]) : new CPOperand(parts[15]);
+			double memBudget = withMaxPoolOut ? Double.parseDouble(parts[17]) : Double.parseDouble(parts[16]);
+		
+			ArrayList<CPOperand> stride = new ArrayList<>();
+			ArrayList<CPOperand> padding = new ArrayList<>();
+			ArrayList<CPOperand> input_shape = new ArrayList<>();
+			ArrayList<CPOperand> filter_shape = new ArrayList<>();
+			stride.add(new CPOperand(parts[3]));
+			stride.add(new CPOperand(parts[4]));
+			padding.add(new CPOperand(parts[5]));
+			padding.add(new CPOperand(parts[6]));
+			input_shape.add(new CPOperand(parts[7]));
+			input_shape.add(new CPOperand(parts[8]));
+			input_shape.add(new CPOperand(parts[9]));
+			input_shape.add(new CPOperand(parts[10]));
+			filter_shape.add(new CPOperand(parts[11]));
+			filter_shape.add(new CPOperand(parts[12]));
+			filter_shape.add(new CPOperand(parts[13]));
+			filter_shape.add(new CPOperand(parts[14]));
+
+			return new ConvolutionGPUInstruction(in1, in2, in3, out, opcode, str, stride,
+					padding, input_shape, filter_shape, memBudget);
+		}
 		else if (opcode.equalsIgnoreCase("conv2d_bias_add")) {
 			InstructionUtils.checkNumFields(parts, 17);
 			CPOperand in1 = new CPOperand(parts[1]);
@@ -324,7 +356,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
 		else if (instOpcode.equalsIgnoreCase("maxpooling_backward")) {
 			MatrixObject image = getMatrixInputForGPUInstruction(ec, _input1.getName());
 			MatrixObject dout = getMatrixInputForGPUInstruction(ec, _input2.getName());
-			
+			MatrixObject maxPoolOutput = _input3 != null ? getMatrixInputForGPUInstruction(ec, _input3.getName()) : null;
 			if(dout.getNumRows() != N || dout.getNumColumns() != C*P*Q) 
 				throw new DMLRuntimeException("Incorrect dimensions for dout in maxpooling_backward");
 			if(image.getNumRows() != N || image.getNumColumns() != C*H*W) 
@@ -333,7 +365,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
 			
 			MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), N, C * H * W);
 			
-			LibMatrixCuDNN.maxpoolingBackward(ec.getGPUContext(0), getExtendedOpcode(), image, dout, out, N, C, H, W,
+			LibMatrixCuDNN.maxpoolingBackward(ec.getGPUContext(0), getExtendedOpcode(), image, dout, maxPoolOutput, out, N, C, H, W,
 					K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, _intermediateMemoryBudget);
 		}
 		else {
@@ -346,7 +378,8 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
 		if ( !instOpcode.equalsIgnoreCase("maxpooling") )
 			ec.releaseMatrixInputForGPUInstruction(_input2.getName());
 
-		if (instOpcode.equalsIgnoreCase("conv2d_bias_add"))
+		if (instOpcode.equalsIgnoreCase("conv2d_bias_add") || 
+			(instOpcode.equalsIgnoreCase("maxpooling_backward") && _input3 != null))
 			ec.releaseMatrixInputForGPUInstruction(_input3.getName());
 
 		ec.releaseMatrixOutputForGPUInstruction(_output.getName());

http://git-wip-us.apache.org/repos/asf/systemml/blob/06d5bb07/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
index 7fd766c..e0a6a57 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
@@ -519,6 +519,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 	 * @param instName the invoking instruction's name for record {@link Statistics}.
 	 * @param image image as matrix object
 	 * @param dout			delta matrix, output of previous layer
+	 * @param maxpoolOutput (optional and can be null) output of maxpool forward function
 	 * @param outputBlock output matrix
 	 * @param N				batch size
 	 * @param C				number of channels
@@ -537,12 +538,14 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
 	public static void maxpoolingBackward(GPUContext gCtx, String instName, MatrixObject image, MatrixObject dout,
-			MatrixObject outputBlock, int N, int C, int H, int W, int K, int R,
+			MatrixObject maxpoolOutput, MatrixObject outputBlock, int N, int C, int H, int W, int K, int R,
 			int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
 			int Q, double intermediateMemoryBudget) throws DMLRuntimeException {
 		long CHW = C*H*W; long CPQ = C*P*Q;  
 		long NCHW = N*CHW; long NCPQ = N*CPQ; 
 
+		final boolean isMaxPoolOutputProvided = maxpoolOutput != null;
+		
 		if(NCHW < maxNumElementsOfCuDNNTensor && NCPQ < maxNumElementsOfCuDNNTensor) {
 			// Filter and output are accounted as dense in the memory estimation for conv2dBackwardData
 			long overhead = isInSparseFormat(gCtx, image) ? OptimizerUtils.estimateSizeExactSparsity(N, CHW, 1.0) : 0;
@@ -551,19 +554,26 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 			if(overhead <= intermediateMemoryBudget) {
 				Pointer x = getDensePointerForCuDNN(gCtx, image, instName);
 				Pointer dy = getDensePointerForCuDNN(gCtx, dout, instName);
-				cudnnMaxpoolingBackward(gCtx, instName, x, dy, dx, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+				Pointer y = isMaxPoolOutputProvided ? getDensePointerForCuDNN(gCtx, maxpoolOutput, instName) : null;
+				cudnnMaxpoolingBackward(gCtx, instName, x, dy, y, dx, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
 			}
 			else {
 				LibMatrixCuDNNInputRowFetcher imgFetcher = new LibMatrixCuDNNInputRowFetcher(gCtx, instName, image);
 				LibMatrixCuDNNInputRowFetcher doutFetcher = new LibMatrixCuDNNInputRowFetcher(gCtx, instName, dout);
+				LibMatrixCuDNNInputRowFetcher maxPoolOutFetcher = isMaxPoolOutputProvided ? new LibMatrixCuDNNInputRowFetcher(gCtx, instName, maxpoolOutput) : null;
 				for(int n = 0; n < N; n++) {
-					cudnnMaxpoolingBackward(gCtx, instName, imgFetcher.getNthRow(n), doutFetcher.getNthRow(n), 
+					Pointer x = imgFetcher.getNthRow(n);
+					Pointer dy = doutFetcher.getNthRow(n);
+					Pointer y = isMaxPoolOutputProvided ? maxPoolOutFetcher.getNthRow(n) : null;
+					cudnnMaxpoolingBackward(gCtx, instName, x, dy, y, 
 							dx.withByteOffset(n*CHW*sizeOfDataType), 
 							1, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
 				}
 				// Deallocate temporary array to hold one element of input
 				imgFetcher.close();
 				doutFetcher.close();
+				if(isMaxPoolOutputProvided)
+					maxPoolOutFetcher.close();
 			}
 		}
 		else {
@@ -572,36 +582,33 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 	}
 	
 	private static void cudnnMaxpoolingBackward(GPUContext gCtx, String instName, 
-			Pointer x, Pointer dy, Pointer dx, 
+			Pointer x, Pointer dy, Pointer y, Pointer dx, 
 			int N, int C, int H, int W, int K, int R,
 			int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
 			int Q) throws DMLRuntimeException {
 		if(LOG.isTraceEnabled()) {
 			LOG.trace("GPU : maxpoolingBackward" + ", GPUContext=" + gCtx);
 		}
-		Pointer y = null;
+		
+		boolean isMaxPoolOutputProvided = (y != null);
 
 		try(LibMatrixCuDNNPoolingDescriptors desc = 
 				LibMatrixCuDNNPoolingDescriptors.cudnnMaxpoolingBackwardDescriptors(gCtx, instName, N, C, H, W, K, R, S, 
 						pad_h, pad_w, stride_h, stride_w, P, Q)) {
 			long t1=0, t2=0, t3=0;
-			if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
-			
-			// Calling PoolForward first, y is one of the inputs for poolBackward
-			// TODO: Remove calling poolForward after necessary changes at language level for poolBackward
-			long numBytes = N*C*P*Q*sizeOfDataType;
-			y = gCtx.allocate(numBytes);
-			
-			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
-			
-			if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
-			int status = cudnnPoolingForward(getCudnnHandle(gCtx), desc.poolingDesc, one(), desc.xDesc, x, zero(), desc.yDesc, y);
-			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MAXPOOLING_FORWARD_LIB, System.nanoTime() - t2);
-
-			if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
-				throw new DMLRuntimeException("Could not executed cudnnPoolingForward before cudnnPoolingBackward: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
+			int status;
+			if(!isMaxPoolOutputProvided) {
+				if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
+				long numBytes = N*C*P*Q*sizeOfDataType;
+				y = gCtx.allocate(numBytes);
+				if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
+				if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
+				status = cudnnPoolingForward(getCudnnHandle(gCtx), desc.poolingDesc, one(), desc.xDesc, x, zero(), desc.yDesc, y);
+				if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MAXPOOLING_FORWARD_LIB, System.nanoTime() - t2);
+				if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
+					throw new DMLRuntimeException("Could not executed cudnnPoolingForward before cudnnPoolingBackward: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
+				}
 			}
-
 			if (GPUStatistics.DISPLAY_STATISTICS) t3 = System.nanoTime();
 			status = cudnnPoolingBackward(getCudnnHandle(gCtx), desc.poolingDesc, one(), desc.yDesc, y, desc.dyDesc, dy, desc.xDesc, x, zero(), desc.dxDesc, dx);
 			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MAXPOOLING_BACKWARD_LIB, System.nanoTime() - t3);
@@ -615,7 +622,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 		finally {
 			long t4=0;
 			if (GPUStatistics.DISPLAY_STATISTICS) t4 = System.nanoTime();
-			if(y != null)
+			if(!isMaxPoolOutputProvided)
 				gCtx.cudaFreeHelper(instName, y);
 			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t4);
 		}

http://git-wip-us.apache.org/repos/asf/systemml/blob/06d5bb07/src/test/java/org/apache/sysml/test/gpu/NeuralNetworkOpTests.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/gpu/NeuralNetworkOpTests.java b/src/test/java/org/apache/sysml/test/gpu/NeuralNetworkOpTests.java
index aba0cae..c57e997 100644
--- a/src/test/java/org/apache/sysml/test/gpu/NeuralNetworkOpTests.java
+++ b/src/test/java/org/apache/sysml/test/gpu/NeuralNetworkOpTests.java
@@ -579,5 +579,87 @@ public class NeuralNetworkOpTests extends GPUTests {
 			}
 		}
 	}
+	
+	
+	@Test
+	@Ignore
+	public void testMaxPoolBackwardWithMaxpoolOut() {
+		String scriptStr = "tmp = max_pool(image, padding=[padH, padW], stride=[strideH, strideW], input_shape=[N,C,H,W], pool_size=[R,S]); print(sum(tmp)); O = max_pool_backward(image, dout, padding=[padH, padW], stride=[strideH, strideW], input_shape=[N,C,H,W], pool_size=[R,S])";
+
+		for (long N : Nlst) {
+			for (long C : Clst) {
+				for (long H : Hlst) {
+					long W = H;
+					for (long R : Rlst) {
+						long S = R;
+						for (long strideH : strideLst) {
+							long strideW = strideH;
+							for (long padH : padLst) {
+								long padW = padH;
+								for (double sparsity : sparsitylst) {
+
+									// pool is smaller than image + padding
+									if (R > (H + padH) || S > (W + padW))
+										continue;
+
+									// Make sure ops fit in GPU memory and within constraints of cudnn
+									long imageSize = N * C * H * W * 8l;
+									if (imageSize > MAX_OP_SIZE)  // image size
+										continue;
+									long poolSize = R * S * 8l;
+									if (poolSize > MAX_OP_SIZE)  // filter size
+										continue;
+
+									int P = (int) ConvolutionUtils.getP(H, R, strideH, padH);
+									int Q = (int) ConvolutionUtils.getQ(W, S, strideW, padW);
+
+									long doutSize = N * C * P * Q * 8l;
+									if (doutSize > MAX_OP_SIZE) // dout/output size
+										continue;
+
+									double imageSizeInMB = imageSize / (1024.0 * 1024.0);
+									double poolSizeInMB = poolSize / (1024.0 * 1024.0);
+									double doutSizeInMB = doutSize / (1024.0 * 1024.0);
+									System.out
+									.format("max_pool_backward, image[%d,%d,%d,%d](%.1fMB), pool[%d,%d](%.1f), dout[%d,%d,%d,%d](%.1fMB), stride[%d,%d], padding[%d,%d]",
+											N, C, H, W, imageSizeInMB, R, S, poolSizeInMB, N, C,
+											P, Q, doutSizeInMB, strideH, strideW, padH, padW);
+
+									Matrix image = generateInputMatrix(spark, (int) N,
+											(int) (C * H * W), -127.0, 127, sparsity, seed, true);
+									Matrix dout = generateInputMatrix(spark, (int) N, (int) (C * P * Q),
+											-127.0, 127, sparsity, seed, true);
+									HashMap<String, Object> inputs = new HashMap<>();
+									inputs.put("N", N);
+									inputs.put("C", C);
+									inputs.put("H", H);
+									inputs.put("W", W);
+									inputs.put("R", R);
+									inputs.put("S", S);
+									inputs.put("strideH", strideH);
+									inputs.put("strideW", strideW);
+									inputs.put("padH", padH);
+									inputs.put("padW", padW);
+									inputs.put("image", image);
+									inputs.put("dout", dout);
+									List<Object> outCPU = runOnCPU(spark, scriptStr, inputs,
+											Arrays.asList("O"));
+									List<Object> outGPU = runOnGPU(spark, scriptStr, inputs,
+											Arrays.asList("O"));
+									assertHeavyHitterPresent("gpu_maxpooling_backward");
+									assertEqualObjects(outCPU.get(0), outGPU.get(0));
+									clearGPUMemory();
+								}
+							}
+						}
+					}
+
+
+
+
+				}
+			}
+		}
+	}
 
 }