You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2017/09/07 19:49:58 UTC

[1/5] systemml git commit: [SYSTEMML-540] Support sparse GPU conv2d as well as fix memory estimation of convolution operations

Repository: systemml
Updated Branches:
  refs/heads/master a0cf8e3be -> 772d9302d


http://git-wip-us.apache.org/repos/asf/systemml/blob/772d9302/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
new file mode 100644
index 0000000..bf5f25b
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
@@ -0,0 +1,1219 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.matrix.data;
+
+import static jcuda.jcudnn.JCudnn.cudnnActivationForward;
+import static jcuda.jcudnn.JCudnn.cudnnBatchNormalizationBackward;
+import static jcuda.jcudnn.JCudnn.cudnnBatchNormalizationForwardInference;
+import static jcuda.jcudnn.JCudnn.cudnnBatchNormalizationForwardTraining;
+import static jcuda.jcudnn.JCudnn.cudnnConvolutionBackwardData;
+import static jcuda.jcudnn.JCudnn.cudnnConvolutionBackwardFilter;
+import static jcuda.jcudnn.JCudnn.cudnnConvolutionForward;
+import static jcuda.jcudnn.JCudnn.cudnnCreateActivationDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnCreateConvolutionDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnCreateFilterDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnCreatePoolingDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnCreateTensorDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnDestroyConvolutionDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnDestroyFilterDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnDestroyPoolingDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataWorkspaceSize;
+import static jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardFilterWorkspaceSize;
+import static jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardWorkspaceSize;
+import static jcuda.jcudnn.JCudnn.cudnnPoolingBackward;
+import static jcuda.jcudnn.JCudnn.cudnnPoolingForward;
+import static jcuda.jcudnn.JCudnn.cudnnSetActivationDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnSetConvolution2dDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnSetFilter4dDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnSetPooling2dDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnSetTensor4dDescriptor;
+import static jcuda.jcudnn.cudnnActivationMode.CUDNN_ACTIVATION_RELU;
+import static jcuda.jcudnn.cudnnConvolutionMode.CUDNN_CROSS_CORRELATION;
+import static jcuda.jcudnn.cudnnDataType.CUDNN_DATA_DOUBLE;
+import static jcuda.jcudnn.cudnnNanPropagation.CUDNN_PROPAGATE_NAN;
+import static jcuda.jcudnn.cudnnPoolingMode.CUDNN_POOLING_MAX;
+import static jcuda.jcudnn.cudnnTensorFormat.CUDNN_TENSOR_NCHW;
+import static jcuda.runtime.JCuda.cudaMemcpy;
+import static jcuda.runtime.JCuda.cudaMemset;
+import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice;
+import jcuda.CudaException;
+import jcuda.Pointer;
+import jcuda.Sizeof;
+import jcuda.jcudnn.cudnnActivationDescriptor;
+import jcuda.jcudnn.cudnnBatchNormMode;
+import jcuda.jcudnn.cudnnConvolutionDescriptor;
+import jcuda.jcudnn.cudnnConvolutionFwdPreference;
+import jcuda.jcudnn.cudnnFilterDescriptor;
+import jcuda.jcudnn.cudnnHandle;
+import jcuda.jcudnn.cudnnPoolingDescriptor;
+import jcuda.jcudnn.cudnnStatus;
+import jcuda.jcudnn.cudnnTensorDescriptor;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysml.hops.OptimizerUtils;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
+import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
+import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
+import org.apache.sysml.runtime.instructions.gpu.context.CSRPointer;
+import org.apache.sysml.runtime.instructions.gpu.context.ExecutionConfig;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
+import org.apache.sysml.utils.GPUStatistics;
+import org.apache.sysml.utils.Statistics;
+
+/**
+ * This class contains method that invoke CuDNN operations.
+ */
+public class LibMatrixCuDNN extends LibMatrixCUDA {
+
+	protected static int CONVOLUTION_PREFERENCE = cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
+	private static final Log LOG = LogFactory.getLog(LibMatrixCuDNN.class.getName());
+
+	protected static cudnnHandle getCudnnHandle(GPUContext gCtx) throws DMLRuntimeException {
+		return gCtx.getCudnnHandle();
+	}
+
+	/**
+	 * Does a 2D convolution followed by a bias_add
+	 *
+	 * @param gCtx     a valid {@link GPUContext}
+	 * @param instName the invoking instruction's name for record {@link Statistics}.
+	 * @param image    input image matrix object
+	 * @param bias     bias matrix object
+	 * @param filter   filter matrix object
+	 * @param output   output matrix object
+	 * @param N        number of input images
+	 * @param C        number of channels
+	 * @param H        height of each image
+	 * @param W        width of each image
+	 * @param K        number of output "channels"
+	 * @param R        height of filter
+	 * @param S        width of filter
+	 * @param pad_h    padding height
+	 * @param pad_w    padding width
+	 * @param stride_h stride height
+	 * @param stride_w string width
+	 * @param P        output height
+	 * @param Q        output width
+	 * @param intermediateMemoryBudget intermediate memory budget
+	 * @throws DMLRuntimeException if error
+	 */
+	public static void conv2dBiasAdd(GPUContext gCtx, String instName, MatrixObject image, MatrixObject bias, MatrixObject filter, MatrixObject output, int N, int C, int H, int W,
+			int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q, double intermediateMemoryBudget)
+					throws DMLRuntimeException {
+		conv2d(gCtx, instName, image, filter, output, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, intermediateMemoryBudget);
+		//cudaDeviceSynchronize;
+		biasAdd(gCtx, instName, output, bias, output);
+	}
+
+	/**
+	 * Performs a 2D convolution
+	 * 
+	 * @param gCtx a valid {@link GPUContext}
+	 * @param instName the invoking instruction's name for record {@link Statistics}.
+	 * @param image input matrix object
+	 * @param filter filter matrix object
+	 * @param outputBlock output matrix object
+	 * @param N        number of input images
+	 * @param C        number of channels
+	 * @param H        height of each image
+	 * @param W        width of each image
+	 * @param K        number of output "channels"
+	 * @param R        height of filter
+	 * @param S        width of filter
+	 * @param pad_h    padding height
+	 * @param pad_w    padding width
+	 * @param stride_h stride height
+	 * @param stride_w string width
+	 * @param P        output height
+	 * @param Q        output width
+	 * @param intermediateMemoryBudget intermediate memory budget
+	 * @throws DMLRuntimeException if error
+	 */
+	public static void conv2d(GPUContext gCtx, String instName, MatrixObject image, MatrixObject filter, MatrixObject outputBlock, int N, int C, int H, int W,
+			int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q, double intermediateMemoryBudget) throws DMLRuntimeException {
+
+		long CHW = C*H*W; long KPQ = K*P*Q; long CRS = C*R*S; 
+		long NCHW = N*CHW; long NKPQ = N*KPQ; long KCRS = K*CRS;
+
+		if(NCHW < maxNumDoublesOfCuDNNTensor && NKPQ < maxNumDoublesOfCuDNNTensor && KCRS < maxNumDoublesOfCuDNNTensor) {
+			// Filter and output are accounted as dense in the memory estimation for conv2d
+			double overhead = isInSparseFormat(gCtx, filter) ? OptimizerUtils.estimateSizeExactSparsity(K, CRS, 1.0) : 0;
+			overhead += isInSparseFormat(gCtx, image) ? OptimizerUtils.estimateSizeExactSparsity(N, CHW, 1.0) : 0;
+
+			Pointer filterPointer = getDensePointerForCuDNN(gCtx, filter, instName);
+			Pointer dstPointer = getDensePointerForCuDNN(gCtx, outputBlock, instName);
+
+			if(overhead <= intermediateMemoryBudget) {
+				// Perform all-input all-channel conv2d
+				Pointer imagePointer = getDensePointerForCuDNN(gCtx, image, instName);
+				cudnnConv2d(gCtx, instName, imagePointer, filterPointer, dstPointer, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+			}
+			else {
+				InputRowFetcher imgFetcher = new InputRowFetcher(gCtx, instName, image);
+				for(int n = 0; n < N; n++) {
+					// Perform one-input all-channel conv2d
+					cudnnConv2d(gCtx, instName, imgFetcher.getNthRow(n), filterPointer, dstPointer.withByteOffset(n*KPQ*Sizeof.DOUBLE), 
+							1, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+				}
+				imgFetcher.close();
+			}
+		}
+		else {
+			throwCuDNNDimensionError(N, CHW, K, CRS, N, KPQ);
+		}
+	}
+
+
+	/**
+	 * Throw an user-friendly error that shows limitation of invoking a cuDNN kernel
+	 *  
+	 * @param dim1 input1 number of rows
+	 * @param dim2 input1 number of columns
+	 * @param dim3 input2 number of rows
+	 * @param dim4 input2 number of columns
+	 * @param dim5 output number of rows
+	 * @param dim6 output number of columns
+	 * @throws DMLRuntimeException the exception with the appropriate message
+	 */
+	private static void throwCuDNNDimensionError(long dim1, long dim2, long dim3, long dim4) throws DMLRuntimeException {
+		throw new DMLRuntimeException("The dimensions of input/output matrices is too large to execute a CuDNN kernel. "
+				+ "Max CuDNN matrix size:" + maxNumDoublesOfCuDNNTensor + ". "
+				+ "Given input matrix dimensions: [" + dim1 + "," + dim2 + "]. Output dimension:  [" + dim3 + "," + dim4 + "].");
+	}
+
+	/**
+	 * Throw an user-friendly error that shows limitation of invoking a cuDNN kernel
+	 *  
+	 * @param dim1 input1 number of rows
+	 * @param dim2 input1 number of columns
+	 * @param dim3 input2 number of rows
+	 * @param dim4 input2 number of columns
+	 * @param dim5 output number of rows
+	 * @param dim6 output number of columns
+	 * @throws DMLRuntimeException the exception with the appropriate message
+	 */
+	private static void throwCuDNNDimensionError(long dim1, long dim2, long dim3, long dim4, long dim5, long dim6) throws DMLRuntimeException {
+		throw new DMLRuntimeException("The dimensions of input/output matrices is too large to execute a CuDNN kernel. "
+				+ "Max CuDNN matrix size:" + maxNumDoublesOfCuDNNTensor + ". "
+				+ "Given input matrix dimensions: [" + dim1 + "," + dim2 + "], [" + dim3 + "," + dim4 + "]. Output dimension: [" + dim5 + "," + dim6 + "]");
+	}
+
+	/**
+	 * Performs 2D convolution
+	 * Takes up an insignificant amount of intermediate space when CONVOLUTION_PREFERENCE is set to CUDNN_CONVOLUTION_FWD_NO_WORKSPACE
+	 * Intermediate space is required by the filter descriptor and convolution descriptor which are metadata structures and don't scale with the size of the input
+	 *
+	 * @param gCtx     a valid {@link GPUContext}
+	 * @param instName the invoking instruction's name for record {@link Statistics}.
+	 * @param image    the input matrix (or image) allocated on the GPU
+	 * @param filter   the filter allocated on the GPU
+	 * @param output   the output matrix allocated on the GPU
+	 * @param N        number of input images
+	 * @param C        number of channels
+	 * @param H        height of each image
+	 * @param W        width of each image
+	 * @param K        number of output "channels"
+	 * @param R        height of filter
+	 * @param S        width of filter
+	 * @param pad_h    padding height
+	 * @param pad_w    padding width
+	 * @param stride_h stride height
+	 * @param stride_w string width
+	 * @param P        output height
+	 * @param Q        output width
+	 * @throws DMLRuntimeException if error
+	 */
+	private static void cudnnConv2d(GPUContext gCtx, String instName, Pointer image, Pointer filter, Pointer output, int N,
+			int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q)
+					throws DMLRuntimeException {
+		LOG.trace("GPU : conv2d" + ", GPUContext=" + gCtx);
+		cudnnFilterDescriptor filterDesc = null;
+		cudnnConvolutionDescriptor convDesc = null;
+		Pointer workSpace = null;
+		long sizeInBytes = 0;
+		try {
+			long t1 = 0, t2 = 0;
+			// Allocate descriptors
+			if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
+			cudnnTensorDescriptor srcTensorDesc = allocateTensorDescriptor(N, C, H, W);
+			cudnnTensorDescriptor dstTensorDesc = allocateTensorDescriptor(N, K, P, Q);
+			filterDesc = allocateFilterDescriptor(K, C, R, S);
+
+			int padding[] = {pad_h, pad_w};
+			int strides[] = {stride_h, stride_w};
+			convDesc = allocateConvolutionDescriptor(padding, strides);
+
+			// Select the best algorithm depending on the data and supported CUDA
+
+			int algo = -1;
+			workSpace = new Pointer();
+
+			if (CONVOLUTION_PREFERENCE == cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_NO_WORKSPACE) {
+				algo = jcuda.jcudnn.cudnnConvolutionFwdAlgo.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+			} else if (CONVOLUTION_PREFERENCE == cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_PREFER_FASTEST) {
+				int[] algos = {-1};
+				// TODO: Look into FFt, Winograd, etc
+				// Also ensure that GPU has enough memory to allocate memory
+				long sizeInBytesArray[] = {0};
+				jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardAlgorithm(getCudnnHandle(gCtx), srcTensorDesc, filterDesc, convDesc, dstTensorDesc,
+						CONVOLUTION_PREFERENCE, sizeInBytesArray[0], algos);
+				cudnnGetConvolutionForwardWorkspaceSize(getCudnnHandle(gCtx), srcTensorDesc, filterDesc, convDesc, dstTensorDesc, algos[0], sizeInBytesArray);
+				if (sizeInBytesArray[0] != 0)
+					workSpace = gCtx.allocate(sizeInBytesArray[0]);
+				sizeInBytes = sizeInBytesArray[0];
+			} else if (CONVOLUTION_PREFERENCE == cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT) {
+				throw new DMLRuntimeException("CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT is not implemented");
+			} else {
+				throw new DMLRuntimeException("Unsupported preference criteria for convolution");
+			}
+			if (GPUStatistics.DISPLAY_STATISTICS)
+				GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
+			if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
+			int status = cudnnConvolutionForward(getCudnnHandle(gCtx), one(),
+					srcTensorDesc, image,
+					filterDesc, filter,
+					convDesc, algo, workSpace, sizeInBytes, zero(),
+					dstTensorDesc, output);
+			if (GPUStatistics.DISPLAY_STATISTICS)
+				GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CONVOLUTION_FORWARD_LIB, System.nanoTime() - t2);
+			if (status != cudnnStatus.CUDNN_STATUS_SUCCESS) {
+				throw new DMLRuntimeException("Could not executed cudnnConvolutionForward: " + cudnnStatus.stringFor(status));
+			}
+		} catch (CudaException e) {
+			throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e);
+		} finally {
+			long t3 = 0;
+			if (GPUStatistics.DISPLAY_STATISTICS) t3 = System.nanoTime();
+			if (filterDesc != null)
+				cudnnDestroyFilterDescriptor(filterDesc);
+			if (convDesc != null)
+				cudnnDestroyConvolutionDescriptor(convDesc);
+			if (workSpace != null && sizeInBytes != 0)
+				gCtx.cudaFreeHelper(instName, workSpace);
+			if (GPUStatistics.DISPLAY_STATISTICS)
+				GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3);
+		}
+	}
+
+	/**
+	 * This method computes the backpropogation errors for filter of convolution operation
+	 * 
+	 * @param gCtx   a valid {@link GPUContext}
+	 * @param instName the invoking instruction's name for record {@link Statistics}.
+	 * @param image input image
+	 * @param dout errors from next layer
+	 * @param outputBlock  output errors
+	 * @param N number of images
+	 * @param C number of channels
+	 * @param H height
+	 * @param W width
+	 * @param K number of filters
+	 * @param R filter height
+	 * @param S filter width
+	 * @param pad_h pad height
+	 * @param pad_w pad width
+	 * @param stride_h stride height
+	 * @param stride_w stride width
+	 * @param P output activation height
+	 * @param Q output activation width
+	 * @param intermediateMemoryBudget intermediate memory budget
+	 * @throws DMLRuntimeException if DMLRuntimeException occurs
+	 */
+	public static void conv2dBackwardFilter(GPUContext gCtx, String instName, MatrixObject image, MatrixObject dout,
+			MatrixObject outputBlock, int N, int C, int H, int W, int K, int R,
+			int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
+			int Q, double intermediateMemoryBudget) throws DMLRuntimeException {
+		long CHW = C*H*W; long KPQ = K*P*Q; long CRS = C*R*S; 
+		long NCHW = N*CHW; long NKPQ = N*KPQ; long KCRS = K*CRS;
+
+		if(NCHW < maxNumDoublesOfCuDNNTensor && NKPQ < maxNumDoublesOfCuDNNTensor && KCRS < maxNumDoublesOfCuDNNTensor) {
+			Pointer dwPointer = getDensePointerForCuDNN(gCtx, outputBlock, instName);
+			double overhead = isInSparseFormat(gCtx, image) ? OptimizerUtils.estimateSizeExactSparsity(N, CHW, 1.0) : 0;
+			overhead += isInSparseFormat(gCtx, dout) ? OptimizerUtils.estimateSizeExactSparsity(N, KPQ, 1.0) : 0;
+			if(overhead <= intermediateMemoryBudget) {
+				// Perform all-input all-channel conv2dBackwardFilter
+				Pointer imagePointer = getDensePointerForCuDNN(gCtx, image, instName);
+				Pointer doutPointer = getDensePointerForCuDNN(gCtx, dout, instName);
+				cudnnConv2dBackwardFilter(gCtx, instName, imagePointer, doutPointer, dwPointer, 
+						N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+			}
+			else {
+				// Perform one-input conv2dBackwardFilter
+				Pointer tempdwPointer = gCtx.allocate(KCRS*Sizeof.DOUBLE);
+				InputRowFetcher imgFetcher = new InputRowFetcher(gCtx, instName, image);
+				InputRowFetcher doutFetcher = new InputRowFetcher(gCtx, instName, dout);
+				for(int n = 0; n < N; n++) {
+					long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
+					cudaMemset(tempdwPointer, 0, KCRS*Sizeof.DOUBLE);
+					if(GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SET_ZERO, System.nanoTime() - t0);
+					// Perform one-input conv2dBackwardFilter
+					cudnnConv2dBackwardFilter(gCtx, instName, imgFetcher.getNthRow(n), doutFetcher.getNthRow(n), tempdwPointer, 
+							1, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+					getCudaKernels(gCtx).launchKernel("inplace_add",
+							ExecutionConfig.getConfigForSimpleMatrixOperations(K, toInt(CRS)),
+							tempdwPointer, dwPointer, K, toInt(CRS));
+
+				}
+
+				// Deallocate temporary array to hold one element of input
+				gCtx.cudaFreeHelper(tempdwPointer, true);
+				imgFetcher.close();
+				doutFetcher.close();
+			}
+		}
+		else {
+			throwCuDNNDimensionError(N, CHW, N, KPQ, K, CRS);
+		}
+	}
+
+	/**
+	 * This method computes the backpropogation errors for filter of convolution operation
+	 * 
+	 * @param gCtx   a valid {@link GPUContext}
+	 * @param instName the invoking instruction's name for record {@link Statistics}.
+	 * @param imagePointer pointer to input image
+	 * @param doutPointer pointer to errors from next layer
+	 * @param dwPointer  output errors
+	 * @param N number of images
+	 * @param C number of channels
+	 * @param H height
+	 * @param W width
+	 * @param K number of filters
+	 * @param R filter height
+	 * @param S filter width
+	 * @param pad_h pad height
+	 * @param pad_w pad width
+	 * @param stride_h stride height
+	 * @param stride_w stride width
+	 * @param P output activation height
+	 * @param Q output activation width
+	 * @throws DMLRuntimeException if DMLRuntimeException occurs
+	 */
+	private static void cudnnConv2dBackwardFilter(GPUContext gCtx, String instName, Pointer imagePointer, Pointer doutPointer,
+			Pointer dwPointer, int N, int C, int H, int W, int K, int R,
+			int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
+			int Q) throws DMLRuntimeException {
+		LOG.trace("GPU : conv2dBackwardFilter" + ", GPUContext=" + gCtx);
+		cudnnFilterDescriptor dwDesc = null;
+		cudnnConvolutionDescriptor convDesc = null;
+
+		Pointer workSpace = null;
+		long sizeInBytes = 0;
+		try {
+
+			long t1 = 0, t2 = 0;
+			if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
+			// Allocate descriptors
+			cudnnTensorDescriptor xTensorDesc = allocateTensorDescriptor(N, C, H, W);
+			cudnnTensorDescriptor doutTensorDesc = allocateTensorDescriptor(N, K, P, Q);
+			dwDesc = allocateFilterDescriptor(K, C, R, S);
+
+			// Allocate data
+			int padding[] = {pad_h, pad_w};
+			int strides[] = {stride_h, stride_w};
+			convDesc = allocateConvolutionDescriptor(padding, strides);
+			long sizeInBytesArray[] = {0};
+
+			// TODO: Select the best algorithm depending on the data and supported CUDA
+			int algo = jcuda.jcudnn.cudnnConvolutionBwdFilterAlgo.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
+
+			workSpace = new Pointer();
+			cudnnGetConvolutionBackwardFilterWorkspaceSize(getCudnnHandle(gCtx),
+					xTensorDesc, doutTensorDesc, convDesc, dwDesc, algo, sizeInBytesArray);
+			if (GPUStatistics.DISPLAY_STATISTICS)
+				GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
+
+			if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
+			int status = cudnnConvolutionBackwardFilter(getCudnnHandle(gCtx), one(), xTensorDesc, imagePointer,
+					doutTensorDesc, doutPointer, convDesc, algo, workSpace, sizeInBytes, zero(), dwDesc, dwPointer);
+			if (GPUStatistics.DISPLAY_STATISTICS)
+				GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CONVOLUTION_BACKWARD_FILTER_LIB, System.nanoTime() - t2);
+
+			if (status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
+				throw new DMLRuntimeException("Could not executed cudnnConvolutionBackwardFilter: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
+			}
+		} catch (CudaException e) {
+			throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e);
+		} finally {
+			long t3=0;
+			if (GPUStatistics.DISPLAY_STATISTICS) t3 = System.nanoTime();
+
+			if(workSpace != null && sizeInBytes != 0)
+				gCtx.cudaFreeHelper(instName, workSpace);
+			if(dwDesc != null)
+				cudnnDestroyFilterDescriptor(dwDesc);
+
+			if(convDesc != null)
+				cudnnDestroyConvolutionDescriptor(convDesc);
+			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3);
+		}
+	}
+
+	/**
+	 * This method computes the backpropogation errors for previous layer of convolution operation
+	 * 
+	 * @param gCtx   a valid {@link GPUContext}
+	 * @param instName the invoking instruction's name for record {@link Statistics}.
+	 * @param filter filter used in conv2d
+	 * @param dout errors from next layer
+	 * @param output  output errors
+	 * @param N number of images
+	 * @param C number of channels
+	 * @param H height
+	 * @param W width
+	 * @param K number of filters
+	 * @param R filter height
+	 * @param S filter width
+	 * @param pad_h pad height
+	 * @param pad_w pad width
+	 * @param stride_h stride height
+	 * @param stride_w stride width
+	 * @param P output activation height
+	 * @param Q output activation width
+	 * @param intermediateMemoryBudget intermediate memory budget
+	 * @throws DMLRuntimeException if DMLRuntimeException occurs
+	 */
+	public static void conv2dBackwardData(GPUContext gCtx, String instName, MatrixObject filter, MatrixObject dout,
+			MatrixObject output, int N, int C, int H, int W, int K, int R,
+			int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
+			int Q, double intermediateMemoryBudget) throws DMLRuntimeException {
+		long CHW = C*H*W; long KPQ = K*P*Q; long CRS = C*R*S; 
+		long NCHW = N*CHW; long NKPQ = N*KPQ; long KCRS = K*CRS;
+
+		if(NCHW < maxNumDoublesOfCuDNNTensor && NKPQ < maxNumDoublesOfCuDNNTensor && KCRS < maxNumDoublesOfCuDNNTensor) {
+			// Filter and output are accounted as dense in the memory estimation for conv2dBackwardData
+			double overhead = isInSparseFormat(gCtx, filter) ? OptimizerUtils.estimateSizeExactSparsity(K, CRS, 1.0) : 0;
+			overhead += isInSparseFormat(gCtx, dout) ? OptimizerUtils.estimateSizeExactSparsity(N, KPQ, 1.0) : 0;
+			Pointer filterPointer = getDensePointerForCuDNN(gCtx, filter, instName);
+			Pointer dstPointer = getDensePointerForCuDNN(gCtx, output, instName);
+			if(overhead <= intermediateMemoryBudget) {
+				// Perform all-input all-channel conv2dBackwardData
+				Pointer doutPointer = getDensePointerForCuDNN(gCtx, dout, instName);
+				cudnnConv2dBackwardData(gCtx, instName, filterPointer, doutPointer, dstPointer, 
+						N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+			}
+			else {
+				InputRowFetcher doutFetcher = new InputRowFetcher(gCtx, instName, dout);
+				for(int n = 0; n < N; n++) {
+					cudnnConv2d(gCtx, instName, doutFetcher.getNthRow(n), filterPointer, dstPointer.withByteOffset(n*CHW*Sizeof.DOUBLE), 
+							1, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+				}
+				doutFetcher.close();
+			}
+		}
+		else {
+			throwCuDNNDimensionError(N, CHW, N, KPQ, K, CRS);
+		}
+	}
+
+	/**
+	 * This method computes the backpropogation errors for previous layer of convolution operation
+	 * 
+	 * @param gCtx   a valid {@link GPUContext}
+	 * @param instName the invoking instruction's name for record {@link Statistics}.
+	 * @param w pointer to filter used in conv2d
+	 * @param dy pointer to errors from next layer
+	 * @param dx pointer to  output errors
+	 * @param N number of images
+	 * @param C number of channels
+	 * @param H height
+	 * @param W width
+	 * @param K number of filters
+	 * @param R filter height
+	 * @param S filter width
+	 * @param pad_h pad height
+	 * @param pad_w pad width
+	 * @param stride_h stride height
+	 * @param stride_w stride width
+	 * @param P output activation height
+	 * @param Q output activation width
+	 * @throws DMLRuntimeException if DMLRuntimeException occurs
+	 */
+	private static void cudnnConv2dBackwardData(GPUContext gCtx, String instName, Pointer w, Pointer dy,
+			Pointer dx, int N, int C, int H, int W, int K, int R,
+			int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
+			int Q) throws DMLRuntimeException {
+		LOG.trace("GPU : conv2dBackwardData" + ", GPUContext=" + gCtx);
+		cudnnFilterDescriptor wDesc = null;
+		cudnnConvolutionDescriptor convDesc = null;
+
+		Pointer workSpace = null;
+		long sizeInBytes = 0;
+		try {
+			long t1=0, t2=0;
+			if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
+			// Allocate descriptors
+			wDesc = allocateFilterDescriptor(K, C, R, S);
+			cudnnTensorDescriptor dyDesc = allocateTensorDescriptor(N, K, P, Q);
+			cudnnTensorDescriptor dxDesc = allocateTensorDescriptor(N, C, H, W);
+
+			int padding [] = { pad_h, pad_w };
+			int strides [] = { stride_h, stride_w };
+			convDesc = allocateConvolutionDescriptor(padding, strides);
+			long sizeInBytesArray[] = { 0 };
+
+			// TODO: Select the best algorithm depending on the data and supported CUDA
+			int algo = jcuda.jcudnn.cudnnConvolutionBwdDataAlgo.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
+			workSpace = new Pointer();
+			cudnnGetConvolutionBackwardDataWorkspaceSize(getCudnnHandle(gCtx),
+					wDesc, dyDesc, convDesc, dxDesc, algo, sizeInBytesArray);
+			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
+
+			if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
+			int status = cudnnConvolutionBackwardData(getCudnnHandle(gCtx), one(), wDesc, w,
+					dyDesc, dy, convDesc, algo, workSpace, sizeInBytes, zero(), dxDesc, dx);
+			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CONVOLUTION_BACKWARD_DATA_LIB, System.nanoTime() - t2);
+
+			if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
+				throw new DMLRuntimeException("Could not executed cudnnConvolutionBackwardData: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
+			}
+		} catch (CudaException e) {
+			throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e);
+		}
+		finally {
+			long t3=0;
+			if (GPUStatistics.DISPLAY_STATISTICS) t3 = System.nanoTime();
+
+			if(workSpace != null && sizeInBytes != 0)
+				gCtx.cudaFreeHelper(instName, workSpace);
+			if(wDesc != null)
+				cudnnDestroyFilterDescriptor(wDesc);
+			if(convDesc != null)
+				cudnnDestroyConvolutionDescriptor(convDesc);
+
+			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3);
+		}
+	}
+
+	/**
+	 * performs maxpooling on GPU by exploiting cudnnPoolingForward(...)
+	 * @param gCtx   a valid {@link GPUContext}
+	 * @param instName the invoking instruction's name for record {@link Statistics}.
+	 * @param image image as matrix object
+	 * @param outputBlock output matrix
+	 * @param N				batch size
+	 * @param C				number of channels
+	 * @param H				height of image
+	 * @param W				width of image
+	 * @param K				number of filters
+	 * @param R				height of filter
+	 * @param S				width of filter
+	 * @param pad_h			vertical padding
+	 * @param pad_w			horizontal padding
+	 * @param stride_h		horizontal stride
+	 * @param stride_w		vertical stride
+	 * @param P				(H - R + 1 + 2*pad_h)/stride_h
+	 * @param Q				(W - S + 1 + 2*pad_w)/stride_w
+	 * @param intermediateMemoryBudget intermediate memory budget
+	 * @throws DMLRuntimeException if DMLRuntimeException occurs
+	 */
+	public static void maxpooling(GPUContext gCtx, String instName, MatrixObject image,
+			MatrixObject outputBlock, int N, int C, int H, int W, int K, int R,
+			int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
+			int Q, double intermediateMemoryBudget) throws DMLRuntimeException {
+		long CHW = C*H*W; long CPQ = C*P*Q;  
+		long NCHW = N*CHW; long NCPQ = N*CPQ; 
+
+		if(NCHW < maxNumDoublesOfCuDNNTensor && NCPQ < maxNumDoublesOfCuDNNTensor) {
+			// Filter and output are accounted as dense in the memory estimation for conv2dBackwardData
+			long overhead = isInSparseFormat(gCtx, image) ? OptimizerUtils.estimateSizeExactSparsity(N, CHW, 1.0) : 0;
+			Pointer y = getDensePointerForCuDNN(gCtx, outputBlock, instName);
+			if(overhead <= intermediateMemoryBudget) {
+				Pointer x = getDensePointerForCuDNN(gCtx, image, instName);
+				cudnnTensorDescriptor xDesc = allocateTensorDescriptor(gCtx, image, N, C, H, W);
+				cudnnMaxpooling(gCtx, instName, x, xDesc, y, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+			}
+			else {
+				InputRowFetcher imgFetcher = new InputRowFetcher(gCtx, instName, image);
+				cudnnTensorDescriptor xDesc = allocateTensorDescriptor(gCtx, image, N, C, H, W);
+				for(int n = 0; n < N; n++) {
+					cudnnMaxpooling(gCtx, instName, imgFetcher.getNthRow(n), xDesc, y.withByteOffset(n*CPQ*Sizeof.DOUBLE), 1, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+				}
+				imgFetcher.close();
+			}
+		}
+		else {
+			throwCuDNNDimensionError(N, CHW, N, CPQ);
+		}
+	}
+
+	/**
+	 * Performs a slice operation: out = in[(n+1):(n+1), 1:numColumns]
+	 */
+	private static class InputRowFetcher {
+		GPUContext gCtx; String instName; int numColumns; boolean isInputInSparseFormat; 
+		Object inPointer; // can be either CSRPointer or Pointer 
+		Pointer outPointer;
+
+		/**
+		 * Initialize the input fetcher
+		 * 
+		 * @param gCtx current gpu context
+		 * @param instName name of the instruction
+		 * @param image input matrix object.
+		 * @throws DMLRuntimeException if error
+		 */
+		public InputRowFetcher(GPUContext gCtx, String instName, MatrixObject image) throws DMLRuntimeException {
+			this.gCtx = gCtx; this.instName = instName;
+			numColumns = toInt(image.getNumColumns());
+			isInputInSparseFormat = isInSparseFormat(gCtx, image);
+			inPointer = isInputInSparseFormat ? getSparsePointer(gCtx, image, instName) : getDensePointerForCuDNN(gCtx, image, instName);
+			outPointer = gCtx.allocate(numColumns*Sizeof.DOUBLE);
+		}
+		/**
+		 * Copy the nth row and return the dense pointer
+		 * @param n zero-based row index
+		 * @return dense pointer containing the nth row. This row is reused in the next iteration
+		 * @throws DMLRuntimeException
+		 */
+		public Pointer getNthRow(int n) throws DMLRuntimeException {
+			if(isInputInSparseFormat) {
+				long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
+				cudaMemset(outPointer, 0, numColumns*Sizeof.DOUBLE);
+				if(GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SET_ZERO, System.nanoTime() - t0);
+				sliceSparseDense(gCtx, instName, (CSRPointer)inPointer, outPointer, n, n, 0, toInt(numColumns-1));
+			}
+			else {
+				sliceDenseDense(gCtx, instName, (Pointer)inPointer, outPointer, n, n, 0, toInt(numColumns-1), numColumns, numColumns);
+			}
+			return outPointer;
+		}
+		/**
+		 * Deallocates temporary pointer
+		 */
+		public void close() {
+			gCtx.cudaFreeHelper(outPointer, true);
+		}
+	}
+
+	private static void cudnnMaxpooling(GPUContext gCtx, String instName, Pointer x, cudnnTensorDescriptor xDesc,
+			Pointer y, int N, int C, int H, int W, int K, int R,
+			int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
+			int Q) throws DMLRuntimeException {
+		LOG.trace("GPU : performMaxpooling" + ", GPUContext=" + gCtx);
+
+		cudnnPoolingDescriptor poolingDesc = null;
+
+		try {
+			long t1=0,t2=0;
+			if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
+			// Allocate descriptors
+			cudnnTensorDescriptor yDesc = allocateTensorDescriptor(N, C, P, Q);
+			poolingDesc = allocatePoolingDescriptor(R, S, pad_h, pad_w, stride_h, stride_w);
+			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
+
+			if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
+			int status = cudnnPoolingForward(getCudnnHandle(gCtx), poolingDesc, one(), xDesc, x, zero(), yDesc, y);
+			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MAXPOOLING_FORWARD_LIB, System.nanoTime() - t2);
+
+			if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
+				throw new DMLRuntimeException("Could not executed cudnnPoolingForward: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
+			}
+		} catch (CudaException e) {
+			throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e);
+		}
+		finally {
+			long t3=0;
+			if (GPUStatistics.DISPLAY_STATISTICS) t3 = System.nanoTime();
+			if(poolingDesc != null)
+				cudnnDestroyPoolingDescriptor(poolingDesc);
+			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3);
+		}
+	}
+
+	/**
+	 * Performs maxpoolingBackward on GPU by exploiting cudnnPoolingBackward(...)
+	 * This method computes the backpropogation errors for previous layer of maxpooling operation
+	 * @param gCtx   a valid {@link GPUContext}
+	 * @param instName the invoking instruction's name for record {@link Statistics}.
+	 * @param image image as matrix object
+	 * @param dout			delta matrix, output of previous layer
+	 * @param outputBlock output matrix
+	 * @param N				batch size
+	 * @param C				number of channels
+	 * @param H				height of image
+	 * @param W				width of image
+	 * @param K				number of filters
+	 * @param R				height of filter
+	 * @param S				width of filter
+	 * @param pad_h			vertical padding
+	 * @param pad_w			horizontal padding
+	 * @param stride_h		horizontal stride
+	 * @param stride_w		vertical stride
+	 * @param P				(H - R + 1 + 2*pad_h)/stride_h
+	 * @param Q				(W - S + 1 + 2*pad_w)/stride_w
+	 * @param intermediateMemoryBudget intermediate memory budget
+	 * @throws DMLRuntimeException if DMLRuntimeException occurs
+	 */
+	public static void maxpoolingBackward(GPUContext gCtx, String instName, MatrixObject image, MatrixObject dout,
+			MatrixObject outputBlock, int N, int C, int H, int W, int K, int R,
+			int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
+			int Q, double intermediateMemoryBudget) throws DMLRuntimeException {
+		long CHW = C*H*W; long CPQ = C*P*Q;  
+		long NCHW = N*CHW; long NCPQ = N*CPQ; 
+
+		if(NCHW < maxNumDoublesOfCuDNNTensor && NCPQ < maxNumDoublesOfCuDNNTensor) {
+			// Filter and output are accounted as dense in the memory estimation for conv2dBackwardData
+			long overhead = isInSparseFormat(gCtx, image) ? OptimizerUtils.estimateSizeExactSparsity(N, CHW, 1.0) : 0;
+			overhead += isInSparseFormat(gCtx, dout) ? OptimizerUtils.estimateSizeExactSparsity(N, CPQ, 1.0) : 0;
+			Pointer dx = getDensePointerForCuDNN(gCtx, outputBlock, instName);
+			if(overhead <= intermediateMemoryBudget) {
+				Pointer x = getDensePointerForCuDNN(gCtx, image, instName);
+				Pointer dy = getDensePointerForCuDNN(gCtx, dout, instName);
+				cudnnMaxpoolingBackward(gCtx, instName, x, dy, dx, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+			}
+			else {
+				InputRowFetcher imgFetcher = new InputRowFetcher(gCtx, instName, image);
+				InputRowFetcher doutFetcher = new InputRowFetcher(gCtx, instName, dout);
+				for(int n = 0; n < N; n++) {
+					cudnnMaxpoolingBackward(gCtx, instName, imgFetcher.getNthRow(n), doutFetcher.getNthRow(n), 
+							dx.withByteOffset(n*CHW*Sizeof.DOUBLE), 
+							1, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+				}
+				// Deallocate temporary array to hold one element of input
+				imgFetcher.close();
+				doutFetcher.close();
+			}
+		}
+		else {
+			throwCuDNNDimensionError(N, CHW, N, CPQ);
+		}
+	}
+	
+	private static void cudnnMaxpoolingBackward(GPUContext gCtx, String instName, 
+			Pointer x, Pointer dy, Pointer dx, 
+			int N, int C, int H, int W, int K, int R,
+			int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
+			int Q) throws DMLRuntimeException {
+		LOG.trace("GPU : maxpoolingBackward" + ", GPUContext=" + gCtx);
+		Pointer y = null;
+		cudnnPoolingDescriptor poolingDesc = null;
+
+		try {
+			long t1=0, t2=0, t3=0;
+			if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
+			// Allocate descriptors
+			cudnnTensorDescriptor xDesc = allocateTensorDescriptor(N, C, H, W);
+			cudnnTensorDescriptor yDesc = allocateTensorDescriptor(N, C, P, Q);
+			cudnnTensorDescriptor dxDesc = allocateTensorDescriptor(N, C, H, W);
+			cudnnTensorDescriptor dyDesc = allocateTensorDescriptor(N, C, P, Q);
+
+			poolingDesc = allocatePoolingDescriptor(R, S, pad_h, pad_w, stride_h, stride_w);
+
+			// Calling PoolForward first, y is one of the inputs for poolBackward
+			// TODO: Remove calling poolForward after necessary changes at language level for poolBackward
+			long numBytes = N*C*P*Q*Sizeof.DOUBLE;
+			y = gCtx.allocate(numBytes);
+			
+			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
+
+			if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
+			int status = cudnnPoolingForward(getCudnnHandle(gCtx), poolingDesc, one(), xDesc, x, zero(), yDesc, y);
+			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MAXPOOLING_FORWARD_LIB, System.nanoTime() - t2);
+
+			if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
+				throw new DMLRuntimeException("Could not executed cudnnPoolingForward before cudnnPoolingBackward: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
+			}
+
+			if (GPUStatistics.DISPLAY_STATISTICS) t3 = System.nanoTime();
+			status = cudnnPoolingBackward(getCudnnHandle(gCtx), poolingDesc, one(), yDesc, y, dyDesc, dy, xDesc, x, zero(), dxDesc, dx);
+			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MAXPOOLING_BACKWARD_LIB, System.nanoTime() - t3);
+
+			if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
+				throw new DMLRuntimeException("Could not executed cudnnPoolingBackward: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
+			}
+		} catch (CudaException e) {
+			throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e);
+		}
+		finally {
+			long t4=0;
+			if (GPUStatistics.DISPLAY_STATISTICS) t4 = System.nanoTime();
+
+			if(y != null)
+				gCtx.cudaFreeHelper(instName, y);
+			if(poolingDesc != null)
+				cudnnDestroyPoolingDescriptor(poolingDesc);
+
+			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t4);
+		}
+	}
+
+	private static cudnnConvolutionDescriptor allocateConvolutionDescriptor(int padding [], int strides []) {
+		cudnnConvolutionDescriptor convDesc = new cudnnConvolutionDescriptor();
+		cudnnCreateConvolutionDescriptor(convDesc);
+		cudnnSetConvolution2dDescriptor(convDesc, padding[0], padding[1], strides[0], strides[1], 1, 1, CUDNN_CROSS_CORRELATION);
+		return convDesc;
+	}
+
+	protected static cudnnFilterDescriptor allocateFilterDescriptor(int K, int C, int R, int S) {
+		cudnnFilterDescriptor filterDesc = new cudnnFilterDescriptor();
+		cudnnCreateFilterDescriptor(filterDesc);
+		cudnnSetFilter4dDescriptor(filterDesc, CUDNN_DATA_DOUBLE, CUDNN_TENSOR_NCHW, K, C, R, S);
+		return filterDesc;
+	}
+
+	/**
+	 * allocates pooling descriptor, used in poolingForward and poolingBackward
+	 * @param R			pooling window height
+	 * @param S			pooling window width
+	 * @param pad_h		vertical padding
+	 * @param pad_w		horizontal padding
+	 * @param stride_h	pooling vertical stride
+	 * @param stride_w	pooling horizontal stride
+	 * @return cudnn pooling descriptor
+	 */
+	private static cudnnPoolingDescriptor allocatePoolingDescriptor(int R, int S, int pad_h, int pad_w, int stride_h, int stride_w) {
+		cudnnPoolingDescriptor poolingDesc = new cudnnPoolingDescriptor();
+		cudnnCreatePoolingDescriptor(poolingDesc);
+		cudnnSetPooling2dDescriptor(poolingDesc, CUDNN_POOLING_MAX, CUDNN_PROPAGATE_NAN, R, S, pad_h, pad_w, stride_h, stride_w);
+		return poolingDesc;
+	}
+
+	/**
+	 * Convenience method to get tensor descriptor
+	 * @param N number of images
+	 * @param C number of channels
+	 * @param H height
+	 * @param W width
+	 * @return cudnn tensor descriptor
+	 * @throws DMLRuntimeException if the input descriptor and matrix dimensions don't match
+	 */
+	private static cudnnTensorDescriptor allocateTensorDescriptor(int N, int C, int H, int W) throws DMLRuntimeException {
+		cudnnTensorDescriptor tensorDescriptor = new cudnnTensorDescriptor();
+		cudnnCreateTensorDescriptor(tensorDescriptor);
+		cudnnSetTensor4dDescriptor(tensorDescriptor, CUDNN_TENSOR_NCHW, CUDNN_DATA_DOUBLE, N, C, H, W);
+		return tensorDescriptor;
+	}
+
+	/**
+	 * Convenience method to get tensor descriptor from underlying GPUObject
+	 * @param gCtx   a valid {@link GPUContext}
+	 * @param mat matrix object
+	 * @param N number of images
+	 * @param C number of channels
+	 * @param H height
+	 * @param W width
+	 * @return cudnn tensor descriptor
+	 * @throws DMLRuntimeException if the input descriptor and matrix dimensions don't match
+	 */
+	private static cudnnTensorDescriptor allocateTensorDescriptor(GPUContext gCtx, MatrixObject mat, int N, int C, int H, int W) throws DMLRuntimeException {
+		if(mat.getNumRows() != N || mat.getNumColumns() != C*H*W) {
+			throw new DMLRuntimeException("Mismatch descriptor-matrix dimensions:" + mat.getNumRows() + " != " + N
+					+ " || " + mat.getNumColumns() + " != " + (C*H*W));
+		}
+		return mat.getGPUObject(gCtx).allocateTensorDescriptor(N, C, H, W);
+	}
+
+	/**
+	 * Performs the forward BatchNormalization layer computation for inference
+	 * @param gCtx   a valid {@link GPUContext}
+	 * @param instName name of the instruction
+	 * @param image input image
+	 * @param scale scale (as per CuDNN) and gamma as per original paper: shape [1, C, 1, 1]
+	 * @param bias bias (as per CuDNN) and beta as per original paper: shape [1, C, 1, 1]
+	 * @param runningMean running mean accumulated during training phase: shape [1, C, 1, 1]
+	 * @param runningVar running variance accumulated during training phase: shape [1, C, 1, 1]
+	 * @param ret normalized input
+	 * @param epsilon epsilon value used in the batch normalization formula
+	 * @throws DMLRuntimeException if error occurs
+	 */
+	public static void batchNormalizationForwardInference(GPUContext gCtx, String instName, MatrixObject image,
+			MatrixObject scale, MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar,
+			MatrixObject ret, double epsilon) throws DMLRuntimeException {
+		LOG.trace("GPU : batchNormalizationForwardInference" + ", GPUContext=" + gCtx);
+		int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
+
+		int N = toInt(image.getNumRows());
+		int C = toInt(scale.getNumColumns());
+		long CHW = image.getNumColumns();
+		validateBatchNormalizationDimensions(scale, bias, runningMean, runningVar, C);
+
+		// Allocate descriptors
+		cudnnTensorDescriptor nCHWDescriptor = allocateNCHWDescriptors(gCtx, N, C, CHW,
+				new MatrixObject[] {image},  new MatrixObject[] {ret});
+		cudnnTensorDescriptor scaleTensorDesc = allocateTensorDescriptor(gCtx, scale, 1, C, 1, 1);
+
+		// Get underlying dense pointer
+		Pointer imagePtr = getDensePointerForCuDNN(gCtx, image, instName);
+		Pointer retPtr = getDensePointerForCuDNN(gCtx, ret, instName);
+		Pointer biasPtr = getDensePointerForCuDNN(gCtx, bias, instName);
+		Pointer scalePtr = getDensePointerForCuDNN(gCtx, scale, instName);
+		Pointer runningMeanPtr = getDensePointerForCuDNN(gCtx, runningMean, instName);
+		Pointer runningVarPtr = getDensePointerForCuDNN(gCtx, runningVar, instName);
+
+		checkStatus(cudnnBatchNormalizationForwardInference(getCudnnHandle(gCtx), mode, one(), zero(),
+				nCHWDescriptor, imagePtr, nCHWDescriptor, retPtr,
+				scaleTensorDesc, scalePtr, biasPtr,
+				runningMeanPtr, runningVarPtr, epsilon));
+	}
+
+	/**
+	 * Performs the forward BatchNormalization layer computation for training
+	 * @param gCtx   a valid {@link GPUContext}
+	 * @param instName name of the instruction
+	 * @param image input image
+	 * @param scale scale (as per CuDNN) and gamma as per original paper: shape [1, C, 1, 1]
+	 * @param bias bias (as per CuDNN) and beta as per original paper: shape [1, C, 1, 1]
+	 * @param runningMean running mean accumulated during training phase: shape [1, C, 1, 1]
+	 * @param runningVar running variance accumulated during training phase: shape [1, C, 1, 1]
+	 * @param ret (output) normalized input
+	 * @param retRunningMean (output) running mean accumulated during training phase: shape [1, C, 1, 1]
+	 * @param retRunningVar (output) running variance accumulated during training phase: shape [1, C, 1, 1]
+	 * @param epsilon epsilon value used in the batch normalization formula
+	 * @param exponentialAverageFactor factor used in the moving average computation
+	 * @throws DMLRuntimeException if error occurs
+	 */
+	public static void batchNormalizationForwardTraining(GPUContext gCtx, String instName, MatrixObject image,
+			MatrixObject scale,  MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar,
+			MatrixObject ret, MatrixObject retRunningMean, MatrixObject retRunningVar, double epsilon, double exponentialAverageFactor) throws DMLRuntimeException {
+		LOG.trace("GPU : batchNormalizationForwardTraining" + ", GPUContext=" + gCtx);
+		int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
+
+		int N = toInt(image.getNumRows());
+		int C = toInt(scale.getNumColumns());
+		long CHW = image.getNumColumns();
+		validateBatchNormalizationDimensions(scale, bias, runningMean, runningVar, C);
+
+		// Allocate descriptors
+		cudnnTensorDescriptor nCHWDescriptor = allocateNCHWDescriptors(gCtx, N, C, CHW,
+				new MatrixObject[] {image},  new MatrixObject[] {ret});
+		cudnnTensorDescriptor scaleTensorDesc = allocateTensorDescriptor(gCtx, scale, 1, C, 1, 1);
+
+		// Get underlying dense pointer
+		Pointer imagePtr = getDensePointerForCuDNN(gCtx, image, instName);
+		Pointer retPtr = getDensePointerForCuDNN(gCtx, ret, instName);
+		Pointer biasPtr = getDensePointerForCuDNN(gCtx, bias, instName);
+		Pointer scalePtr = getDensePointerForCuDNN(gCtx, scale, instName);
+		Pointer runningMeanPtr = getDensePointerForCuDNN(gCtx, runningMean, instName);
+		Pointer runningVarPtr = getDensePointerForCuDNN(gCtx, runningVar, instName);
+
+		// To allow for copy-on-write
+		Pointer retRunningMeanPtr = getDensePointerForCuDNN(gCtx, retRunningMean, instName);
+		Pointer retRunningVarPtr = getDensePointerForCuDNN(gCtx, retRunningVar, instName);
+		cudaMemcpy(retRunningMeanPtr, runningMeanPtr, C * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
+		cudaMemcpy(retRunningVarPtr, runningVarPtr, C * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
+
+		// ignoring resultSaveMean and resultSaveVariance as it requires state management
+		checkStatus(cudnnBatchNormalizationForwardTraining(getCudnnHandle(gCtx), mode, one(), zero(),
+				nCHWDescriptor, imagePtr, nCHWDescriptor, retPtr,
+				scaleTensorDesc, scalePtr, biasPtr, exponentialAverageFactor,
+				retRunningMeanPtr, retRunningVarPtr, epsilon, new Pointer(), new Pointer()));
+	}
+
+	private static void validateBatchNormalizationDimensions(MatrixObject scale, MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar, int C) throws DMLRuntimeException {
+		if(scale.getNumRows() != 1 || scale.getNumColumns() != C) {
+			throw new DMLRuntimeException("Incorrect dimensions for scale");
+		}
+		if(bias.getNumRows() != 1 || bias.getNumColumns() != C) {
+			throw new DMLRuntimeException("Incorrect dimensions for bias");
+		}
+		if(runningMean.getNumRows() != 1 || runningMean.getNumColumns() != C) {
+			throw new DMLRuntimeException("Incorrect dimensions for running mean");
+		}
+		if(runningVar.getNumRows() != 1 || runningVar.getNumColumns() != C) {
+			throw new DMLRuntimeException("Incorrect dimensions for running variance");
+		}
+	}
+
+	/**
+	 * Convenient utility for batch normalization that returns a NCHW descriptor
+	 * @param gCtx a valid {@link GPUContext}
+	 * @param N number of images
+	 * @param C number of channels
+	 * @param CHW channels*height*width
+	 * @param input input matrix objects
+	 * @param output output matrix objects
+	 * @return one of the NCHW descriptor
+	 * @throws DMLRuntimeException if error occurs
+	 */
+	private static cudnnTensorDescriptor allocateNCHWDescriptors(GPUContext gCtx, int N, int C, long CHW, MatrixObject [] input, MatrixObject [] output) throws DMLRuntimeException {
+		cudnnTensorDescriptor ret  = null; // Return any one
+		if(CHW > ((long)Integer.MAX_VALUE)*C) {
+			throw new DMLRuntimeException("image size (height*width) should be less than " + Integer.MAX_VALUE);
+		}
+		cudnnTensorDescriptor knownNCHWdescriptor = null;
+		int H = -1; int W = -1;
+		for(int i = 0; i < input.length; i++) {
+			knownNCHWdescriptor = input[i].getGPUObject(gCtx).getTensorDescriptor();
+			if(knownNCHWdescriptor != null) {
+				int [] shape = input[i].getGPUObject(gCtx).getTensorShape();
+				if(shape[0] != N || shape[1] != C) {
+					throw new DMLRuntimeException("Incorrect N and C:" + shape[0]  + " != " + N + " || " + shape[1]  + " != " +  C);
+				}
+				H = shape[2];
+				W = shape[3];
+				break;
+			}
+		}
+		if(knownNCHWdescriptor != null) {
+			// We precisely know N, C, H, W
+			for(int i = 0; i < input.length; i++) {
+				ret = allocateTensorDescriptor(gCtx, input[i], N, C, H, W);
+			}
+			for(int i = 0; i < output.length; i++) {
+				ret = allocateTensorDescriptor(gCtx, output[i], N, C, H, W);
+			}
+		}
+		else {
+			int HW = (int) (CHW / C);
+			H = HW; W = 1; // If not known
+			double potentialH = Math.sqrt(HW);
+			if(potentialH == ((int) potentialH)) {
+				H = (int) potentialH;
+				W = H;
+			}
+			// We are not sure about H and W, hence don't allocate them.
+			ret = new cudnnTensorDescriptor();
+			cudnnCreateTensorDescriptor(ret);
+			cudnnSetTensor4dDescriptor(ret, CUDNN_TENSOR_NCHW, CUDNN_DATA_DOUBLE, N, C, H, W);
+		}
+		return ret;
+	}
+
+	/**
+	 * This method computes the backpropagation errors for image, scale and bias of batch normalization layer
+	 * @param gCtx   a valid {@link GPUContext}
+	 * @param instName name of the instruction
+	 * @param image input image
+	 * @param dout input errors of shape C, H, W
+	 * @param scale scale (as per CuDNN) and gamma as per original paper: shape [1, C, 1, 1]
+	 * @param ret (output) backpropagation errors for previous layer
+	 * @param retScale backpropagation error for scale
+	 * @param retBias backpropagation error for bias
+	 * @param epsilon epsilon value used in the batch normalization formula
+	 * @throws DMLRuntimeException if error occurs
+	 */
+	public static void batchNormalizationBackward(GPUContext gCtx, String instName, MatrixObject image, MatrixObject dout,
+			MatrixObject scale, MatrixObject ret, MatrixObject retScale, MatrixObject retBias,
+			double epsilon) throws DMLRuntimeException {
+		LOG.trace("GPU : batchNormalizationBackward" + ", GPUContext=" + gCtx);
+		int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
+
+		int N = toInt(image.getNumRows());
+		int C = toInt(scale.getNumColumns());
+		long CHW = image.getNumColumns();
+
+		// Allocate descriptors
+		cudnnTensorDescriptor nCHWDescriptor = allocateNCHWDescriptors(gCtx, N, C, CHW,
+				new MatrixObject[] {image, dout},  new MatrixObject[] {ret});
+		cudnnTensorDescriptor scaleTensorDesc = allocateTensorDescriptor(gCtx, scale, 1, C, 1, 1);
+
+		// Get underlying dense pointer
+		Pointer imagePtr = getDensePointerForCuDNN(gCtx, image, instName);
+		Pointer doutPtr = getDensePointerForCuDNN(gCtx, dout, instName);
+		Pointer scalePtr = getDensePointerForCuDNN(gCtx, scale, instName);
+		Pointer retPtr = getDensePointerForCuDNN(gCtx, ret, instName);
+		Pointer retScalePtr = getDensePointerForCuDNN(gCtx, retScale, instName);
+		Pointer retBiasPtr = getDensePointerForCuDNN(gCtx, retBias, instName);
+
+		// ignoring resultSaveMean and resultSaveVariance as it requires state management
+		checkStatus(cudnnBatchNormalizationBackward(getCudnnHandle(gCtx), mode,  one(), zero(), one(), zero(),
+				nCHWDescriptor,  imagePtr, nCHWDescriptor, doutPtr, nCHWDescriptor, retPtr,
+				scaleTensorDesc, scalePtr, retScalePtr, retBiasPtr, epsilon, new Pointer(), new Pointer()));
+	}
+
+
+	private static void cudnnReLU(GPUContext gCtx, String instName, MatrixObject in, Pointer dstData, cudnnTensorDescriptor srcTensorDesc) throws DMLRuntimeException {
+		long t0=0;
+		try {
+			LOG.trace("GPU : performCuDNNReLU" + ", GPUContext=" + gCtx);
+			cudnnTensorDescriptor dstTensorDesc = srcTensorDesc;
+
+			Pointer srcData = getDensePointerForCuDNN(gCtx, in, instName);
+			cudnnActivationDescriptor activationDescriptor = new cudnnActivationDescriptor();
+			cudnnCreateActivationDescriptor(activationDescriptor);
+			double dummy = -1;
+			cudnnSetActivationDescriptor(activationDescriptor, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, dummy);
+			if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
+			cudnnActivationForward(getCudnnHandle(gCtx), activationDescriptor,
+					one(), srcTensorDesc, srcData,
+					zero(), dstTensorDesc, dstData);
+			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ACTIVATION_FORWARD_LIB, System.nanoTime() - t0);
+		} catch (CudaException e) {
+			throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e);
+		}
+		finally {
+			long t1=0;
+			if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
+			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t1);
+		}
+	}
+
+	/**
+	 * Performs the relu operation on the GPU.
+	 * @param ec currently active {@link ExecutionContext}
+	 * @param gCtx   a valid {@link GPUContext}
+	 * @param instName the invoking instruction's name for record {@link Statistics}.
+	 * @param in input matrix
+	 * @param outputName	name of the output matrix
+	 * @throws DMLRuntimeException	if an error occurs
+	 */
+	public static void relu(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in, String outputName) throws DMLRuntimeException {
+		if (ec.getGPUContext(0) != gCtx)
+			throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
+		long N = in.getNumRows();
+		long CHW = in.getNumColumns();
+		MatrixObject output = ec.getMatrixObject(outputName);
+		getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, in.getNumRows(), in.getNumColumns()); // Allocated the dense output matrix
+		long t0=0;
+		cudnnTensorDescriptor srcTensorDesc = in.getGPUObject(gCtx).getTensorDescriptor();
+		if(N*CHW >= maxNumDoublesOfCuDNNTensor ||  srcTensorDesc == null) {
+			LOG.trace("GPU : relu custom kernel" + ", GPUContext=" + gCtx);
+			// Invokes relu(double* A,  double* ret, int rlen, int clen)
+			if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
+			Pointer dstData = getDensePointerForCuDNN(gCtx, output, instName);
+			Pointer srcData = getDensePointerForCuDNN(gCtx, in, instName); // TODO: FIXME: Add sparse kernel support for relu
+			getCudaKernels(gCtx).launchKernel("relu",
+					ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(N), toInt(CHW)),
+					srcData, dstData, toInt(N), toInt(CHW));
+			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RELU_KERNEL, System.nanoTime() - t0);
+		}
+		else {
+			cudnnReLU(gCtx, instName, in, getDensePointerForCuDNN(gCtx, output, instName), srcTensorDesc);
+		}
+	}
+
+	/**
+	 * Convenience method to get jcudaDenseMatrixPtr. This method explicitly converts sparse to dense format, so use it judiciously.
+	 * @param gCtx a valid {@link GPUContext}
+	 * @param image input matrix object
+	 * @return jcuda pointer
+	 * @throws DMLRuntimeException if error occurs while sparse to dense conversion
+	 */
+	protected static Pointer getDensePointerForCuDNN(GPUContext gCtx, MatrixObject image, String instName) throws DMLRuntimeException {
+		long numElems = image.getNumRows()*image.getNumColumns();
+		if(numElems > maxNumDoublesOfCuDNNTensor) {
+			throw new DMLRuntimeException("CuDNN restriction: the size of input tensor cannot have greater than 2 giga-elements, but has " + numElems + " (i.e. [" + image.getNumRows() + " X " + image.getNumColumns() + "]). Hint: try reducing the mini-batch size.");
+		}
+		return getDensePointer(gCtx, image, instName);
+	}
+
+	/**
+	 * Convenience method for checking the status of CuDNN kernel.
+	 *
+	 * @param status status returned by CuDNN
+	 * @throws DMLRuntimeException if status is not CUDNN_STATUS_SUCCESS
+	 */
+	protected static void checkStatus(int status) throws DMLRuntimeException {
+		if(status != cudnnStatus.CUDNN_STATUS_SUCCESS)
+			throw new DMLRuntimeException("Error status returned by CuDNN:" + jcuda.jcudnn.cudnnStatus.stringFor(status));
+	}
+}


[4/5] systemml git commit: [SYSTEMML-540] Support sparse GPU conv2d as well as fix memory estimation of convolution operations

Posted by ni...@apache.org.
http://git-wip-us.apache.org/repos/asf/systemml/blob/772d9302/src/main/cpp/kernels/SystemML.ptx
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.ptx b/src/main/cpp/kernels/SystemML.ptx
index f6ba15a..caa3fc7 100644
--- a/src/main/cpp/kernels/SystemML.ptx
+++ b/src/main/cpp/kernels/SystemML.ptx
@@ -1,8 +1,8 @@
 //
 // Generated by NVIDIA NVVM Compiler
 //
-// Compiler Build ID: CL-21124049
-// Cuda compilation tools, release 8.0, V8.0.44
+// Compiler Build ID: CL-21554848
+// Cuda compilation tools, release 8.0, V8.0.61
 // Based on LLVM 3.4svn
 //
 
@@ -252,6 +252,52 @@ BB3_4:
 	ret;
 }
 
+	// .globl	inplace_add
+.visible .entry inplace_add(
+	.param .u64 inplace_add_param_0,
+	.param .u64 inplace_add_param_1,
+	.param .u32 inplace_add_param_2,
+	.param .u32 inplace_add_param_3
+)
+{
+	.reg .pred 	%p<4>;
+	.reg .b32 	%r<10>;
+	.reg .f64 	%fd<4>;
+	.reg .b64 	%rd<8>;
+
+
+	ld.param.u64 	%rd1, [inplace_add_param_0];
+	ld.param.u64 	%rd2, [inplace_add_param_1];
+	ld.param.u32 	%r4, [inplace_add_param_2];
+	ld.param.u32 	%r3, [inplace_add_param_3];
+	mov.u32 	%r5, %ctaid.x;
+	mov.u32 	%r6, %ntid.x;
+	mov.u32 	%r7, %tid.x;
+	mad.lo.s32 	%r1, %r6, %r5, %r7;
+	div.s32 	%r2, %r1, %r3;
+	setp.lt.s32	%p1, %r2, %r4;
+	setp.gt.s32	%p2, %r3, -1;
+	and.pred  	%p3, %p1, %p2;
+	@!%p3 bra 	BB4_2;
+	bra.uni 	BB4_1;
+
+BB4_1:
+	rem.s32 	%r8, %r1, %r3;
+	cvta.to.global.u64 	%rd3, %rd1;
+	mad.lo.s32 	%r9, %r2, %r3, %r8;
+	mul.wide.s32 	%rd4, %r9, 8;
+	add.s64 	%rd5, %rd3, %rd4;
+	cvta.to.global.u64 	%rd6, %rd2;
+	add.s64 	%rd7, %rd6, %rd4;
+	ld.global.f64 	%fd1, [%rd7];
+	ld.global.f64 	%fd2, [%rd5];
+	add.f64 	%fd3, %fd2, %fd1;
+	st.global.f64 	[%rd7], %fd3;
+
+BB4_2:
+	ret;
+}
+
 	// .globl	bias_add
 .visible .entry bias_add(
 	.param .u64 bias_add_param_0,
@@ -282,10 +328,10 @@ BB3_4:
 	setp.lt.s32	%p1, %r2, %r5;
 	setp.gt.s32	%p2, %r3, -1;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB4_2;
-	bra.uni 	BB4_1;
+	@!%p3 bra 	BB5_2;
+	bra.uni 	BB5_1;
 
-BB4_1:
+BB5_1:
 	rem.s32 	%r9, %r1, %r3;
 	cvta.to.global.u64 	%rd4, %rd1;
 	mad.lo.s32 	%r10, %r2, %r3, %r9;
@@ -302,7 +348,7 @@ BB4_1:
 	add.s64 	%rd11, %rd10, %rd5;
 	st.global.f64 	[%rd11], %fd3;
 
-BB4_2:
+BB5_2:
 	ret;
 }
 
@@ -341,10 +387,10 @@ BB4_2:
 	setp.lt.s32	%p1, %r1, %r5;
 	setp.gt.s32	%p2, %r3, -1;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB5_4;
-	bra.uni 	BB5_1;
+	@!%p3 bra 	BB6_4;
+	bra.uni 	BB6_1;
 
-BB5_1:
+BB6_1:
 	cvta.to.global.u64 	%rd6, %rd4;
 	mad.lo.s32 	%r10, %r1, %r3, %r2;
 	cvta.to.global.u64 	%rd7, %rd3;
@@ -353,25 +399,25 @@ BB5_1:
 	ld.global.f64 	%fd1, [%rd9];
 	add.s64 	%rd2, %rd6, %rd8;
 	setp.eq.s32	%p4, %r4, 1;
-	@%p4 bra 	BB5_3;
-	bra.uni 	BB5_2;
+	@%p4 bra 	BB6_3;
+	bra.uni 	BB6_2;
 
-BB5_3:
+BB6_3:
 	mul.wide.s32 	%rd12, %r2, 8;
 	add.s64 	%rd13, %rd1, %rd12;
 	ld.global.f64 	%fd5, [%rd13];
 	fma.rn.f64 	%fd6, %fd5, %fd2, %fd1;
 	st.global.f64 	[%rd2], %fd6;
-	bra.uni 	BB5_4;
+	bra.uni 	BB6_4;
 
-BB5_2:
+BB6_2:
 	mul.wide.s32 	%rd10, %r1, 8;
 	add.s64 	%rd11, %rd1, %rd10;
 	ld.global.f64 	%fd3, [%rd11];
 	fma.rn.f64 	%fd4, %fd3, %fd2, %fd1;
 	st.global.f64 	[%rd2], %fd4;
 
-BB5_4:
+BB6_4:
 	ret;
 }
 
@@ -405,10 +451,10 @@ BB5_4:
 	setp.lt.s32	%p1, %r2, %r5;
 	setp.gt.s32	%p2, %r3, -1;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB6_2;
-	bra.uni 	BB6_1;
+	@!%p3 bra 	BB7_2;
+	bra.uni 	BB7_1;
 
-BB6_1:
+BB7_1:
 	rem.s32 	%r9, %r1, %r3;
 	cvta.to.global.u64 	%rd4, %rd1;
 	mad.lo.s32 	%r10, %r2, %r3, %r9;
@@ -425,7 +471,7 @@ BB6_1:
 	add.s64 	%rd11, %rd10, %rd5;
 	st.global.f64 	[%rd11], %fd3;
 
-BB6_2:
+BB7_2:
 	ret;
 }
 
@@ -467,10 +513,10 @@ BB6_2:
 	setp.lt.s32	%p1, %r8, %r2;
 	setp.gt.s32	%p2, %r3, -1;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB7_6;
-	bra.uni 	BB7_1;
+	@!%p3 bra 	BB8_6;
+	bra.uni 	BB8_1;
 
-BB7_1:
+BB8_1:
 	cvta.to.global.u64 	%rd4, %rd2;
 	mul.wide.s32 	%rd5, %r1, 8;
 	add.s64 	%rd6, %rd4, %rd5;
@@ -480,26 +526,26 @@ BB7_1:
 	setp.lt.f64	%p4, %fd8, %fd3;
 	cvta.to.global.u64 	%rd7, %rd3;
 	add.s64 	%rd1, %rd7, %rd5;
-	@%p4 bra 	BB7_5;
-	bra.uni 	BB7_2;
+	@%p4 bra 	BB8_5;
+	bra.uni 	BB8_2;
 
-BB7_5:
+BB8_5:
 	st.global.f64 	[%rd1], %fd4;
-	bra.uni 	BB7_6;
+	bra.uni 	BB8_6;
 
-BB7_2:
+BB8_2:
 	setp.lt.f64	%p5, %fd1, %fd2;
-	@%p5 bra 	BB7_4;
-	bra.uni 	BB7_3;
+	@%p5 bra 	BB8_4;
+	bra.uni 	BB8_3;
 
-BB7_4:
+BB8_4:
 	st.global.f64 	[%rd1], %fd5;
-	bra.uni 	BB7_6;
+	bra.uni 	BB8_6;
 
-BB7_3:
+BB8_3:
 	st.global.f64 	[%rd1], %fd6;
 
-BB7_6:
+BB8_6:
 	ret;
 }
 
@@ -515,9 +561,9 @@ BB7_6:
 	.param .u32 matrix_matrix_cellwise_op_param_7
 )
 {
-	.reg .pred 	%p<77>;
-	.reg .b32 	%r<65>;
-	.reg .f64 	%fd<55>;
+	.reg .pred 	%p<73>;
+	.reg .b32 	%r<66>;
+	.reg .f64 	%fd<56>;
 	.reg .b64 	%rd<19>;
 
 
@@ -538,93 +584,93 @@ BB7_6:
 	setp.lt.s32	%p2, %r1, %r14;
 	setp.gt.s32	%p3, %r10, -1;
 	and.pred  	%p4, %p2, %p3;
-	@!%p4 bra 	BB8_73;
-	bra.uni 	BB8_1;
+	@!%p4 bra 	BB9_77;
+	bra.uni 	BB9_1;
 
-BB8_1:
+BB9_1:
 	mad.lo.s32 	%r3, %r1, %r10, %r2;
 	setp.eq.s32	%p5, %r11, 1;
-	mov.u32 	%r63, %r1;
-	@%p5 bra 	BB8_5;
+	mov.u32 	%r64, %r1;
+	@%p5 bra 	BB9_5;
 
 	setp.ne.s32	%p6, %r11, 2;
-	mov.u32 	%r64, %r3;
-	@%p6 bra 	BB8_4;
+	mov.u32 	%r65, %r3;
+	@%p6 bra 	BB9_4;
 
-	mov.u32 	%r64, %r2;
+	mov.u32 	%r65, %r2;
 
-BB8_4:
-	mov.u32 	%r58, %r64;
-	mov.u32 	%r4, %r58;
-	mov.u32 	%r63, %r4;
+BB9_4:
+	mov.u32 	%r59, %r65;
+	mov.u32 	%r4, %r59;
+	mov.u32 	%r64, %r4;
 
-BB8_5:
-	mov.u32 	%r5, %r63;
+BB9_5:
+	mov.u32 	%r5, %r64;
 	setp.eq.s32	%p7, %r12, 1;
-	mov.u32 	%r61, %r1;
-	@%p7 bra 	BB8_9;
+	mov.u32 	%r62, %r1;
+	@%p7 bra 	BB9_9;
 
 	setp.ne.s32	%p8, %r12, 2;
-	mov.u32 	%r62, %r3;
-	@%p8 bra 	BB8_8;
+	mov.u32 	%r63, %r3;
+	@%p8 bra 	BB9_8;
 
-	mov.u32 	%r62, %r2;
+	mov.u32 	%r63, %r2;
 
-BB8_8:
-	mov.u32 	%r61, %r62;
+BB9_8:
+	mov.u32 	%r62, %r63;
 
-BB8_9:
+BB9_9:
 	cvta.to.global.u64 	%rd5, %rd3;
 	cvta.to.global.u64 	%rd6, %rd2;
 	mul.wide.s32 	%rd7, %r5, 8;
 	add.s64 	%rd8, %rd6, %rd7;
 	ld.global.f64 	%fd1, [%rd8];
-	mul.wide.s32 	%rd9, %r61, 8;
+	mul.wide.s32 	%rd9, %r62, 8;
 	add.s64 	%rd10, %rd5, %rd9;
 	ld.global.f64 	%fd2, [%rd10];
-	mov.f64 	%fd54, 0d7FEFFFFFFFFFFFFF;
+	mov.f64 	%fd55, 0d7FEFFFFFFFFFFFFF;
 	setp.gt.s32	%p9, %r13, 8;
-	@%p9 bra 	BB8_26;
+	@%p9 bra 	BB9_26;
 
 	setp.gt.s32	%p23, %r13, 3;
-	@%p23 bra 	BB8_18;
+	@%p23 bra 	BB9_18;
 
 	setp.gt.s32	%p30, %r13, 1;
-	@%p30 bra 	BB8_15;
+	@%p30 bra 	BB9_15;
 
 	setp.eq.s32	%p33, %r13, 0;
-	@%p33 bra 	BB8_71;
-	bra.uni 	BB8_13;
+	@%p33 bra 	BB9_75;
+	bra.uni 	BB9_13;
 
-BB8_71:
-	add.f64 	%fd54, %fd1, %fd2;
-	bra.uni 	BB8_72;
+BB9_75:
+	add.f64 	%fd55, %fd1, %fd2;
+	bra.uni 	BB9_76;
 
-BB8_26:
+BB9_26:
 	setp.gt.s32	%p10, %r13, 13;
-	@%p10 bra 	BB8_35;
+	@%p10 bra 	BB9_35;
 
 	setp.gt.s32	%p17, %r13, 10;
-	@%p17 bra 	BB8_31;
+	@%p17 bra 	BB9_31;
 
 	setp.eq.s32	%p21, %r13, 9;
-	@%p21 bra 	BB8_53;
-	bra.uni 	BB8_29;
+	@%p21 bra 	BB9_55;
+	bra.uni 	BB9_29;
 
-BB8_53:
-	setp.eq.f64	%p50, %fd1, %fd2;
-	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p50;
-	bra.uni 	BB8_72;
+BB9_55:
+	setp.eq.f64	%p48, %fd1, %fd2;
+	selp.f64	%fd55, 0d3FF0000000000000, 0d0000000000000000, %p48;
+	bra.uni 	BB9_76;
 
-BB8_18:
+BB9_18:
 	setp.gt.s32	%p24, %r13, 5;
-	@%p24 bra 	BB8_22;
+	@%p24 bra 	BB9_22;
 
 	setp.eq.s32	%p28, %r13, 4;
-	@%p28 bra 	BB8_56;
-	bra.uni 	BB8_20;
+	@%p28 bra 	BB9_58;
+	bra.uni 	BB9_20;
 
-BB8_56:
+BB9_58:
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r8}, %fd1;
@@ -637,7 +683,7 @@ BB8_56:
 	add.s32 	%r32, %r31, -1012;
 	mov.b64 	 %rd15, %fd2;
 	shl.b64 	%rd1, %rd15, %r32;
-	setp.eq.s64	%p55, %rd1, -9223372036854775808;
+	setp.eq.s64	%p53, %rd1, -9223372036854775808;
 	abs.f64 	%fd19, %fd1;
 	// Callseq Start 0
 	{
@@ -654,340 +700,342 @@ BB8_56:
 	param0, 
 	param1
 	);
-	ld.param.f64	%fd53, [retval0+0];
+	ld.param.f64	%fd54, [retval0+0];
 	
 	//{
 	}// Callseq End 0
-	setp.lt.s32	%p56, %r8, 0;
-	and.pred  	%p1, %p56, %p55;
-	@!%p1 bra 	BB8_58;
-	bra.uni 	BB8_57;
+	setp.lt.s32	%p54, %r8, 0;
+	and.pred  	%p1, %p54, %p53;
+	@!%p1 bra 	BB9_60;
+	bra.uni 	BB9_59;
 
-BB8_57:
+BB9_59:
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r33}, %fd53;
+	mov.b64 	{%temp, %r33}, %fd54;
 	}
 	xor.b32  	%r34, %r33, -2147483648;
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r35, %temp}, %fd53;
+	mov.b64 	{%r35, %temp}, %fd54;
 	}
-	mov.b64 	%fd53, {%r35, %r34};
+	mov.b64 	%fd54, {%r35, %r34};
 
-BB8_58:
-	mov.f64 	%fd52, %fd53;
-	setp.eq.f64	%p57, %fd1, 0d0000000000000000;
-	@%p57 bra 	BB8_61;
-	bra.uni 	BB8_59;
+BB9_60:
+	mov.f64 	%fd53, %fd54;
+	setp.eq.f64	%p55, %fd1, 0d0000000000000000;
+	@%p55 bra 	BB9_63;
+	bra.uni 	BB9_61;
 
-BB8_61:
-	selp.b32	%r36, %r8, 0, %p55;
+BB9_63:
+	selp.b32	%r36, %r8, 0, %p53;
 	or.b32  	%r37, %r36, 2146435072;
-	setp.lt.s32	%p61, %r9, 0;
-	selp.b32	%r38, %r37, %r36, %p61;
+	setp.lt.s32	%p59, %r9, 0;
+	selp.b32	%r38, %r37, %r36, %p59;
 	mov.u32 	%r39, 0;
-	mov.b64 	%fd52, {%r39, %r38};
-	bra.uni 	BB8_62;
+	mov.b64 	%fd53, {%r39, %r38};
+	bra.uni 	BB9_64;
 
-BB8_35:
+BB9_35:
 	setp.gt.s32	%p11, %r13, 15;
-	@%p11 bra 	BB8_39;
+	@%p11 bra 	BB9_39;
 
 	setp.eq.s32	%p15, %r13, 14;
-	@%p15 bra 	BB8_50;
-	bra.uni 	BB8_37;
+	@%p15 bra 	BB9_52;
+	bra.uni 	BB9_37;
 
-BB8_50:
+BB9_52:
 	cvt.rni.s64.f64	%rd11, %fd1;
 	cvt.rni.s64.f64	%rd12, %fd2;
 	cvt.u32.u64	%r25, %rd11;
 	cvt.u32.u64	%r26, %rd12;
 	or.b32  	%r27, %r26, %r25;
-	setp.eq.s32	%p47, %r27, 0;
-	selp.f64	%fd54, 0d0000000000000000, 0d3FF0000000000000, %p47;
-	bra.uni 	BB8_72;
+	setp.eq.s32	%p45, %r27, 0;
+	selp.f64	%fd55, 0d0000000000000000, 0d3FF0000000000000, %p45;
+	bra.uni 	BB9_76;
 
-BB8_15:
+BB9_15:
 	setp.eq.s32	%p31, %r13, 2;
-	@%p31 bra 	BB8_70;
-	bra.uni 	BB8_16;
+	@%p31 bra 	BB9_74;
+	bra.uni 	BB9_16;
 
-BB8_70:
-	mul.f64 	%fd54, %fd1, %fd2;
-	bra.uni 	BB8_72;
+BB9_74:
+	mul.f64 	%fd55, %fd1, %fd2;
+	bra.uni 	BB9_76;
 
-BB8_31:
+BB9_31:
 	setp.eq.s32	%p18, %r13, 11;
-	@%p18 bra 	BB8_52;
+	@%p18 bra 	BB9_54;
 
 	setp.eq.s32	%p19, %r13, 12;
-	@%p19 bra 	BB8_51;
-	bra.uni 	BB8_33;
+	@%p19 bra 	BB9_53;
+	bra.uni 	BB9_33;
 
-BB8_51:
-	max.f64 	%fd54, %fd1, %fd2;
-	bra.uni 	BB8_72;
+BB9_53:
+	max.f64 	%fd55, %fd1, %fd2;
+	bra.uni 	BB9_76;
 
-BB8_22:
+BB9_22:
 	setp.eq.s32	%p25, %r13, 6;
-	@%p25 bra 	BB8_55;
+	@%p25 bra 	BB9_57;
 
 	setp.eq.s32	%p26, %r13, 7;
-	@%p26 bra 	BB8_54;
-	bra.uni 	BB8_24;
+	@%p26 bra 	BB9_56;
+	bra.uni 	BB9_24;
 
-BB8_54:
-	setp.gt.f64	%p52, %fd1, %fd2;
-	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p52;
-	bra.uni 	BB8_72;
+BB9_56:
+	setp.gt.f64	%p50, %fd1, %fd2;
+	selp.f64	%fd55, 0d3FF0000000000000, 0d0000000000000000, %p50;
+	bra.uni 	BB9_76;
 
-BB8_39:
+BB9_39:
 	setp.eq.s32	%p12, %r13, 16;
-	@%p12 bra 	BB8_49;
+	@%p12 bra 	BB9_51;
 
 	setp.eq.s32	%p13, %r13, 17;
-	@%p13 bra 	BB8_45;
-	bra.uni 	BB8_41;
+	@%p13 bra 	BB9_46;
+	bra.uni 	BB9_41;
 
-BB8_45:
-	setp.eq.f64	%p39, %fd2, 0d0000000000000000;
-	setp.eq.f64	%p40, %fd2, 0d8000000000000000;
-	or.pred  	%p41, %p39, %p40;
-	mov.f64 	%fd54, 0d7FF8000000000000;
-	@%p41 bra 	BB8_72;
+BB9_46:
+	setp.eq.f64	%p38, %fd2, 0d0000000000000000;
+	setp.eq.f64	%p39, %fd2, 0d8000000000000000;
+	or.pred  	%p40, %p38, %p39;
+	mov.f64 	%fd55, 0d7FF8000000000000;
+	@%p40 bra 	BB9_76;
 
-	div.rn.f64 	%fd54, %fd1, %fd2;
-	abs.f64 	%fd39, %fd54;
-	setp.gtu.f64	%p42, %fd39, 0d7FF0000000000000;
-	@%p42 bra 	BB8_72;
+	div.rn.f64 	%fd55, %fd1, %fd2;
+	abs.f64 	%fd39, %fd55;
+	setp.gtu.f64	%p41, %fd39, 0d7FF0000000000000;
+	@%p41 bra 	BB9_76;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r22, %temp}, %fd54;
+	mov.b64 	{%temp, %r22}, %fd55;
 	}
+	and.b32  	%r23, %r22, 2147483647;
+	setp.ne.s32	%p42, %r23, 2146435072;
+	@%p42 bra 	BB9_50;
+
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r23}, %fd54;
+	mov.b64 	{%r24, %temp}, %fd55;
 	}
-	and.b32  	%r24, %r23, 2147483647;
-	setp.ne.s32	%p43, %r24, 2146435072;
-	setp.ne.s32	%p44, %r22, 0;
-	or.pred  	%p45, %p43, %p44;
-	@!%p45 bra 	BB8_72;
-	bra.uni 	BB8_48;
-
-BB8_48:
-	cvt.rmi.f64.f64	%fd40, %fd54;
+	setp.eq.s32	%p43, %r24, 0;
+	@%p43 bra 	BB9_76;
+
+BB9_50:
+	cvt.rmi.f64.f64	%fd40, %fd55;
 	mul.f64 	%fd41, %fd2, %fd40;
-	sub.f64 	%fd54, %fd1, %fd41;
-	bra.uni 	BB8_72;
+	sub.f64 	%fd55, %fd1, %fd41;
+	bra.uni 	BB9_76;
 
-BB8_13:
+BB9_13:
 	setp.eq.s32	%p34, %r13, 1;
-	@%p34 bra 	BB8_14;
-	bra.uni 	BB8_72;
+	@%p34 bra 	BB9_14;
+	bra.uni 	BB9_76;
 
-BB8_14:
-	sub.f64 	%fd54, %fd1, %fd2;
-	bra.uni 	BB8_72;
+BB9_14:
+	sub.f64 	%fd55, %fd1, %fd2;
+	bra.uni 	BB9_76;
 
-BB8_29:
+BB9_29:
 	setp.eq.s32	%p22, %r13, 10;
-	@%p22 bra 	BB8_30;
-	bra.uni 	BB8_72;
+	@%p22 bra 	BB9_30;
+	bra.uni 	BB9_76;
 
-BB8_30:
-	setp.neu.f64	%p49, %fd1, %fd2;
-	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p49;
-	bra.uni 	BB8_72;
+BB9_30:
+	setp.neu.f64	%p47, %fd1, %fd2;
+	selp.f64	%fd55, 0d3FF0000000000000, 0d0000000000000000, %p47;
+	bra.uni 	BB9_76;
 
-BB8_20:
+BB9_20:
 	setp.eq.s32	%p29, %r13, 5;
-	@%p29 bra 	BB8_21;
-	bra.uni 	BB8_72;
+	@%p29 bra 	BB9_21;
+	bra.uni 	BB9_76;
 
-BB8_21:
-	setp.lt.f64	%p54, %fd1, %fd2;
-	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p54;
-	bra.uni 	BB8_72;
+BB9_21:
+	setp.lt.f64	%p52, %fd1, %fd2;
+	selp.f64	%fd55, 0d3FF0000000000000, 0d0000000000000000, %p52;
+	bra.uni 	BB9_76;
 
-BB8_37:
+BB9_37:
 	setp.eq.s32	%p16, %r13, 15;
-	@%p16 bra 	BB8_38;
-	bra.uni 	BB8_72;
+	@%p16 bra 	BB9_38;
+	bra.uni 	BB9_76;
 
-BB8_38:
+BB9_38:
 	mul.f64 	%fd43, %fd1, %fd2;
 	mov.f64 	%fd44, 0d3FF0000000000000;
-	sub.f64 	%fd54, %fd44, %fd43;
-	bra.uni 	BB8_72;
+	sub.f64 	%fd55, %fd44, %fd43;
+	bra.uni 	BB9_76;
 
-BB8_16:
+BB9_16:
 	setp.eq.s32	%p32, %r13, 3;
-	@%p32 bra 	BB8_17;
-	bra.uni 	BB8_72;
+	@%p32 bra 	BB9_17;
+	bra.uni 	BB9_76;
 
-BB8_17:
-	div.rn.f64 	%fd54, %fd1, %fd2;
-	bra.uni 	BB8_72;
+BB9_17:
+	div.rn.f64 	%fd55, %fd1, %fd2;
+	bra.uni 	BB9_76;
 
-BB8_52:
-	min.f64 	%fd54, %fd1, %fd2;
-	bra.uni 	BB8_72;
+BB9_54:
+	min.f64 	%fd55, %fd1, %fd2;
+	bra.uni 	BB9_76;
 
-BB8_33:
+BB9_33:
 	setp.eq.s32	%p20, %r13, 13;
-	@%p20 bra 	BB8_34;
-	bra.uni 	BB8_72;
+	@%p20 bra 	BB9_34;
+	bra.uni 	BB9_76;
 
-BB8_34:
+BB9_34:
 	cvt.rni.s64.f64	%rd13, %fd1;
 	cvt.rni.s64.f64	%rd14, %fd2;
 	cvt.u32.u64	%r28, %rd13;
 	cvt.u32.u64	%r29, %rd14;
 	and.b32  	%r30, %r29, %r28;
-	setp.eq.s32	%p48, %r30, 0;
-	selp.f64	%fd54, 0d0000000000000000, 0d3FF0000000000000, %p48;
-	bra.uni 	BB8_72;
+	setp.eq.s32	%p46, %r30, 0;
+	selp.f64	%fd55, 0d0000000000000000, 0d3FF0000000000000, %p46;
+	bra.uni 	BB9_76;
 
-BB8_55:
-	setp.le.f64	%p53, %fd1, %fd2;
-	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p53;
-	bra.uni 	BB8_72;
+BB9_57:
+	setp.le.f64	%p51, %fd1, %fd2;
+	selp.f64	%fd55, 0d3FF0000000000000, 0d0000000000000000, %p51;
+	bra.uni 	BB9_76;
 
-BB8_24:
+BB9_24:
 	setp.eq.s32	%p27, %r13, 8;
-	@%p27 bra 	BB8_25;
-	bra.uni 	BB8_72;
+	@%p27 bra 	BB9_25;
+	bra.uni 	BB9_76;
 
-BB8_25:
-	setp.ge.f64	%p51, %fd1, %fd2;
-	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p51;
-	bra.uni 	BB8_72;
+BB9_25:
+	setp.ge.f64	%p49, %fd1, %fd2;
+	selp.f64	%fd55, 0d3FF0000000000000, 0d0000000000000000, %p49;
+	bra.uni 	BB9_76;
 
-BB8_49:
-	setp.neu.f64	%p46, %fd1, 0d0000000000000000;
+BB9_51:
+	setp.neu.f64	%p44, %fd1, 0d0000000000000000;
 	sub.f64 	%fd42, %fd1, %fd2;
-	selp.f64	%fd54, %fd42, 0d0000000000000000, %p46;
-	bra.uni 	BB8_72;
+	selp.f64	%fd55, %fd42, 0d0000000000000000, %p44;
+	bra.uni 	BB9_76;
 
-BB8_41:
+BB9_41:
 	setp.ne.s32	%p14, %r13, 18;
-	@%p14 bra 	BB8_72;
+	@%p14 bra 	BB9_76;
 
-	div.rn.f64 	%fd54, %fd1, %fd2;
-	abs.f64 	%fd37, %fd54;
+	div.rn.f64 	%fd55, %fd1, %fd2;
+	abs.f64 	%fd37, %fd55;
 	setp.gtu.f64	%p35, %fd37, 0d7FF0000000000000;
-	@%p35 bra 	BB8_72;
+	@%p35 bra 	BB9_76;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r19, %temp}, %fd54;
+	mov.b64 	{%temp, %r19}, %fd55;
 	}
+	and.b32  	%r20, %r19, 2147483647;
+	setp.ne.s32	%p36, %r20, 2146435072;
+	@%p36 bra 	BB9_45;
+
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r20}, %fd54;
+	mov.b64 	{%r21, %temp}, %fd55;
 	}
-	and.b32  	%r21, %r20, 2147483647;
-	setp.ne.s32	%p36, %r21, 2146435072;
-	setp.ne.s32	%p37, %r19, 0;
-	or.pred  	%p38, %p36, %p37;
-	@!%p38 bra 	BB8_72;
-	bra.uni 	BB8_44;
+	setp.eq.s32	%p37, %r21, 0;
+	@%p37 bra 	BB9_76;
 
-BB8_44:
-	cvt.rmi.f64.f64	%fd54, %fd54;
-	bra.uni 	BB8_72;
+BB9_45:
+	cvt.rmi.f64.f64	%fd55, %fd55;
+	bra.uni 	BB9_76;
 
-BB8_59:
-	setp.gt.s32	%p58, %r8, -1;
-	@%p58 bra 	BB8_62;
+BB9_61:
+	setp.gt.s32	%p56, %r8, -1;
+	@%p56 bra 	BB9_64;
 
 	cvt.rzi.f64.f64	%fd45, %fd2;
-	setp.neu.f64	%p59, %fd45, %fd2;
-	selp.f64	%fd52, 0dFFF8000000000000, %fd52, %p59;
+	setp.neu.f64	%p57, %fd45, %fd2;
+	selp.f64	%fd53, 0dFFF8000000000000, %fd53, %p57;
 
-BB8_62:
-	mov.f64 	%fd25, %fd52;
+BB9_64:
+	mov.f64 	%fd25, %fd53;
 	add.f64 	%fd26, %fd1, %fd2;
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r40}, %fd26;
 	}
 	and.b32  	%r41, %r40, 2146435072;
-	setp.ne.s32	%p62, %r41, 2146435072;
-	mov.f64 	%fd51, %fd25;
-	@%p62 bra 	BB8_69;
+	setp.ne.s32	%p60, %r41, 2146435072;
+	mov.f64 	%fd52, %fd25;
+	@%p60 bra 	BB9_73;
 
-	setp.gtu.f64	%p63, %fd19, 0d7FF0000000000000;
-	mov.f64 	%fd51, %fd26;
-	@%p63 bra 	BB8_69;
+	setp.gtu.f64	%p61, %fd19, 0d7FF0000000000000;
+	mov.f64 	%fd52, %fd26;
+	@%p61 bra 	BB9_73;
 
 	abs.f64 	%fd46, %fd2;
-	setp.gtu.f64	%p64, %fd46, 0d7FF0000000000000;
-	mov.f64 	%fd50, %fd26;
-	mov.f64 	%fd51, %fd50;
-	@%p64 bra 	BB8_69;
+	setp.gtu.f64	%p62, %fd46, 0d7FF0000000000000;
+	mov.f64 	%fd51, %fd26;
+	mov.f64 	%fd52, %fd51;
+	@%p62 bra 	BB9_73;
+
+	and.b32  	%r42, %r9, 2147483647;
+	setp.ne.s32	%p63, %r42, 2146435072;
+	@%p63 bra 	BB9_69;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r42, %temp}, %fd2;
+	mov.b64 	{%r43, %temp}, %fd2;
 	}
-	and.b32  	%r43, %r9, 2147483647;
-	setp.eq.s32	%p65, %r43, 2146435072;
-	setp.eq.s32	%p66, %r42, 0;
-	and.pred  	%p67, %p65, %p66;
-	@%p67 bra 	BB8_68;
-	bra.uni 	BB8_66;
-
-BB8_68:
-	setp.gt.f64	%p71, %fd19, 0d3FF0000000000000;
-	selp.b32	%r51, 2146435072, 0, %p71;
-	xor.b32  	%r52, %r51, 2146435072;
-	setp.lt.s32	%p72, %r9, 0;
-	selp.b32	%r53, %r52, %r51, %p72;
-	setp.eq.f64	%p73, %fd1, 0dBFF0000000000000;
-	selp.b32	%r54, 1072693248, %r53, %p73;
-	mov.u32 	%r55, 0;
-	mov.b64 	%fd51, {%r55, %r54};
-	bra.uni 	BB8_69;
-
-BB8_66:
+	setp.eq.s32	%p64, %r43, 0;
+	@%p64 bra 	BB9_72;
+
+BB9_69:
+	and.b32  	%r44, %r8, 2147483647;
+	setp.ne.s32	%p65, %r44, 2146435072;
+	mov.f64 	%fd49, %fd25;
+	mov.f64 	%fd52, %fd49;
+	@%p65 bra 	BB9_73;
+
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r44, %temp}, %fd1;
+	mov.b64 	{%r45, %temp}, %fd1;
 	}
-	and.b32  	%r45, %r8, 2147483647;
-	setp.eq.s32	%p68, %r45, 2146435072;
-	setp.eq.s32	%p69, %r44, 0;
-	and.pred  	%p70, %p68, %p69;
-	mov.f64 	%fd51, %fd25;
-	@!%p70 bra 	BB8_69;
-	bra.uni 	BB8_67;
-
-BB8_67:
+	setp.ne.s32	%p66, %r45, 0;
+	mov.f64 	%fd52, %fd25;
+	@%p66 bra 	BB9_73;
+
 	shr.s32 	%r46, %r9, 31;
 	and.b32  	%r47, %r46, -2146435072;
-	selp.b32	%r48, -1048576, 2146435072, %p1;
-	add.s32 	%r49, %r48, %r47;
-	mov.u32 	%r50, 0;
-	mov.b64 	%fd51, {%r50, %r49};
-
-BB8_69:
-	setp.eq.f64	%p74, %fd2, 0d0000000000000000;
-	setp.eq.f64	%p75, %fd1, 0d3FF0000000000000;
-	or.pred  	%p76, %p75, %p74;
-	selp.f64	%fd54, 0d3FF0000000000000, %fd51, %p76;
-
-BB8_72:
+	add.s32 	%r48, %r47, 2146435072;
+	or.b32  	%r49, %r48, -2147483648;
+	selp.b32	%r50, %r49, %r48, %p1;
+	mov.u32 	%r51, 0;
+	mov.b64 	%fd52, {%r51, %r50};
+	bra.uni 	BB9_73;
+
+BB9_72:
+	setp.gt.f64	%p67, %fd19, 0d3FF0000000000000;
+	selp.b32	%r52, 2146435072, 0, %p67;
+	xor.b32  	%r53, %r52, 2146435072;
+	setp.lt.s32	%p68, %r9, 0;
+	selp.b32	%r54, %r53, %r52, %p68;
+	setp.eq.f64	%p69, %fd1, 0dBFF0000000000000;
+	selp.b32	%r55, 1072693248, %r54, %p69;
+	mov.u32 	%r56, 0;
+	mov.b64 	%fd52, {%r56, %r55};
+
+BB9_73:
+	setp.eq.f64	%p70, %fd2, 0d0000000000000000;
+	setp.eq.f64	%p71, %fd1, 0d3FF0000000000000;
+	or.pred  	%p72, %p71, %p70;
+	selp.f64	%fd55, 0d3FF0000000000000, %fd52, %p72;
+
+BB9_76:
 	cvta.to.global.u64 	%rd16, %rd4;
 	mul.wide.s32 	%rd17, %r3, 8;
 	add.s64 	%rd18, %rd16, %rd17;
-	st.global.f64 	[%rd18], %fd54;
+	st.global.f64 	[%rd18], %fd55;
 	bar.sync 	0;
 
-BB8_73:
+BB9_77:
 	ret;
 }
 
@@ -1001,9 +1049,9 @@ BB8_73:
 	.param .u32 matrix_scalar_op_param_5
 )
 {
-	.reg .pred 	%p<141>;
-	.reg .b32 	%r<86>;
-	.reg .f64 	%fd<107>;
+	.reg .pred 	%p<133>;
+	.reg .b32 	%r<88>;
+	.reg .f64 	%fd<109>;
 	.reg .b64 	%rd<20>;
 
 
@@ -1018,7 +1066,7 @@ BB8_73:
 	mov.u32 	%r11, %tid.x;
 	mad.lo.s32 	%r1, %r9, %r10, %r11;
 	setp.ge.s32	%p3, %r1, %r8;
-	@%p3 bra 	BB9_130;
+	@%p3 bra 	BB10_138;
 
 	cvta.to.global.u64 	%rd6, %rd5;
 	cvta.to.global.u64 	%rd7, %rd4;
@@ -1027,86 +1075,86 @@ BB8_73:
 	ld.global.f64 	%fd1, [%rd9];
 	add.s64 	%rd1, %rd6, %rd8;
 	setp.eq.s32	%p4, %r7, 0;
-	@%p4 bra 	BB9_66;
+	@%p4 bra 	BB10_70;
 
-	mov.f64 	%fd98, 0d7FEFFFFFFFFFFFFF;
+	mov.f64 	%fd99, 0d7FEFFFFFFFFFFFFF;
 	setp.gt.s32	%p5, %r6, 8;
-	@%p5 bra 	BB9_19;
+	@%p5 bra 	BB10_19;
 
 	setp.gt.s32	%p19, %r6, 3;
-	@%p19 bra 	BB9_11;
+	@%p19 bra 	BB10_11;
 
 	setp.gt.s32	%p26, %r6, 1;
-	@%p26 bra 	BB9_8;
+	@%p26 bra 	BB10_8;
 
 	setp.eq.s32	%p29, %r6, 0;
-	@%p29 bra 	BB9_64;
-	bra.uni 	BB9_6;
+	@%p29 bra 	BB10_68;
+	bra.uni 	BB10_6;
 
-BB9_64:
-	add.f64 	%fd98, %fd1, %fd68;
-	bra.uni 	BB9_65;
+BB10_68:
+	add.f64 	%fd99, %fd1, %fd68;
+	bra.uni 	BB10_69;
 
-BB9_66:
-	mov.f64 	%fd106, 0d7FEFFFFFFFFFFFFF;
-	setp.gt.s32	%p73, %r6, 8;
-	@%p73 bra 	BB9_83;
+BB10_70:
+	mov.f64 	%fd108, 0d7FEFFFFFFFFFFFFF;
+	setp.gt.s32	%p69, %r6, 8;
+	@%p69 bra 	BB10_87;
 
-	setp.gt.s32	%p87, %r6, 3;
-	@%p87 bra 	BB9_75;
+	setp.gt.s32	%p83, %r6, 3;
+	@%p83 bra 	BB10_79;
 
-	setp.gt.s32	%p94, %r6, 1;
-	@%p94 bra 	BB9_72;
+	setp.gt.s32	%p90, %r6, 1;
+	@%p90 bra 	BB10_76;
 
-	setp.eq.s32	%p97, %r6, 0;
-	@%p97 bra 	BB9_128;
-	bra.uni 	BB9_70;
+	setp.eq.s32	%p93, %r6, 0;
+	@%p93 bra 	BB10_136;
+	bra.uni 	BB10_74;
 
-BB9_128:
-	add.f64 	%fd106, %fd1, %fd68;
-	bra.uni 	BB9_129;
+BB10_136:
+	add.f64 	%fd108, %fd1, %fd68;
+	bra.uni 	BB10_137;
 
-BB9_19:
+BB10_19:
 	setp.gt.s32	%p6, %r6, 13;
-	@%p6 bra 	BB9_28;
+	@%p6 bra 	BB10_28;
 
 	setp.gt.s32	%p13, %r6, 10;
-	@%p13 bra 	BB9_24;
+	@%p13 bra 	BB10_24;
 
 	setp.eq.s32	%p17, %r6, 9;
-	@%p17 bra 	BB9_46;
-	bra.uni 	BB9_22;
+	@%p17 bra 	BB10_48;
+	bra.uni 	BB10_22;
 
-BB9_46:
-	setp.eq.f64	%p46, %fd1, %fd68;
-	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p46;
-	bra.uni 	BB9_65;
+BB10_48:
+	setp.eq.f64	%p44, %fd1, %fd68;
+	selp.f64	%fd99, 0d3FF0000000000000, 0d0000000000000000, %p44;
+	bra.uni 	BB10_69;
 
-BB9_83:
-	setp.gt.s32	%p74, %r6, 13;
-	@%p74 bra 	BB9_92;
+BB10_87:
+	setp.gt.s32	%p70, %r6, 13;
+	@%p70 bra 	BB10_96;
 
-	setp.gt.s32	%p81, %r6, 10;
-	@%p81 bra 	BB9_88;
+	setp.gt.s32	%p77, %r6, 10;
+	@%p77 bra 	BB10_92;
 
-	setp.eq.s32	%p85, %r6, 9;
-	@%p85 bra 	BB9_110;
-	bra.uni 	BB9_86;
+	setp.eq.s32	%p81, %r6, 9;
+	@%p81 bra 	BB10_116;
+	bra.uni 	BB10_90;
 
-BB9_110:
-	setp.eq.f64	%p114, %fd1, %fd68;
-	selp.f64	%fd106, 0d3FF0000000000000, 0d0000000000000000, %p114;
-	bra.uni 	BB9_129;
+BB10_116:
+	setp.eq.f64	%p108, %fd1, %fd68;
+	selp.f64	%fd108, 0d3FF0000000000000, 0d0000000000000000, %p108;
+	bra.uni 	BB10_137;
 
-BB9_11:
+BB10_11:
 	setp.gt.s32	%p20, %r6, 5;
-	@%p20 bra 	BB9_15;
+	@%p20 bra 	BB10_15;
 
 	setp.eq.s32	%p24, %r6, 4;
-	@%p24 bra 	BB9_49;
-	bra.uni 	BB9_13;
+	@%p24 bra 	BB10_51;
+	bra.uni 	BB10_13;
 
-BB9_49:
+BB10_51:
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r2}, %fd68;
@@ -1119,7 +1167,7 @@ BB9_49:
 	add.s32 	%r25, %r24, -1012;
 	mov.b64 	 %rd14, %fd1;
 	shl.b64 	%rd2, %rd14, %r25;
-	setp.eq.s64	%p51, %rd2, -9223372036854775808;
+	setp.eq.s64	%p49, %rd2, -9223372036854775808;
 	abs.f64 	%fd18, %fd68;
 	// Callseq Start 1
 	{
@@ -1136,69 +1184,69 @@ BB9_49:
 	param0, 
 	param1
 	);
-	ld.param.f64	%fd97, [retval0+0];
+	ld.param.f64	%fd98, [retval0+0];
 	
 	//{
 	}// Callseq End 1
-	setp.lt.s32	%p52, %r2, 0;
-	and.pred  	%p1, %p52, %p51;
-	@!%p1 bra 	BB9_51;
-	bra.uni 	BB9_50;
+	setp.lt.s32	%p50, %r2, 0;
+	and.pred  	%p1, %p50, %p49;
+	@!%p1 bra 	BB10_53;
+	bra.uni 	BB10_52;
 
-BB9_50:
+BB10_52:
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r26}, %fd97;
+	mov.b64 	{%temp, %r26}, %fd98;
 	}
 	xor.b32  	%r27, %r26, -2147483648;
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r28, %temp}, %fd97;
+	mov.b64 	{%r28, %temp}, %fd98;
 	}
-	mov.b64 	%fd97, {%r28, %r27};
+	mov.b64 	%fd98, {%r28, %r27};
 
-BB9_51:
-	mov.f64 	%fd96, %fd97;
-	setp.eq.f64	%p53, %fd68, 0d0000000000000000;
-	@%p53 bra 	BB9_54;
-	bra.uni 	BB9_52;
+BB10_53:
+	mov.f64 	%fd97, %fd98;
+	setp.eq.f64	%p51, %fd68, 0d0000000000000000;
+	@%p51 bra 	BB10_56;
+	bra.uni 	BB10_54;
 
-BB9_54:
-	selp.b32	%r29, %r2, 0, %p51;
+BB10_56:
+	selp.b32	%r29, %r2, 0, %p49;
 	or.b32  	%r30, %r29, 2146435072;
-	setp.lt.s32	%p57, %r3, 0;
-	selp.b32	%r31, %r30, %r29, %p57;
+	setp.lt.s32	%p55, %r3, 0;
+	selp.b32	%r31, %r30, %r29, %p55;
 	mov.u32 	%r32, 0;
-	mov.b64 	%fd96, {%r32, %r31};
-	bra.uni 	BB9_55;
+	mov.b64 	%fd97, {%r32, %r31};
+	bra.uni 	BB10_57;
 
-BB9_28:
+BB10_28:
 	setp.gt.s32	%p7, %r6, 15;
-	@%p7 bra 	BB9_32;
+	@%p7 bra 	BB10_32;
 
 	setp.eq.s32	%p11, %r6, 14;
-	@%p11 bra 	BB9_43;
-	bra.uni 	BB9_30;
+	@%p11 bra 	BB10_45;
+	bra.uni 	BB10_30;
 
-BB9_43:
+BB10_45:
 	cvt.rni.s64.f64	%rd10, %fd68;
 	cvt.rni.s64.f64	%rd11, %fd1;
 	cvt.u32.u64	%r18, %rd10;
 	cvt.u32.u64	%r19, %rd11;
 	or.b32  	%r20, %r19, %r18;
-	setp.eq.s32	%p43, %r20, 0;
-	selp.f64	%fd98, 0d0000000000000000, 0d3FF0000000000000, %p43;
-	bra.uni 	BB9_65;
+	setp.eq.s32	%p41, %r20, 0;
+	selp.f64	%fd99, 0d0000000000000000, 0d3FF0000000000000, %p41;
+	bra.uni 	BB10_69;
 
-BB9_75:
-	setp.gt.s32	%p88, %r6, 5;
-	@%p88 bra 	BB9_79;
+BB10_79:
+	setp.gt.s32	%p84, %r6, 5;
+	@%p84 bra 	BB10_83;
 
-	setp.eq.s32	%p92, %r6, 4;
-	@%p92 bra 	BB9_113;
-	bra.uni 	BB9_77;
+	setp.eq.s32	%p88, %r6, 4;
+	@%p88 bra 	BB10_119;
+	bra.uni 	BB10_81;
 
-BB9_113:
+BB10_119:
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r4}, %fd1;
@@ -1207,11 +1255,11 @@ BB9_113:
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r5}, %fd68;
 	}
-	bfe.u32 	%r61, %r5, 20, 11;
-	add.s32 	%r62, %r61, -1012;
+	bfe.u32 	%r62, %r5, 20, 11;
+	add.s32 	%r63, %r62, -1012;
 	mov.b64 	 %rd19, %fd68;
-	shl.b64 	%rd3, %rd19, %r62;
-	setp.eq.s64	%p119, %rd3, -9223372036854775808;
+	shl.b64 	%rd3, %rd19, %r63;
+	setp.eq.s64	%p113, %rd3, -9223372036854775808;
 	abs.f64 	%fd51, %fd1;
 	// Callseq Start 2
 	{
@@ -1228,612 +1276,616 @@ BB9_113:
 	param0, 
 	param1
 	);
-	ld.param.f64	%fd105, [retval0+0];
+	ld.param.f64	%fd107, [retval0+0];
 	
 	//{
 	}// Callseq End 2
-	setp.lt.s32	%p120, %r4, 0;
-	and.pred  	%p2, %p120, %p119;
-	@!%p2 bra 	BB9_115;
-	bra.uni 	BB9_114;
+	setp.lt.s32	%p114, %r4, 0;
+	and.pred  	%p2, %p114, %p113;
+	@!%p2 bra 	BB10_121;
+	bra.uni 	BB10_120;
 
-BB9_114:
+BB10_120:
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r63}, %fd105;
+	mov.b64 	{%temp, %r64}, %fd107;
 	}
-	xor.b32  	%r64, %r63, -2147483648;
+	xor.b32  	%r65, %r64, -2147483648;
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r65, %temp}, %fd105;
+	mov.b64 	{%r66, %temp}, %fd107;
 	}
-	mov.b64 	%fd105, {%r65, %r64};
-
-BB9_115:
-	mov.f64 	%fd104, %fd105;
-	setp.eq.f64	%p121, %fd1, 0d0000000000000000;
-	@%p121 bra 	BB9_118;
-	bra.uni 	BB9_116;
-
-BB9_118:
-	selp.b32	%r66, %r4, 0, %p119;
-	or.b32  	%r67, %r66, 2146435072;
-	setp.lt.s32	%p125, %r5, 0;
-	selp.b32	%r68, %r67, %r66, %p125;
-	mov.u32 	%r69, 0;
-	mov.b64 	%fd104, {%r69, %r68};
-	bra.uni 	BB9_119;
-
-BB9_92:
-	setp.gt.s32	%p75, %r6, 15;
-	@%p75 bra 	BB9_96;
-
-	setp.eq.s32	%p79, %r6, 14;
-	@%p79 bra 	BB9_107;
-	bra.uni 	BB9_94;
-
-BB9_107:
+	mov.b64 	%fd107, {%r66, %r65};
+
+BB10_121:
+	mov.f64 	%fd106, %fd107;
+	setp.eq.f64	%p115, %fd1, 0d0000000000000000;
+	@%p115 bra 	BB10_124;
+	bra.uni 	BB10_122;
+
+BB10_124:
+	selp.b32	%r67, %r4, 0, %p113;
+	or.b32  	%r68, %r67, 2146435072;
+	setp.lt.s32	%p119, %r5, 0;
+	selp.b32	%r69, %r68, %r67, %p119;
+	mov.u32 	%r70, 0;
+	mov.b64 	%fd106, {%r70, %r69};
+	bra.uni 	BB10_125;
+
+BB10_96:
+	setp.gt.s32	%p71, %r6, 15;
+	@%p71 bra 	BB10_100;
+
+	setp.eq.s32	%p75, %r6, 14;
+	@%p75 bra 	BB10_113;
+	bra.uni 	BB10_98;
+
+BB10_113:
 	cvt.rni.s64.f64	%rd15, %fd1;
 	cvt.rni.s64.f64	%rd16, %fd68;
-	cvt.u32.u64	%r55, %rd15;
-	cvt.u32.u64	%r56, %rd16;
-	or.b32  	%r57, %r56, %r55;
-	setp.eq.s32	%p111, %r57, 0;
-	selp.f64	%fd106, 0d0000000000000000, 0d3FF0000000000000, %p111;
-	bra.uni 	BB9_129;
-
-BB9_8:
+	cvt.u32.u64	%r56, %rd15;
+	cvt.u32.u64	%r57, %rd16;
+	or.b32  	%r58, %r57, %r56;
+	setp.eq.s32	%p105, %r58, 0;
+	selp.f64	%fd108, 0d0000000000000000, 0d3FF0000000000000, %p105;
+	bra.uni 	BB10_137;
+
+BB10_8:
 	setp.eq.s32	%p27, %r6, 2;
-	@%p27 bra 	BB9_63;
-	bra.uni 	BB9_9;
+	@%p27 bra 	BB10_67;
+	bra.uni 	BB10_9;
 
-BB9_63:
-	mul.f64 	%fd98, %fd1, %fd68;
-	bra.uni 	BB9_65;
+BB10_67:
+	mul.f64 	%fd99, %fd1, %fd68;
+	bra.uni 	BB10_69;
 
-BB9_24:
+BB10_24:
 	setp.eq.s32	%p14, %r6, 11;
-	@%p14 bra 	BB9_45;
+	@%p14 bra 	BB10_47;
 
 	setp.eq.s32	%p15, %r6, 12;
-	@%p15 bra 	BB9_44;
-	bra.uni 	BB9_26;
+	@%p15 bra 	BB10_46;
+	bra.uni 	BB10_26;
 
-BB9_44:
-	max.f64 	%fd98, %fd68, %fd1;
-	bra.uni 	BB9_65;
+BB10_46:
+	max.f64 	%fd99, %fd68, %fd1;
+	bra.uni 	BB10_69;
 
-BB9_15:
+BB10_15:
 	setp.eq.s32	%p21, %r6, 6;
-	@%p21 bra 	BB9_48;
+	@%p21 bra 	BB10_50;
 
 	setp.eq.s32	%p22, %r6, 7;
-	@%p22 bra 	BB9_47;
-	bra.uni 	BB9_17;
+	@%p22 bra 	BB10_49;
+	bra.uni 	BB10_17;
 
-BB9_47:
-	setp.lt.f64	%p48, %fd1, %fd68;
-	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p48;
-	bra.uni 	BB9_65;
+BB10_49:
+	setp.lt.f64	%p46, %fd1, %fd68;
+	selp.f64	%fd99, 0d3FF0000000000000, 0d0000000000000000, %p46;
+	bra.uni 	BB10_69;
 
-BB9_32:
+BB10_32:
 	setp.eq.s32	%p8, %r6, 16;
-	@%p8 bra 	BB9_42;
+	@%p8 bra 	BB10_44;
 
 	setp.eq.s32	%p9, %r6, 17;
-	@%p9 bra 	BB9_38;
-	bra.uni 	BB9_34;
+	@%p9 bra 	BB10_39;
+	bra.uni 	BB10_34;
 
-BB9_38:
-	setp.eq.f64	%p35, %fd1, 0d0000000000000000;
-	setp.eq.f64	%p36, %fd1, 0d8000000000000000;
-	or.pred  	%p37, %p35, %p36;
-	mov.f64 	%fd98, 0d7FF8000000000000;
-	@%p37 bra 	BB9_65;
+BB10_39:
+	setp.eq.f64	%p34, %fd1, 0d0000000000000000;
+	setp.eq.f64	%p35, %fd1, 0d8000000000000000;
+	or.pred  	%p36, %p34, %p35;
+	mov.f64 	%fd99, 0d7FF8000000000000;
+	@%p36 bra 	BB10_69;
 
-	div.rn.f64 	%fd98, %fd68, %fd1;
-	abs.f64 	%fd72, %fd98;
-	setp.gtu.f64	%p38, %fd72, 0d7FF0000000000000;
-	@%p38 bra 	BB9_65;
+	div.rn.f64 	%fd99, %fd68, %fd1;
+	abs.f64 	%fd72, %fd99;
+	setp.gtu.f64	%p37, %fd72, 0d7FF0000000000000;
+	@%p37 bra 	BB10_69;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r15, %temp}, %fd98;
+	mov.b64 	{%temp, %r15}, %fd99;
 	}
+	and.b32  	%r16, %r15, 2147483647;
+	setp.ne.s32	%p38, %r16, 2146435072;
+	@%p38 bra 	BB10_43;
+
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r16}, %fd98;
+	mov.b64 	{%r17, %temp}, %fd99;
 	}
-	and.b32  	%r17, %r16, 2147483647;
-	setp.ne.s32	%p39, %r17, 2146435072;
-	setp.ne.s32	%p40, %r15, 0;
-	or.pred  	%p41, %p39, %p40;
-	@!%p41 bra 	BB9_65;
-	bra.uni 	BB9_41;
+	setp.eq.s32	%p39, %r17, 0;
+	@%p39 bra 	BB10_69;
 
-BB9_41:
-	cvt.rmi.f64.f64	%fd73, %fd98;
+BB10_43:
+	cvt.rmi.f64.f64	%fd73, %fd99;
 	mul.f64 	%fd74, %fd1, %fd73;
-	sub.f64 	%fd98, %fd68, %fd74;
-	bra.uni 	BB9_65;
-
-BB9_72:
-	setp.eq.s32	%p95, %r6, 2;
-	@%p95 bra 	BB9_127;
-	bra.uni 	BB9_73;
-
-BB9_127:
-	mul.f64 	%fd106, %fd1, %fd68;
-	bra.uni 	BB9_129;
-
-BB9_88:
-	setp.eq.s32	%p82, %r6, 11;
-	@%p82 bra 	BB9_109;
-
-	setp.eq.s32	%p83, %r6, 12;
-	@%p83 bra 	BB9_108;
-	bra.uni 	BB9_90;
-
-BB9_108:
-	max.f64 	%fd106, %fd1, %fd68;
-	bra.uni 	BB9_129;
-
-BB9_79:
-	setp.eq.s32	%p89, %r6, 6;
-	@%p89 bra 	BB9_112;
-
-	setp.eq.s32	%p90, %r6, 7;
-	@%p90 bra 	BB9_111;
-	bra.uni 	BB9_81;
-
-BB9_111:
-	setp.gt.f64	%p116, %fd1, %fd68;
-	selp.f64	%fd106, 0d3FF0000000000000, 0d0000000000000000, %p116;
-	bra.uni 	BB9_129;
-
-BB9_96:
-	setp.eq.s32	%p76, %r6, 16;
-	@%p76 bra 	BB9_106;
-
-	setp.eq.s32	%p77, %r6, 17;
-	@%p77 bra 	BB9_102;
-	bra.uni 	BB9_98;
-
-BB9_102:
-	setp.eq.f64	%p103, %fd68, 0d0000000000000000;
-	setp.eq.f64	%p104, %fd68, 0d8000000000000000;
-	or.pred  	%p105, %p103, %p104;
-	mov.f64 	%fd106, 0d7FF8000000000000;
-	@%p105 bra 	BB9_129;
-
-	div.rn.f64 	%fd106, %fd1, %fd68;
-	abs.f64 	%fd83, %fd106;
-	setp.gtu.f64	%p106, %fd83, 0d7FF0000000000000;
-	@%p106 bra 	BB9_129;
+	sub.f64 	%fd99, %fd68, %fd74;
+	bra.uni 	BB10_69;
+
+BB10_76:
+	setp.eq.s32	%p91, %r6, 2;
+	@%p91 bra 	BB10_135;
+	bra.uni 	BB10_77;
+
+BB10_135:
+	mul.f64 	%fd108, %fd1, %fd68;
+	bra.uni 	BB10_137;
+
+BB10_92:
+	setp.eq.s32	%p78, %r6, 11;
+	@%p78 bra 	BB10_115;
+
+	setp.eq.s32	%p79, %r6, 12;
+	@%p79 bra 	BB10_114;
+	bra.uni 	BB10_94;
+
+BB10_114:
+	max.f64 	%fd108, %fd1, %fd68;
+	bra.uni 	BB10_137;
+
+BB10_83:
+	setp.eq.s32	%p85, %r6, 6;
+	@%p85 bra 	BB10_118;
+
+	setp.eq.s32	%p86, %r6, 7;
+	@%p86 bra 	BB10_117;
+	bra.uni 	BB10_85;
+
+BB10_117:
+	setp.gt.f64	%p110, %fd1, %fd68;
+	selp.f64	%fd108, 0d3FF0000000000000, 0d0000000000000000, %p110;
+	bra.uni 	BB10_137;
+
+BB10_100:
+	setp.eq.s32	%p72, %r6, 16;
+	@%p72 bra 	BB10_112;
+
+	setp.eq.s32	%p73, %r6, 17;
+	@%p73 bra 	BB10_107;
+	bra.uni 	BB10_102;
+
+BB10_107:
+	setp.eq.f64	%p98, %fd68, 0d0000000000000000;
+	setp.eq.f64	%p99, %fd68, 0d8000000000000000;
+	or.pred  	%p100, %p98, %p99;
+	mov.f64 	%fd108, 0d7FF8000000000000;
+	@%p100 bra 	BB10_137;
+
+	div.rn.f64 	%fd108, %fd1, %fd68;
+	abs.f64 	%fd83, %fd108;
+	setp.gtu.f64	%p101, %fd83, 0d7FF0000000000000;
+	@%p101 bra 	BB10_137;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r52, %temp}, %fd106;
+	mov.b64 	{%temp, %r53}, %fd108;
 	}
+	and.b32  	%r54, %r53, 2147483647;
+	setp.ne.s32	%p102, %r54, 2146435072;
+	@%p102 bra 	BB10_111;
+
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r53}, %fd106;
+	mov.b64 	{%r55, %temp}, %fd108;
 	}
-	and.b32  	%r54, %r53, 2147483647;
-	setp.ne.s32	%p107, %r54, 2146435072;
-	setp.ne.s32	%p108, %r52, 0;
-	or.pred  	%p109, %p107, %p108;
-	@!%p109 bra 	BB9_129;
-	bra.uni 	BB9_105;
-
-BB9_105:
-	cvt.rmi.f64.f64	%fd84, %fd106;
+	setp.eq.s32	%p103, %r55, 0;
+	@%p103 bra 	BB10_137;
+
+BB10_111:
+	cvt.rmi.f64.f64	%fd84, %fd108;
 	mul.f64 	%fd85, %fd84, %fd68;
-	sub.f64 	%fd106, %fd1, %fd85;
-	bra.uni 	BB9_129;
+	sub.f64 	%fd108, %fd1, %fd85;
+	bra.uni 	BB10_137;
 
-BB9_6:
+BB10_6:
 	setp.eq.s32	%p30, %r6, 1;
-	@%p30 bra 	BB9_7;
-	bra.uni 	BB9_65;
+	@%p30 bra 	BB10_7;
+	bra.uni 	BB10_69;
 
-BB9_7:
-	sub.f64 	%fd98, %fd68, %fd1;
-	bra.uni 	BB9_65;
+BB10_7:
+	sub.f64 	%fd99, %fd68, %fd1;
+	bra.uni 	BB10_69;
 
-BB9_22:
+BB10_22:
 	setp.eq.s32	%p18, %r6, 10;
-	@%p18 bra 	BB9_23;
-	bra.uni 	BB9_65;
+	@%p18 bra 	BB10_23;
+	bra.uni 	BB10_69;
 
-BB9_23:
-	setp.neu.f64	%p45, %fd1, %fd68;
-	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p45;
-	bra.uni 	BB9_65;
+BB10_23:
+	setp.neu.f64	%p43, %fd1, %fd68;
+	selp.f64	%fd99, 0d3FF0000000000000, 0d0000000000000000, %p43;
+	bra.uni 	BB10_69;
 
-BB9_13:
+BB10_13:
 	setp.eq.s32	%p25, %r6, 5;
-	@%p25 bra 	BB9_14;
-	bra.uni 	BB9_65;
+	@%p25 bra 	BB10_14;
+	bra.uni 	BB10_69;
 
-BB9_14:
-	setp.gt.f64	%p50, %fd1, %fd68;
-	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p50;
-	bra.uni 	BB9_65;
+BB10_14:
+	setp.gt.f64	%p48, %fd1, %fd68;
+	selp.f64	%fd99, 0d3FF0000000000000, 0d0000000000000000, %p48;
+	bra.uni 	BB10_69;
 
-BB9_30:
+BB10_30:
 	setp.eq.s32	%p12, %r6, 15;
-	@%p12 bra 	BB9_31;
-	bra.uni 	BB9_65;
+	@%p12 bra 	BB10_31;
+	bra.uni 	BB10_69;
 
-BB9_31:
+BB10_31:
 	mul.f64 	%fd76, %fd1, %fd68;
 	mov.f64 	%fd77, 0d3FF0000000000000;
-	sub.f64 	%fd98, %fd77, %fd76;
-	bra.uni 	BB9_65;
+	sub.f64 	%fd99, %fd77, %fd76;
+	bra.uni 	BB10_69;
 
-BB9_9:
+BB10_9:
 	setp.eq.s32	%p28, %r6, 3;
-	@%p28 bra 	BB9_10;
-	bra.uni 	BB9_65;
+	@%p28 bra 	BB10_10;
+	bra.uni 	BB10_69;
 
-BB9_10:
-	div.rn.f64 	%fd98, %fd68, %fd1;
-	bra.uni 	BB9_65;
+BB10_10:
+	div.rn.f64 	%fd99, %fd68, %fd1;
+	bra.uni 	BB10_69;
 
-BB9_45:
-	min.f64 	%fd98, %fd68, %fd1;
-	bra.uni 	BB9_65;
+BB10_47:
+	min.f64 	%fd99, %fd68, %fd1;
+	bra.uni 	BB10_69;
 
-BB9_26:
+BB10_26:
 	setp.eq.s32	%p16, %r6, 13;
-	@%p16 bra 	BB9_27;
-	bra.uni 	BB9_65;
+	@%p16 bra 	BB10_27;
+	bra.uni 	BB10_69;
 
-BB9_27:
+BB10_27:
 	cvt.rni.s64.f64	%rd12, %fd68;
 	cvt.rni.s64.f64	%rd13, %fd1;
 	cvt.u32.u64	%r21, %rd12;
 	cvt.u32.u64	%r22, %rd13;
 	and.b32  	%r23, %r22, %r21;
-	setp.eq.s32	%p44, %r23, 0;
-	selp.f64	%fd98, 0d0000000000000000, 0d3FF0000000000000, %p44;
-	bra.uni 	BB9_65;
+	setp.eq.s32	%p42, %r23, 0;
+	selp.f64	%fd99, 0d0000000000000000, 0d3FF0000000000000, %p42;
+	bra.uni 	BB10_69;
 
-BB9_48:
-	setp.ge.f64	%p49, %fd1, %fd68;
-	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p49;
-	bra.uni 	BB9_65;
+BB10_50:
+	setp.ge.f64	%p47, %fd1, %fd68;
+	selp.f64	%fd99, 0d3FF0000000000000, 0d0000000000000000, %p47;
+	bra.uni 	BB10_69;
 
-BB9_17:
+BB10_17:
 	setp.eq.s32	%p23, %r6, 8;
-	@%p23 bra 	BB9_18;
-	bra.uni 	BB9_65;
+	@%p23 bra 	BB10_18;
+	bra.uni 	BB10_69;
 
-BB9_18:
-	setp.le.f64	%p47, %fd1, %fd68;
-	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p47;
-	bra.uni 	BB9_65;
+BB10_18:
+	setp.le.f64	%p45, %fd1, %fd68;
+	selp.f64	%fd99, 0d3FF0000000000000, 0d0000000000000000, %p45;
+	bra.uni 	BB10_69;
 
-BB9_42:
-	setp.neu.f64	%p42, %fd68, 0d0000000000000000;
+BB10_44:
+	setp.neu.f64	%p40, %fd68, 0d0000000000000000;
 	sub.f64 	%fd75, %fd68, %fd1;
-	selp.f64	%fd98, %fd75, 0d0000000000000000, %p42;
-	bra.uni 	BB9_65;
+	selp.f64	%fd99, %fd75, 0d0000000000000000, %p40;
+	bra.uni 	BB10_69;
 
-BB9_34:
+BB10_34:
 	setp.ne.s32	%p10, %r6, 18;
-	@%p10 bra 	BB9_65;
+	@%p10 bra 	BB10_69;
 
-	div.rn.f64 	%fd98, %fd68, %fd1;
-	abs.f64 	%fd70, %fd98;
+	div.rn.f64 	%fd99, %fd68, %fd1;
+	abs.f64 	%fd70, %fd99;
 	setp.gtu.f64	%p31, %fd70, 0d7FF0000000000000;
-	@%p31 bra 	BB9_65;
+	@%p31 bra 	BB10_69;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r12, %temp}, %fd98;
+	mov.b64 	{%temp, %r12}, %fd99;
 	}
+	and.b32  	%r13, %r12, 2147483647;
+	setp.ne.s32	%p32, %r13, 2146435072;
+	@%p32 bra 	BB10_38;
+
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r13}, %fd98;
+	mov.b64 	{%r14, %temp}, %fd99;
 	}
-	and.b32  	%r14, %r13, 2147483647;
-	setp.ne.s32	%p32, %r14, 2146435072;
-	setp.ne.s32	%p33, %r12, 0;
-	or.pred  	%p34, %p32, %p33;
-	@!%p34 bra 	BB9_65;
-	bra.uni 	BB9_37;
-
-BB9_37:
-	cvt.rmi.f64.f64	%fd98, %fd98;
-	bra.uni 	BB9_65;
-
-BB9_70:
-	setp.eq.s32	%p98, %r6, 1;
-	@%p98 bra 	BB9_71;
-	bra.uni 	BB9_129;
-
-BB9_71:
-	sub.f64 	%fd106, %fd1, %fd68;
-	bra.uni 	BB9_129;
-
-BB9_86:
-	setp.eq.s32	%p86, %r6, 10;
-	@%p86 bra 	BB9_87;
-	bra.uni 	BB9_129;
-
-BB9_87:
-	setp.neu.f64	%p113, %fd1, %fd68;
-	selp.f64	%fd106, 0d3FF0000000000000, 0d0000000000000000, %p113;
-	bra.uni 	BB9_129;
-
-BB9_77:
-	setp.eq.s32	%p93, %r6, 5;
-	@%p93 bra 	BB9_78;
-	bra.uni 	BB9_129;
-
-BB9_78:
-	setp.lt.f64	%p118, %fd1, %fd68;
-	selp.f64	%fd106, 0d3FF0000000000000, 0d0000000000000000, %p118;
-	bra.uni 	BB9_129;
-
-BB9_94:
-	setp.eq.s32	%p80, %r6, 15;
-	@%p80 bra 	BB9_95;
-	bra.uni 	BB9_129;
-
-BB9_95:
+	setp.eq.s32	%p33, %r14, 0;
+	@%p33 bra 	BB10_69;
+
+BB10_38:
+	cvt.rmi.f64.f64	%fd99, %fd99;
+	bra.uni 	BB10_69;
+
+BB10_74:
+	setp.eq.s32	%p94, %r6, 1;
+	@%p94 bra 	BB10_75;
+	bra.uni 	BB10_137;
+
+BB10_75:
+	sub.f64 	%fd108, %fd1, %fd68;
+	bra.uni 	BB10_137;
+
+BB10_90:
+	setp.eq.s32	%p82, %r6, 10;
+	@%p82 bra 	BB10_91;
+	bra.uni 	BB10_137;
+
+BB10_91:
+	setp.neu.f64	%p107, %fd1, %fd68;
+	selp.f64	%fd108, 0d3FF0000000000000, 0d0000000000000000, %p107;
+	bra.uni 	BB10_137;
+
+BB10_81:
+	setp.eq.s32	%p89, %r6, 5;
+	@%p89 bra 	BB10_82;
+	bra.uni 	BB10_137;
+
+BB10_82:
+	setp.lt.f64	%p112, %fd1, %fd68;
+	selp.f64	%fd108, 0d3FF0000000000000, 0d0000000000000000, %p112;
+	bra.uni 	BB10_137;
+
+BB10_98:
+	setp.eq.s32	%p76, %r6, 15;
+	@%p76 bra 	BB10_99;
+	bra.uni 	BB10_137;
+
+BB10_99:
 	mul.f64 	%fd87, %fd1, %fd68;
 	mov.f64 	%fd88, 0d3FF0000000000000;
-	sub.f64 	%fd106, %fd88, %fd87;
-	bra.uni 	BB9_129;
+	sub.f64 	%fd108, %fd88, %fd87;
+	bra.uni 	BB10_137;
 
-BB9_73:
-	setp.eq.s32	%p96, %r6, 3;
-	@%p96 bra 	BB9_74;
-	bra.uni 	BB9_129;
+BB10_77:
+	setp.eq.s32	%p92, %r6, 3;
+	@%p92 bra 	BB10_78;
+	bra.uni 	BB10_137;
 
-BB9_74:
-	div.rn.f64 	%fd106, %fd1, %fd68;
-	bra.uni 	BB9_129;
+BB10_78:
+	div.rn.f64 	%fd108, %fd1, %fd68;
+	bra.uni 	BB10_137;
 
-BB9_109:
-	min.f64 	%fd106, %fd1, %fd68;
-	bra.uni 	BB9_129;
+BB10_115:
+	min.f64 	%fd108, %fd1, %fd68;
+	bra.uni 	BB10_137;
 
-BB9_90:
-	setp.eq.s32	%p84, %r6, 13;
-	@%p84 bra 	BB9_91;
-	bra.uni 	BB9_129;
+BB10_94:
+	setp.eq.s32	%p80, %r6, 13;
+	@%p80 bra 	BB10_95;
+	bra.uni 	BB10_137;
 
-BB9_91:
+BB10_95:
 	cvt.rni.s64.f64	%rd17, %fd1;
 	cvt.rni.s64.f64	%rd18, %fd68;
-	cvt.u32.u64	%r58, %rd17;
-	cvt.u32.u64	%r59, %rd18;
-	and.b32  	%r60, %r59, %r58;
-	setp.eq.s32	%p112, %r60, 0;
-	selp.f64	%fd106, 0d0000000000000000, 0d3FF0000000000000, %p112;
-	bra.uni 	BB9_129;
-
-BB9_112:
-	setp.le.f64	%p117, %fd1, %fd68;
-	selp.f64	%fd106, 0d3FF0000000000000, 0d0000000000000000, %p117;
-	bra.uni 	BB9_129;
-
-BB9_81:
-	setp.eq.s32	%p91, %r6, 8;
-	@%p91 bra 	BB9_82;
-	bra.uni 	BB9_129;
-
-BB9_82:
-	setp.ge.f64	%p115, %fd1, %fd68;
-	selp.f64	%fd106, 0d3FF0000000000000, 0d0000000000000000, %p115;
-	bra.uni 	BB9_129;
-
-BB9_106:
-	setp.neu.f64	%p110, %fd1, 0d0000000000000000;
+	cvt.u32.u64	%r59, %rd17;
+	cvt.u32.u64	%r60, %rd18;
+	and.b32  	%r61, %r60, %r59;
+	setp.eq.s32	%p106, %r61, 0;
+	selp.f64	%fd108, 0d0000000000000000, 0d3FF0000000000000, %p106;
+	bra.uni 	BB10_137;
+
+BB10_118:
+	setp.le.f64	%p111, %fd1, %fd68;
+	selp.f64	%fd108, 0d3FF0000000000000, 0d0000000000000000, %p111;
+	bra.uni 	BB10_137;
+
+BB10_85:
+	setp.eq.s32	%p87, %r6, 8;
+	@%p87 bra 	BB10_86;
+	bra.uni 	BB10_137;
+
+BB10_86:
+	setp.ge.f64	%p109, %fd1, %fd68;
+	selp.f64	%fd108, 0d3FF0000000000000, 0d0000000000000000, %p109;
+	bra.uni 	BB10_137;
+
+BB10_112:
+	setp.neu.f64	%p104, %fd1, 0d0000000000000000;
 	sub.f64 	%fd86, %fd1, %fd68;
-	selp.f64	%fd106, %fd86, 0d0000000000000000, %p110;
-	bra.uni 	BB9_129;
+	selp.f64	%fd108, %fd86, 0d0000000000000000, %p104;
+	bra.uni 	BB10_137;
 
-BB9_98:
-	setp.ne.s32	%p78, %r6, 18;
-	@%p78 bra 	BB9_129;
+BB10_102:
+	setp.ne.s32	%p74, %r6, 18;
+	@%p74 bra 	BB10_137;
 
-	div.rn.f64 	%fd106, %fd1, %fd68;
-	abs.f64 	%fd81, %fd106;
-	setp.gtu.f64	%p99, %fd81, 0d7FF0000000000000;
-	@%p99 bra 	BB9_129;
+	div.rn.f64 	%fd108, %fd1, %fd68;
+	abs.f64 	%fd81, %fd108;
+	setp.gtu.f64	%p95, %fd81, 0d7FF0000000000000;
+	@%p95 bra 	BB10_137;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r49, %temp}, %fd106;
+	mov.b64 	{%temp, %r50}, %fd108;
 	}
+	and.b32  	%r51, %r50, 2147483647;
+	setp.ne.s32	%p96, %r51, 2146435072;
+	@%p96 bra 	BB10_106;
+
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r50}, %fd106;
+	mov.b64 	{%r52, %temp}, %fd108;
 	}
-	and.b32  	%r51, %r50, 2147483647;
-	setp.ne.s32	%p100, %r51, 2146435072;
-	setp.ne.s32	%p101, %r49, 0;
-	or.pred  	%p102, %p100, %p101;
-	@!%p102 bra 	BB9_129;
-	bra.uni 	BB9_101;
+	setp.eq.s32	%p97, %r52, 0;
+	@%p97 bra 	BB10_137;
 
-BB9_101:
-	cvt.rmi.f64.f64	%fd106, %fd106;
-	bra.uni 	BB9_129;
+BB10_106:
+	cvt.rmi.f64.f64	%fd108, %fd108;
+	bra.uni 	BB10_137;
 
-BB9_52:
-	setp.gt.s32	%p54, %r2, -1;
-	@%p54 bra 	BB9_55;
+BB10_54:
+	setp.gt.s32	%p52, %r2, -1;
+	@%p52 bra 	BB10_57;
 
 	cvt.rzi.f64.f64	%fd78, %fd1;
-	setp.neu.f64	%p55, %fd78, %fd1;
-	selp.f64	%fd96, 0dFFF8000000000000, %fd96, %p55;
+	setp.neu.f64	%p53, %fd78, %fd1;
+	selp.f64	%fd97, 0dFFF8000000000000, %fd97, %p53;
 
-BB9_55:
-	mov.f64 	%fd24, %fd96;
+BB10_57:
+	mov.f64 	%fd24, %fd97;
 	add.f64 	%fd25, %fd1, %fd68;
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r33}, %fd25;
 	}
 	and.b32  	%r34, %r33, 2146435072;
-	setp.ne.s32	%p58, %r34, 2146435072;
-	mov.f64 	%fd95, %fd24;
-	@%p58 bra 	BB9_62;
+	setp.ne.s32	%p56, %r34, 2146435072;
+	mov.f64 	%fd96, %fd24;
+	@%p56 bra 	BB10_66;
 
-	setp.gtu.f64	%p59, %fd18, 0d7FF0000000000000;
-	mov.f64 	%fd95, %fd25;
-	@%p59 bra 	BB9_62;
+	setp.gtu.f64	%p57, %fd18, 0d7FF0000000000000;
+	mov.f64 	%fd96, %fd25;
+	@%p57 bra 	BB10_66;
 
 	abs.f64 	%fd79, %fd1;
-	setp.gtu.f64	%p60, %fd79, 0d7FF0000000000000;
-	mov.f64 	%fd94, %fd25;
-	mov.f64 	%fd95, %fd94;
-	@%p60 bra 	BB9_62;
+	setp.gtu.f64	%p58, %fd79, 0d7FF0000000000000;
+	mov.f64 	%fd95, %fd25;
+	mov.f64 	%fd96, %fd95;
+	@%p58 bra 	BB10_66;
+
+	and.b32  	%r35, %r3, 2147483647;
+	setp.ne.s32	%p59, %r35, 2146435072;
+	@%p59 bra 	BB10_62;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r35, %temp}, %fd1;
+	mov.b64 	{%r36, %temp}, %fd1;
 	}
-	and.b32  	%r36, %r3, 2147483647;
-	setp.eq.s32	%p61, %r36, 2146435072;
-	setp.eq.s32	%p62, %r35, 0;
-	and.pred  	%p63, %p61, %p62;
-	@%p63 bra 	BB9_61;
-	bra.uni 	BB9_59;
+	setp.eq.s32	%p60, %r36, 0;
+	@%p60 bra 	BB10_65;
 
-BB9_61:
-	setp.gt.f64	%p67, %fd18, 0d3FF0000000000000;
-	selp.b32	%r44, 2146435072, 0, %p67;
-	xor.b32  	%r45, %r44, 2146435072;
-	setp.lt.s32	%p68, %r3, 0;
-	selp.b32	%r46, %r45, %r44, %p68;
-	setp.eq.f64	%p69, %fd68, 0dBFF0000000000000;
-	selp.b32	%r47, 1072693248, %r46, %p69;
-	mov.u32 	%r48, 0;
-	mov.b64 	%fd95, {%r48, %r47};
-	bra.uni 	BB9_62;
-
-BB9_116:
-	setp.gt.s32	%p122, %r4, -1;
-	@%p122 bra 	BB9_119;
+BB10_62:
+	and.b32  	%r37, %r2, 2147483647;
+	setp.ne.s32	%p61, %r37, 2146435072;
+	mov.f64 	%fd93, %fd24;
+	mov.f64 	%fd96, %fd93;
+	@%p61 bra 	BB10_66;
+
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r38, %temp}, %fd68;
+	}
+	setp.ne.s32	%p62, %r38, 0;
+	mov.f64 	%fd96, %fd24;
+	@%p62 bra 	BB10_66;
+
+	shr.s32 	%r39, %r3, 31;
+	and.b32  	%r40, %r39, -2146435072;
+	add.s32 	%r41, %r40, 2146435072;
+	or.b32  	%r42, %r41, -2147483648;
+	selp.b32	%r43, %r42, %r41, %p1;
+	mov.u32 	%r44, 0;
+	mov.b64 	%fd96, {%r44, %r43};
+	bra.uni 	BB10_66;
+
+BB10_122:
+	setp.gt.s32	%p116, %r4, -1;
+	@%p116 bra 	BB10_125;
 
 	cvt.rzi.f64.f64	%fd89, %fd68;
-	setp.neu.f64	%p123, %fd89, %fd68;
-	selp.f64	%fd104, 0dFFF8000000000000, %fd104, %p123;
+	setp.neu.f64	%p117, %fd89, %fd68;
+	selp.f64	%fd106, 0dFFF8000000000000, %fd106, %p117;
 
-BB9_119:
-	mov.f64 	%fd57, %fd104;
+BB10_125:
+	mov.f64 	%fd57, %fd106;
 	add.f64 	%fd58, %fd1, %fd68;
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r70}, %fd58;
+	mov.b64 	{%temp, %r71}, %fd58;
 	}
-	and.b32  	%r71, %r70, 2146435072;
-	setp.ne.s32	%p126, %r71, 2146435072;
-	mov.f64 	%fd103, %fd57;
-	@%p126 bra 	BB9_126;
+	and.b32  	%r72, %r71, 2146435072;
+	setp.ne.s32	%p120, %r72, 2146435072;
+	mov.f64 	%fd105, %fd57;
+	@%p120 bra 	BB10_134;
 
-	setp.gtu.f64	%p127, %fd51, 0d7FF0000000000000;
-	mov.f64 	%fd103, %fd58;
-	@%p127 bra 	BB9_126;
+	setp.gtu.f64	%p121, %fd51, 0d7FF0000000000000;
+	mov.f64 	%fd105, %fd58;
+	@%p121 bra 	BB10_134;
 
 	abs.f64 	%fd90, %fd68;
-	setp.gtu.f64	%p128, %fd90, 0d7FF0000000000000;
-	mov.f64 	%fd102, %fd58;
-	mov.f64 	%fd103, %fd102;
-	@%p128 bra 	BB9_126;
+	setp.gtu.f64	%p122, %fd90, 0d7FF0000000000000;
+	mov.f64 	%fd104, %fd58;
+	mov.f64 	%fd105, %fd104;
+	@%p122 bra 	BB10_134;
 
-	{
-	.reg .b32 %temp; 
-	mov.b64 	{%r72, %temp}, %fd68;
-	}
 	and.b32  	%r73, %r5, 2147483647;
-	setp.eq.s32	%p129, %r73, 2146435072;
-	setp.eq.s32	%p130, %r72, 0;
-	and.pred  	%p131, %p129, %p130;
-	@%p131 bra 	BB9_125;
-	bra.uni 	BB9_123;
-
-BB9_125:
-	setp.gt.f64	%p135, %fd51, 0d3FF0000000000000;
-	selp.b32	%r81, 2146435072, 0, %p135;
-	xor.b32  	%r82, %r81, 2146435072;
-	setp.lt.s32	%p136, %r5, 0;
-	selp.b32	%r83, %r82, %r81, %p136;
-	setp.eq.f64	%p137, %fd1, 0dBFF0000000000000;
-	selp.b32	%r84, 1072693248, %r83, %p137;
-	mov.u32 	%r85, 0;
-	mov.b64 	%fd103, {%r85, %r84};
-	bra.uni 	BB9_126;
+	setp.ne.s32	%p123, %r73, 2146435072;
+	@%p123 bra 	BB10_130;
 
-BB9_59:
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r37, %temp}, %fd68;
+	mov.b64 	{%r74, %temp}, %fd68;
 	}
-	and.b32  	%r38, %r2, 2147483647;
-	setp.eq.s32	%p64, %r38, 2146435072;
-	setp.eq.s32	%p65, %r37, 0;
-	and.pred  	%p66, %p64, %p65;
-	mov.f64 	%fd95, %fd24;
-	@!%p66 bra 	BB9_62;
-	bra.uni 	BB9_60;
+	setp.eq.s32	%p124, %r74, 0;
+	@%p124 bra 	BB10_133;
 
-BB9_60:
-	shr.s32 	%r39, %r3, 31;
-	and.b32  	%r40, %r39, -2146435072;
-	selp.b32	%r41, -1048576, 2146435072, %p1;
-	add.s32 	%r42, %r41, %r40;
-	mov.u32 	%r43, 0;
-	mov.b64 	%fd95, {%r43, %r42};
-
-BB9_62:
-	setp.eq.f64	%p70, %fd1, 0d0000000000000000;
-	setp.eq.f64	%p71, %fd68, 0d3FF0000000000000;
-	or.pred  	%p72, %p71, %p70;
-	selp.f64	%fd98, 0d3FF0000000000000, %fd95, %p72;
-
-BB9_65:
-	st.global.f64 	[%rd1], %fd98;
-	bra.uni 	BB9_130;
+BB10_130:
+	and.b32  	%r75, %r4, 2147483647;
+	setp.ne.s32	%p125, %r75, 2146435072;
+	mov.f64 	%fd102, %fd57;
+	mov.f64 	%fd105, %fd102;
+	@%p125 bra 	BB10_134;
 
-BB9_123:
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r74, %temp}, %fd1;
+	mov.b64 	{%r76, %temp}, %fd1;
 	}
-	and.b32  	%r75, %r4, 2147483647;
-	setp.eq.s32	%p132, %r75, 2146435072;
-	setp.eq.s32	%p133, %r74, 0;
-	and.pred  	%p134, %p132, %p133;
-	mov.f64 	%fd103, %fd57;
-	@!%p134 bra 	BB9_126;
-	bra.uni 	BB9_124;
-
-BB9_124:
-	shr.s32 	%r76, %r5, 31;
-	and.b32  	%r77, %r76, -2146435072;
-	selp.b32	%r78, -1048576, 2146435072, %p2;
-	add.s32 	%r79, %r78, %r77;
-	mov.u32 	%r80, 0;
-	mov.b64 	%fd103, {%r80, %r79};
-
-BB9_126:
-	setp.eq.f64	%p138, %fd68, 0d0000000000000000;
-	setp.eq.f64	%p139, %fd1, 0d3FF0000000000000;
-	or.pred  	%p140, %p139, %p138;
-	selp.f64	%fd106, 0d3FF0000000000000, %fd103, %p140;
-
-BB9_129:
-	st.global.f64 	[%rd1], %fd106;
-
-BB9_130:
+	setp.ne.s32	%p126, %r76, 0;
+	mov.f64 	%fd105, %fd57;
+	@%p126 bra 	BB10_134;
+
+	shr.s32 	%r77, %r5, 31;
+	and.b32  	%r78, %r77, -2146435072;
+	add.s32 	%r79, %r78, 2146435072;
+	or.b32  	%r80, %r79, -2147483648;
+	selp.b32	%r81, %r80, %r79, %p2;
+	mov.u32 	%r82, 0;
+	mov.b64 	%fd105, {%r82, %r81};
+	bra.uni 	BB10_134;
+
+BB10_65:
+	setp.gt.f64	%p63, %fd18, 0d3FF0000000000000;
+	selp.b32	%r45, 2146435072, 0, %p63;
+	xor.b32  	%r46, %r45, 2146435072;
+	setp.lt.s32	%p64, %r3, 0;
+	selp.b32	%r47, %r46, %r45, %p64;
+	setp.eq.f64	%p65, %fd68, 0dBFF0000000000000;
+	selp.b32	%r48, 1072693248, %r47, %p65;
+	mov.u32 	%r49, 0;
+	mov.b64 	%fd96, {%r49, %r48};
+
+BB10_66:
+	setp.eq.f64	%p66, %fd1, 0d0000000000000000;
+	setp.eq.f64	%p67, %fd68, 0d3FF0000000000000;
+	or.pred  	%p68, %p67, %p66;
+	selp.f64	%fd99, 0d3FF0000000000000, %fd96, %p68;
+
+BB10_69:
+	st.global.f64 	[%rd1], %fd99;
+	bra.uni 	BB10_138;
+
+BB10_133:
+	setp.gt.f64	%p127, %fd51, 0d3FF0000000000000;
+	selp.b32	%r83, 2146435072, 0, %p127;
+	xor.b32  	%r84, %r83, 2146435072;
+	setp.lt.s32	%p128, %r5, 0;
+	selp.b32	%r85, %r84, %r83, %p128;
+	setp.eq.f64	%p129, %fd1, 0dBFF0000000000000;
+	selp.b32	%r86, 1072693248, %r85, %p129;
+	mov.u32 	%r87, 0;
+	mov.b64 	%fd105, {%r87, %r86};
+
+BB10_134:
+	setp.eq.f64	%p130, %fd68, 0d0000000000000000;
+	setp.eq.f64	%p131, %fd1, 0d3FF0000000000000;
+	or.pred  	%p132, %p131, %p130;
+	selp.f64	%fd108, 0d3FF0000000000000, %fd105, %p132;
+
+BB10_137:
+	st.global.f64 	[%rd1], %fd108;
+
+BB10_138:
 	bar.sync 	0;
 	ret;
 }
@@ -1859,14 +1911,14 @@ BB9_130:
 	mov.u32 	%r5, %tid.x;
 	mad.lo.s32 	%r1, %r4, %r3, %r5;
 	setp.ge.s32	%p1, %r1, %r2;
-	@%p1 bra 	BB10_2;
+	@%p1 bra 	BB11_2;
 
 	cvta.to.global.u64 	%rd2, %rd1;
 	mul.wide.s32 	%rd3, %r1, 8;
 	add.s64 	%rd4, %rd2, %rd3;
 	st.global.f64 	[%rd4], %fd1;
 
-BB10_2:
+BB11_2:
 	ret;
 }
 
@@ -1906,10 +1958,10 @@ BB10_2:
 	setp.lt.s32	%p1, %r1, %r7;
 	setp.lt.s32	%p2, %r2, %r4;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB11_2;
-	bra.uni 	BB11_1;
+	@!%p3 bra 	BB12_2;
+	bra.uni 	BB12_1;
 
-BB11_1:
+BB12_1:
 	cvta.to.global.u64 	%rd5, %rd2;
 	mad.lo.s32 	%r13, %r1, %r4, %r2;
 	mul.wide.s32 	%rd6, %r13, 8;
@@ -1920,14 +1972,14 @@ BB11_1:
 	add.s64 	%rd9, %rd1, %rd8;
 	st.global.f64 	[%rd9], %fd1;
 
-BB11_2:
+BB12_2:
 	setp.lt.s32	%p4, %r1, %r5;
 	setp.lt.s32	%p5, %r2, %r6;
 	and.pred  	%p6, %p4, %p5;
-	@!%p6 bra 	BB11_4;
-	bra.uni 	BB11_3;
+	@!%p6 bra 	BB12_4;
+	bra.uni 	BB12_3;
 
-BB11_3:
+BB12_3:
 	cvta.to.global.u64 	%rd10, %rd3;
 	mad.lo.s32 	%r15, %r1, %r6, %r2;
 	mul.wide.s32 	%rd11, %r15, 8;
@@ -1939,7 +1991,7 @@ BB11_3:
 	add.s64 	%rd14, %rd1, %rd13;
 	st.global.f64 	[%rd14], %fd2;
 
-BB11_4:
+BB12_4:
 	ret;
 }
 
@@ -1978,10 +2030,10 @@ BB11_4:
 	setp.lt.s32	%p1, %r1, %r3;
 	setp.lt.s32	%p2, %r2, %r4;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB12_2;
-	bra.uni 	BB12_1;
+	@!%p3 bra 	BB13_2;
+	bra.uni 	BB13_1;
 
-BB12_1:
+BB13_1:
 	cvta.to.global.u64 	%rd5, %rd2;
 	mad.lo.s32 	%r12, %r1, %r4, %r2;
 	mul.wide.s32 	%rd6, %r12, 8;
@@ -1990,14 +2042,14 @@ BB12_1:
 	add.s64 	%rd8, %rd1, %rd6;
 	st.global.f64 	[%rd8], %fd1;
 
-BB12_2:
+BB13_2:
 	setp.lt.s32	%p4, %r1, %r5;
 	setp.lt.s32	%p5, %r2, %r6;
 	and.pred  	%p6, %p4, %p5;
-	@!%p6 bra 	BB12_4;
-	bra.uni 	BB12_3;
+	@!%p6 bra 	BB13_4;
+	bra.uni 	BB13_3;
 
-BB12_3:
+BB13_3:
 	cvta.to.global.u64 	%rd9, %rd3;
 	mad.lo.s32 	%r13, %r1, %r6, %r2;
 	mul.wide.s32 	%rd10, %r13, 8;
@@ -2009,7 +2061,7 @@ BB12_3:
 	add.s64 	%rd13, %rd1, %rd12;
 	st.global.f64 	[%rd13], %fd2;
 
-BB12_4:
+BB13_4:
 	ret;
 }
 
@@ -2037,9 +2089,9 @@ BB12_4:
 	mov.f64 	%fd76, 0d0000000000000000;
 	mov.f64 	%fd77, %fd76;
 	setp.ge.u32	%p1, %r32, %r5;
-	@%p1 bra 	BB13_4;
+	@%p1 bra 	BB14_4;
 
-BB13_1:
+BB14_1:
 	mov.f64 	%fd1, %fd77;
 	cvta.to.global.u64 	%rd4, %rd2;
 	mul.wide.u32 	%rd5, %r32, 8;
@@ -2048,23 +2100,23 @@ BB13_1:
 	add.f64 	%fd78, %fd1, %fd30;
 	add.s32 	%r3, %r32, %r9;
 	setp.ge.u32	%p2, %r3, %r5;
-	@%p2 bra 	BB13_3;
+	@%p2 bra 	BB14_3;
 
 	mul.wide.u32 	%rd8, %r3, 8;
 	add.s64 	%rd9, %rd4, %rd8;
 	ld.global.f64 	%fd31, [%rd9];
 	add.f64 	%fd78, %fd78, %fd31;
 
-BB13_3:
+BB14_3:
 	mov.f64 	%fd77, %fd78;
 	shl.b32 	%r12, %r9, 1;
 	mov.u32 	%r13, %nctaid.x;
 	mad.lo.s32 	%r32, %r12, %r13, %r32;
 	setp.lt.u32	%p3, %r32, %r5;
 	mov.f64 	%fd76, %fd77;
-	@%p3 bra 	BB13_1;
+	@%p3 bra 	BB14_1;
 
-BB13_4:
+BB14_4:
 	mov.f64 	%fd74, %fd76;
 	mul.wide.u32 	%rd10, %r6, 8;
 	mov.u64 	%rd11, sdata;
@@ -2072,130 +2124,130 @@ BB13_4:
 	st.shared.f64 	[%rd1], %fd74;
 	bar.sync 	0;
 	setp.lt.u32	%p4, %r9, 1024;
-	@%p4 bra 	BB13_8;
+	@%p4 bra 	BB14_8;
 
 	setp.gt.u32	%p5, %r6, 511;
 	mov.f64 	%fd75, %fd74;
-	@%p5 bra 	BB13_7;
+	@%p5 bra 	BB14_7;
 
 	ld.shared.f64 	%fd32, [%rd1+4096];
 	add.f64 	%fd75, %fd74, %fd32;
 	st.shared.f64 	[%rd1], %fd75;
 
-BB13_7:
+BB14_7:
 	mov.f64 	%fd74, %fd75;
 	bar.sync 	0;
 
-BB13_8:
+BB14_8:
 	mov.f64 	%fd72, %fd74;
 	setp.lt.u32	%p6, %r9, 512;
-	@%p6 bra 	BB13_12;
+	@%p6 bra 	BB14_12;
 
 	setp.gt.u32	%p7, %r6, 255;
 	mov.f64 	%fd73, %fd72;
-	@%p7 bra 	BB13_11;
+	@%p7 bra 	BB14_11;
 
 	ld.shared.f64 	%fd33, [%rd1+2048];
 	add.f64 	%fd73, %fd72, %fd33;
 	st.shared.f64 	[%rd1], %fd73;
 
-BB13_11:
+BB14_11:
 	mov.f64 	%fd72, %fd73;
 	bar.sync 	0;
 
-BB13_12:
+BB14_12:
 	mov.f64 	%fd70, %fd72;
 	setp.lt.u32	%p8, %r9, 256;
-	@%p8 bra 	BB13_16;
+	@%p8 bra 	BB14_16;
 
 	setp.gt.u32	%p9, %r6, 127;
 	mov.f64 	%fd71, %fd70;
-	@%p9 bra 	BB13_15;
+	@%p9 bra 	BB14_15;
 
 	ld.shared.f64 	%fd34, [%rd1+1024];
 	add.f64 	%fd71, %fd70, %fd34;
 	st.shared.f64 	[%rd1], %fd71;
 
-BB13_15:
+BB14_15:
 	mov.f64 	%fd70, %fd71;
 	bar.sync 	0;
 
-BB13_16:
+BB14_16:
 	mov.f64 	%fd68, %fd70;
 	setp.lt.u32	%p10, %r9, 128;
-	@%p10 bra 	BB13_20;
+	@%p10 bra 	BB14_20;
 
 	setp.gt.u32	%p11, %r6, 63;
 	mov.f64 	%fd69, %fd68;
-	@%p11 bra 	BB13_19;
+	@%p11 bra 	BB14_19;
 
 	ld.shared.f64 	%fd35, [%rd1+512];
 	add.f64 	%fd69, %fd68, %fd35;
 	st.shared.f64 	[%rd1], %fd69;
 
-BB13_19:
+BB14_19:
 	mov.f64 	%fd68, %fd69;
 	bar.sync 	0;
 
-BB13_20:
+BB14_20:
 	mov.f64 	%fd67, %fd68;
 	setp.gt.u32	%p12, %r6, 31;
-	@%p12 bra 	BB13_33;
+	@%p12 bra 	BB14_33;
 
 	setp.lt.u32	%p13, %r9, 64;
-	@%p13 bra 	BB13_23;
+	@%p13 bra 	BB14_23;
 
 	ld.volatile.shared.f64 	%fd36, [%rd1+256];
 	add.f64 	%fd67, %fd67, %fd36;
 	st.volatile.shared.f64 	[%rd1], %fd67;
 
-BB13_23:
+BB14_23:
 	mov.f64 	%fd66, %fd67;
 	setp.lt.u32	%p14, %r9, 32;
-	@%p14 bra 	BB13_25;
+	@%p14 bra 	BB14_25;
 
 	ld.volatile.shared.f64 	%fd37, [%rd1+128];
 	add.f64 	%fd66, %fd66, %fd37;
 	st.volatile.shared.f64 	[%rd1], %fd66;
 
-BB13_25:
+BB14_25:
 	mov.f64 	%fd65, %fd66;
 	setp.lt.u32	%p15, %r9, 16;
-	@%p15 bra 	BB13_27;
+	@%p15 bra 	BB14_27;
 
 	ld.volatile.shared.f64 	%fd38, [%rd1+64];
 	add.f64 	%fd65, %fd65, %fd38;
 	st.volatile.shared.f64 	[%rd1], %fd65;
 
-BB13_27:
+BB14_27:
 	mov.f64 	%fd64, %fd65;
 	setp.lt.u32	%p16, %r9, 8;
-	@%p16 bra 	BB13_29;
+	@%p16 bra 	BB14_29;
 
 	ld.volatile.shared.f64 	%fd39, [%rd1+32];
 	add.f64 	%fd64, %fd64, %fd39;
 	st.volatile.shared.f64 	[%rd1], %fd64;
 
-BB13_29:
+BB14_29:
 	mov.f64 	%fd63, %fd64;
 	setp.lt.u32	%p17, %r9, 4;
-	@%p17 bra 	BB13_31;
+	@%p17 bra 	BB14_31;
 
 	ld.volatile.shared.f64 	%fd40, [%rd1+16];
 	add.f64 	%fd63, %fd63, %fd40;
 	st.volatile.shared.f64 	[%rd1], %fd63;
 
-BB13_31:
+BB14_31:
 	setp.lt.u32	%p18, %r9, 2;
-	@%p18 bra 	BB13_33;
+	@%p18 bra 	BB14_33;
 
 	ld.volatile.shared.f64 	%fd41, [%rd1+8];
 	add.f64 	%fd42, %fd63, %fd41;
 	st.volatile.shared.f64 	[%rd1], %fd42;
 
-BB13_33:
+BB14_33:
 	setp.ne.s32	%p19, %r6, 0;
-	@%p19 bra 	BB13_35;
+	@%p19 bra 	BB14_35;
 
 	ld.shared.f64 	%fd43, [sdata];
 	cvta.to.global.u64 	%rd12, %rd3;
@@ -2203,7 +2255,7 @@ BB13_33:
 	add.s64 	%rd14, %rd12, %rd13;
 	st.global.f64 	[%rd14], %fd43;
 
-BB13_35:
+BB14_35:
 	ret;
 }
 
@@ -2227,17 +2279,17 @@ BB13_35:
 	ld.param.u32 	%r4, [reduce_row_sum_param_3];
 	mov.u32 	%r6, %ctaid.x;
 	setp.ge.u32	%p1, %r6, %r5;
-	@%p1 bra 	BB14_35;
+	@%p1 bra 	BB15_35;
 
 	mov.u32 	%r38, %tid.x;
 	mov.f64 	%fd72, 0d0000000000000000;
 	mov.f64 	%fd73, %fd72;
 	setp.ge.u32	%p2, %r38, %r4;
-	@%p2 bra 	BB14_4;
+	@%p2 bra 	BB15_4;
 
 	cvta.to.global.u64 	%rd3, %rd1;
 
-BB14_3:
+BB15_3:
 	mad.lo.s32 	%r8, %r6, %r4, %r38;
 	mul.wide.u32 	%rd4, %r8, 8;
 	add.s64 	%rd5, %rd3, %rd4;
@@ -2247,9 +2299,9 @@ BB14_3:
 	add.s32 	%r38, %r9, %r38;
 	setp.lt.u32	%p3, %r38, %r4;
 	mov.f64 	%fd72, %fd73;
-	@%p3 bra 	BB14_3;
+	@%p3 bra 	BB15_3;
 
-BB14_4:
+BB15_4:
 	mov.f64 	%fd70, %fd72;
 	mov.u32 	%r10, %tid.x;
 	mul.wide.u32 	%rd6, %r10, 8;
@@ -2259,130 +2311,130 @@ BB14_4:
 	bar.sync 	0;
 	mov.u32 	%r11, %ntid.x;
 	setp.lt.u32	%p4, %r11, 1024;
-	@%p4 bra 	BB14_8;
+	@%p4 bra 	BB15_8;
 
 	setp.gt.u32	%p5, %r10, 511;
 	mov.f64 	%fd71, %fd70;
-	@%p5 bra 	BB14_7;
+	@%p5 bra 	BB15_7;
 
 	ld.shared.f64 	%fd29, [%rd8+4096];
 	add.f64 	%fd71, %fd70, %fd29;
 	st.shared.f64 	[%rd8], %fd71;
 
-BB14_7:
+BB15_7:
 	mov.f64 	%fd70, %fd71;
 	bar.sync 	0;
 
-BB14_8:
+BB15_8:
 	mov.f64 	%fd68, %fd70;
 	setp.lt.u32	%p6, %r11, 512;
-	@%p6 bra 	BB14_12;
+	@%p6 bra 	BB15_12;
 
 	setp.gt.u32	%p7, %r10, 255;
 	mov.f64 	%fd69, %fd68;
-	@%p7 bra 	BB14_11;
+	@%p7 bra 	BB15_11;
 
 	ld.shared.f64 	%fd30, [%rd8+2048];
 	add.f64 	%fd69, %fd68, %fd30;
 	st.shared.f64 	[%rd8], %fd69;
 
-BB14_11:
+BB15_11:
 	mov.f64 	%fd68, %fd69;
 	bar.sync 	0;
 
-BB14_12:
+BB15_12:
 	mov.f64 	%fd66, %fd68;
 	setp.lt.u32	%p8, %r11, 256;
-	@%p8 bra 	BB14_16;
+	@%p8 bra 	BB15_16;
 
 	setp.gt.u32	%p9, %r10, 127;
 	mov.f64 	%fd67, %fd66;
-	@%p9 bra 	BB14_15;
+	@%p9 bra 	BB15_15;
 
 	ld.shared.f64 	%fd31, [%rd8+1024];
 	add.f64 	%fd67, %fd66, %fd31;
 	st.shared.f64 	[%rd8], %fd67;
 
-BB14_15:
+BB15_15:
 	mov.f64 	%fd66, %fd67;
 	bar.sync 	0;
 
-BB14_16:
+BB15_16:
 	mov.f64 	%fd64, %fd66;
 	setp.lt.u32	%p10, %r11, 128;
-	@%p10 bra 	BB14_20;
+	@%p10 bra 	BB15_20;
 
 	setp.gt.u32	%p11, %r10, 63;
 	mov.f64 	%fd65, %fd64;
-	@%p11 bra 	BB14_19;
+	@%p11 bra 	BB15_19;
 
 	ld.shared.f64 	%fd32, [%rd8+512];
 	add.f64 	%fd65, %fd64, %fd32;
 	st.shared.f64 	[%rd8], %fd65;
 
-BB14_19:
+BB15_19:
 	mov.f64 	%fd64, %fd65;
 	bar.sync 	0;
 
-BB14_20:
+BB15_20:
 	mov.f64 	%fd63, %fd64;
 	setp.gt.u32	%p12, %r10, 31;
-	@%p12 bra 	BB14_33;
+	@%p12 bra 	BB15_33;
 
 	setp.lt.u32	%p13, %r11, 64;
-	@%p13 bra 	BB14_23;
+	@%p13 bra 	BB15_23;
 
 	ld.volatile.shared.f64 	%fd33, [%rd8+256];
 	add.f64 	%fd63, %fd63, %fd33;
 	st.volatile.shared.f64 	[%rd8], %fd63;
 
-BB14_23:
+BB15_23:
 	mov.f64 	%fd62, %fd63;
 	setp.lt.u32	%p14, %r11, 32;
-	@%p14 bra 	BB14_25;
+	@%p14 bra 	BB15_25;
 
 	ld.volatile.shared.f64 	%fd34, [%rd8+128];
 	add.f64 	%fd62, %fd62, %fd34;
 	st.volatile.shared.f64 	[%rd8], %fd62;
 
-BB14_25:
+BB15_25:
 	mov.f64 	%fd61, %fd62;
 	setp.lt.u32	%p15, %r11, 16;
-	@%p15 bra 	BB14_27;
+	@%p15 bra 	BB15_27;
 
 	ld.volatile.shared.f64 	%fd35, [%rd8+64];
 	add.f64 	%fd61, %fd61, %fd35;
 	st.volatile.shared.f64 	[%rd8], %fd61;
 
-BB14_27:
+BB15_27:
 	mov.f64 	%fd60, %fd61;
 	setp.lt.u32	%p16, %r11, 8;
-	@%p16 bra 	BB14_29;
+	@%p16 bra 	BB15_29;
 
 	ld.volatile.shared.f64 	%fd36, [%rd8+32];
 	add.f64 	%fd60, %fd60, %fd36;
 	st.volatile.shared.f64 	[%rd8], %fd60;
 
-BB14_29:
+BB15_29:
 	mov.f64 	%fd59, %fd60;
 	setp.lt.u32	%p17, %r11, 4;
-	@%p17 bra 	BB14_31;
+	@%p17 bra 	BB15_31;
 
 	ld.volatile.shared.f64 	%fd37, [%rd8+16];
 	add.f64 	%fd59, %fd59, %fd37;
 	st.volatile.shared.f64 	[%rd8], %fd59;
 
-BB14_31:
+BB15_31:
 	setp.lt.u32	%p18, %r11, 2;
-	@%p18 bra 	BB14_33;
+	@%p18 bra 	BB15_33;
 
 	ld.volatile.shared.f64 	%fd38, [%rd8+8];
 	add.f64 	%fd39, %fd59, %fd38;
 	st.volatile.shared.f64 	[%rd8], %fd39;
 
-BB14_33:
+BB15_33:
 	setp.ne.s32	%p19, %r10, 0;
-	@%p19 bra 	BB14_35;
+	@%p19 bra 	BB15_35;
 
 	ld.shared.f64 	%fd40, [sdata];
 	cvta.to.global.u64 	%rd39, %rd2;
@@ -2390,7 +2442,7 @@ BB14_33:
 	add.s64 	%rd41, %rd39, %rd40;
 	st.global.f64 	[%rd41], %fd40;
 
-BB14_35:
+BB15_35:
 	ret;
 }
 
@@ -2417,18 +2469,18 @@ BB14_35:
 	mov.u32 	%r9, %tid.x;
 	mad.lo.s32 	%r1, %r7, %r8, %r9;
 	setp.ge.u32	%p1, %r1, %r6;
-	@%p1 bra 	BB15_5;
+	@%p1 bra 	BB16_5;
 
 	cvta.to.global.u64 	%rd1, %rd2;
 	mul.lo.s32 	%r2, %r6, %r5;
 	mov.f64 	%fd8, 0d0000000000000000;
 	mov.f64 	%fd9, %fd8;
 	setp.ge.u32	%p2, %r1, %r2;
-	@%p2 bra 	BB15_4;
+	@%p2 bra 	BB16_4;
 
 	mov.u32 	%r10, %r1;
 
-BB15_3:
+BB16_3:
 	mov.u32 	%r3, %r10;
 	mul.wide.u32 	%rd4, %r3, 8;
 	add.s64 	%rd5, %rd1, %rd4;
@@ -2438,15 +2490,15 @@ BB15_3:
 	setp.lt.u32	%p3, %r4, %r2;
 	mov.u32 	%r10, %r4;
 	mov.f64 	%fd8, %fd9;
-	@%p3 bra 	BB15_3;
+	@%p3 bra 	BB16_3;
 
-BB15_4:
+BB16_4:
 	cvta.to.global.u64 	%rd6, %rd3;
 	mul.wide.u32 	%rd7, %r1, 8;
 	add.s64 	%rd8, %rd6, %rd7;
 	st.global.f64 	[%rd8], %fd8;
 
-BB15_5:
+BB16_5:
 	ret;
 }
 
@@ -2474,9 +2526,9 @@ BB15_5:
 	mov.f64 	%fd76, 0dFFEFFFFFFFFFFFFF;
 	mov.f64 	%fd77, %fd76;
 	setp.ge.u32	%p1, %r32, %r5;
-	@%p1 bra 	BB16_4;
+	@%p1 bra 	BB17_4;
 
-BB16_1:
+BB17_1:
 	mov.f64 	%fd1, %fd77;
 	cvta.to.global.u64 	%rd4, %rd2;
 	mul.wide.u32 	%rd5, %r32, 8;
@@ -2485,23 +2537,23 @@ BB16_1:
 	max.f64 	%fd78, %fd1, %fd30;
 	add.s32 	%r3, %r32, %r9;
 	setp.ge.u32	%p2, %r3, %r5;
-	@%p2 bra 	BB16_3;
+	@%p2 bra 	BB17_3;
 
 	mul.wide.u32 	%rd8, %r3, 8;
 	add.s64 	%rd9, %rd4, %rd8;
 	ld.global.f64 	%fd31, [%rd9];
 	max.f64 	%fd78, %fd78, %fd31;
 
-BB16_3:
+BB17_3:
 	mov.f64 	%fd77, %fd78;
 	shl.b32 	%r12, %r9, 1;
 	mov.u32 	%r13, %nctaid.x;
 	mad.lo.s32 	%r32, %r12, %r13, %r32;
 	setp.lt.u32	%p3, %r32, %r5;
 	mov.f64 	%fd76, %fd77;
-	@%p3 bra 	BB16_1;
+	@%p3 bra 	BB17_1;
 
-BB16_4:
+BB17_4:
 	mov.f64 	%fd74, %fd76;
 	mul.wide.u32 	%rd10, %r6, 8;
 	mov.u64 	%rd11, sdata;
@@ -2509,130 +2561,130 @@ BB16_4:
 	st.shared.f64 	[%rd1], %fd74;
 	bar.sync 	0;
 	setp.lt.u32	%p4, %r9, 1024;
-	@%p4 bra 	BB16_8;
+	@%p4 bra 	BB17_8;
 
 	setp.gt.u32	%p5, %r6, 511;
 	mov.f64 	%fd75, %fd74;
-	@%p5 bra 	BB16_7;
+	@%p5 bra 	BB17_7;
 
 	ld.shared.f64 	%fd32, [%rd1+4096];
 	max.f64 	%fd75, %fd74, %fd32;
 	st.shared.f64 	[%rd1], %fd75;
 
-BB16_7:
+BB17_7:
 	mov.f64 	%fd74, %fd75;
 	bar.sync 	0;
 
-BB16_8:
+BB17_8:
 	mov.f64 	%fd72, %fd74;
 	setp.lt.u32	%p6, %r9, 512;
-	@%p6 bra 	BB16_12;
+	@%p6 bra 	BB17_12;
 
 	setp.gt.u32	%p7, %r6, 255;
 	mov.f64 	%fd73, %fd72;
-	@%p7 bra 	BB16_11;
+	@%p7 bra 	BB17_11;
 
 	ld.shared.f64 	%fd33, [%rd1+2048];
 	max.f64 	%fd73, %fd72, %fd33;
 	st.shared.f64 	[%rd1], %fd73;
 
-BB16_11:
+BB17_11:
 	mov.f64 	%fd72, %fd73;
 	bar.sync 	0;
 
-BB16_12:
+BB17_12:
 	mov.f64 	%fd70, %fd72;
 	setp.lt.u32	%p8, %r9, 256;
-	@%p8 bra 	BB16_16;
+	@%p8 bra 	BB17_16;
 
 	setp.gt.u32	%p9, %r6, 127;
 	mov.f64 	%fd71, %fd70;
-	@%p9 bra 	BB16_15;
+	@%p9 bra 	BB17_15;
 
 	ld.shared.f64 	%fd34, [%rd1+1024];
 	max.f64 	%fd71, %fd70, %fd34;
 	st.shared.f64 	[%rd1], %fd71;
 
-BB16_15:
+BB17_15:
 	mov.f64 	%fd70, %fd71;
 	bar.sync 	0;
 
-BB16_16:
+BB17_16:
 	mov.f64 	%fd68, %fd70;
 	setp.lt.u32	%p10, %r9, 128;
-	@%p10 bra 	BB16_20;
+	@%p10 bra 	BB17_20;
 
 	setp.gt.u32	%p11, %r6, 63;
 	mov.f64 	%fd69, %fd68;
-	@%p11 bra 	BB16_19;
+	@%p11 bra 	BB17_19;
 
 	ld.shared.f64 	%fd35, [%rd1+512];
 	max.f64 	%fd69, %fd68, %fd35;
 	st.shared.f64 	[%rd1], %fd69;
 
-BB16_19:
+BB17_19:
 	mov.f64 	%fd68, %fd69;
 	bar.sync 	0;
 
-BB16_20:
+BB17_20:
 	mov.f64 	%fd67, %fd68;
 	setp.gt.u32	%p12, %r6, 31;
-	@%p12 bra 	BB16_33;
+	@%p12 bra 	BB17_33;
 
 	setp.lt.u32	%p13, %r9, 64;
-	@%p13 bra 	BB16_23;
+	@%p13 bra 	BB17_23;
 
 	ld.volatile.shared.f64 	%fd36, [%rd1+256];
 	max.f64 	%fd67, %fd67, %fd36;
 	st.volatile.shared.f64 	[%rd1], %fd67;
 
-BB16_23:
+BB17_23:
 	mov.f64 	%fd66, %fd67;
 	setp.lt.u32	%p14, %r9, 32;
-	@%p14 bra 	BB16_25;
+	@%p14 bra 	BB17_25;
 
 	ld.volatile.shared.f64 	%fd37, [%rd1+128];
 	max.f64 	%fd66, %fd66, %fd37;
 	st.volatile.shared.f64 	[%rd1], %fd66;
 
-BB16_25:
+BB17_25:
 	mov.f64 	%fd65, %fd66;
 	setp.lt.u32	%p15, %r9, 16;
-	@%p15 bra 	BB16_27;
+	@%p15 bra 	BB17_27;
 
 	ld.volatile.shared.f64 	%fd38, [%rd1+64];
 	max.f64 	%fd65, %fd65, %fd38;
 	st.volatile.shared.f64 	[%rd1], %fd65;
 
-BB16_27:
+BB17_27:
 	mov.f64 	%fd64, %fd65;
 	setp.lt.u32	%p16, %r9, 8;
-	@%p16 bra 	BB16_29;
+	@%p16 bra 	BB17_29;
 
 	ld.volatile.shared.f64 	%fd39, [%rd1+32];
 	max.f64 	%fd64, %fd64, %fd39;
 	st.volatile.shared.f64 	[%rd1], %fd64;
 
-BB16_29:
+BB17_29:
 	mov.f64 	%fd63, %fd64;
 	setp.lt.u32	%p17, %r9, 4;
-	@%p17 bra 	BB16_31;
+	@%p17 bra 	BB17_31;
 
 	ld.volatile.shared.f64 	%fd40, [%rd1+16];
 	max.f64 	%fd63, %fd63, %fd40;
 	st.volatile.shared.f64 	[%rd1], %fd63;
 
-BB16_31:
+BB17_31:
 	setp.lt.u32	%p18, %r9, 2;
-	@%p18 bra 	BB16_33;
+	@%p18 bra 	BB17_33;
 
 	ld.volatile.shared.f64 	%fd41, [%rd1+8];
 	max.f64 	%fd42, %fd63, %fd41;
 	st.volatile.shared.f64 	[%rd1], %fd42;
 
-BB16_33:
+BB17_33:
 	setp.ne.s32	%p19, %r6, 0;
-	@%p19 bra 	BB16_35;
+	@%p19 bra 	BB17_35;
 
 	ld.shared.f64 	%fd43, [sdata];
 	cvta.to.global.u64 	%rd12, %rd3;
@@ -2640,7 +2692,7 @@ BB16_33:
 	add.s64 	%rd14, %rd12, %rd13;
 	st.global.f64 	[%rd14], %fd43;
 
-BB16_35:
+BB17_35:
 	ret;
 }
 
@@ -2664,17 +2716,17 @@ BB16_35:
 	ld.param.u32 	%r4, [reduce_row_max_param_3];
 	mov.u32 	%r6, %ctaid.x;
 	setp.ge.u32	%p1, %r6, %r5;
-	@%p1 bra 	BB17_35;
+	@%p1 bra 	BB18_35;
 
 	mov.u32 	%r38, %tid.x;
 	mov.f64 	%fd72, 0dFFEFFFFFFFFFFFFF;
 	mov.f64 	%fd73, %fd72;
 	setp.ge.u32	%p2, %r38, %r4;
-	@%p2 bra 	BB17_4;
+	@%p2 bra 	BB18_4;
 
 	cvta.to.global.u64 	%rd3, %rd1;
 
-BB17_3:
+BB18_3:
 	mad.lo.s32 	%r8, %r6, %r4, %r38;
 	mul.wide.u32 	%rd4, %r8, 8;
 	add.s64 	%rd5, %rd3, %rd4;
@@ -2684,9 +2736,9 @@ BB17_3:
 	add.s32 	%r38, %r9, %r38;
 	setp.lt.u32	%p3, %r38, %r4;
 	mov.f64 	%fd72, %fd73;
-	@%p3 bra 	BB17_3;
+	@%p3 bra 	BB18_3;
 
-BB17_4:
+BB18_4:
 	mov.f64 	%fd70, %fd72;
 	mov.u32 	%r10, %tid.x;
 	mul.wide.u32 	%rd6, %r10, 8;
@@ -2696,130 +2748,130 @@ BB17_4:
 	bar.sync 	0;
 	mov.u32 	%r11, %ntid.x;
 	setp.lt.u32	%p4, %r11, 1024;
-	@%p4 bra 	BB17_8;
+	@%p4 bra 	BB18_8;
 
 	setp.gt.u32	%p5, %r10, 511;
 	mov.f64 	%fd71, %fd70;
-	@%p5 bra 	BB17_7;
+	@%p5 bra 	BB18_7;
 
 	ld.shared.f64 	%fd29, [%rd8+4096];
 	max.f64 	%fd71, %fd70, %fd29;
 	st.shared.f64 	[%rd8], %fd71;
 
-BB17_7:
+BB18_7:
 	mov.f64 	%fd70, %fd71;
 	bar.sync 	0;
 
-BB17_8:
+BB18_8:
 	mov.f64 	%fd68, %fd70;
 	setp.lt.u32	%p6, %r11, 512;
-	@%p6 bra 	BB17_12;
+	@%p6 bra 	BB18_12;
 
 	setp.gt.u32	%p7, %r10, 255;
 	mov.f64 	%fd69, %fd68;
-	@%p7 bra 	BB17_11;
+	@%p7 bra 	BB18_11;
 
 	ld.shared.f64 	%fd30, [%rd8+2048];
 	max.f64 	%fd69, %fd68, %fd30;
 	st.shared.f64 	[%rd8], %fd69;
 
-BB17_11:
+BB18_11:
 	mov.f64 	%fd68, %fd69;
 	bar.sync 	0;
 
-BB17_12:
+BB18_12:
 	mov.f64 	%fd66, %fd68;
 	setp.lt.u32	%p8, %r11, 256;
-	@%p8 bra 	BB17_16;
+	@%p8 bra 	BB18_16;
 
 	setp.gt.u32	%p9, %r10, 127;
 	mov.f64 	%fd67, %fd66;
-	@%p9 bra 	BB17_15;
+	@%p9 bra 	BB18_15;
 
 	ld.shared.f64 	%fd31, [%rd8+1024];
 	max.f64 	%fd67, %fd66, %fd31;
 	st.shared.f64 	[%rd8], %fd67;
 
-BB17_15:
+BB18_15:
 	mov.f64 	%fd66, %fd67;
 	bar.sync 	0;
 
-BB17_16:
+BB18_16:
 	mov.f64 	%fd64, %fd66;
 	setp.lt.u32	%p10, %r11, 128;
-	@%p10 bra 	BB17_20;
+	@%p10 bra 	BB18_20;
 
 	setp.gt.u32	%p11, %r10, 63;
 	mov.f64 	%fd65, %fd64;
-	@%p11 bra 	BB17_19;
+	@%p11 bra 	BB18_19;
 
 	ld.shared.f64 	%fd32, [%rd8+512];
 	max.f64 	%fd65, %fd64, %fd32;
 	st.shared.f64 	[%rd8], %fd65;
 
-BB17_19:
+BB18_19:
 	mov.f64 	%fd64, %fd65;
 	bar.sync 	0;
 
-BB17_20:
+BB18_20:
 	mov.f64 	%fd63, %fd64;
 	setp.gt.u32	%p12, %r10, 31;
-	@%p12 bra 	BB17_33;
+	@%p12 bra 	BB18_33;
 
 	setp.lt.u32	%p13, %r11, 64;
-	@%p13 bra 	BB17_23;
+	@%p13 bra 	BB18_23;
 
 	ld.volatile.shared.f64 	%fd33, [%rd8+256];
 	max.f64 	%fd63, %fd63, %fd33;
 	st.volatile.shared.f64 	[%rd8], %fd63;
 
-BB17_23:
+BB18_23:
 	mov.f64 	%fd62, %fd63;
 	setp.lt.u32	%p14, %r11, 32;
-	@%p14 bra 	BB17_25;
+	@%p14 bra 	BB18_25;
 
 	ld.volatile.shared.f64 	%fd34, [%rd8+128];
 	max.f64 	%fd62, %fd62, %fd34;
 	st.volatile.shared.f64 	[%rd8], %fd62;
 
-BB17_25:
+BB18_25:
 	mov.f64 	%fd61, %fd62;
 	setp.lt.u32	%p15, %r11, 16;
-	@%p15 bra 	BB17_27;
+	@%p15 bra 	BB18_27;
 
 	ld.volatile.shared.f64 	%fd35, [%rd8+64];
 	max.f64 	%fd61, %fd61, %fd35;
 	st.volatile.shared.f64 	[%rd8], %fd61;
 
-BB17_27:
+BB18_27:
 	mov.f64 	%fd60, %fd61;
 	setp.lt.u32	%p16, %r11, 8;
-	@%p16 bra 	BB17_29;
+	@%p16 bra 	BB18_29;
 
 	ld.volatile.shared.f64 	%fd36, [%rd8+32];
 	max.f64 	%fd60, %fd60, %fd36;
 	st.volatile.shared.f64 	[%rd8], %fd60;
 
-BB17_29:
+BB18_29:
 	mov.f64 	%fd59, %fd60;
 	setp.lt.u32	%p17, %r11, 4;
-	@%p17 bra 	BB17_31;
+	@%p17 bra 	BB18_31;
 
 	ld.volatile.shared.f64 	%fd37, [%rd8+16];
 	max.f64 	%fd59, %fd59, %fd37;
 	st.volatile.shared.f64 	[%rd8], %fd59;
 
-BB17_31:
+BB18_31:
 	setp.lt.u32	%p18, %r11, 2;
-	@%p18 bra 	BB17_33;
+	@%p18 bra 	BB18_33;
 
 	ld.volatile.shared.f64 	%fd38, [%rd8+8];
 	max.f64 	%fd39, %fd59, %fd38;
 	st.volatile.shared.f64 	[%rd8], %fd39;
 
-BB17_33:
+BB18_33:
 	setp.ne.s32	%p19, %r10, 0;
-	@%p19 bra 	BB17_35;
+	@%p19 bra 	BB18_35;
 
 	ld.shared.f64 	%fd40, [sdata];
 	cvta.to.global.u64 	%rd39, %rd2;
@@ -2827,7 +2879,7 @@ BB17_33:
 	add.s64 	%rd41, %rd39, %rd40;
 	st.global.f64 	[%rd41], %fd40;
 
-BB17_35:
+BB18_35:
 	ret;
 }
 
@@ -2854,18 +2906,18 @@ BB17_35:
 	mov.u32 	%r9, %tid.x;
 	mad.lo.s32 	%r1, %r7, %r8, %r9;
 	setp.ge.u32	%p1, %r1, %r6;
-	@%p1 bra 	BB18_5;
+	@%p1 bra 	BB19_5;
 
 	cvta.to.global.u64 	%rd1, %rd2;
 	mul.lo.s32 	%r2, %r6, %r5;
 	mov.f64 	%fd8, 0dFFEFFFFFFFFFFFFF;
 	mov.f64 	%fd9, %fd8;
 	setp.ge.u32	%p2, %r1, %r2;
-	@%p2 bra 	BB18_4;
+	@%p2 bra 	BB19_4;
 
 	mov.u32 	%r10, %r1;
 
-BB18_3:
+BB19_3:
 	mov.u32 	%r3, %r10;
 	mul.wide.u32 	%rd4, %r3, 8;
 	add.s64 	%rd5, %rd1, %rd4;
@@ -2875,15 +2927,15 @@ BB18_3:
 	setp.lt.u32	%p3, %r4, %r2;
 	mov.u32 	%r10, %r4;
 	mov.f64 	%fd8, %fd9;
-	@%p3 bra 	BB18_3;
+	@%p3 bra 	BB19_3;
 
-BB18_4:
+BB19_4:
 	cvta.to.global.u64 	%rd6, %rd3;
 	mul.wide.u32 	%rd7, %r1, 8;
 	add.s64 	%rd8, %rd6, %rd7;
 	st.global.f64 	[%rd8], %fd8;
 
-BB18_5:
+BB19_5:
 	ret;
 }
 
@@ -2911,9 +2963,9 @@ BB18_5:
 	mov.f64 	%fd76, 0d7FEFFFFFFFFFFFFF;
 	mov.f64 	%fd77, %fd76;
 	setp.ge.u32	%p1, %r32, %r5;
-	@%p1 bra 	BB19_4;
+	@%p1 bra 	BB20_4;
 
-BB19_1:
+BB20_1:
 	mov.f64 	%fd1, %fd77;
 	cvta.to.global.u64 	%rd4, %rd2;
 	mul.wide.u32 	%rd5, %r32, 8;
@@ -2922,23 +2974,23 @@ BB19_1:
 	min.f64 	%fd78, %fd1, %fd30;
 	add.s32 	%r3, %r32, %r9;
 	setp.ge.u32	%p2, %r3, %r5;
-	@%p2 bra 	BB19_3;
+	@%p2 bra 	BB20_3;
 
 	mul.wide.u32 	%rd8, %r3, 8;
 	add.s64 	%rd9, %rd4, %rd8;
 	ld.global.f64 	%fd31, [%rd9];
 	min.f64 	%fd78, %fd78, %fd31;
 
-BB19_3:
+BB20_3:
 	mov.f64 	%fd77, %fd78;
 	shl.b32 	%r12, %r9, 1;
 	mov.u32 	%r13, %nctaid.x;
 	mad.lo.s32 	%r32, %r12, %r13, %r32;
 	setp.lt.u32	%p3, %r32, %r5;
 	mov.f64 	%fd76, %fd77;
-	@%p3 bra 	BB19_1;
+	@%p3 bra 	BB20_1;
 
-BB19_4:
+BB20_4:
 	mov.f64 	%fd74, %fd76;
 	mul.wide.u32 	%rd10, %r6, 8;
 	mov.u64 	%rd11, sdata;
@@ -2946,130 +2998,130 @@ BB19_4:
 	st.shared.f64 	[%rd1], %fd74;
 	bar.sync 	0;
 	setp.lt.u32	%p4, %r9, 1024;
-	@%p4 bra 	BB19_8;
+	@%p4 bra 	BB20_8;
 
 	setp.gt.u32	%p5, %r6, 511;
 	mov.f64 	%fd75, %fd74;
-	@%p5 bra 	BB19_7;
+	@%p5 bra 	BB20_7;
 
 	ld.shared.f64 	%fd32, [%rd1+4096];
 	min.f64 	%fd75, %fd74, %fd32;
 	st.shared.f64 	[%rd1], %fd75;
 
-BB19_7:
+BB20_7:
 	mov.f64 	%fd74, %fd75;
 	bar.sync 	0;
 
-BB19_8:
+BB20_8:
 	mov.f64 	%fd72, %fd74;
 	setp.lt.u32	%p6, %r9, 512;
-	@%p6 bra 	BB19_12;
+	@%p6 bra 	BB20_12;
 
 	setp.gt.u32	%p7, %r6, 255;
 	mov.f64 	%fd73, %fd72;
-	@%p7 bra 	BB19_11;
+	@%p7 bra 	BB20_11;
 
 	ld.shared.f64 	%fd33, [%rd1+2048];
 	min.f64 	%fd73, %fd72, %fd33;
 	st.shared.f64 	[%rd1], %fd73;
 
-BB19_11:
+BB20_11:
 	mov.f64 	%fd72, %fd73;
 	bar.sync 	0;
 
-BB19_12:
+BB20_12:
 	mov.f64 	%fd70, %fd72;
 	setp.lt.u32	%p8, %r9, 256;
-	@%p8 bra 	BB19_16;
+	@%p8 bra 	BB20_16;
 
 	setp.gt.u32	%p9, %r6, 127;
 	mov.f64 	%fd71, %fd70;
-	@%p9 bra 	BB19_15;
+	@%p9 bra 	BB20_15;
 
 	ld.shared.f64 	%fd34, [%rd1+1024];
 	min.f64 	%fd71, %fd70, %fd34;
 	st.shared.f64 	[%rd1], %fd71;
 
-BB19_15:
+BB20_15:
 	mov.f64 	%fd70, %fd71;
 	bar.sync 	0;
 
-BB19_16:
+BB20_16:
 	mov.f64 	%fd68, %fd70;
 	setp.lt.u32	%p10, %r9, 128;
-	@%p10 bra 	BB19_20;
+	@%p10 bra 	BB20_20;
 
 	setp.gt.u32	%p11, %r6, 63;
 	mov.f64 	%fd69, %fd68;
-	@%p11 bra 	BB19_19;
+	@%p11 bra 	BB20_19;
 
 	ld.shared.f64 	%fd35, [%rd1+512];
 	min.f64 	%fd69, %fd68, %fd35;
 	st.shared.f64 	[%rd1], %fd69;
 
-BB19_19:
+BB20_19:
 	mov.f64 	%fd68, %fd69;
 	bar.sync 	0;
 
-BB19_20:
+BB20_20:
 	mov.f64 	%fd67, %fd68;
 	setp.gt.u32	%p12, %r6, 31;
-	@%p12 bra 	BB19_33;
+	@%p12 bra 	BB20_33;
 
 	setp.lt.u32	%p13, %r9, 64;
-	@%p13 bra 	BB19_23;
+	@%p13 bra 	BB20_23;
 
 	ld.volatile.shared.f64 	%fd36, [%rd1+256];
 	min.f64 	%fd67, %fd67, %fd36;
 	st.volatile.shared.f64 	[%rd1], %fd67;
 
-BB19_23:
+BB20_23:
 	mov.f64 	%fd66, %fd67;
 	setp.lt.u32	%p14, %r9, 32;
-	@%p14 bra 	BB19_25;
+	@%p14 bra 	BB20_25;
 
 	ld.volatile.shared.f64 	%fd37, [%rd1+128];
 	min.f64 	%fd66, %fd66, %fd37;
 	st.volatile.shared.f64 	[%rd1], %fd66;
 
-BB19_25:
+BB20_25:
 	mov.f64 	%fd65, %fd66;
 	setp.lt.u32	%p15, %r9, 16;
-	@%p15 bra 	BB19_27;
+	@%p15 bra 	BB20_27;
 
 	ld.volatile.shared.f64 	%fd38, [%rd1+64];
 	min.f64 	%fd65, %fd65, %fd38;
 	st.volatile.shared.f64 	[%rd1], %fd65;
 
-BB19_27:
+BB20_27:
 	mov.f64 	%fd64, %fd65;
 	setp.lt.u32	%p16, %r9, 8;
-	@%p16 bra 	BB19_29;
+	@%p16 bra 	BB20_29;
 
 	ld.volatile.shared.f64 	%fd39, [%rd1+32];
 	min.f64 	%fd64, %fd64, %fd39;
 	st.volatile.shared.f64 	[%rd1], %fd64;
 
-BB19_29:
+BB20_29:
 	mov.f64 	%fd63, %fd64;
 	setp.lt.u32	%p17, %r9, 4;
-	@%p17 bra 	BB19_31;
+	@%p17 bra 	BB20_31;
 
 	ld.volatile.shared.f64 	%fd40, [%rd1+16];
 	min.f64 	%fd63, %fd63, %fd40;
 	st.volatile.shared.f64 	[%rd1], %fd63;
 
-BB19_31:
+BB20_31:
 	setp.lt.u32	%p18, %r9, 2;
-	@%p18 bra 	BB19_33;
+	@%p18 bra 	BB20_33;
 
 	ld.volatile.shared.f64 	%fd41, [%rd1+8];
 	min.f64 	%fd42, %fd63, %fd41;
 	st.volatile.shared.f64 	[%rd1], %fd42;
 
-BB19_33:
+BB20_33:
 	setp.ne.s32	%p19, %r6, 0;
-	@%p19 bra 	BB19_35;
+	@%p19 bra 	BB20_35;
 
 	ld.shared.f64 	%fd43, [sdata];
 	cvta.to.global.u64 	%rd12, %rd3;
@@ -3077,7 +3129,7 @@ BB19_33:
 	add.s64 	%rd14, %rd12, %rd13;
 	st.global.f64 	[%rd14], %fd43;
 
-BB19_35:
+BB20_35:
 	ret;
 }
 
@@ -3101,17 +3153,17 @@ BB19_35:
 	ld.param.u32 	%r4, [reduce_row_min_param_3];
 	mov.u32 	%r6, %ctaid.x;
 	setp.ge.u32	%p1, %r6, %r5;
-	@%p1 bra 	BB20_35;
+	@%p1 bra 	BB21_35;
 
 	mov.u32 	%r38, %tid.x;
 	mov.f64 	%fd72, 0d7FEFFFFFFFFFFFFF;
 	mov.f64 	%fd73, %fd72;
 	setp.ge.u32	%p2, %r38, %r4;
-	@%p2 bra 	BB20_4;
+	@%p2 bra 	BB21_4;
 
 	cvta.to.global.u64 	%rd3, %rd1;
 
-BB20_3:
+BB21_3:
 	mad.lo.s32 	%r8, %r6, %r4, %r38;
 	mul.wide.u32 	%rd4, %r8, 8;
 	add.s64 	%rd5, %rd3, %rd4;
@@ -3121,9 +3173,9 @@ BB20_3:
 	add.s32 	%r38, %r9, %r38;
 	setp.lt.u32	%p3, %r38, %r4;
 	mov.f64 	%fd72, %fd73;
-	@%p3 bra 	BB20_3;
+	@%p3 bra 	BB21_3;
 
-BB20_4:
+BB21_4:
 	mov.f64 	%fd70, %fd72;
 	mov.u32 	%r10, %tid.x;
 	mul.wide.u32 	%rd6, %r10, 8;
@@ -3133,130 +3185,130 @@ BB20_4:
 	bar.sync 	0;
 	mov.u32 	%r11, %ntid.x;
 	setp.lt.u32	%p4, %r11, 1024;
-	@%p4 bra 	BB20_8;
+	@%p4 bra 	BB21_8;
 
 	setp.gt.u32	%p5, %r10, 511;
 	mov.f64 	%fd71, %fd70;
-	@%p5 bra 	BB20_7;
+	@%p5 bra 	BB21_7;
 
 	ld.shared.f64 	%fd29, [%rd8+4096];
 	min.f64 	%fd71, %fd70, %fd29;
 	st.shared.f64 	[%rd8], %fd71;
 
-BB20_7:
+BB21_7:
 	mov.f64 	%fd70, %fd71;
 	bar.sync 	0;
 
-BB20_8:
+BB21_8:
 	mov.f64 	%fd68, %fd70;
 	setp.lt.u32	%p6, %r11, 512;
-	@%p6 bra 	BB20_12;
+	@%p6 bra 	BB21_12;
 
 	setp.gt.u32	%p7, %r10, 255;
 	mov.f64 	%fd69, %fd68;
-	@%p7 bra 	BB20_11;
+	@%p7 bra 	BB21_11;
 
 	ld.shared.f64 	%fd30, [%rd8+2048];
 	min.f64 	%fd69, %fd68, %fd30;
 	st.shared.f64 	[%rd8], %fd69;
 
-BB20_11:
+BB21_11:
 	mov.f64 	%fd68, %fd69;
 	bar.sync 	0;
 
-BB20_12:
+BB21_12:
 	mov.f64 	%fd66, %fd68;
 	setp.lt.u32	%p8, %r11, 256;
-	@%p8 bra 	BB20_16;
+	@%p8 bra 	BB21_16;
 
 	setp.gt.u32	%p9, %r10, 127;
 	mov.f64 	%fd67, %fd66;
-	@%p9 bra 	BB20_15;
+	@%p9 bra 	BB21_15;
 
 	ld.shared.f64 	%fd31, [%rd8+1024];
 	min.f64 	%fd67, %fd66, %fd31;
 	st.shared.f64 	[%rd8], %fd67;
 
-BB20_15:
+BB21_15:
 	mov.f64 	%fd66, %fd67;
 	bar.sync 	0;
 
-BB20_16:
+BB21_16:
 	mov.f64 	%fd64, %fd66;
 	setp.lt.u32	%p10, %r11, 128;
-	@%p10 bra 	BB20_20;
+	@%p10 bra 	BB21_20;
 
 	setp.gt.u32	%p11, %r10, 63;
 	mov.f64 	%fd65, %fd64;
-	@%p11 bra 	BB20_19;
+	@%p11 bra 	BB21_19;
 
 	ld.shared.f64 	%fd32, [%rd8+512];
 	min.f64 	%fd65, %fd64, %fd32;
 	st.shared.f64 	[%rd8], %fd65;
 
-BB20_19:
+BB21_19:
 	mov.f64 	%fd64, %fd65;
 	bar.sync 	0;
 
-BB20_20:
+BB21_20:
 	mov.f64 	%fd63, %fd64;
 	setp.gt.u32	%p12, %r10, 31;
-	@%p12 bra 	BB20_33;
+	@%p12 bra 	BB21_33;
 
 	setp.lt.u32	%p13, %r11, 64;
-	@%p13 bra 	BB20_23;
+	@%p13 bra 	BB21_23;
 
 	ld.volatile.shared.f64 	%fd33, [%rd8+256];
 	min.f64 	%fd63, %fd63, %fd33;
 	st.volatile.shared.f64 	[%rd8], %fd63;
 
-BB20_23:
+BB21_23:
 	mov.f64 	%fd62, %fd63;
 	setp.lt.u32	%p14, %r11, 32;
-	@%p14 bra 	BB20_25;
+	@%p14 bra 	BB21_25;
 
 	ld.volatile.shared.f64 	%fd34, [%rd8+128];
 	min.f64 	%fd62, %fd62, %fd34;
 	st.volatile.shared.f64 	[%rd8], %fd62;
 
-BB20_25:
+BB21_25:
 	mov.f64 	%fd61, %fd62;
 	setp.lt.u32	%p15, %r11, 16;
-	@%p15 bra 	BB20_27;
+	@%p15 bra 	BB21_27;
 
 	ld.volatile.shared.f64 	%fd35, [%rd8+64];
 	min.f64 	%fd61, %fd61, %fd35;
 	st.volatile.shared.f64 	[%rd8], %fd61;
 
-BB20_27:
+BB21_27:
 	mov.f64 	%fd60, %fd61;
 	setp.lt.u32	%p16, %r11, 8;
-	@%p16 bra 	BB20_29;
+	@%p16 bra 	BB21_29;
 
 	ld.volatile.shared.f64 	%fd36, [%rd8+32];
 	min.f64 	%fd60, %fd60, %fd36;
 	st.volatile.shared.f64 	[%rd8], %fd60;
 
-BB20_29:
+BB21_29:
 	mov.f64 	%fd59, %fd60;
 	setp.lt.u32	%p17, %r11, 4;
-	@%p17 bra 	BB20_31;
+	@%p17 bra 	BB21_31;
 
 	ld.volatile.shared.f64 	%fd37, [%rd8+16];
 	min.f64 	%fd59, %fd59, %fd37;
 	st.volatile.shared.f64 	[%rd8], %fd59;
 
-BB20_31:
+BB21_31:
 	setp.lt.u32	%p18, %r11, 2;
-	@%p18 bra 	BB20_33;
+	@%p18 bra 	BB21_33;
 
 	ld.volatile.shared.f64 	%fd38, [%rd8+8];
 	min.f64 	%fd39, %fd59, %fd38;
 	st.volatile.shared.f64 	[%rd8], %fd39;
 
-BB20_33:
+BB21_33:
 	setp.ne.s32	%p19, %r10, 0;
-	@%p19 bra 	BB20_35;
+	@%p19 bra 	BB21_35;
 
 	ld.shared.f64 	%fd40, [sdata];
 	cvta.to.global.u64 	%rd39, %rd2;
@@ -3264,7 +3316,7 @@ BB20_33:
 	add.s64 	%rd41, %rd39, %rd40;
 	st.global.f64 	[%rd41], %fd40;
 
-BB20_35:
+BB21_35:
 	ret;
 }
 
@@ -3291,18 +3343,18 @@ BB20_35:
 	mov.u32 	%r9, %tid.x;
 	mad.lo.s32 	%r1, %r7, %r8, %r9;
 	setp.ge.u32	%p1, %r1, %r6;
-	@%p1 bra 	BB21_5;
+	@%p1 bra 	BB22_5;
 
 	cvta.to.global.u64 	%rd1, %rd2;
 	mul.lo.s32 	%r2, %r6, %r5;
 	mov.f64 	%fd8, 0d7FEFFFFFFFFFFFFF;
 	mov.f64 	%fd9, %fd8;
 	setp.ge.u32	%p2, %r1, %r2;
-	@%p2 bra 	BB21_4;
+	@%p2 bra 	BB22_4;
 
 	mov.u32 	%r10, %r1;
 
-BB21_3:
+BB22_3:
 	mov.u32 	%r3, %r10;
 	mul.wide.u32 	%rd4, %r3, 8;
 	add.s64 	%rd5, %rd1, %rd4;
@@ -3312,15 +3364,15 @@ BB21_3:
 	setp.lt.u32	%p3, %r4, %r2;
 	mov.u32 	%r10, %r4;
 	mov.f64 	%fd8, %fd9;
-	@%p3 bra 	BB21_3;
+	@%p3 bra 	BB22_3;
 
-BB21_4:
+BB22_4:
 	cvta.to.global.u64 	%rd6, %rd3;
 	mul.wide.u32 	%rd7, %r1, 8;
 	add.s64 	%rd8, %rd6, %rd7;
 	st.global.f64 	[%rd8], %fd8;
 
-BB21_5:
+BB22_5:
 	ret;
 }
 
@@ -3348,9 +3400,9 @@ BB21_5:
 	mov.f64 	%fd76, 0d3FF0000000000000;
 	mov.f64 	%fd77, %fd76;
 	setp.ge.u32	%p1, %r32, %r5;
-	@%p1 bra 	BB22_4;
+	@%p1 bra 	BB23_4;
 
-BB22_1:
+BB23_1:
 	mov.f64 	%fd1, %fd77;
 	cvta.to.global.u64 	%rd4, %rd2;
 	mul.wide.u32 	%rd5, %r32, 8;
@@ -3359,23 +3411,23 @@ BB22_1:
 	mul.f64 	%fd78, %fd1, %fd30;
 	add.s32 	%r3, %r32, %r9;
 	setp.ge.u32	%p2, %r3, %r5;
-	@%p2 bra 	BB22_3;
+	@%p2 bra 	BB23_3;
 
 	mul.wide.u32 	%rd8, %r3, 8;
 	add.s64 	%rd9, %rd4, %rd8;
 	ld.global.f64 	%fd31, [%rd9];
 	mul.f64 	%fd78, %fd78, %fd31;
 
-BB22_3:
+BB23_3:
 	mov.f64 	%fd77, %fd78;
 	shl.b32 	%r12, %r9, 1;
 	mov.u32 	%r13, %nctaid.x;
 	mad.lo.s32 	%r32, %r12, %r13, %r32;
 	setp.lt.u32	%p3, %r32, %r5;
 	mov.f64 	%fd76, %fd77;
-	@%p3 bra 	BB22_1;
+	@%p3 bra 	BB23_1;
 
-BB22_4:
+BB23_4:
 	mov.f64 	%fd74, %fd76;
 	mul.wide.u32 	%rd10, %r6, 8;
 	mov.u64 	%rd11, sdata;
@@ -3383,130 +3435,130 @@ BB22_4:
 	st.shared.f64 	[%rd1], %fd74;
 	bar.sync 	0;
 	setp.lt.u32	%p4, %r9, 1024;
-	@%p4 bra 	BB22_8;
+	@%p4 bra 	BB23_8;
 
 	setp.gt.u32	%p5, %r6, 511;
 	mov.f64 	%fd75, %fd74;
-	@%p5 bra 	BB22_7;
+	@%p5 bra 	BB23_7;
 
 	ld.shared.f64 	%fd32, [%rd1+4096];
 	mul.f64 	%fd75, %fd74, %fd32;
 	st.shared.f64 	[%rd1], %fd75;
 
-BB22_7:
+BB23_7:
 	mov.f64 	%fd74, %fd75;
 	bar.sync 	0;
 
-BB22_8:
+BB23_8:
 	mov.f64 	%fd72, %fd74;
 	setp.lt.u32	%p6, %r9, 512;
-	@%p6 bra 	BB22_12;
+	@%p6 bra 	BB23_12;
 
 	setp.gt.u32	%p7, %r6, 255;
 	mov.f64 	%fd73, %fd72;
-	@%p7 bra 	BB22_11;
+	@%p7 bra 	BB23_11;
 
 	ld.shared.f64 	%fd33, [%rd1+2048];
 	mul.f64 	%fd73, %fd72, %fd33;
 	st.shared.f64 	[%rd1], %fd73;
 
-BB22_11:
+BB23_11:
 	mov.f64 	%fd72, %fd73;
 	bar.sync 	0;
 
-BB22_12:
+BB23_12:
 	mov.f64 	%fd70, %fd72;
 	setp.lt.u32	%p8, %r9, 256;
-	@%p8 bra 	BB22_16;
+	@%p8 bra 	BB23_16;
 
 	setp.gt.u32	%p9, %r6, 127;
 	mov.f64 	%fd71, %fd70;
-	@%p9 bra 	BB22_15;
+	@%p9 bra 	BB23_15;
 
 	ld.shared.f64 	%fd34, [%rd1+1024];
 	mul.f64 	%fd71, %fd70, %fd34;
 	st.shared.f64 	[%rd1], %fd71;
 
-BB22_15:
+BB23_15:
 	mov.f64 	%fd70, %fd71;
 	bar.sync 	0;
 
-BB22_16:
+BB23_16:
 	mov.f64 	%fd68, %fd70;
 	setp.lt.u32	%p10, %r9, 128;
-	@%p10 bra 	BB22_20;
+	@%p10 bra 	BB23_20;
 
 	setp.gt.u32	%p11, %r6, 63;
 	mov.f64 	%fd69, %fd68;
-	@%p11 bra 	BB22_19;
+	@%p11 bra 	BB23_19;
 
 	ld.shared.f64 	%fd35, [%rd1+512];
 	mul.f64 	%fd69, %fd68, %fd35;
 	st.shared.f64 	[%rd1], %fd69;
 
-BB22_19:
+BB23_19:
 	mov.f64 	%fd68, %fd69;
 	bar.sync 	0;
 
-BB22_20:
+BB23_20:
 	mov.f64 	%fd67, %fd68;
 	setp.gt.u32	%p12, %r6, 31;
-	@%p12 bra 	BB22_33;
+	@%p12 bra 	BB23_33;
 
 	setp.lt.u32	%p13, %r9, 64;
-	@%p13 bra 	BB22_23;
+	@%p13 bra 	BB23_23;
 
 	ld.volatile.shared.f64 	%fd36, [%rd1+256];
 	mul.f64 	%fd67, %fd67, %fd36;
 	st.volatile.shared.f64 	[%rd1], %fd67;
 
-BB22_23:
+BB23_23:
 	mov.f64 	%fd66, %fd67;
 	setp.lt.u32	%p14, %r9, 32;
-	@%p14 bra 	BB22_25;
+	@%p14 bra 	BB23_25;
 
 	ld.volatile.shared.f64 	%fd37, [%rd1+128];
 	mul.f64 	%fd66, %fd66, %fd37;
 	st.volatile.shared.f64 	[%rd1], %fd66;
 
-BB22_25:
+BB23_25:
 	mov.f64 	%fd65, %fd66;
 	setp.lt.u32	%p15, %r9, 16;
-	@%p15 bra 	BB22_27;
+	@%p15 bra 	BB23_27;
 
 	ld.volatile.shared.f64 	%fd38, [%rd1+64];
 	mul.f64 	%fd65, %fd65, %fd38;
 	st.volatile.shared.f64 	[%rd1], %fd65;
 
-BB22_27:
+BB23_27:
 	mov.f64 	%fd64, %fd65;
 	setp.lt.u32	%p16, %r9, 8;
-	@%p16 bra 	BB22_29;
+	@%p16 bra 	BB23_29;
 
 	ld.volatile.shared.f64 	%fd39, [%rd1+32];
 	mul.f64 	%fd64, %fd64, %fd39;
 	st.volatile.shared.f64 	[%rd1], %fd64;
 
-BB22_29:
+BB23_29:
 	mov.f64 	%fd63, %fd64;
 	setp.lt.u32	%p17, %r9, 4;
-	@%p17 bra 	BB22_31;
+	@%p17 bra 	BB23_31;
 
 	ld.volatile.shared.f64 	%fd40, [%rd1+16];
 	mul.f64 	%fd63, %fd63, %fd40;
 	st.volatile.shared.f64 	[%rd1], %fd63;
 
-BB22_31:
+BB23_31:
 	setp.lt.u32	%p18, %r9, 2;
-	@%p18 bra 	BB22_33;
+	@%p18 bra 	BB23_33;
 
 	ld.volatile.shared.f64 	%fd41, [%rd1+8];
 	mul.f64 	%fd42, %fd63, %fd41;
 	st.volatile.shared.f64 	[%rd1], %fd42;
 
-BB22_33:
+BB23_33:
 	setp.ne.s32	%p19, %r6, 0;
-	@%p19 bra 	BB22_35;
+	@%p19 bra 	BB23_35;
 
 	ld.shared.f64 	%fd43, [sdata];
 	cvta.to.global.u64 	%rd12, %rd3;
@@ -3514,7 +3566,7 @@ BB22_33:
 	add.s64 	%rd14, %rd12, %rd13;
 	st.global.f64 	[%rd14], %fd43;
 
-BB22_35:
+BB23_35:
 	ret;
 }
 
@@ -3538,17 +3590,17 @@ BB22_35:
 	ld.param.u32 	%r4, [reduce_row_mean_param_3];
 	mov.u32 	%r6, %ctaid.x;
 	setp.ge.u32	%p1, %r6, %r5;
-	@%p1 bra 	BB23_35;
+	@%p1 bra 	BB24_35;
 
 	mov.u32 	%r38, %tid.x;
 	mov.f64 	%fd74, 0d0000000000000000;
 	mov.f64 	%fd75, %fd74;
 	setp.ge.u32	%p2, %r38, %r4;
-	@%p2 bra 	BB23_4;
+	@%p2 bra 	BB24_4;
 
 	cvta.to.global.u64 	%rd3, %rd1;
 
-BB23_3:
+BB24_3:
 	mad.lo.s32 	%r8, %r6, %r4, %r38;
 	mul.wide.u32 	%rd4, %r8, 8;
 	add.s64 	%rd5, %rd3, %rd4;
@@ -3558,9 +3610,9 @@ BB23_3:
 	add.s32 	%r38, %r9, %r38;
 	setp.lt.u32	%p3, %r38, %r4;
 	mov.f64 	%fd74, %fd75;
-	@%p3 bra 	BB23_3;
+	@%p3 bra 	BB24_3;
 
-BB23_4:
+BB24_4:
 	mov.f64 	%fd72, %fd74;
 	mov.u32 	%r10, %tid.x;
 	mul.wide.u32 	%rd6, %r10, 8;
@@ -3570,130 +3622,130 @@ BB23_4:
 	bar.sync 	0;
 	mov.u32 	%r11, %ntid.x;
 	setp.lt.u32	%p4, %r11, 1024;
-	@%p4 bra 	BB23_8;
+	@%p4 bra 	BB24_8;
 
 	setp.gt.u32	%p5, %r10, 511;
 	mov.f64 	%fd73, %fd72;
-	@%p5 bra 	BB23_7;
+	@%p5 bra 	BB24_7;
 
 	ld.shared.f64 	%fd29, [%rd8+4096];
 	add.f64 	%fd73, %fd72, %fd29;
 	st.shared.f64 	[%rd8], %fd73;
 
-BB23_7:
+BB24_7:
 	mov.f64 	%fd72, %fd73;
 	bar.sync 	0;
 
-BB23_8:
+BB24_8:
 	mov.f64 	%fd70, %fd72;
 	setp.lt.u32	%p6, %r11, 512;
-	@%p6 bra 	BB23_12;
+	@%p6 bra 	BB24_12;
 
 	setp.gt.u32	%p7, %r10, 255;
 	mov.f64 	%fd71, %fd70;
-	@%p7 bra 	BB23_11;
+	@%p7 bra 	BB24_11;
 
 	ld.shared.f64 	%fd30, [%rd8+2048];
 	add.f64 	%fd71, %fd70, %fd30;
 	st.shared.f64 	[%rd8], %fd71;
 
-BB23_11:
+BB24_11:
 	mov.f64 	%fd70, %fd71;
 	bar.sync 	0;
 
-BB23_12:
+BB24_12:
 	mov.f64 	%fd68, %fd70;
 	setp.lt.u32	%p8, %r11, 256;
-	@%p8 bra 	BB23_16;
+	@%p8 bra 	BB24_16;
 
 	setp.gt.u32	%p9, %r10, 127;
 	mov.f64 	%fd69, %fd68;
-	@%p9 bra 	BB23_15;
+	@%p9 bra 	BB24_15;
 
 	ld.shared.f64 	%fd31, [%rd8+1024];
 	add.f64 	%fd69, %fd68, %fd31;
 	st.shared.f64 	[%rd8], %fd69;
 
-BB23_15:
+BB24_15:
 	mov.f64 	%fd68, %fd69;
 	bar.sync 	0;
 
-BB23_16:
+BB24_16:
 	mov.f64 	%fd66, %fd68;
 	setp.lt.u32	%p10, %r11, 128;
-	@%p10 bra 	BB23_20;
+	@%p10 bra 	BB24_20;
 
 	setp.gt.u32	%p11, %r10, 63;
 	mov.f64 	%fd67, %fd66;
-	@%p11 bra 	BB23_19;
+	@%p11 bra 	BB24_19;
 
 	ld.shared.f64 	%fd32, [%rd8+512];
 	add.f64 	%fd67, %fd66, %fd32;
 	st.shared.f64 	[%rd8], %fd67;
 
-BB23_19:
+BB24_19:
 	mov.f64 	%fd66, %fd67;
 	bar.sync 	0;
 
-BB23_20:
+BB24_20:
 	mov.f64 	%fd65, %fd66;
 	setp.gt.u32	%p12, %r10, 31;
-	@%p12 bra 	BB23_33;
+	@%p12 bra 	BB24_33;
 
 	setp.lt.u32	%p13, %r11, 64;
-	@%p13 bra 	BB23_23;
+	@%p13 bra 	BB24_23;
 
 	ld.volatile.shared.f64 	%fd33, [%rd8+256];
 	add.f64 	%fd65, %fd65, %fd33;
 	st.volatile.shared.f64 	[%rd8], %fd65;
 
-BB23_23:
+BB24_23:
 	mov.f64 	%fd64, %fd65;
 	setp.lt.u32	%p14, %r11, 32;
-	@%p14 bra 	BB23_25;
+	@%p14 bra 	BB24_25;
 
 	ld.volatile.shared.f64 	%fd34, [%rd8+128];
 	add.f64 	%fd64, %fd64, %fd34;
 	st.volatile.shared.f64 	[%rd8], %fd64;
 
-BB23_25:
+BB24_25:
 	mov.f64 	%fd63, %fd64;
 	setp.lt.u32	%p15, %r11, 16;
-	@%p15 bra 	BB23_27;
+	@%p15 bra 	BB24_27;
 
 	ld.volatile.shared.f64 	%fd35, [%rd8+64];
 	add.f64 	%fd63, %fd63, %fd35;
 	st.volatile.shared.f64 	[%rd8], %fd63;
 
-BB23_27:
+BB24_27:
 	mov.f64 	%fd62, %fd63;
 	setp.lt.u32	%p16, %r11, 8;
-	@%p16 bra 	BB23_29;
+	@%p16 bra 	BB24_29;
 
 	ld.volatile.shared.f64 	%fd36, [%rd8+32];
 	add.f64 	%fd62, %fd62, %fd36;
 	st.volatile.shared.f64 	[%rd8], %fd62;
 
-BB23_29:
+BB24_29:
 	mov.f64 	%fd61, %fd62;
 	setp.lt.u32	%p17, %r11, 4;
-	@%p17 bra 	BB23_31;
+	@%p17 bra 	BB24_31;
 
 	ld.volatile.shared.f64 	%fd37, [%rd8+16];
 	add.f64 	%fd61, %fd61, %fd37;
 	st.volatile.shared.f64 	[%rd8], %fd61;
 
-BB23_31:
+BB24_31:
 	setp.lt.u32	%p18, %r11, 2;
-	@%p18 bra 	BB23_33;
+	@%p18 bra 	BB24_33;
 
 	ld.volatile.shared.f64 	%fd38, [%rd8+8];
 	add.f64 	%fd39, %fd61, %fd38;
 	st.volatile.shared.f64 	[%rd8], %fd39;
 
-BB23_33:
+BB24_33:
 	setp.ne.s32	%p19, %r10, 0;
-	@%p19 bra 	BB23_35;
+	@%p19 bra 	BB24_35;
 
 	ld.shared.f64 	%fd40, [sdata];
 	cvt.u64.u32	%rd39, %r4;
@@ -3704,7 +3756,7 @@ BB23_33:
 	add.s64 	%rd42, %rd40, %rd41;
 	st.global.f64 	[%rd42], %fd42;
 
-BB23_35:
+BB24_35:
 	ret;
 }
 
@@ -3731,18 +3783,18 @@ BB23_35:
 	mov.u32 	%r9, %tid.x;
 	mad.lo.s32 	%r1, %r7, %r8, %r9;
 	setp.ge.u32	%p1, %r1, %r6;
-	@%p1 bra 	BB24_5;
+	@%p1 bra 	BB25_5;
 
 	cvta.to.global.u64 	%rd1, %rd2;
 	mul.lo.s32 	%r2, %r6, %r5;
 	mov.f64 	%fd10, 0d0000000000000000;
 	mov.f64 	%fd11, %fd10;
 	setp.ge.u32	%p2, %r1, %r2;
-	@%p2 bra 	BB24_4;
+	@%p2 bra 	BB25_4;
 
 	mov.u32 	%r10, %r1;
 
-BB24_3:
+BB25_3:
 	mov.u32 	%r3, %r10;
 	mul.wide.u32 	%rd4, %r3, 8;
 	add.s64 	%rd5, %rd1, %rd4;
@@ -3752,9 +3804,9 @@ BB24_3:
 	setp.lt.u32	%p3, %r4, %r2;
 	mov.u32 	%r10, %r4;
 	mov.f64 	%fd10, %fd11;
-	@%p3 bra 	BB24_3;
+	@%p3 bra 	BB25_3;
 
-BB24_4:
+BB25_4:
 	cvta.to.global.u64 	%rd6, %rd3;
 	cvt.u64.u32	%rd7, %r5;
 	cvt.rn.f64.s64	%fd7, %rd7;
@@ -3763,7 +3815,7 @@ BB24_4:
 	add.s64 	%rd9, %rd6, %rd8;
 	st.global.f64 	[%rd9], %fd8;
 
-BB24_5:
+BB25_5:
 	ret;
 }
 
@@ -3789,7 +3841,7 @@ BB24_5:
 	mov.u32 	%r8, %tid.x;
 	mad.lo.s32 	%r1, %r7, %r6, %r8;
 	setp.ge.u32	%p1, %r1, %r5;
-	@%p1 bra 	BB25_5;
+	@%p1 bra 	BB26_5;
 
 	cvta.to.global.u64 	%rd4, %rd2;
 	cvt.s64.s32	%rd1, %r1;
@@ -3849,13 +3901,13 @@ BB24_5:
 	mov.b32 	 %f2, %r11;
 	abs.f32 	%f1, %f2;
 	setp.lt.f32	%p2, %f1, 0f4086232B;
-	@%p2 bra 	BB25_4;
+	@%p2 bra 	BB26_4;
 
 	setp.lt.f64	%p3, %fd1, 0d0000000000000000;
 	add.f64 	%fd37, %fd1, 0d7FF0000000000000;
 	selp.f64	%fd40, 0d0000000000000000, %fd37, %p3;
 	setp.geu.f32	%p4, %f1, 0f40874800;
-	@%p4 bra 	BB25_4;
+	@%p4 bra 	BB26_4;
 
 	shr.u32 	%r12, %r2, 31;
 	add.s32 	%r13, %r2, %r12;
@@ -3870,13 +3922,13 @@ BB24_5:
 	mov.b64 	%fd39, {%r20, %r19};
 	mul.f64 	%fd40, %fd38, %fd39;
 
-BB25_4:
+BB26_4:
 	cvta.to.global.u64 	%rd7, %rd

<TRUNCATED>

[5/5] systemml git commit: [SYSTEMML-540] Support sparse GPU conv2d as well as fix memory estimation of convolution operations

Posted by ni...@apache.org.
[SYSTEMML-540] Support sparse GPU conv2d as well as fix memory estimation of convolution operations

Design doc: Memory estimation of GPU operators

- Since not all operator are supported on GPU, isGPUEnabled indicates
whether an operation is enabled for GPU. This method doesnot take into
account any memory estimates.
- To simplify memory estimation logic, the methods computeOutputMemEstimate
and computeIntermediateMemEstimate should return maximum of memory
required for GPU and CP operators.
- Additionally, these methods are guarded so that when -gpu flag is not
provided, additional memory overhead due to GPU are ignored. For example:
sparse-to-dense conversion on GPU.
- (WIP) Every GPU operators should respect the memory returned by
computeIntermediateMemEstimate (and computeOutputMemEstimate - see below
point).
- (WIP) Every GPU operator should create output in the same format as the
corresponding CP operator. That is, computeOutputMemEstimate are
consistent across both CP and GPU in terms of worst-case.
-  The drawback of using maximum memory (mem = Math.max(mem_gpu, mem_gpu))
are:
a. GPU operator is not selected when mem_gpu < total memory available on GPU
< mem
b. CP operator is not selected (i.e. distributed operator compiled) when
mem_cpu < driver memory budget < mem

Closes #650.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/772d9302
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/772d9302
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/772d9302

Branch: refs/heads/master
Commit: 772d9302dc196b047134ea491542d55113f52a08
Parents: a0cf8e3
Author: Niketan Pansare <np...@us.ibm.com>
Authored: Thu Sep 7 11:49:52 2017 -0800
Committer: Niketan Pansare <np...@us.ibm.com>
Committed: Thu Sep 7 12:49:52 2017 -0700

----------------------------------------------------------------------
 src/main/cpp/kernels/SystemML.cu                |   19 +
 src/main/cpp/kernels/SystemML.ptx               | 2757 +++++++++---------
 .../org/apache/sysml/hops/ConvolutionOp.java    |  458 ++-
 src/main/java/org/apache/sysml/hops/Hop.java    |   63 +-
 .../apache/sysml/lops/ConvolutionTransform.java |   11 +-
 .../cp/ConvolutionCPInstruction.java            |   91 +-
 .../gpu/ConvolutionGPUInstruction.java          |   72 +-
 .../gpu/MatrixBuiltinGPUInstruction.java        |    3 +-
 .../instructions/gpu/context/GPUContext.java    |    6 +
 .../matrix/data/ConvolutionParameters.java      |   25 +
 .../runtime/matrix/data/LibMatrixCUDA.java      | 1041 +------
 .../runtime/matrix/data/LibMatrixCuDNN.java     | 1219 ++++++++
 12 files changed, 3229 insertions(+), 2536 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/772d9302/src/main/cpp/kernels/SystemML.cu
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.cu b/src/main/cpp/kernels/SystemML.cu
index d64d8aa..bb6482d 100644
--- a/src/main/cpp/kernels/SystemML.cu
+++ b/src/main/cpp/kernels/SystemML.cu
@@ -156,6 +156,25 @@ __global__ void relu_backward(double* X,  double* dout, double* ret, int rlen, i
 	}
 }
 
+/**
+ * Performs inplace addition: ret += input
+ *
+ * @param input rhs input array allocated on the GPU
+ * @param ret the input and output array allocated on the GPU
+ * @param rlen the number of rows
+ * @param clen the number of columns
+ */
+extern "C"
+__global__ void inplace_add(double* input,  double* ret, int rlen, int clen) {
+	int tid = blockIdx.x * blockDim.x + threadIdx.x;
+	int ix = tid / clen;
+	int iy = tid % clen;
+	if(ix < rlen && iy < clen) {
+		int index = ix * clen + iy;
+		ret[index] += input[index];
+	}
+}
+
 // Performs the operation corresponding to the DML script:
 // ones = matrix(1, rows=1, cols=Hout*Wout)
 // output = input + matrix(bias %*% ones, rows=1, cols=F*Hout*Wout)


[2/5] systemml git commit: [SYSTEMML-540] Support sparse GPU conv2d as well as fix memory estimation of convolution operations

Posted by ni...@apache.org.
http://git-wip-us.apache.org/repos/asf/systemml/blob/772d9302/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index 09ffe9f..a362364 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -21,37 +21,6 @@ package org.apache.sysml.runtime.matrix.data;
 
 import static jcuda.jcublas.cublasOperation.CUBLAS_OP_N;
 import static jcuda.jcublas.cublasOperation.CUBLAS_OP_T;
-import static jcuda.jcudnn.JCudnn.cudnnActivationForward;
-import static jcuda.jcudnn.JCudnn.cudnnBatchNormalizationBackward;
-import static jcuda.jcudnn.JCudnn.cudnnBatchNormalizationForwardInference;
-import static jcuda.jcudnn.JCudnn.cudnnBatchNormalizationForwardTraining;
-import static jcuda.jcudnn.JCudnn.cudnnConvolutionBackwardData;
-import static jcuda.jcudnn.JCudnn.cudnnConvolutionBackwardFilter;
-import static jcuda.jcudnn.JCudnn.cudnnConvolutionForward;
-import static jcuda.jcudnn.JCudnn.cudnnCreateActivationDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnCreateConvolutionDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnCreateFilterDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnCreatePoolingDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnCreateTensorDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnDestroyConvolutionDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnDestroyFilterDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnDestroyPoolingDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataWorkspaceSize;
-import static jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardFilterWorkspaceSize;
-import static jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardWorkspaceSize;
-import static jcuda.jcudnn.JCudnn.cudnnPoolingBackward;
-import static jcuda.jcudnn.JCudnn.cudnnPoolingForward;
-import static jcuda.jcudnn.JCudnn.cudnnSetActivationDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnSetConvolution2dDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnSetFilter4dDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnSetPooling2dDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnSetTensor4dDescriptor;
-import static jcuda.jcudnn.cudnnActivationMode.CUDNN_ACTIVATION_RELU;
-import static jcuda.jcudnn.cudnnConvolutionMode.CUDNN_CROSS_CORRELATION;
-import static jcuda.jcudnn.cudnnDataType.CUDNN_DATA_DOUBLE;
-import static jcuda.jcudnn.cudnnNanPropagation.CUDNN_PROPAGATE_NAN;
-import static jcuda.jcudnn.cudnnPoolingMode.CUDNN_POOLING_MAX;
-import static jcuda.jcudnn.cudnnTensorFormat.CUDNN_TENSOR_NCHW;
 import static jcuda.jcusparse.JCusparse.cusparseDcsr2csc;
 import static jcuda.jcusparse.JCusparse.cusparseDcsrgemm;
 import static jcuda.jcusparse.JCusparse.cusparseDcsrmv;
@@ -116,7 +85,6 @@ import org.apache.sysml.runtime.util.IndexRange;
 import org.apache.sysml.utils.GPUStatistics;
 import org.apache.sysml.utils.Statistics;
 
-import jcuda.CudaException;
 import jcuda.Pointer;
 import jcuda.Sizeof;
 import jcuda.jcublas.JCublas2;
@@ -125,15 +93,6 @@ import jcuda.jcublas.cublasFillMode;
 import jcuda.jcublas.cublasHandle;
 import jcuda.jcublas.cublasOperation;
 import jcuda.jcublas.cublasSideMode;
-import jcuda.jcudnn.cudnnActivationDescriptor;
-import jcuda.jcudnn.cudnnBatchNormMode;
-import jcuda.jcudnn.cudnnConvolutionDescriptor;
-import jcuda.jcudnn.cudnnConvolutionFwdPreference;
-import jcuda.jcudnn.cudnnFilterDescriptor;
-import jcuda.jcudnn.cudnnHandle;
-import jcuda.jcudnn.cudnnPoolingDescriptor;
-import jcuda.jcudnn.cudnnStatus;
-import jcuda.jcudnn.cudnnTensorDescriptor;
 import jcuda.jcusolver.JCusolverDn;
 import jcuda.jcusparse.JCusparse;
 import jcuda.jcusparse.cusparseAction;
@@ -155,6 +114,10 @@ public class LibMatrixCUDA {
 	private static int _MAX_THREADS = -1;
 	private static int _MAX_BLOCKS  = -1;
 	private static int _WARP_SIZE 	= -1;
+	
+	// From CuDNN 5.1 documentation:
+	// The total size of a tensor including the potential padding between dimensions is limited to 2 Giga-elements of type datatype.
+	protected static long maxNumDoublesOfCuDNNTensor = 2000000000;
 
 	//********************************************************************/
 	//***************************** UTILS ********************************/
@@ -220,11 +183,7 @@ public class LibMatrixCUDA {
 		return gCtx.getCublasHandle();
 	}
 
-	private static cudnnHandle getCudnnHandle(GPUContext gCtx) throws DMLRuntimeException {
-		return gCtx.getCudnnHandle();
-	}
-
-	private static JCudaKernels getCudaKernels(GPUContext gCtx) throws DMLRuntimeException {
+	protected static JCudaKernels getCudaKernels(GPUContext gCtx) throws DMLRuntimeException {
 		return gCtx.getKernels();
 	}
 
@@ -237,17 +196,13 @@ public class LibMatrixCUDA {
 	//***************** DEEP LEARNING Operators **************************/
 	//********************************************************************/
 
-
-
-	private static int CONVOLUTION_PREFERENCE = cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
-
 	private static Pointer _one;
 	private static Pointer _zero;
 	/**
 	 * Convenience method to get a pointer to value '1.0' on device. Instead of allocating and deallocating it for every kernel invocation.
 	 * @return jcuda pointer
 	 */
-	private static Pointer one() {
+	protected static Pointer one() {
 		if(_one == null) {
 			_one = pointerTo(1.0);
 		}
@@ -257,7 +212,7 @@ public class LibMatrixCUDA {
 	 * Convenience method to get a pointer to value '0.0f' on device. Instead of allocating and deallocating it for every kernel invocation.
 	 * @return jcuda pointer
 	 */
-	private static Pointer zero() {
+	protected static Pointer zero() {
 		if(_zero == null) {
 			_zero = pointerTo(0.0f);
 		}
@@ -265,56 +220,6 @@ public class LibMatrixCUDA {
 	}
 
 	/**
-	 * Convenience method to get tensor descriptor from underlying GPUObject
-	 * @param gCtx   a valid {@link GPUContext}
-	 * @param mat matrix object
-	 * @param N number of images
-	 * @param C number of channels
-	 * @param H height
-	 * @param W width
-	 * @return cudnn tensor descriptor
-	 * @throws DMLRuntimeException if the input descriptor and matrix dimensions don't match
-	 */
-	private static cudnnTensorDescriptor allocateTensorDescriptor(GPUContext gCtx, MatrixObject mat, int N, int C, int H, int W) throws DMLRuntimeException {
-		if(mat.getNumRows() != N || mat.getNumColumns() != C*H*W) {
-			throw new DMLRuntimeException("Mismatch descriptor-matrix dimensions:" + mat.getNumRows() + " != " + N
-					+ " || " + mat.getNumColumns() + " != " + (C*H*W));
-		}
-		return mat.getGPUObject(gCtx).allocateTensorDescriptor(N, C, H, W);
-	}
-
-	/**
-	 * Convenience method to get tensor descriptor
-	 * @param N number of images
-	 * @param C number of channels
-	 * @param H height
-	 * @param W width
-	 * @return cudnn tensor descriptor
-	 * @throws DMLRuntimeException if the input descriptor and matrix dimensions don't match
-	 */
-	private static cudnnTensorDescriptor allocateTensorDescriptor(int N, int C, int H, int W) throws DMLRuntimeException {
-		cudnnTensorDescriptor tensorDescriptor = new cudnnTensorDescriptor();
-		cudnnCreateTensorDescriptor(tensorDescriptor);
-		cudnnSetTensor4dDescriptor(tensorDescriptor, CUDNN_TENSOR_NCHW, CUDNN_DATA_DOUBLE, N, C, H, W);
-		return tensorDescriptor;
-	}
-
-	/**
-	 * Convenience method to get jcudaDenseMatrixPtr. This method explicitly converts sparse to dense format, so use it judiciously.
-	 * @param gCtx a valid {@link GPUContext}
-	 * @param image input matrix object
-	 * @param isForCuDNN true if the dense pointer is to be used by a CuDNN kernel
-	 * @return jcuda pointer
-	 * @throws DMLRuntimeException if error occurs while sparse to dense conversion
-	 */
-	private static Pointer getDensePointer(GPUContext gCtx, MatrixObject image, boolean isForCuDNN, String instName) throws DMLRuntimeException {
-		if(isForCuDNN && image.getNumRows()*image.getNumColumns() > numDoublesIn2GB) {
-			throw new DMLRuntimeException("CuDNN restriction: the size of input tensor cannot be greater than 2GB. Hint: try reducing the mini-batch size.");
-		}
-		return getDensePointer(gCtx, image, instName);
-	}
-
-	/**
 	 * Convenience method to get jcudaDenseMatrixPtr. This method explicitly converts sparse to dense format, so use it judiciously.
 	 * @param gCtx a valid {@link GPUContext}
 	 * @param input input matrix object
@@ -322,7 +227,7 @@ public class LibMatrixCUDA {
 	 * @return jcuda pointer
 	 * @throws DMLRuntimeException if error occurs while sparse to dense conversion
 	 */
-	private static Pointer getDensePointer(GPUContext gCtx, MatrixObject input, String instName) throws DMLRuntimeException {
+	protected static Pointer getDensePointer(GPUContext gCtx, MatrixObject input, String instName) throws DMLRuntimeException {
 		if(isInSparseFormat(gCtx, input)) {
 			input.getGPUObject(gCtx).sparseToDense(instName);
 		}
@@ -337,222 +242,17 @@ public class LibMatrixCUDA {
 	 * @return a sparse matrix pointer
 	 * @throws DMLRuntimeException if error occurs
 	 */
-	private static CSRPointer getSparsePointer(GPUContext gCtx, MatrixObject input, String instName) throws DMLRuntimeException {
+	protected static CSRPointer getSparsePointer(GPUContext gCtx, MatrixObject input, String instName) throws DMLRuntimeException {
 		if(!isInSparseFormat(gCtx, input)) {
 			input.getGPUObject(gCtx).denseToSparse();
 		}
 		return input.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
 	}
-
-	/**
-	 * Convenience method for checking the status of CuDNN kernel.
-	 *
-	 * @param status status returned by CuDNN
-	 * @throws DMLRuntimeException if status is not CUDNN_STATUS_SUCCESS
-	 */
-	private static void checkStatus(int status) throws DMLRuntimeException {
-		if(status != cudnnStatus.CUDNN_STATUS_SUCCESS)
-			throw new DMLRuntimeException("Error status returned by CuDNN:" + jcuda.jcudnn.cudnnStatus.stringFor(status));
-	}
-
-	/**
-	 * Does a 2D convolution followed by a bias_add
-	 *
-	 * @param gCtx     a valid {@link GPUContext}
-	 * @param instName the invoking instruction's name for record {@link Statistics}.
-	 * @param image    input image matrix object
-	 * @param bias     bias matrix object
-	 * @param filter   filter matrix object
-	 * @param output   output matrix object
-	 * @param N        number of input images
-	 * @param C        number of channels
-	 * @param H        height of each image
-	 * @param W        width of each image
-	 * @param K        number of output "channels"
-	 * @param R        height of filter
-	 * @param S        width of filter
-	 * @param pad_h    padding height
-	 * @param pad_w    padding width
-	 * @param stride_h stride height
-	 * @param stride_w string width
-	 * @param P        output height
-	 * @param Q        output width
-	 * @throws DMLRuntimeException if error
-	 */
-	public static void conv2dBiasAdd(GPUContext gCtx, String instName, MatrixObject image, MatrixObject bias, MatrixObject filter, MatrixObject output, int N, int C, int H, int W,
-			int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q)
-					throws DMLRuntimeException {
-		/*
-		int rows = (int) output.getNumRows();
-		int cols = (int) output.getNumColumns();
-		long size  = rows * cols * Sizeof.DOUBLE;
-
-		Pointer imagePointer = getDensePointer(image, instName);
-		Pointer biasPointer = getDensePointer(bias, instName);
-		Pointer outputPointer = getDensePointer(output, instName);
-		Pointer filterPointer = getDensePointer(filter, instName);
-
-		Pointer tmp = allocate(size);
-
-		conv2d(instName, imagePointer, filterPointer, tmp, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
-		cudaDeviceSynchronize();
-
-		long k1 = bias.getNumColumns();
-		if(k1 != bias.getNumColumns() || bias.getNumColumns() != 1 || cols % k1 != 0) {
-			throw new DMLRuntimeException("Incorrect inputs for bias_add: input[" + rows + " X " + cols + "] and bias[" + K + " X " + bias.getNumColumns() + "]");
-		}
-		// biasAdd(instName, output, bias, output);
-		biasAdd(instName, tmp, biasPointer, outputPointer, rows, cols, (int)k1);
-
-		cudaFreeHelper(tmp);
-		 */
-		LOG.trace("GPU : conv2dBiasAdd" + ", GPUContext=" + gCtx);
-		conv2d(gCtx, instName, image, filter, output, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
-		//cudaDeviceSynchronize;
-		biasAdd(gCtx, instName, output, bias, output);
-	}
-
-	public static void conv2d(GPUContext gCtx, String instName, MatrixObject image, MatrixObject filter, MatrixObject outputBlock, int N, int C, int H, int W,
-			int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q)
-					throws DMLRuntimeException {
-		Pointer imagePointer = getDensePointer(gCtx, image, true, instName);
-		Pointer filterPointer = getDensePointer(gCtx, filter, true, instName);
-		Pointer dstPointer = getDensePointer(gCtx, outputBlock, true, instName);
-
-		conv2d(gCtx, instName, imagePointer, filterPointer, dstPointer, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
-	}
-
-	/**
-	 * Performs 2D convolution
-	 * Takes up an insignificant amount of intermediate space when CONVOLUTION_PREFERENCE is set to CUDNN_CONVOLUTION_FWD_NO_WORKSPACE
-	 * Intermediate space is required by the filter descriptor and convolution descriptor which are metadata structures and don't scale with the size of the input
-	 *
-	 * @param gCtx     a valid {@link GPUContext}
-	 * @param instName the invoking instruction's name for record {@link Statistics}.
-	 * @param image    the input matrix (or image) allocated on the GPU
-	 * @param filter   the filter allocated on the GPU
-	 * @param output   the output matrix allocated on the GPU
-	 * @param N        number of input images
-	 * @param C        number of channels
-	 * @param H        height of each image
-	 * @param W        width of each image
-	 * @param K        number of output "channels"
-	 * @param R        height of filter
-	 * @param S        width of filter
-	 * @param pad_h    padding height
-	 * @param pad_w    padding width
-	 * @param stride_h stride height
-	 * @param stride_w string width
-	 * @param P        output height
-	 * @param Q        output width
-	 * @throws DMLRuntimeException if error
-	 */
-	public static void conv2d(GPUContext gCtx, String instName, Pointer image, Pointer filter, Pointer output, int N,
-			int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q)
-					throws DMLRuntimeException {
-		LOG.trace("GPU : conv2d" + ", GPUContext=" + gCtx);
-		cudnnFilterDescriptor filterDesc = null;
-		cudnnConvolutionDescriptor convDesc = null;
-		Pointer workSpace = null;
-		long sizeInBytes = 0;
-		try {
-			long t1 = 0, t2 = 0;
-			// Allocate descriptors
-			if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
-			cudnnTensorDescriptor srcTensorDesc = allocateTensorDescriptor(N, C, H, W);
-			cudnnTensorDescriptor dstTensorDesc = allocateTensorDescriptor(N, K, P, Q);
-			filterDesc = allocateFilterDescriptor(K, C, R, S);
-
-			int padding[] = {pad_h, pad_w};
-			int strides[] = {stride_h, stride_w};
-			convDesc = allocateConvolutionDescriptor(padding, strides);
-
-			// Select the best algorithm depending on the data and supported CUDA
-
-			int algo = -1;
-			workSpace = new Pointer();
-
-			if (CONVOLUTION_PREFERENCE == cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_NO_WORKSPACE) {
-				algo = jcuda.jcudnn.cudnnConvolutionFwdAlgo.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-			} else if (CONVOLUTION_PREFERENCE == cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_PREFER_FASTEST) {
-				int[] algos = {-1};
-				// TODO: Look into FFt, Winograd, etc
-				// Also ensure that GPU has enough memory to allocate memory
-				long sizeInBytesArray[] = {0};
-				jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardAlgorithm(getCudnnHandle(gCtx), srcTensorDesc, filterDesc, convDesc, dstTensorDesc,
-						CONVOLUTION_PREFERENCE, sizeInBytesArray[0], algos);
-				cudnnGetConvolutionForwardWorkspaceSize(getCudnnHandle(gCtx), srcTensorDesc, filterDesc, convDesc, dstTensorDesc, algos[0], sizeInBytesArray);
-				if (sizeInBytesArray[0] != 0)
-					workSpace = gCtx.allocate(sizeInBytesArray[0]);
-				sizeInBytes = sizeInBytesArray[0];
-			} else if (CONVOLUTION_PREFERENCE == cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT) {
-				throw new DMLRuntimeException("CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT is not implemented");
-			} else {
-				throw new DMLRuntimeException("Unsupported preference criteria for convolution");
-			}
-			if (GPUStatistics.DISPLAY_STATISTICS)
-				GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
-			if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
-			int status = cudnnConvolutionForward(getCudnnHandle(gCtx), one(),
-					srcTensorDesc, image,
-					filterDesc, filter,
-					convDesc, algo, workSpace, sizeInBytes, zero(),
-					dstTensorDesc, output);
-			if (GPUStatistics.DISPLAY_STATISTICS)
-				GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CONVOLUTION_FORWARD_LIB, System.nanoTime() - t2);
-			if (status != cudnnStatus.CUDNN_STATUS_SUCCESS) {
-				throw new DMLRuntimeException("Could not executed cudnnConvolutionForward: " + cudnnStatus.stringFor(status));
-			}
-		} catch (CudaException e) {
-			throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e);
-		} finally {
-			long t3 = 0;
-			if (GPUStatistics.DISPLAY_STATISTICS) t3 = System.nanoTime();
-			if (filterDesc != null)
-				cudnnDestroyFilterDescriptor(filterDesc);
-			if (convDesc != null)
-				cudnnDestroyConvolutionDescriptor(convDesc);
-			if (workSpace != null && sizeInBytes != 0)
-				gCtx.cudaFreeHelper(instName, workSpace);
-			if (GPUStatistics.DISPLAY_STATISTICS)
-				GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3);
-		}
-	}
-
-	private static cudnnConvolutionDescriptor allocateConvolutionDescriptor(int padding [], int strides []) {
-		cudnnConvolutionDescriptor convDesc = new cudnnConvolutionDescriptor();
-		cudnnCreateConvolutionDescriptor(convDesc);
-		cudnnSetConvolution2dDescriptor(convDesc, padding[0], padding[1], strides[0], strides[1], 1, 1, CUDNN_CROSS_CORRELATION);
-		return convDesc;
-	}
-
-	private static Pointer pointerTo(double value) {
+	
+	protected static Pointer pointerTo(double value) {
 		return Pointer.to(new double[] { value });
 	}
-
-	private static cudnnFilterDescriptor allocateFilterDescriptor(int K, int C, int R, int S) {
-		cudnnFilterDescriptor filterDesc = new cudnnFilterDescriptor();
-		cudnnCreateFilterDescriptor(filterDesc);
-		cudnnSetFilter4dDescriptor(filterDesc, CUDNN_DATA_DOUBLE, CUDNN_TENSOR_NCHW, K, C, R, S);
-		return filterDesc;
-	}
-
-	/**
-	 * allocates pooling descriptor, used in poolingForward and poolingBackward
-	 * @param R			pooling window height
-	 * @param S			pooling window width
-	 * @param pad_h		vertical padding
-	 * @param pad_w		horizontal padding
-	 * @param stride_h	pooling vertical stride
-	 * @param stride_w	pooling horizontal stride
-	 * @return cudnn pooling descriptor
-	 */
-	private static cudnnPoolingDescriptor allocatePoolingDescriptor(int R, int S, int pad_h, int pad_w, int stride_h, int stride_w) {
-		cudnnPoolingDescriptor poolingDesc = new cudnnPoolingDescriptor();
-		cudnnCreatePoolingDescriptor(poolingDesc);
-		cudnnSetPooling2dDescriptor(poolingDesc, CUDNN_POOLING_MAX, CUDNN_PROPAGATE_NAN, R, S, pad_h, pad_w, stride_h, stride_w);
-		return poolingDesc;
-	}
+	
 
 	/**
 	 * This method computes the backpropagation errors for previous layer of relu operation
@@ -669,598 +369,7 @@ public class LibMatrixCUDA {
 				image, bias, output, rows, cols, PQ);
 		if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RELU_BACKWARD_KERNEL, System.nanoTime() - t1);
 	}
-
-	private static void validateBatchNormalizationDimensions(MatrixObject scale, MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar, int C) throws DMLRuntimeException {
-		if(scale.getNumRows() != 1 || scale.getNumColumns() != C) {
-			throw new DMLRuntimeException("Incorrect dimensions for scale");
-		}
-		if(bias.getNumRows() != 1 || bias.getNumColumns() != C) {
-			throw new DMLRuntimeException("Incorrect dimensions for bias");
-		}
-		if(runningMean.getNumRows() != 1 || runningMean.getNumColumns() != C) {
-			throw new DMLRuntimeException("Incorrect dimensions for running mean");
-		}
-		if(runningVar.getNumRows() != 1 || runningVar.getNumColumns() != C) {
-			throw new DMLRuntimeException("Incorrect dimensions for running variance");
-		}
-	}
-
-	/**
-	 * Performs the forward BatchNormalization layer computation for inference
-	 * @param gCtx   a valid {@link GPUContext}
-	 * @param instName name of the instruction
-	 * @param image input image
-	 * @param scale scale (as per CuDNN) and gamma as per original paper: shape [1, C, 1, 1]
-	 * @param bias bias (as per CuDNN) and beta as per original paper: shape [1, C, 1, 1]
-	 * @param runningMean running mean accumulated during training phase: shape [1, C, 1, 1]
-	 * @param runningVar running variance accumulated during training phase: shape [1, C, 1, 1]
-	 * @param ret normalized input
-	 * @param epsilon epsilon value used in the batch normalization formula
-	 * @throws DMLRuntimeException if error occurs
-	 */
-	public static void batchNormalizationForwardInference(GPUContext gCtx, String instName, MatrixObject image,
-			MatrixObject scale, MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar,
-			MatrixObject ret, double epsilon) throws DMLRuntimeException {
-		LOG.trace("GPU : batchNormalizationForwardInference" + ", GPUContext=" + gCtx);
-		int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
-
-		int N = toInt(image.getNumRows());
-		int C = toInt(scale.getNumColumns());
-		long CHW = image.getNumColumns();
-		validateBatchNormalizationDimensions(scale, bias, runningMean, runningVar, C);
-
-		// Allocate descriptors
-		cudnnTensorDescriptor nCHWDescriptor = allocateNCHWDescriptors(gCtx, N, C, CHW,
-				new MatrixObject[] {image},  new MatrixObject[] {ret});
-		cudnnTensorDescriptor scaleTensorDesc = allocateTensorDescriptor(gCtx, scale, 1, C, 1, 1);
-
-		// Get underlying dense pointer
-		Pointer imagePtr = getDensePointer(gCtx, image, true, instName);
-		Pointer retPtr = getDensePointer(gCtx, ret, true, instName);
-		Pointer biasPtr = getDensePointer(gCtx, bias, true, instName);
-		Pointer scalePtr = getDensePointer(gCtx, scale, true, instName);
-		Pointer runningMeanPtr = getDensePointer(gCtx, runningMean, true, instName);
-		Pointer runningVarPtr = getDensePointer(gCtx, runningVar, true, instName);
-
-		checkStatus(cudnnBatchNormalizationForwardInference(getCudnnHandle(gCtx), mode, one(), zero(),
-				nCHWDescriptor, imagePtr, nCHWDescriptor, retPtr,
-				scaleTensorDesc, scalePtr, biasPtr,
-				runningMeanPtr, runningVarPtr, epsilon));
-	}
-
-	/**
-	 * Performs the forward BatchNormalization layer computation for training
-	 * @param gCtx   a valid {@link GPUContext}
-	 * @param instName name of the instruction
-	 * @param image input image
-	 * @param scale scale (as per CuDNN) and gamma as per original paper: shape [1, C, 1, 1]
-	 * @param bias bias (as per CuDNN) and beta as per original paper: shape [1, C, 1, 1]
-	 * @param runningMean running mean accumulated during training phase: shape [1, C, 1, 1]
-	 * @param runningVar running variance accumulated during training phase: shape [1, C, 1, 1]
-	 * @param ret (output) normalized input
-	 * @param retRunningMean (output) running mean accumulated during training phase: shape [1, C, 1, 1]
-	 * @param retRunningVar (output) running variance accumulated during training phase: shape [1, C, 1, 1]
-	 * @param epsilon epsilon value used in the batch normalization formula
-	 * @param exponentialAverageFactor factor used in the moving average computation
-	 * @throws DMLRuntimeException if error occurs
-	 */
-	public static void batchNormalizationForwardTraining(GPUContext gCtx, String instName, MatrixObject image,
-			MatrixObject scale,  MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar,
-			MatrixObject ret, MatrixObject retRunningMean, MatrixObject retRunningVar, double epsilon, double exponentialAverageFactor) throws DMLRuntimeException {
-		LOG.trace("GPU : batchNormalizationForwardTraining" + ", GPUContext=" + gCtx);
-		int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
-
-		int N = toInt(image.getNumRows());
-		int C = toInt(scale.getNumColumns());
-		long CHW = image.getNumColumns();
-		validateBatchNormalizationDimensions(scale, bias, runningMean, runningVar, C);
-
-		// Allocate descriptors
-		cudnnTensorDescriptor nCHWDescriptor = allocateNCHWDescriptors(gCtx, N, C, CHW,
-				new MatrixObject[] {image},  new MatrixObject[] {ret});
-		cudnnTensorDescriptor scaleTensorDesc = allocateTensorDescriptor(gCtx, scale, 1, C, 1, 1);
-
-		// Get underlying dense pointer
-		Pointer imagePtr = getDensePointer(gCtx, image, true, instName);
-		Pointer retPtr = getDensePointer(gCtx, ret, true, instName);
-		Pointer biasPtr = getDensePointer(gCtx, bias, true, instName);
-		Pointer scalePtr = getDensePointer(gCtx, scale, true, instName);
-		Pointer runningMeanPtr = getDensePointer(gCtx, runningMean, true, instName);
-		Pointer runningVarPtr = getDensePointer(gCtx, runningVar, true, instName);
-
-		// To allow for copy-on-write
-		Pointer retRunningMeanPtr = getDensePointer(gCtx, retRunningMean, true, instName);
-		Pointer retRunningVarPtr = getDensePointer(gCtx, retRunningVar, true, instName);
-		cudaMemcpy(retRunningMeanPtr, runningMeanPtr, C * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
-		cudaMemcpy(retRunningVarPtr, runningVarPtr, C * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
-
-		// ignoring resultSaveMean and resultSaveVariance as it requires state management
-		checkStatus(cudnnBatchNormalizationForwardTraining(getCudnnHandle(gCtx), mode, one(), zero(),
-				nCHWDescriptor, imagePtr, nCHWDescriptor, retPtr,
-				scaleTensorDesc, scalePtr, biasPtr, exponentialAverageFactor,
-				retRunningMeanPtr, retRunningVarPtr, epsilon, new Pointer(), new Pointer()));
-	}
-
-	/**
-	 * Convenient utility for batch normalization that returns a NCHW descriptor
-	 * @param gCtx a valid {@link GPUContext}
-	 * @param N number of images
-	 * @param C number of channels
-	 * @param CHW channels*height*width
-	 * @param input input matrix objects
-	 * @param output output matrix objects
-	 * @return one of the NCHW descriptor
-	 * @throws DMLRuntimeException if error occurs
-	 */
-	private static cudnnTensorDescriptor allocateNCHWDescriptors(GPUContext gCtx, int N, int C, long CHW, MatrixObject [] input, MatrixObject [] output) throws DMLRuntimeException {
-		cudnnTensorDescriptor ret  = null; // Return any one
-		if(CHW > ((long)Integer.MAX_VALUE)*C) {
-			throw new DMLRuntimeException("image size (height*width) should be less than " + Integer.MAX_VALUE);
-		}
-		cudnnTensorDescriptor knownNCHWdescriptor = null;
-		int H = -1; int W = -1;
-		for(int i = 0; i < input.length; i++) {
-			knownNCHWdescriptor = input[i].getGPUObject(gCtx).getTensorDescriptor();
-			if(knownNCHWdescriptor != null) {
-				int [] shape = input[i].getGPUObject(gCtx).getTensorShape();
-				if(shape[0] != N || shape[1] != C) {
-					throw new DMLRuntimeException("Incorrect N and C:" + shape[0]  + " != " + N + " || " + shape[1]  + " != " +  C);
-				}
-				H = shape[2];
-				W = shape[3];
-				break;
-			}
-		}
-		if(knownNCHWdescriptor != null) {
-			// We precisely know N, C, H, W
-			for(int i = 0; i < input.length; i++) {
-				ret = allocateTensorDescriptor(gCtx, input[i], N, C, H, W);
-			}
-			for(int i = 0; i < output.length; i++) {
-				ret = allocateTensorDescriptor(gCtx, output[i], N, C, H, W);
-			}
-		}
-		else {
-			int HW = (int) (CHW / C);
-			H = HW; W = 1; // If not known
-			double potentialH = Math.sqrt(HW);
-			if(potentialH == ((int) potentialH)) {
-				H = (int) potentialH;
-				W = H;
-			}
-			// We are not sure about H and W, hence don't allocate them.
-			ret = new cudnnTensorDescriptor();
-			cudnnCreateTensorDescriptor(ret);
-			cudnnSetTensor4dDescriptor(ret, CUDNN_TENSOR_NCHW, CUDNN_DATA_DOUBLE, N, C, H, W);
-		}
-		return ret;
-	}
-
-	/**
-	 * This method computes the backpropagation errors for image, scale and bias of batch normalization layer
-	 * @param gCtx   a valid {@link GPUContext}
-	 * @param instName name of the instruction
-	 * @param image input image
-	 * @param dout input errors of shape C, H, W
-	 * @param scale scale (as per CuDNN) and gamma as per original paper: shape [1, C, 1, 1]
-	 * @param ret (output) backpropagation errors for previous layer
-	 * @param retScale backpropagation error for scale
-	 * @param retBias backpropagation error for bias
-	 * @param epsilon epsilon value used in the batch normalization formula
-	 * @throws DMLRuntimeException if error occurs
-	 */
-	public static void batchNormalizationBackward(GPUContext gCtx, String instName, MatrixObject image, MatrixObject dout,
-			MatrixObject scale, MatrixObject ret, MatrixObject retScale, MatrixObject retBias,
-			double epsilon) throws DMLRuntimeException {
-		LOG.trace("GPU : batchNormalizationBackward" + ", GPUContext=" + gCtx);
-		int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
-
-		int N = toInt(image.getNumRows());
-		int C = toInt(scale.getNumColumns());
-		long CHW = image.getNumColumns();
-
-		// Allocate descriptors
-		cudnnTensorDescriptor nCHWDescriptor = allocateNCHWDescriptors(gCtx, N, C, CHW,
-				new MatrixObject[] {image, dout},  new MatrixObject[] {ret});
-		cudnnTensorDescriptor scaleTensorDesc = allocateTensorDescriptor(gCtx, scale, 1, C, 1, 1);
-
-		// Get underlying dense pointer
-		Pointer imagePtr = getDensePointer(gCtx, image, true, instName);
-		Pointer doutPtr = getDensePointer(gCtx, dout, true, instName);
-		Pointer scalePtr = getDensePointer(gCtx, scale, true, instName);
-		Pointer retPtr = getDensePointer(gCtx, ret, true, instName);
-		Pointer retScalePtr = getDensePointer(gCtx, retScale, true, instName);
-		Pointer retBiasPtr = getDensePointer(gCtx, retBias, true, instName);
-
-		// ignoring resultSaveMean and resultSaveVariance as it requires state management
-		checkStatus(cudnnBatchNormalizationBackward(getCudnnHandle(gCtx), mode,  one(), zero(), one(), zero(),
-				nCHWDescriptor,  imagePtr, nCHWDescriptor, doutPtr, nCHWDescriptor, retPtr,
-				scaleTensorDesc, scalePtr, retScalePtr, retBiasPtr, epsilon, new Pointer(), new Pointer()));
-	}
-
-
-	/**
-	 * This method computes the backpropogation errors for filter of convolution operation
-	 * @param gCtx   a valid {@link GPUContext}
-	 * @param instName the invoking instruction's name for record {@link Statistics}.
-	 * @param image input image
-	 * @param dout errors from next layer
-	 * @param outputBlock  output errors
-	 * @param N number of images
-	 * @param C number of channels
-	 * @param H height
-	 * @param W width
-	 * @param K number of filters
-	 * @param R filter height
-	 * @param S filter width
-	 * @param pad_h pad height
-	 * @param pad_w pad width
-	 * @param stride_h stride height
-	 * @param stride_w stride width
-	 * @param P output activation height
-	 * @param Q output activation width
-	 * @throws DMLRuntimeException if DMLRuntimeException occurs
-	 */
-	public static void conv2dBackwardFilter(GPUContext gCtx, String instName, MatrixObject image, MatrixObject dout,
-			MatrixObject outputBlock, int N, int C, int H, int W, int K, int R,
-			int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
-			int Q) throws DMLRuntimeException {
-		LOG.trace("GPU : conv2dBackwardFilter" + ", GPUContext=" + gCtx);
-		cudnnFilterDescriptor dwDesc = null;
-		cudnnConvolutionDescriptor convDesc = null;
-
-		Pointer workSpace = null;
-		long sizeInBytes = 0;
-		try {
-
-			long t1 = 0, t2 = 0;
-			if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
-			// Allocate descriptors
-			cudnnTensorDescriptor xTensorDesc = allocateTensorDescriptor(gCtx, image, N, C, H, W);
-			cudnnTensorDescriptor doutTensorDesc = allocateTensorDescriptor(gCtx, dout, N, K, P, Q);
-			dwDesc = allocateFilterDescriptor(K, C, R, S);
-
-			// Allocate data
-			Pointer imagePointer = getDensePointer(gCtx, image, true, instName);
-			Pointer doutPointer = getDensePointer(gCtx, dout, true, instName);
-			Pointer dwPointer = getDensePointer(gCtx, outputBlock, true, instName);
-			int padding[] = {pad_h, pad_w};
-			int strides[] = {stride_h, stride_w};
-			convDesc = allocateConvolutionDescriptor(padding, strides);
-			long sizeInBytesArray[] = {0};
-
-			// TODO: Select the best algorithm depending on the data and supported CUDA
-			int algo = jcuda.jcudnn.cudnnConvolutionBwdFilterAlgo.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
-
-			workSpace = new Pointer();
-			cudnnGetConvolutionBackwardFilterWorkspaceSize(getCudnnHandle(gCtx),
-					xTensorDesc, doutTensorDesc, convDesc, dwDesc, algo, sizeInBytesArray);
-			if (GPUStatistics.DISPLAY_STATISTICS)
-				GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
-
-			if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
-			int status = cudnnConvolutionBackwardFilter(getCudnnHandle(gCtx), one(), xTensorDesc, imagePointer,
-					doutTensorDesc, doutPointer, convDesc, algo, workSpace, sizeInBytes, zero(), dwDesc, dwPointer);
-			if (GPUStatistics.DISPLAY_STATISTICS)
-				GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CONVOLUTION_BACKWARD_FILTER_LIB, System.nanoTime() - t2);
-
-			if (status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
-				throw new DMLRuntimeException("Could not executed cudnnConvolutionBackwardFilter: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
-			}
-		} catch (CudaException e) {
-			throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e);
-		} finally {
-			long t3=0;
-			if (GPUStatistics.DISPLAY_STATISTICS) t3 = System.nanoTime();
-
-			if(workSpace != null && sizeInBytes != 0)
-				gCtx.cudaFreeHelper(instName, workSpace);
-			if(dwDesc != null)
-				cudnnDestroyFilterDescriptor(dwDesc);
-
-			if(convDesc != null)
-				cudnnDestroyConvolutionDescriptor(convDesc);
-			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3);
-		}
-	}
-
-	private static long numDoublesIn2GB = 268435456;
-
-	/**
-	 * This method computes the backpropogation errors for previous layer of convolution operation
-	 * @param gCtx   a valid {@link GPUContext}
-	 * @param instName the invoking instruction's name for record {@link Statistics}.
-	 * @param filter filter used in conv2d
-	 * @param dout errors from next layer
-	 * @param output  output errors
-	 * @param N number of images
-	 * @param C number of channels
-	 * @param H height
-	 * @param W width
-	 * @param K number of filters
-	 * @param R filter height
-	 * @param S filter width
-	 * @param pad_h pad height
-	 * @param pad_w pad width
-	 * @param stride_h stride height
-	 * @param stride_w stride width
-	 * @param P output activation height
-	 * @param Q output activation width
-	 * @throws DMLRuntimeException if DMLRuntimeException occurs
-	 */
-	public static void conv2dBackwardData(GPUContext gCtx, String instName, MatrixObject filter, MatrixObject dout,
-			MatrixObject output, int N, int C, int H, int W, int K, int R,
-			int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
-			int Q) throws DMLRuntimeException {
-		LOG.trace("GPU : conv2dBackwardData" + ", GPUContext=" + gCtx);
-		cudnnFilterDescriptor wDesc = null;
-		cudnnConvolutionDescriptor convDesc = null;
-
-		Pointer workSpace = null;
-		long sizeInBytes = 0;
-		try {
-			long t1=0, t2=0;
-			if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
-			// Allocate descriptors
-			wDesc = allocateFilterDescriptor(K, C, R, S);
-			cudnnTensorDescriptor dyDesc = allocateTensorDescriptor(gCtx, dout, N, K, P, Q);
-			cudnnTensorDescriptor dxDesc = allocateTensorDescriptor(gCtx, output, N, C, H, W);
-
-			// Allocate data
-			Pointer w = getDensePointer(gCtx, filter, true, instName);
-			Pointer dy = getDensePointer(gCtx, dout, true, instName);
-			Pointer dx = getDensePointer(gCtx, output, true, instName);
-
-			int padding [] = { pad_h, pad_w };
-			int strides [] = { stride_h, stride_w };
-			convDesc = allocateConvolutionDescriptor(padding, strides);
-			long sizeInBytesArray[] = { 0 };
-
-			// TODO: Select the best algorithm depending on the data and supported CUDA
-			int algo = jcuda.jcudnn.cudnnConvolutionBwdDataAlgo.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
-			workSpace = new Pointer();
-			cudnnGetConvolutionBackwardDataWorkspaceSize(getCudnnHandle(gCtx),
-					wDesc, dyDesc, convDesc, dxDesc, algo, sizeInBytesArray);
-			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
-
-			if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
-			int status = cudnnConvolutionBackwardData(getCudnnHandle(gCtx), one(), wDesc, w,
-					dyDesc, dy, convDesc, algo, workSpace, sizeInBytes, zero(), dxDesc, dx);
-			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CONVOLUTION_BACKWARD_DATA_LIB, System.nanoTime() - t2);
-
-			if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
-				throw new DMLRuntimeException("Could not executed cudnnConvolutionBackwardData: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
-			}
-		} catch (CudaException e) {
-			throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e);
-		}
-		finally {
-			long t3=0;
-			if (GPUStatistics.DISPLAY_STATISTICS) t3 = System.nanoTime();
-
-			if(workSpace != null && sizeInBytes != 0)
-				gCtx.cudaFreeHelper(instName, workSpace);
-			if(wDesc != null)
-				cudnnDestroyFilterDescriptor(wDesc);
-			if(convDesc != null)
-				cudnnDestroyConvolutionDescriptor(convDesc);
-
-			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3);
-		}
-	}
-
-	/**
-	 * performs maxpooling on GPU by exploiting cudnnPoolingForward(...)
-	 * @param gCtx   a valid {@link GPUContext}
-	 * @param instName the invoking instruction's name for record {@link Statistics}.
-	 * @param image image as matrix object
-	 * @param outputBlock output matrix
-	 * @param N				batch size
-	 * @param C				number of channels
-	 * @param H				height of image
-	 * @param W				width of image
-	 * @param K				number of filters
-	 * @param R				height of filter
-	 * @param S				width of filter
-	 * @param pad_h			vertical padding
-	 * @param pad_w			horizontal padding
-	 * @param stride_h		horizontal stride
-	 * @param stride_w		vertical stride
-	 * @param P				(H - R + 1 + 2*pad_h)/stride_h
-	 * @param Q				(W - S + 1 + 2*pad_w)/stride_w
-	 * @throws DMLRuntimeException if DMLRuntimeException occurs
-	 */
-	public static void maxpooling(GPUContext gCtx, String instName, MatrixObject image,
-			MatrixObject outputBlock, int N, int C, int H, int W, int K, int R,
-			int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
-			int Q) throws DMLRuntimeException {
-		Pointer x = getDensePointer(gCtx, image, true, instName);
-		cudnnTensorDescriptor xDesc = allocateTensorDescriptor(gCtx, image, N, C, H, W);
-		performMaxpooling(gCtx, instName, x, xDesc, outputBlock, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
-	}
-
-	public static void performMaxpooling(GPUContext gCtx, String instName, Pointer x, cudnnTensorDescriptor xDesc,
-			MatrixObject outputBlock, int N, int C, int H, int W, int K, int R,
-			int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
-			int Q) throws DMLRuntimeException {
-		LOG.trace("GPU : performMaxpooling" + ", GPUContext=" + gCtx);
-		Pointer y = getDensePointer(gCtx, outputBlock, true, instName);
-		cudnnPoolingDescriptor poolingDesc = null;
-
-		try {
-			long t1=0,t2=0;
-			if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
-			// Allocate descriptors
-			cudnnTensorDescriptor yDesc = allocateTensorDescriptor(gCtx, outputBlock, N, C, P, Q);
-			poolingDesc = allocatePoolingDescriptor(R, S, pad_h, pad_w, stride_h, stride_w);
-			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
-
-			if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
-			int status = cudnnPoolingForward(getCudnnHandle(gCtx), poolingDesc, one(), xDesc, x, zero(), yDesc, y);
-			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MAXPOOLING_FORWARD_LIB, System.nanoTime() - t2);
-
-			if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
-				throw new DMLRuntimeException("Could not executed cudnnPoolingForward: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
-			}
-		} catch (CudaException e) {
-			throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e);
-		}
-		finally {
-			long t3=0;
-			if (GPUStatistics.DISPLAY_STATISTICS) t3 = System.nanoTime();
-			if(poolingDesc != null)
-				cudnnDestroyPoolingDescriptor(poolingDesc);
-			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3);
-		}
-	}
-
-	/**
-	 * Performs maxpoolingBackward on GPU by exploiting cudnnPoolingBackward(...)
-	 * This method computes the backpropogation errors for previous layer of maxpooling operation
-	 * @param gCtx   a valid {@link GPUContext}
-	 * @param instName the invoking instruction's name for record {@link Statistics}.
-	 * @param image image as matrix object
-	 * @param dout			delta matrix, output of previous layer
-	 * @param outputBlock output matrix
-	 * @param N				batch size
-	 * @param C				number of channels
-	 * @param H				height of image
-	 * @param W				width of image
-	 * @param K				number of filters
-	 * @param R				height of filter
-	 * @param S				width of filter
-	 * @param pad_h			vertical padding
-	 * @param pad_w			horizontal padding
-	 * @param stride_h		horizontal stride
-	 * @param stride_w		vertical stride
-	 * @param P				(H - R + 1 + 2*pad_h)/stride_h
-	 * @param Q				(W - S + 1 + 2*pad_w)/stride_w
-	 * @throws DMLRuntimeException if DMLRuntimeException occurs
-	 */
-	public static void maxpoolingBackward(GPUContext gCtx, String instName, MatrixObject image, MatrixObject dout,
-			MatrixObject outputBlock, int N, int C, int H, int W, int K, int R,
-			int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
-			int Q) throws DMLRuntimeException {
-		LOG.trace("GPU : maxpoolingBackward" + ", GPUContext=" + gCtx);
-		Pointer y = null;
-		cudnnPoolingDescriptor poolingDesc = null;
-
-		try {
-			long t1=0, t2=0, t3=0;
-			if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
-			// Allocate descriptors
-			cudnnTensorDescriptor xDesc = allocateTensorDescriptor(gCtx, image, N, C, H, W);
-			cudnnTensorDescriptor yDesc = allocateTensorDescriptor(gCtx, dout, N, C, P, Q);
-			cudnnTensorDescriptor dxDesc = allocateTensorDescriptor(gCtx, outputBlock, N, C, H, W);
-			cudnnTensorDescriptor dyDesc = allocateTensorDescriptor(gCtx, dout, N, C, P, Q);
-
-			poolingDesc = allocatePoolingDescriptor(R, S, pad_h, pad_w, stride_h, stride_w);
-
-			// Calling PoolForward first, y is one of the inputs for poolBackward
-			// TODO: Remove calling poolForward after necessary changes at language level for poolBackward
-			long numBytes = N*C*P*Q*Sizeof.DOUBLE;
-			y = gCtx.allocate(numBytes);
-
-			// Allocate data
-			Pointer x = getDensePointer(gCtx, image, true, instName);
-			Pointer dx = getDensePointer(gCtx, outputBlock, true, instName);
-			Pointer dy = getDensePointer(gCtx, dout, true, instName);
-
-			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
-
-			if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
-			int status = cudnnPoolingForward(getCudnnHandle(gCtx), poolingDesc, one(), xDesc, x, zero(), yDesc, y);
-			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MAXPOOLING_FORWARD_LIB, System.nanoTime() - t2);
-
-			if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
-				throw new DMLRuntimeException("Could not executed cudnnPoolingForward before cudnnPoolingBackward: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
-			}
-
-			if (GPUStatistics.DISPLAY_STATISTICS) t3 = System.nanoTime();
-			status = cudnnPoolingBackward(getCudnnHandle(gCtx), poolingDesc, one(), yDesc, y, dyDesc, dy, xDesc, x, zero(), dxDesc, dx);
-			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MAXPOOLING_BACKWARD_LIB, System.nanoTime() - t3);
-
-			if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
-				throw new DMLRuntimeException("Could not executed cudnnPoolingBackward: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
-			}
-		} catch (CudaException e) {
-			throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e);
-		}
-		finally {
-			long t4=0;
-			if (GPUStatistics.DISPLAY_STATISTICS) t4 = System.nanoTime();
-
-			if(y != null)
-				gCtx.cudaFreeHelper(instName, y);
-			if(poolingDesc != null)
-				cudnnDestroyPoolingDescriptor(poolingDesc);
-
-			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t4);
-		}
-	}
-
-	private static void performCuDNNReLU(GPUContext gCtx, String instName, MatrixObject in, Pointer dstData, cudnnTensorDescriptor srcTensorDesc) throws DMLRuntimeException {
-		long t0=0;
-		try {
-			LOG.trace("GPU : performCuDNNReLU" + ", GPUContext=" + gCtx);
-			cudnnTensorDescriptor dstTensorDesc = srcTensorDesc;
-
-			Pointer srcData = getDensePointer(gCtx, in, true, instName);
-			cudnnActivationDescriptor activationDescriptor = new cudnnActivationDescriptor();
-			cudnnCreateActivationDescriptor(activationDescriptor);
-			double dummy = -1;
-			cudnnSetActivationDescriptor(activationDescriptor, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, dummy);
-			if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
-			cudnnActivationForward(getCudnnHandle(gCtx), activationDescriptor,
-					one(), srcTensorDesc, srcData,
-					zero(), dstTensorDesc, dstData);
-			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ACTIVATION_FORWARD_LIB, System.nanoTime() - t0);
-		} catch (CudaException e) {
-			throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e);
-		}
-		finally {
-			long t1=0;
-			if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
-			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t1);
-		}
-	}
-
-
-	/**
-	 * Performs the relu operation on the GPU.
-	 * @param ec currently active {@link ExecutionContext}
-	 * @param gCtx   a valid {@link GPUContext}
-	 * @param instName the invoking instruction's name for record {@link Statistics}.
-	 * @param in input matrix
-	 * @param outputName	name of the output matrix
-	 * @throws DMLRuntimeException	if an error occurs
-	 */
-	public static void relu(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in, String outputName) throws DMLRuntimeException {
-		if (ec.getGPUContext(0) != gCtx)
-			throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
-		long N = in.getNumRows();
-		long CHW = in.getNumColumns();
-		MatrixObject output = ec.getMatrixObject(outputName);
-		getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, in.getNumRows(), in.getNumColumns()); // Allocated the dense output matrix
-		long t0=0;
-		cudnnTensorDescriptor srcTensorDesc = in.getGPUObject(gCtx).getTensorDescriptor();
-		if(N*CHW >= numDoublesIn2GB ||  srcTensorDesc == null) {
-			LOG.trace("GPU : relu custom kernel" + ", GPUContext=" + gCtx);
-			// Invokes relu(double* A,  double* ret, int rlen, int clen)
-			if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
-			Pointer dstData = getDensePointer(gCtx, output, instName);
-			Pointer srcData = getDensePointer(gCtx, in, instName); // TODO: FIXME: Add sparse kernel support for relu
-			getCudaKernels(gCtx).launchKernel("relu",
-					ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(N), toInt(CHW)),
-					srcData, dstData, toInt(N), toInt(CHW));
-			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RELU_KERNEL, System.nanoTime() - t0);
-		}
-		else {
-			performCuDNNReLU(gCtx, instName, in, getDensePointer(gCtx, output, true, instName), srcTensorDesc);
-		}
-	}
-
-
+	
 
 	//********************************************************************/
 	//************* End of DEEP LEARNING Operators ***********************/
@@ -2814,28 +1923,6 @@ public class LibMatrixCUDA {
 		deviceCopy(instName, srcPtr, destPtr, (int)src.getNumRows(), (int)src.getNumColumns());
 	}
 
-	@SuppressWarnings("unused")
-	private static void compareAndSet(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in, String outputName, double compareVal,  double tolerance,
-			double ifEqualsVal, double ifLessThanVal, double ifGreaterThanVal) throws DMLRuntimeException {
-		if (ec.getGPUContext(0) != gCtx)
-			throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
-		Pointer A = getDensePointer(gCtx, in, instName); // TODO: FIXME: Implement sparse kernel
-		MatrixObject out = ec.getMatrixObject(outputName);
-		int rlen = toInt(out.getNumRows());
-		int clen = toInt(out.getNumColumns());
-		getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, rlen, clen);	// Allocated the dense output matrix
-		Pointer ret = getDensePointer(gCtx, out, instName);
-
-		// out.getMatrixCharacteristics().setNonZeros(rlen*clen);
-		// compareAndSet(double* A,  double* ret, int rlen, int clen, double compareVal, double ifEqualsVal, double ifNotEqualsVal)
-		long t0=0;
-		if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
-		getCudaKernels(gCtx).launchKernel("compare_and_set",
-				ExecutionConfig.getConfigForSimpleMatrixOperations(rlen, clen),
-				A, ret, rlen, clen, compareVal, tolerance, ifEqualsVal, ifLessThanVal, ifGreaterThanVal);
-		if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_COMPARE_AND_SET_KERNEL, System.nanoTime() - t0);
-	}
-
 	/**
 	 * Fills an an array on the GPU with a given scalar value
 	 * @param ec					currently active instance of the {@link ExecutionContext}
@@ -3075,7 +2162,7 @@ public class LibMatrixCUDA {
 	//******************* End of Re-org Functions ************************/
 	//********************************************************************/
 
-	private static int toInt(long num) throws DMLRuntimeException {
+	protected static int toInt(long num) throws DMLRuntimeException {
 		if(num >= Integer.MAX_VALUE || num <= Integer.MIN_VALUE) {
 			throw new DMLRuntimeException("GPU : Exceeded supported size " + num);
 		}
@@ -3115,21 +2202,13 @@ public class LibMatrixCUDA {
 					+ in1.getNumColumns() + "]");
 		}
 
-		int len1 = toInt(in1.getNumColumns());
-		int len2 = toInt(ec.getMatrixObject(outputName).getNumColumns());
+		
 		if(isInSparseFormat(gCtx, in1)) {
 			// Input in1 is in sparse format and output is in dense format
 			MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, ru - rl + 1, cu - cl + 1);
 			CSRPointer inPointer = getSparsePointer(gCtx, in1, instName);
 			Pointer outPointer = getDensePointer(gCtx, out, instName);
-			int size = ru - rl + 1;
-			long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
-			// Performs a slice operation where the input matrix is sparse and the output matrix is dense.
-			// This function avoids unnecessary sparse to dense conversion of the input matrix.
-			// We can generalize this later to output sparse matrix.
-			getCudaKernels(gCtx).launchKernel("slice_sparse_dense", ExecutionConfig.getConfigForSimpleVectorOperations(size),
-					inPointer.val, inPointer.rowPtr, inPointer.colInd, outPointer, rl, ru, cl, cu);
-			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RIX_SPARSE_DENSE_OP, System.nanoTime() - t0);
+			sliceSparseDense(gCtx, instName, inPointer, outPointer, rl, ru, cl, cu);
 		}
 		else {
 			// Input in1 is in dense format (see inPointer)
@@ -3137,18 +2216,64 @@ public class LibMatrixCUDA {
 
 			Pointer inPointer = getDensePointer(gCtx, in1, instName);
 			Pointer outPointer = getDensePointer(gCtx, out, instName);
-			long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
-			if (len1 == len2) {
-				cudaMemcpy(outPointer, inPointer.withByteOffset(rl * len1 * Sizeof.DOUBLE), (ru - rl + 1) * len1
-						* Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
-			} else {
-				for (int i = rl, ix1 = rl * len1 + cl, ix2 = 0; i <= ru; i++, ix1 += len1, ix2 += len2) {
-					cudaMemcpy(outPointer.withByteOffset(ix2 * Sizeof.DOUBLE),
-							inPointer.withByteOffset(ix1 * Sizeof.DOUBLE), len2 * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
-				}
+			int len1 = toInt(in1.getNumColumns());
+			int len2 = toInt(ec.getMatrixObject(outputName).getNumColumns());
+			sliceDenseDense(gCtx, instName, inPointer, outPointer, rl, ru, cl, cu, len1, len2);
+		}
+	}
+	
+	/**
+	 * Perform slice operation on dense input and output it in dense format
+	 * 
+	 * @param gCtx gpu context
+	 * @param instName instruction name
+	 * @param inPointer dense input pointer
+	 * @param outPointer dense output pointer (doesnot need to be zeroed out)
+	 * @param rl row lower
+	 * @param ru row upper
+	 * @param cl column lower
+	 * @param cu column upper
+	 * @param len1 input number of columns
+	 * @param len2 output number of columns
+	 * @throws DMLRuntimeException
+	 */
+	protected static void sliceDenseDense(GPUContext gCtx, String instName, Pointer inPointer, Pointer outPointer, 
+			int rl, int ru, int cl, int cu, int len1, int len2) throws DMLRuntimeException {
+		long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
+		if (len1 == len2) {
+			cudaMemcpy(outPointer, inPointer.withByteOffset(rl * len1 * Sizeof.DOUBLE), (ru - rl + 1) * len1
+					* Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
+		} else {
+			for (int i = rl, ix1 = rl * len1 + cl, ix2 = 0; i <= ru; i++, ix1 += len1, ix2 += len2) {
+				cudaMemcpy(outPointer.withByteOffset(ix2 * Sizeof.DOUBLE),
+						inPointer.withByteOffset(ix1 * Sizeof.DOUBLE), len2 * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
 			}
-			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RIX_DENSE_OP, System.nanoTime() - t0);
 		}
+		if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RIX_DENSE_OP, System.nanoTime() - t0);
+	}
+	
+	/**
+	 * Perform slice operation on sparse input and output it in dense format
+	 * 
+	 * @param gCtx gpu context
+	 * @param instName instruction name
+	 * @param inPointer sparse CSR input pointer
+	 * @param outPointer dense output pointer (expected to be zeroed out)
+	 * @param rl row lower
+	 * @param ru row upper
+	 * @param cl column lower
+	 * @param cu column upper
+	 * @throws DMLRuntimeException
+	 */
+	protected static void sliceSparseDense(GPUContext gCtx, String instName, CSRPointer inPointer, Pointer outPointer, int rl, int ru, int cl, int cu) throws DMLRuntimeException {
+		int size = ru - rl + 1;
+		long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
+		// Performs a slice operation where the input matrix is sparse and the output matrix is dense.
+		// This function avoids unnecessary sparse to dense conversion of the input matrix.
+		// We can generalize this later to output sparse matrix.
+		getCudaKernels(gCtx).launchKernel("slice_sparse_dense", ExecutionConfig.getConfigForSimpleVectorOperations(size),
+				inPointer.val, inPointer.rowPtr, inPointer.colInd, outPointer, rl, ru, cl, cu);
+		if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RIX_SPARSE_DENSE_OP, System.nanoTime() - t0);
 	}
 
 	public static void cbind(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName) throws DMLRuntimeException {
@@ -3650,26 +2775,6 @@ public class LibMatrixCUDA {
 	//********************************************************************/
 
 	/**
-	 * Convenience method for debugging matrices on the GPU.
-	 * @param in		Pointer to a double array (matrix) on the GPU
-	 * @param rlen	row length
-	 * @param clen	column length
-	 */
-	@SuppressWarnings("unused")
-	private static void debugPrintMatrix(Pointer in, int rlen, int clen){
-		double[] data = new double[rlen * clen];
-		cudaMemcpy(Pointer.to(data), in, rlen*clen*Sizeof.DOUBLE, cudaMemcpyDeviceToHost);
-		int k=0;
-		for (int i=0; i<rlen; ++i){
-			for (int j=0; j<clen; ++j){
-				System.out.print(data[k]);
-				k++;
-			}
-			System.out.println();
-		}
-	}
-
-	/**
 	 * Helper method to get the output block (allocated on the GPU)
 	 * Also records performance information into {@link Statistics}
 	 * @param ec		active {@link ExecutionContext}
@@ -3680,7 +2785,7 @@ public class LibMatrixCUDA {
 	 * @return	the matrix object
 	 * @throws DMLRuntimeException	if an error occurs
 	 */
-	private static MatrixObject getDenseMatrixOutputForGPUInstruction(ExecutionContext ec, String instName, String name, long numRows, long numCols) throws DMLRuntimeException {
+	protected static MatrixObject getDenseMatrixOutputForGPUInstruction(ExecutionContext ec, String instName, String name, long numRows, long numCols) throws DMLRuntimeException {
 		long t0=0;
 		if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
 		Pair<MatrixObject, Boolean> mb = ec.getDenseMatrixOutputForGPUInstruction(name, numRows, numCols);


[3/5] systemml git commit: [SYSTEMML-540] Support sparse GPU conv2d as well as fix memory estimation of convolution operations

Posted by ni...@apache.org.
http://git-wip-us.apache.org/repos/asf/systemml/blob/772d9302/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
index 2b9335c..59ac29e 100644
--- a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
+++ b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
@@ -191,7 +191,7 @@ public class ConvolutionOp extends Hop  implements MultiThreadedHop
 //		// TODO: Inserting reblock requires knowing columns apriori
 //		ConvolutionTransform transform1 = new ConvolutionTransform(addReblockIfNecessary(et, lopOp, in), lopOp, getDataType(), getValueType(), et, k);
 //		setReblockedOutputDimension(et, transform1);
-		ConvolutionTransform transform1 = new ConvolutionTransform(in, lopOp, getDataType(), getValueType(), et, k);
+		ConvolutionTransform transform1 = new ConvolutionTransform(in, lopOp, getDataType(), getValueType(), et, k, computeIntermediateMemEstimate(-1, -1, -1 ));
 		setOutputDimensions(transform1);
 		
 		setLineNumbers(transform1);
@@ -223,13 +223,171 @@ public class ConvolutionOp extends Hop  implements MultiThreadedHop
 		return OptimizerUtils.estimateSizeExactSparsity(dim1, dim2, sparsity);
 	}
 	
+	// ---------------------------------------------------------------
+	// Utility methods to guard the computation of memory estimates in presense of unknowns
+	private static class IntermediateDimensions {
+		int dim1; int dim2; double sp;
+		public IntermediateDimensions(ConvolutionOp h, String dim1Str, String dim2Str, double sp) {
+			dim1 = (int) h.getDim(dim1Str);
+			dim2 = (int) h.getDim(dim2Str);
+			this.sp = sp;
+		}
+		public IntermediateDimensions(ConvolutionOp h, String dim1Str, String dim2Str) {
+			dim1 = (int) h.getDim(dim1Str);
+			dim2 = (int) h.getDim(dim2Str);
+			sp = 1;
+		}
+		public IntermediateDimensions(ConvolutionOp h, int dim1, String dim2Str) {
+			this.dim1 = dim1;
+			dim2 = (int) h.getDim(dim2Str);
+			sp = 1;
+		}
+		
+		/**
+		 * Add two computed memory estimates
+		 * 
+		 * @param val1 memory estimate 1
+		 * @param val2 memory estimate 2
+		 * @return sum of memory estimates
+		 */
+		static double guardedAdd(double val1, double val2) {
+			if(val1 < 0 || val2 < 0) return OptimizerUtils.DEFAULT_SIZE;
+			double ret = val1 + val2;
+			if(ret >= OptimizerUtils.DEFAULT_SIZE) return OptimizerUtils.DEFAULT_SIZE;
+			else return ret;
+		}
+		
+		/**
+		 * Compute memory estimates for given intermediate matrices 
+		 * 
+		 * @param intermediates list of intermediates
+		 * @param numWorkers number of workers
+		 * @return memory estimate
+		 */
+		public static double addEstimateSizes(ArrayList<IntermediateDimensions> intermediates, int numWorkers) {
+			double memBudget = 0; 
+			for(int i = 0; i < intermediates.size(); i++) {
+				memBudget = guardedAdd(memBudget, OptimizerUtils.estimateSizeExactSparsity(
+						intermediates.get(i).dim1, intermediates.get(i).dim2, intermediates.get(i).sp)*numWorkers);
+			}
+			return memBudget;
+		}
+		
+		/**
+		 * Compute max of two computed memory estimates
+		 * @param val1 memory estimate 1
+		 * @param val2 memory estimate 2
+		 * @return max of memory estimates
+		 */
+		public static double guardedMax(double val1, double val2) {
+			if(val1 < 0 || val2 < 0) return OptimizerUtils.DEFAULT_SIZE;
+			double ret = Math.max(val1, val2);
+			if(ret >= OptimizerUtils.DEFAULT_SIZE) return OptimizerUtils.DEFAULT_SIZE;
+			else return ret;
+		}
+	}
+	
+	/**
+	 * Helper utility to compute intermediate memory estimate
+	 * 
+	 * @param gpuIntermediates intermediates for GPU
+	 * @param cpIntermediates intermediates for CP
+	 * @return memory estimates
+	 */
+	private double computeIntermediateMemEstimateHelper(
+			ArrayList<IntermediateDimensions> gpuIntermediates,
+			ArrayList<IntermediateDimensions> cpIntermediates) {
+		// Since CP operators use row-level parallelism by default
+		int numWorkers = (int) Math.min(OptimizerUtils.getConstrainedNumThreads(_maxNumThreads), Math.max(getDim("N"), 1));
+		if(DMLScript.USE_ACCELERATOR) {
+			// Account for potential sparse-to-dense conversion
+			double gpuMemBudget = IntermediateDimensions.addEstimateSizes(gpuIntermediates, 1);
+			double cpMemoryBudget = IntermediateDimensions.addEstimateSizes(cpIntermediates, numWorkers);
+			if(cpMemoryBudget > gpuMemBudget) {
+				double oneThreadCPMemBudget = IntermediateDimensions.addEstimateSizes(cpIntermediates, 1);
+				if(oneThreadCPMemBudget <= gpuMemBudget) {
+					// Why limit CPU ? in-order to give more opportunity to compile GPU operators
+					cpMemoryBudget = oneThreadCPMemBudget;
+				}
+			}
+			// Finally, use the maximum of CP and GPU memory budget
+			return IntermediateDimensions.guardedMax(cpMemoryBudget, gpuMemBudget);
+		}
+		else {
+			// When -gpu flag is not provided, the memory estimates for CP are not affected.
+			return IntermediateDimensions.addEstimateSizes(cpIntermediates, numWorkers);
+		}
+	}
+	
 	@Override
-	protected double computeIntermediateMemEstimate( long dim1, long dim2, long nnz )
+	protected double computeIntermediateMemEstimate( long ignoreDim1, long ignoreDim2, long ignoreNnz )
 	{	
-		//default: no intermediate memory requirements
-		return 0;
+		ArrayList<IntermediateDimensions> gpuIntermediates = new ArrayList<IntermediateDimensions>();
+		ArrayList<IntermediateDimensions> cpIntermediates = new ArrayList<IntermediateDimensions>();
+		if(getOp() == ConvOp.DIRECT_CONV2D) {
+			// Assumption: To compile a GPU conv2d operator, following should fit on the GPU:
+			// 1. output in dense format (i.e. computeOutputMemEstimate) 
+			// 2. input in any format
+			// 3. atleast one input row in dense format
+			// 4. filter in dense format
+			
+			// Account for potential sparse-to-dense conversion of atleast 1 input row and filter
+			gpuIntermediates.add(new IntermediateDimensions(this, 1, "CHW"));
+			gpuIntermediates.add(new IntermediateDimensions(this, "K", "CRS"));
+			
+			// im2col operation preserves the worst-case sparsity of the input.
+			cpIntermediates.add(new IntermediateDimensions(this, "CRS", "PQ", getInput().get(0).getSparsity()));
+		}
+		else if(getOp() == ConvOp.DIRECT_CONV2D_BACKWARD_DATA) {
+			// Assumption: To compile a GPU conv2d_backward_data operator, following should fit on the GPU:
+			// 1. output in dense format (i.e. computeOutputMemEstimate) 
+			// 2. dout in any format
+			// 3. atleast one dout row in dense format
+			// 4. filter in dense format
+			
+			// Account for potential sparse-to-dense conversion of atleast 1 input row and filter
+			gpuIntermediates.add(new IntermediateDimensions(this, 1, "KPQ"));
+			gpuIntermediates.add(new IntermediateDimensions(this, "K", "CRS"));
+			
+			// There are 2 intermediates: rotate180 and input to col2im for conv2d_backward_data
+			// rotate180 preserves the "exact" sparsity of the dout matrix
+			cpIntermediates.add(new IntermediateDimensions(this, "PQ", "K", getInput().get(1).getSparsity()));
+			// Note: worst-case sparsity for the input of col2im (of size NPQ x CRS where N is determined by degree of parallelism)
+			cpIntermediates.add(new IntermediateDimensions(this, "PQ", "CRS"));
+		}
+		else if(getOp() == ConvOp.DIRECT_CONV2D_BACKWARD_FILTER) {
+			// Assumption: To compile a GPU conv2d_backward_filter operator, following should fit on the GPU:
+			// 1. output in dense format (i.e. computeOutputMemEstimate) 
+			// 2. dout in any format
+			// 3. atleast one dout and input row in dense format
+			
+			// Account for potential sparse-to-dense conversion of atleast 1 input + dout row
+			gpuIntermediates.add(new IntermediateDimensions(this, 1, "CHW"));
+			gpuIntermediates.add(new IntermediateDimensions(this, 1, "KPQ"));
+			
+			// There are 2 intermediates: im2col and rotate180 for conv2d_backward_filter
+			// rotate180 preserves the "exact" sparsity of the dout matrix
+			cpIntermediates.add(new IntermediateDimensions(this, "PQ", "K", getInput().get(1).getSparsity()));
+			// im2col operation preserves the worst-case sparsity of the input.
+			cpIntermediates.add(new IntermediateDimensions(this, "CRS", "PQ", getInput().get(0).getSparsity()));
+		}
+		else if(getOp() == ConvOp.MAX_POOLING) {
+			// Account for potential sparse-to-dense conversion of atleast 1 input row
+			gpuIntermediates.add(new IntermediateDimensions(this, 1, "CHW"));
+		}
+		else if(getOp() == ConvOp.MAX_POOLING_BACKWARD) {
+			// Account for potential sparse-to-dense conversion of atleast 1 input + dout row
+			gpuIntermediates.add(new IntermediateDimensions(this, 1, "CHW"));
+			gpuIntermediates.add(new IntermediateDimensions(this, 1, "CPQ"));
+		}
+		
+		if(gpuIntermediates.size() > 0 || cpIntermediates.size() > 0)
+			return computeIntermediateMemEstimateHelper(gpuIntermediates, cpIntermediates);
+		else
+			return 0;
 	}
 	
+	
 	@Override
 	protected long[] inferOutputCharacteristics( MemoTable memo )
 	{
@@ -243,65 +401,9 @@ public class ConvolutionOp extends Hop  implements MultiThreadedHop
 			ret[2] = -1;
 			return (ret[0]>0 && ret[1]>0) ? ret : null;
 		}
-	
-		ConvolutionParameters params;
-		try {
-			params = parseInput();
-		} catch (DMLRuntimeException e) {
-			throw new RuntimeException(e);
-		}
 		
-		switch(op) 
-		{
-			case MAX_POOLING: {
-				// input
-				long N = getInput().get(0)._dim1;
-				ret[0] = N;
-				ret[1] = getExtractedVal(params.C, params.P, params.Q);
-				ret[2] = -1;
-				break;
-			}
-			case DIRECT_CONV2D: {
-				// input, filter
-				long N = getInput().get(0)._dim1;
-				ret[0] = N;
-				ret[1] = getExtractedVal(params.K, params.P, params.Q);
-				ret[2] = -1;
-				break;
-			}
-			case DIRECT_CONV2D_BACKWARD_FILTER: {
-				// input, dout	
-				ret[0] = params.K;
-				ret[1] = getExtractedVal(params.C, params.R, params.S);
-				ret[2] = -1;
-				break;
-			}
-			case MAX_POOLING_BACKWARD: {
-				// input, dout
-				ret[0] = getInput().get(0)._dim1;
-				ret[1] = getInput().get(0)._dim2;
-				ret[2] = -1;
-				break;
-			}
-			case DIRECT_CONV2D_BACKWARD_DATA: {
-				// filter, dout
-				long N = getInput().get(1)._dim1;
-				ret[0] = N;
-				ret[1] = getExtractedVal(params.C, params.H, params.W);
-				ret[2] = -1;
-				break;
-			}
-			default:
-				throw new RuntimeException("Unsupported op:" + op.name());
-		}
-		
-		if(LOG.isDebugEnabled() && (ret[0] <= 0 || ret[1] <= 0)) {
-			LOG.debug("Unknown dimensions for ConvolutionOp in inferOutputCharacteristics:" + op.name() + " " + ret[0] + " " + ret[1] + 
-					" img_dim=[" + params.N + " " + params.C + " " + params.H + " " + params.W + "]" +
-					" filter_dim=[" + params.K + " " + params.C + " " + params.H + " " + params.W + "]" + 
-					" output_feature_map=[" + params.P + " " + params.Q + "] stride=[" + params.stride_h + " " + params.stride_w + "]" +
-					" pad=[" + params.pad_h + " " + params.pad_w + "]");
-		}
+		refreshSizeInformation();
+		ret[0] = _dim1; ret[1] = _dim2; ret[2] = _nnz;
 		
 		//safe return (create entry only if at least dims known)
 		return (ret[0]>0 && ret[1]>0) ? ret : null;
@@ -347,50 +449,44 @@ public class ConvolutionOp extends Hop  implements MultiThreadedHop
 		return _etype;
 	}
 	
+	// Caching parameters speed-ups dynamic recompilation time by avoiding unnecessary computeSizeInformation
+	private ConvolutionParameters _cachedParams = new ConvolutionParameters(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, _maxNumThreads);
 	// stride1, stride2, padding1, padding2  
 	// input_shape1, input_shape2, input_shape3, input_shape4, 
 	// filter_shape1, filter_shape2, filter_shape3, filter_shape4
 	ConvolutionParameters parseInput() throws DMLRuntimeException {
-		ConvolutionParameters params = null;
 		if(op == ConvOp.MAX_POOLING_BACKWARD 
 				|| op == ConvOp.DIRECT_CONV2D 
 				|| op == ConvOp.DIRECT_CONV2D_BACKWARD_FILTER
 				|| op == ConvOp.DIRECT_CONV2D_BACKWARD_DATA) {
-			params = new ConvolutionParameters(
-					computeSizeInformation(getInput().get(6)),
-					computeSizeInformation(getInput().get(7)), 
-					computeSizeInformation(getInput().get(8)), 
-					computeSizeInformation(getInput().get(9)), 
-					computeSizeInformation(getInput().get(10)), 
-					computeSizeInformation(getInput().get(12)), 
-					computeSizeInformation(getInput().get(13)), 
-					computeSizeInformation(getInput().get(2)), 
-					computeSizeInformation(getInput().get(3)), 
-					computeSizeInformation(getInput().get(4)), 
-					computeSizeInformation(getInput().get(5)), _maxNumThreads);
+			_cachedParams.setIfUnknown(
+					getInput().get(6),
+					getInput().get(7), 
+					getInput().get(8), 
+					getInput().get(9), 
+					getInput().get(10), 
+					getInput().get(12), 
+					getInput().get(13), 
+					getInput().get(2), 
+					getInput().get(3), 
+					getInput().get(4), 
+					getInput().get(5), _maxNumThreads);
 		}
 		else {
-			params = new ConvolutionParameters(
-					computeSizeInformation(getInput().get(5)),
-					computeSizeInformation(getInput().get(6)), 
-					computeSizeInformation(getInput().get(7)), 
-					computeSizeInformation(getInput().get(8)), 
-					computeSizeInformation(getInput().get(9)), 
-					computeSizeInformation(getInput().get(11)), 
-					computeSizeInformation(getInput().get(12)), 
-					computeSizeInformation(getInput().get(1)), 
-					computeSizeInformation(getInput().get(2)), 
-					computeSizeInformation(getInput().get(3)), 
-					computeSizeInformation(getInput().get(4)), _maxNumThreads);
-		}
-		return params;
-	}
-
-	public static long getExtractedVal(long val1, long val2, long val3) {
-		if(val1 == -1 || val2 == -1 || val3 == -1) {
-			return -1;
+			_cachedParams.setIfUnknown(
+					getInput().get(5),
+					getInput().get(6), 
+					getInput().get(7), 
+					getInput().get(8), 
+					getInput().get(9), 
+					getInput().get(11), 
+					getInput().get(12), 
+					getInput().get(1), 
+					getInput().get(2), 
+					getInput().get(3), 
+					getInput().get(4), _maxNumThreads);
 		}
-		return val1*val2*val3;
+		return _cachedParams;
 	}
 	
 	@Override
@@ -400,72 +496,50 @@ public class ConvolutionOp extends Hop  implements MultiThreadedHop
 			Hop input1 = getInput().get(0);
 			setDim1(input1.getDim1());
 			setDim2(input1.getDim2());
+			_nnz = -1; // cannot infer stats
 			return;
 		}
 		
-		ConvolutionParameters params;
-		try {
-			params = parseInput();
-		} catch (DMLRuntimeException e) {
-			throw new RuntimeException(e);
-		}
-		
 		switch(op) 
 		{
 			case MAX_POOLING:
 			{	
-				// input
-				long N = getInput().get(0)._dim1;
-				_dim1 = N;
-				_dim2 = getExtractedVal(params.C, params.P, params.Q);
+				_dim1 = getDim("N");
+				_dim2 = getDim("CPQ");
 				_nnz = -1; // cannot infer stats
 				break;
 			}
 			case MAX_POOLING_BACKWARD:
 			{
-				// input, dout
-				_dim1 = getInput().get(0)._dim1;
-				_dim2 = getInput().get(0)._dim2;
+				_dim1 = getDim("N");
+				_dim2 = getDim("CHW");
 				_nnz = -1;
 				break;
 			}
 			case DIRECT_CONV2D:
 			{
-				// input, filter
-				long N = getInput().get(0)._dim1;
-				_dim1 = N;
-				_dim2 = getExtractedVal(params.K, params.P, params.Q);
+				_dim1 = getDim("N");
+				_dim2 = getDim("KPQ");
 				_nnz = -1; // cannot infer stats
 				break;
 			}
 			case DIRECT_CONV2D_BACKWARD_DATA:
 			{
-				// filter, dout
-				long N = getInput().get(1)._dim1;
-				_dim1 = N;
-				_dim2 = getExtractedVal(params.C, params.H, params.W);
+				_dim1 = getDim("N");
+				_dim2 = getDim("CHW");
 				_nnz = -1; // cannot infer stats
 				break;
 			}
 			case DIRECT_CONV2D_BACKWARD_FILTER:
 			{
-				// input, dout	
-				_dim1 = params.K;
-				_dim2 = getExtractedVal(params.C, params.R, params.S);
+				_dim1 = getDim("K");
+				_dim2 = getDim("CRS");
 				_nnz = -1; // cannot infer stats
 				break;
 			}
 			default:
 				throw new RuntimeException("The sizes are not refreshed for " + op.name());
 		}
-		
-		if(LOG.isDebugEnabled() && (_dim1 <= 0 || _dim2 <= 0)) {
-			LOG.debug("Unknown dimensions for ConvolutionOp in refreshSizeInformation:" + op.name() + " " + _dim1 + " " + _dim2 + 
-					" img_dim=[" + params.N + " " + params.C + " " + params.H + " " + params.W + "]" +
-					" filter_dim=[" + params.K + " " + params.C + " " + params.H + " " + params.W + "]" + 
-					" output_feature_map=[" + params.P + " " + params.Q + "] stride=[" + params.stride_h + " " + params.stride_w + "]" +
-					" pad=[" + params.pad_h + " " + params.pad_w + "]");
-		}
 	}
 	
 	@Override
@@ -511,4 +585,132 @@ public class ConvolutionOp extends Hop  implements MultiThreadedHop
 	public int getMaxNumThreads() {
 		return _maxNumThreads;
 	}
+	
+	
+	// ------------------------------------------------------------------------------------------------------
+	// Utility methods to get the dimensions taking into account unknown dimensions
+	
+	/**
+	 * Convenient method to get the dimensions required by ConvolutionOp.
+	 * 
+	 * @param dimString can be K, CRS, N, CHW, KPQ, PQ
+	 * @return either -1 or value associated with the dimString
+	 */
+	private long getDim(String dimString) {
+		if(op == ConvOp.BIAS_ADD || op == ConvOp.BIAS_MULTIPLY) {
+			throw new RuntimeException("getDim method should not be invoked for bias_add and bias_multiply");
+		}
+		ConvolutionParameters params;
+		try {
+			params = parseInput();
+		} catch (DMLRuntimeException e) {
+			throw new RuntimeException(e);
+		}
+		Hop filter = null; 	// shape: K x CRS 
+		Hop input = null; 	// shape: N x CHW
+		Hop dout = null;	// shape: N x KPQ
+		Hop dout1 = null;	// shape: N x CPQ
+		
+		if(getOp() == ConvOp.DIRECT_CONV2D) {
+			input  = getInput().get(0);
+			filter = getInput().get(1);
+		}
+		else if(getOp() == ConvOp.DIRECT_CONV2D_BACKWARD_DATA) {
+			filter = getInput().get(0);
+			dout  = getInput().get(1);
+		}
+		else if(getOp() == ConvOp.DIRECT_CONV2D_BACKWARD_FILTER) {
+			input = getInput().get(0);
+			dout  = getInput().get(1);
+		}
+		else if(getOp() == ConvOp.MAX_POOLING) {
+			input = getInput().get(0);
+		}
+		else if(getOp() == ConvOp.MAX_POOLING_BACKWARD) {
+			input = getInput().get(0);
+			dout1  = getInput().get(1);
+		}
+		
+		long ret = -1;
+		if(dimString.equals("K") && filter != null) {
+			ret = getNonNegative(ret, getNonNegative(params.K, filter._dim1));
+		}
+		else if(dimString.equals("CRS") && filter != null) {
+			ret = getNonNegative(ret, getNonNegative(nonNegativeMultiply(params.C, params.R, params.S), filter._dim2));
+		}
+		else if(dimString.equals("N") && input != null) {
+			ret = getNonNegative(ret, getNonNegative(params.N, input._dim1));
+		}
+		else if(dimString.equals("CHW") && input != null) {
+			ret = getNonNegative(ret, getNonNegative(nonNegativeMultiply(params.C, params.H, params.W), input._dim2));
+		}
+		else if(dimString.equals("N") && dout != null) {
+			ret = getNonNegative(ret, getNonNegative(params.N, dout._dim1));
+		}
+		else if(dimString.equals("KPQ") && dout != null) {
+			ret = getNonNegative(ret, getNonNegative(nonNegativeMultiply(params.K, params.P, params.Q), dout._dim2));
+		}
+		else if(dimString.equals("N") && dout1 != null) {
+			ret = getNonNegative(ret, getNonNegative(params.N, dout1._dim1));
+		}
+		else if(dimString.equals("CPQ") && dout1 != null) {
+			ret = getNonNegative(ret, getNonNegative(nonNegativeMultiply(params.C, params.P, params.Q), dout1._dim2));
+		}
+		else if(dimString.equals("K")) {
+			ret = getNonNegative(ret, params.K >= 0 ? params.K : -1);
+		}
+		else if(dimString.equals("CRS")) {
+			ret = getNonNegative(ret, nonNegativeMultiply(params.C, params.R, params.S));
+		}
+		else if(dimString.equals("N")) {
+			ret = getNonNegative(ret, params.N >= 0 ? params.N : -1);
+		}
+		else if(dimString.equals("CHW")) {
+			ret = getNonNegative(ret, nonNegativeMultiply(params.C, params.H, params.W));
+		}
+		else if(dimString.equals("KPQ")) {
+			ret = getNonNegative(ret, nonNegativeMultiply(params.K, params.P, params.Q));
+		}
+		else if(dimString.equals("PQ")) {
+			ret = getNonNegative(ret, nonNegativeMultiply(params.P, params.Q));
+		}
+		else if(dimString.equals("CPQ")) {
+			ret = getNonNegative(ret, nonNegativeMultiply(params.C, params.P, params.Q));
+		}
+		else {
+			throw new RuntimeException("Unsupported dimension:" + dimString + " for operator " + getOp().name());
+		}
+		
+		if(LOG.isDebugEnabled() && ret < 0) {
+			LOG.debug("Unknown dimension " + dimString + " for ConvolutionOp:" + op.name() + 
+					" img_dim=[" + params.N + " " + params.C + " " + params.H + " " + params.W + "]" +
+					" filter_dim=[" + params.K + " " + params.C + " " + params.H + " " + params.W + "]" + 
+					" output_feature_map=[" + params.P + " " + params.Q + "] stride=[" + params.stride_h + " " + params.stride_w + "]" +
+					" pad=[" + params.pad_h + " " + params.pad_w + "]");
+		}
+		return ret;
+	}
+	
+	private long nonNegativeMultiply(long val1, long val2, long val3) {
+		if(val1 >= 0 && val2 >= 0 && val3 >= 0) {
+			return val1 * val2 * val3;
+		}
+		else return -1;
+	}
+	private long nonNegativeMultiply(long val1, long val2) {
+		if(val1 >= 0 && val2 >= 0) {
+			return val1 * val2;
+		}
+		else return -1;
+	}
+	private long getNonNegative(long val1, long val2) {
+		if(val1 >= 0 && val2 >= 0) {
+			if(val1 == val2) return val1;
+			else throw new RuntimeException("Incorrect dimensions in Convolution Hop: " + val1 + " != " + val2);
+		}
+		else if(val1 >= 0) return val1;
+		else if(val2 >= 0) return val2;
+		else return -1;
+	}
+	// ------------------------------------------------------------------------------------------------------
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/772d9302/src/main/java/org/apache/sysml/hops/Hop.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/Hop.java b/src/main/java/org/apache/sysml/hops/Hop.java
index eeaa5f1..b454771 100644
--- a/src/main/java/org/apache/sysml/hops/Hop.java
+++ b/src/main/java/org/apache/sysml/hops/Hop.java
@@ -708,31 +708,8 @@ public abstract class Hop implements ParseInfo
 		_validCPSizeEstimate = (wstats!=null) ? OptimizerUtils.isValidCPMatrixSize(
 			wstats[0], wstats[1], OptimizerUtils.getSparsity(wstats[0], wstats[1], wstats[2])) : false;
 	}
-
 	
 	/**
-	 * Computes the hop-specific output memory estimate in bytes. Should be 0 if not
-	 * applicable. 
-	 * 
-	 * @param dim1 dimension 1
-	 * @param dim2 dimension 2
-	 * @param nnz number of non-zeros
-	 * @return memory estimate
-	 */
-	protected abstract double computeOutputMemEstimate( long dim1, long dim2, long nnz );
-
-	/**
-	 * Computes the hop-specific intermediate memory estimate in bytes. Should be 0 if not
-	 * applicable.
-	 * 
-	 * @param dim1 dimension 1
-	 * @param dim2 dimension 2
-	 * @param nnz number of non-zeros
-	 * @return memory estimate
-	 */
-	protected abstract double computeIntermediateMemEstimate( long dim1, long dim2, long nnz );
-
-	/**
 	 * Computes the output matrix characteristics (rows, cols, nnz) based on worst-case output
 	 * and/or input estimates. Should return null if dimensions are unknown.
 	 * 
@@ -849,6 +826,21 @@ public abstract class Hop implements ParseInfo
 	
 	public abstract String getOpString();
 
+	// ========================================================================================
+	// Design doc: Memory estimation of GPU
+	// 1. Since not all operator are supported on GPU, isGPUEnabled indicates whether an operation 
+	// is enabled for GPU. This method doesnot take into account any memory estimates.
+	// 2. To simplify memory estimation logic, the methods computeOutputMemEstimate and computeIntermediateMemEstimate
+	// should return maximum of memory required for GPU and CP operators. 
+	// 3. Additionally, these methods are guarded so that when -gpu flag is not provided, additional memory overhead due to GPU
+	// are ignored. For example: sparse-to-dense conversion on GPU. 
+	// 4. (WIP) Every GPU operators should respect the memory returned by computeIntermediateMemEstimate (and computeOutputMemEstimate - see below point).
+	// 5. (WIP) Every GPU operator should create output in the same format as the corresponding CP operator. That is,  computeOutputMemEstimate
+	// are consistent across both CP and GPU in terms of worst-case.
+	// 6. The drawback of using maximum memory (mem = Math.max(mem_gpu, mem_gpu)) are:
+	// - GPU operator is not selected when mem_gpu < total memory available on GPU < mem
+	// - CP operator is not selected (i.e. distributed operator compiled) when mem_cpu < driver memory budget < mem
+	
 	/**
 	 * In memory-based optimizer mode (see OptimizerUtils.isMemoryBasedOptLevel()), 
 	 * the exectype is determined by checking this method as well as memory budget of this Hop. 
@@ -861,6 +853,31 @@ public abstract class Hop implements ParseInfo
 	 */
 	public abstract boolean isGPUEnabled();
 	
+	/**
+	 * Computes the hop-specific output memory estimate in bytes. Should be 0 if not
+	 * applicable. 
+	 * 
+	 * @param dim1 dimension 1
+	 * @param dim2 dimension 2
+	 * @param nnz number of non-zeros
+	 * @return memory estimate
+	 */
+	protected abstract double computeOutputMemEstimate( long dim1, long dim2, long nnz );
+
+	/**
+	 * Computes the hop-specific intermediate memory estimate in bytes. Should be 0 if not
+	 * applicable.
+	 * 
+	 * @param dim1 dimension 1
+	 * @param dim2 dimension 2
+	 * @param nnz number of non-zeros
+	 * @return memory estimate
+	 */
+	protected abstract double computeIntermediateMemEstimate( long dim1, long dim2, long nnz );
+	
+	// ========================================================================================
+
+	
 	protected boolean isVector() {
 		return (dimsKnown() && (_dim1 == 1 || _dim2 == 1) );
 	}

http://git-wip-us.apache.org/repos/asf/systemml/blob/772d9302/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java b/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
index 8784956..121112b 100644
--- a/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
+++ b/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
@@ -37,6 +37,7 @@ public class ConvolutionTransform extends Lop
 	
 	private OperationTypes operation = null;
 	private int numThreads = -1;
+	private double intermediateMemBudget = 0;
 	
 	/**
 	 * Constructor when we have one input.
@@ -47,12 +48,14 @@ public class ConvolutionTransform extends Lop
 	 * @param vt value type
 	 * @param et execution type
 	 * @param k number of threads
+	 * @param intermediateMemBudget intermediate memory budget
 	 */
-	public ConvolutionTransform(Lop input, ConvolutionTransform.OperationTypes op, DataType dt, ValueType vt, ExecType et, int k) 
+	public ConvolutionTransform(Lop input, ConvolutionTransform.OperationTypes op, DataType dt, ValueType vt, ExecType et, int k, double intermediateMemBudget) 
 	{
 		super(Lop.Type.Transform, dt, vt);		
 		init(input, op, dt, vt, et);
 		numThreads = k;
+		this.intermediateMemBudget = intermediateMemBudget;
 	}
 	
 	public ConvolutionTransform(Lop input1, Lop input2, ConvolutionTransform.OperationTypes op, DataType dt, ValueType vt, ExecType et, int k) 
@@ -165,6 +168,9 @@ public class ConvolutionTransform extends Lop
 				sb.append( OPERAND_DELIMITOR );
 				sb.append( numThreads );
 			}
+			
+			sb.append( OPERAND_DELIMITOR );
+			sb.append( intermediateMemBudget );
 			return sb.toString();
 		}
 		else {
@@ -210,6 +216,9 @@ public class ConvolutionTransform extends Lop
 			sb.append( OPERAND_DELIMITOR );
 			sb.append( numThreads );
 		}
+		
+		sb.append( OPERAND_DELIMITOR );
+		sb.append( intermediateMemBudget );
 	}
 
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/772d9302/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
index 629b688..e91029e 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
@@ -22,6 +22,10 @@ package org.apache.sysml.runtime.instructions.cp;
 import java.util.ArrayList;
 import java.util.Arrays;
 
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysml.api.DMLScript;
+import org.apache.sysml.hops.OptimizerUtils;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysml.runtime.functionobjects.SwapIndex;
@@ -41,24 +45,25 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
 	private ArrayList<CPOperand> _filter_shape;
 	private ArrayList<CPOperand> _stride = new ArrayList<CPOperand>();
 	private ArrayList<CPOperand> _padding = new ArrayList<CPOperand>();
-	private int _numThreads = -1;
-
-	private ConvolutionCPInstruction(CPOperand in, CPOperand in2, CPOperand out, String opcode, String istr,
-			int numThreads) throws DMLRuntimeException {
-		super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), in, out, opcode, istr);
-		if (!(opcode.equals("bias_add") || opcode.equals("relu_backward") || opcode.equals("bias_multiply"))) {
-			throw new DMLRuntimeException(
-					"Incorrect usage. Expected the opcode to be bias_add or bias_multiply or relu_backward, but found "
-							+ opcode);
+	private int _numThreads = -1;	private double _intermediateMemoryBudget = 0;
+	private static final Log LOG = LogFactory.getLog(ConvolutionCPInstruction.class.getName());
+	private static boolean warnedUnderUtilitization = false;
+	
+	public ConvolutionCPInstruction(CPOperand in, CPOperand in2, CPOperand out, String opcode, String istr, int numThreads, double intermediateMemoryBudget) throws DMLRuntimeException {
+		super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), in, out,
+				opcode, istr);
+		if( !(opcode.equals("bias_add") || opcode.equals("relu_backward") || opcode.equals("bias_multiply") ) ) {
+			throw new DMLRuntimeException("Incorrect usage. Expected the opcode to be bias_add or bias_multiply or relu_backward, but found " + opcode);
 		}
 		_in2 = in2;
 		_cptype = CPINSTRUCTION_TYPE.Convolution;
 		_numThreads = numThreads;
+		_intermediateMemoryBudget = intermediateMemoryBudget;
 	}
 
 	private ConvolutionCPInstruction(CPOperand in, CPOperand out, String opcode, String istr,
 			ArrayList<CPOperand> stride, ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape,
-			ArrayList<CPOperand> filter_shape, int numThreads) {
+			ArrayList<CPOperand> filter_shape, int numThreads, double intermediateMemoryBudget) {
 		super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), in, out, opcode, istr);
 		_cptype = CPINSTRUCTION_TYPE.Convolution;
 		_stride = stride;
@@ -66,12 +71,15 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
 		_input_shape = input_shape;
 		_filter_shape = filter_shape;
 		_numThreads = numThreads;
+		_intermediateMemoryBudget = intermediateMemoryBudget;
 	}
-
-	private ConvolutionCPInstruction(CPOperand in, CPOperand in2, CPOperand out, String opcode, String istr,
-			ArrayList<CPOperand> stride, ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape,
-			ArrayList<CPOperand> filter_shape, int numThreads) {
-		super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), in, out, opcode, istr);
+	
+	public ConvolutionCPInstruction(CPOperand in, CPOperand in2, CPOperand out, String opcode,
+			String istr, ArrayList<CPOperand> stride,
+			ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape,
+			ArrayList<CPOperand> filter_shape, int numThreads, double intermediateMemoryBudget) {
+		super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), in, out,
+				opcode, istr);
 		_in2 = in2;
 		_cptype = CPINSTRUCTION_TYPE.Convolution;
 		_stride = stride;
@@ -79,12 +87,15 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
 		_input_shape = input_shape;
 		_filter_shape = filter_shape;
 		_numThreads = numThreads;
+		_intermediateMemoryBudget = intermediateMemoryBudget;
 	}
-
-	private ConvolutionCPInstruction(CPOperand in, CPOperand in2, CPOperand in3, CPOperand out, String opcode,
-			String istr, ArrayList<CPOperand> stride, ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape,
-			ArrayList<CPOperand> filter_shape, int numThreads) {
-		super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), in, out, opcode, istr);
+	
+	public ConvolutionCPInstruction(CPOperand in, CPOperand in2, CPOperand in3, CPOperand out, String opcode,
+			String istr, ArrayList<CPOperand> stride,
+			ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape,
+			ArrayList<CPOperand> filter_shape, int numThreads, double intermediateMemoryBudget) {
+		super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), in, out,
+				opcode, istr);
 		_in2 = in2;
 		_in3 = in3;
 		_cptype = CPINSTRUCTION_TYPE.Convolution;
@@ -93,6 +104,7 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
 		_input_shape = input_shape;
 		_filter_shape = filter_shape;
 		_numThreads = numThreads;
+		_intermediateMemoryBudget = intermediateMemoryBudget;
 	}
 
 	public static ConvolutionCPInstruction parseInstruction(String str)
@@ -101,7 +113,7 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
 		String[] parts = InstructionUtils.getInstructionPartsWithValueType(str);
 		String opcode = parts[0];
 		if (opcode.equalsIgnoreCase("maxpooling") || opcode.equalsIgnoreCase("relu_maxpooling")) {
-			InstructionUtils.checkNumFields(parts, 15);
+			InstructionUtils.checkNumFields(parts, 16);
 			// stride1, stride2, padding1, padding2
 			// input_shape1, input_shape2, input_shape3, input_shape4,
 			// filter_shape1, filter_shape2, filter_shape3, filter_shape4, k
@@ -127,13 +139,13 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
 			int k = Integer.parseInt(parts[15]);
 
 			return new ConvolutionCPInstruction(in, out, opcode, str, stride,
-					padding, input_shape, filter_shape, k);
+					padding, input_shape, filter_shape, k, Double.parseDouble(parts[16]));
 		} 
 		else if (opcode.equalsIgnoreCase("maxpooling_backward") || opcode.equalsIgnoreCase("relu_maxpooling_backward")
 				|| opcode.equalsIgnoreCase("conv2d")
 				|| opcode.equalsIgnoreCase("conv2d_backward_filter")
 				|| opcode.equalsIgnoreCase("conv2d_backward_data")) {
-			InstructionUtils.checkNumFields(parts, 16);
+			InstructionUtils.checkNumFields(parts, 17);
 			// dout, stride1, stride2, padding1, padding2
 			// input_shape1, input_shape2, input_shape3, input_shape4,
 			// filter_shape1, filter_shape2, filter_shape3, filter_shape4, k
@@ -160,10 +172,10 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
 			int k = Integer.parseInt(parts[16]);
 
 			return new ConvolutionCPInstruction(in, in2, out, opcode, str, stride,
-					padding, input_shape, filter_shape, k);
+					padding, input_shape, filter_shape, k, Double.parseDouble(parts[17]));
 		}
 		else if (opcode.equalsIgnoreCase("conv2d_bias_add")) {
-			InstructionUtils.checkNumFields(parts, 17);
+			InstructionUtils.checkNumFields(parts, 18);
 			// dout, stride1, stride2, padding1, padding2
 			// input_shape1, input_shape2, input_shape3, input_shape4,
 			// filter_shape1, filter_shape2, filter_shape3, filter_shape4, k
@@ -191,15 +203,15 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
 			int k = Integer.parseInt(parts[17]);
 
 			return new ConvolutionCPInstruction(in, in2, in3, out, opcode, str, stride,
-					padding, input_shape, filter_shape, k);
+					padding, input_shape, filter_shape, k, Double.parseDouble(parts[18]));
 		}
 		else if (opcode.equalsIgnoreCase("bias_add") || opcode.equals("relu_backward") || opcode.equalsIgnoreCase("bias_multiply") ) {
-			InstructionUtils.checkNumFields(parts, 4);
+			InstructionUtils.checkNumFields(parts, 5);
 			CPOperand in = new CPOperand(parts[1]);
 			CPOperand in2 = new CPOperand(parts[2]);
 			CPOperand out = new CPOperand(parts[3]);
 			int k = Integer.parseInt(parts[4]);
-			return new ConvolutionCPInstruction(in, in2, out, opcode, str, k);
+			return new ConvolutionCPInstruction(in, in2, out, opcode, str, k, Double.parseDouble(parts[5]));
 		}
 		else {
 			throw new DMLRuntimeException("Unknown opcode while parsing a ConvolutionCPInstruction: " + str);
@@ -363,6 +375,7 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
 			ec.releaseMatrixInput(_in2.getName(), getExtendedOpcode());
 		}
 		else if (instOpcode.equalsIgnoreCase("conv2d")) {
+			resetNumThreads(params, C*R*S, P*Q, matBlock.getNonZeros() / (matBlock.getNumRows()*matBlock.getNumColumns()));
 			MatrixBlock filter = ec.getMatrixInput(_in2.getName(), getExtendedOpcode());
 			if(filter.isEmpty() || matBlock.isEmpty()) {
 				outputBlock = new MatrixBlock(N, K*P*Q, true);
@@ -377,6 +390,7 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
 			ec.releaseMatrixInput(_in2.getName(), getExtendedOpcode());
 		}
 		else if (instOpcode.equalsIgnoreCase("conv2d_bias_add")) {
+			resetNumThreads(params, C*R*S, P*Q, matBlock.getNonZeros() / (matBlock.getNumRows()*matBlock.getNumColumns()));
 			MatrixBlock filter = ec.getMatrixInput(_in3.getName(), getExtendedOpcode());
 			MatrixBlock bias = ec.getMatrixInput(_in2.getName(), getExtendedOpcode());
 			if(bias.getNumRows() != params.K || bias.getNumColumns() != 1) {
@@ -446,6 +460,27 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
 		ec.setMatrixOutput(getOutputVariableName(), outputBlock, getExtendedOpcode());
 	}
 	
+	/**
+	 * Reset the number of thread to respect the intermediate CP memory budget
+	 * 
+	 * @param params convolution parameters
+	 * @param numRows number of rows of intermediate matrix used per thread
+	 * @param numCols number of rows of intermediate matrix used per thread
+	 * @param sparsity sparsity of intermediate matrix used per thread
+	 */
+	private void resetNumThreads(ConvolutionParameters params, int numRows, int numCols, double sparsity) {
+		if(DMLScript.USE_ACCELERATOR) {
+			double memBudget1Thread = OptimizerUtils.estimateSizeExactSparsity(numRows, numCols, sparsity);
+			int limitedDegreeOfParallelism = (int) Math.floor(_intermediateMemoryBudget / memBudget1Thread);
+			if(params.numThreads > limitedDegreeOfParallelism) {
+				params.numThreads = limitedDegreeOfParallelism;
+				if(!warnedUnderUtilitization)
+					LOG.warn("CPU Under-utilization to respect the intermediate memory budget. To avoid this, please try reducing the mini-batch or forcing gpu execution.");
+				warnedUnderUtilitization = true;
+			}
+		}
+	}
+	
 	private MatrixBlock getDenseOutputBlock(int numRows, int numCols) throws DMLRuntimeException {
 		MatrixBlock outputBlock = new MatrixBlock(numRows, numCols, false);
 		outputBlock.allocateDenseBlock();

http://git-wip-us.apache.org/repos/asf/systemml/blob/772d9302/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
index 5b37576..b25f787 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
@@ -27,6 +27,7 @@ import org.apache.sysml.runtime.functionobjects.SwapIndex;
 import org.apache.sysml.runtime.instructions.InstructionUtils;
 import org.apache.sysml.runtime.instructions.cp.CPOperand;
 import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
+import org.apache.sysml.runtime.matrix.data.LibMatrixCuDNN;
 import org.apache.sysml.runtime.matrix.operators.ReorgOperator;
 import org.apache.sysml.runtime.util.ConvolutionUtils;
 import org.apache.sysml.utils.GPUStatistics;
@@ -40,9 +41,9 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
 	private ArrayList<CPOperand> _filter_shape;
 	private ArrayList<CPOperand> _stride = new ArrayList<CPOperand>();
 	private ArrayList<CPOperand> _padding = new ArrayList<CPOperand>();
-
-	private ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, CPOperand out, String opcode, String istr)
-			throws DMLRuntimeException {
+	private double _intermediateMemoryBudget = 0;
+	
+	public ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, CPOperand out, String opcode, String istr, double intermediateMemoryBudget) throws DMLRuntimeException {
 		super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), opcode, istr);
 		if (!(opcode.equals("bias_add") || opcode.equals("bias_multiply") || opcode.equals("relu_backward"))) {
 			throw new DMLRuntimeException(
@@ -53,18 +54,23 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
 		_input2 = in2;
 		_gputype = GPUINSTRUCTION_TYPE.Convolution;
 		_output = out;
+		_intermediateMemoryBudget = intermediateMemoryBudget;
 	}
-
-	private ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, CPOperand in3, CPOperand out, String opcode,
-			String istr, ArrayList<CPOperand> stride, ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape,
-			ArrayList<CPOperand> filter_shape) {
-		this(in1, in2, out, opcode, istr, stride, padding, input_shape, filter_shape);
+	
+	public ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, CPOperand in3, CPOperand out, String opcode,
+			String istr, ArrayList<CPOperand> stride,
+			ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape,
+			ArrayList<CPOperand> filter_shape, double intermediateMemoryBudget) 
+	{
+		this(in1, in2, out, opcode, istr, stride, padding,  input_shape, filter_shape, intermediateMemoryBudget);
 		_input3 = in3;
 	}
-
-	private ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, CPOperand out, String opcode, String istr,
-			ArrayList<CPOperand> stride, ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape,
-			ArrayList<CPOperand> filter_shape) {
+	
+	public ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, CPOperand out, String opcode,
+			String istr, ArrayList<CPOperand> stride,
+			ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape,
+			ArrayList<CPOperand> filter_shape, double intermediateMemoryBudget) 
+	{
 		super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), opcode, istr);
 		_gputype = GPUINSTRUCTION_TYPE.Convolution;
 
@@ -75,6 +81,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
 		_padding = padding;
 		_input_shape = input_shape;
 		_filter_shape = filter_shape;
+		_intermediateMemoryBudget = intermediateMemoryBudget;
 	}
 
 	public static ConvolutionGPUInstruction parseInstruction(String str)
@@ -87,7 +94,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
 			 || opcode.equalsIgnoreCase("conv2d_backward_filter")
 			 || opcode.equalsIgnoreCase("conv2d_backward_data")
 			 || opcode.equalsIgnoreCase("maxpooling_backward")) ) {
-			InstructionUtils.checkNumFields(parts, 15);
+			InstructionUtils.checkNumFields(parts, 16);
 			CPOperand in1 = new CPOperand(parts[1]);
 			CPOperand in2 = new CPOperand(parts[2]);
 			CPOperand out = new CPOperand(parts[15]);
@@ -110,10 +117,10 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
 			filter_shape.add(new CPOperand(parts[14]));
 
 			return new ConvolutionGPUInstruction(in1, in2, out, opcode, str, stride,
-					padding, input_shape, filter_shape);
+					padding, input_shape, filter_shape, Double.parseDouble(parts[16]));
 		}
 		else if (opcode.equalsIgnoreCase("conv2d_bias_add")) {
-			InstructionUtils.checkNumFields(parts, 16);
+			InstructionUtils.checkNumFields(parts, 17);
 			CPOperand in1 = new CPOperand(parts[1]);
 			CPOperand in2 = new CPOperand(parts[2]);
 			CPOperand in3 = new CPOperand(parts[3]);
@@ -137,10 +144,10 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
 			filter_shape.add(new CPOperand(parts[15]));
 
 			return new ConvolutionGPUInstruction(in1, in2, in3, out, opcode, str, stride,
-					padding, input_shape, filter_shape);
+					padding, input_shape, filter_shape, Double.parseDouble(parts[17]));
 		}
 		else if (opcode.equalsIgnoreCase("maxpooling")) {
-			InstructionUtils.checkNumFields(parts, 14);
+			InstructionUtils.checkNumFields(parts, 15);
 			CPOperand in1 = new CPOperand(parts[1]);
 			CPOperand out = new CPOperand(parts[14]);
 		
@@ -162,14 +169,14 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
 			filter_shape.add(new CPOperand(parts[13]));
 
 			return new ConvolutionGPUInstruction(in1, null, out, opcode, str, stride,
-					padding, input_shape, filter_shape);
+					padding, input_shape, filter_shape, Double.parseDouble(parts[15]));
 		}
 		else if( opcode.equalsIgnoreCase("bias_add") || opcode.equalsIgnoreCase("relu_backward") || opcode.equalsIgnoreCase("bias_multiply")  ) {
-			InstructionUtils.checkNumFields(parts, 3);
+			InstructionUtils.checkNumFields(parts, 4);
 			CPOperand in1 = new CPOperand(parts[1]);
 			CPOperand in2 = new CPOperand(parts[2]);
 			CPOperand out = new CPOperand(parts[3]);
-			return new ConvolutionGPUInstruction(in1, in2, out, opcode, str);
+			return new ConvolutionGPUInstruction(in1, in2, out, opcode, str, Double.parseDouble(parts[4]));
 		}
 		else {
 			throw new DMLRuntimeException("Unknown opcode while parsing a ConvolutionGPUInstruction: " + str);	
@@ -251,8 +258,8 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
 			
 			MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), N, K * P * Q);
 			
-			LibMatrixCUDA.conv2d(ec.getGPUContext(0), getExtendedOpcode(), image, filter, out, N, C, H, W,
-					K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+			LibMatrixCuDNN.conv2d(ec.getGPUContext(0), getExtendedOpcode(), image, filter, out, N, C, H, W,
+					K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, _intermediateMemoryBudget);
 		}
 		else if (instOpcode.equalsIgnoreCase("conv2d_bias_add")) {
 			MatrixObject image = getMatrixInputForGPUInstruction(ec, _input1.getName());
@@ -266,8 +273,8 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
 			
 			MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), N, K * P * Q);
 			
-			LibMatrixCUDA.conv2dBiasAdd(ec.getGPUContext(0), getExtendedOpcode(), image, bias, filter, out, N, C, H, W,
-						K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+			LibMatrixCuDNN.conv2dBiasAdd(ec.getGPUContext(0), getExtendedOpcode(), image, bias, filter, out, N, C, H, W,
+						K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, _intermediateMemoryBudget);
 		}
 		else if (instOpcode.equalsIgnoreCase("conv2d_backward_filter")) {
 			MatrixObject image = getMatrixInputForGPUInstruction(ec, _input1.getName());
@@ -281,8 +288,8 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
 			
 			MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), K, C * R * S);
 			
-			LibMatrixCUDA.conv2dBackwardFilter(ec.getGPUContext(0), getExtendedOpcode(), image, dout, out, N, C, H, W,
-					K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+			LibMatrixCuDNN.conv2dBackwardFilter(ec.getGPUContext(0), getExtendedOpcode(), image, dout, out, N, C, H, W,
+					K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, _intermediateMemoryBudget);
 			// TODO: For now always copy the device data to host
 			// ec.gpuCtx.copyDeviceToHost(outputBlock);
 		}
@@ -298,8 +305,8 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
 			
 			MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), N, C * H * W);
 			
-			LibMatrixCUDA.conv2dBackwardData(ec.getGPUContext(0), getExtendedOpcode(), filter, dout, out, N, C, H, W,
-					K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+			LibMatrixCuDNN.conv2dBackwardData(ec.getGPUContext(0), getExtendedOpcode(), filter, dout, out, N, C, H, W,
+					K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, _intermediateMemoryBudget);
 		}
 		else if (instOpcode.equalsIgnoreCase("maxpooling")) {
 			MatrixObject image = getMatrixInputForGPUInstruction(ec, _input1.getName());
@@ -311,8 +318,8 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
 			MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), N, C * P * Q);
 			
 			if(instOpcode.equalsIgnoreCase("maxpooling"))
-				LibMatrixCUDA.maxpooling(ec.getGPUContext(0), getExtendedOpcode(), image, out, N, C, H, W,
-					K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+				LibMatrixCuDNN.maxpooling(ec.getGPUContext(0), getExtendedOpcode(), image, out, N, C, H, W,
+					K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, _intermediateMemoryBudget);
 		}
 		else if (instOpcode.equalsIgnoreCase("maxpooling_backward")) {
 			MatrixObject image = getMatrixInputForGPUInstruction(ec, _input1.getName());
@@ -326,8 +333,8 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
 			
 			MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), N, C * H * W);
 			
-			LibMatrixCUDA.maxpoolingBackward(ec.getGPUContext(0), getExtendedOpcode(), image, dout, out, N, C, H, W,
-					K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+			LibMatrixCuDNN.maxpoolingBackward(ec.getGPUContext(0), getExtendedOpcode(), image, dout, out, N, C, H, W,
+					K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, _intermediateMemoryBudget);
 		}
 		else {
 			throw new DMLRuntimeException("Unsupported GPU context for " + instOpcode);
@@ -345,6 +352,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
 		ec.releaseMatrixOutputForGPUInstruction(_output.getName());
 	}
 
+
 	private int getScalarInput(ExecutionContext ec, ArrayList<CPOperand> aL, int index) 
 		throws DMLRuntimeException 
 	{

http://git-wip-us.apache.org/repos/asf/systemml/blob/772d9302/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
index af27dc6..5096566 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
@@ -24,6 +24,7 @@ import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysml.runtime.instructions.cp.CPOperand;
 import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
+import org.apache.sysml.runtime.matrix.data.LibMatrixCuDNN;
 import org.apache.sysml.runtime.matrix.operators.Operator;
 import org.apache.sysml.utils.GPUStatistics;
 
@@ -44,7 +45,7 @@ public class MatrixBuiltinGPUInstruction extends BuiltinUnaryGPUInstruction {
 
 		switch(opcode) {
 			case "sel+":
-				LibMatrixCUDA.relu(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break;
+				LibMatrixCuDNN.relu(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break;
 			case "exp":
 				LibMatrixCUDA.exp(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break;
 			case "sqrt":

http://git-wip-us.apache.org/repos/asf/systemml/blob/772d9302/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
index c6b82c4..197daaf 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
@@ -49,6 +49,7 @@ import org.apache.commons.logging.LogFactory;
 import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.conf.ConfigurationManager;
 import org.apache.sysml.conf.DMLConfig;
+import org.apache.sysml.hops.OptimizerUtils;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
@@ -151,6 +152,11 @@ public class GPUContext {
 		LOG.info(" GPU memory - Total: " + (total[0] * (1e-6)) + " MB, Available: " + (free[0] * (1e-6)) + " MB on "
 				+ this);
 
+		if(GPUContextPool.initialGPUMemBudget() > OptimizerUtils.getLocalMemBudget()) {
+			LOG.warn("Potential under-utilization: GPU memory (" + GPUContextPool.initialGPUMemBudget() 
+					+ ") > driver memory budget (" + OptimizerUtils.getLocalMemBudget() + "). "
+					+ "Consider increasing the driver memory budget.");
+		}
 	}
 
 	private void initializeCudaLibraryHandles() throws DMLRuntimeException {

http://git-wip-us.apache.org/repos/asf/systemml/blob/772d9302/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java b/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
index 78b6e3b..6d06ee5 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
@@ -21,6 +21,7 @@ package org.apache.sysml.runtime.matrix.data;
 
 import java.io.Serializable;
 
+import org.apache.sysml.hops.Hop;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.util.ConvolutionUtils;
 
@@ -34,7 +35,9 @@ public class ConvolutionParameters implements Serializable {
 	public int K; public int R; public int S; public int stride_h; public int stride_w; public int pad_h; public int pad_w;
 	public int P; public int Q; public int numThreads;
 	
+	// Optional variables used by ConvolutionCPInstruction
 	public boolean enableNative = false;
+	
 	public MatrixBlock input1; public MatrixBlock input2; public MatrixBlock output;
 	
 	public MatrixBlock bias;
@@ -62,6 +65,28 @@ public class ConvolutionParameters implements Serializable {
 				"], pad=[" + pad_h + "," + pad_w + "])";  
 	}
 	
+	public void setIfUnknown(Hop N, Hop C, Hop H, Hop W,
+			Hop K, Hop R, Hop S, Hop stride_h, Hop stride_w, Hop pad_h, Hop pad_w, int numThreads) throws DMLRuntimeException {
+		if(this.N < 0) this.N = convertToInt(Hop.computeSizeInformation(N));
+		if(this.C < 0) this.C = convertToInt(Hop.computeSizeInformation(C));
+		if(this.H < 0) this.H = convertToInt(Hop.computeSizeInformation(H));
+		if(this.W < 0) this.W = convertToInt(Hop.computeSizeInformation(W));
+		if(this.K < 0) this.K = convertToInt(Hop.computeSizeInformation(K));
+		if(this.R < 0) this.R = convertToInt(Hop.computeSizeInformation(R));
+		if(this.S < 0) this.S = convertToInt(Hop.computeSizeInformation(S));
+		if(this.stride_h < 0) this.stride_h = convertToInt(Hop.computeSizeInformation(stride_h));
+		if(this.stride_w < 0) this.stride_w = convertToInt(Hop.computeSizeInformation(stride_w));
+		if(this.pad_h < 0) this.pad_h = convertToInt(Hop.computeSizeInformation(pad_h));
+		if(this.pad_w < 0) this.pad_w = convertToInt(Hop.computeSizeInformation(pad_w));
+		if(this.P < 0 && this.H >= 0 && this.R >= 0 && this.stride_h >= 0 && this.pad_h >= 0) {
+			this.P = (int) ConvolutionUtils.getP(this.H, this.R, this.stride_h, this.pad_h);
+		}
+		if(this.Q < 0 && this.W >= 0 && this.S >= 0 && this.stride_w >= 0 && this.pad_w >= 0) {
+			this.Q = (int) ConvolutionUtils.getQ(this.W, this.S, this.stride_w, this.pad_w);
+		}
+		this.numThreads = numThreads;
+	}
+	
 	public ConvolutionParameters(long N, long C, long H, long W,
 			long K, long R, long S, long stride_h, long stride_w, long pad_h, long pad_w, int numThreads) throws DMLRuntimeException {
 		this.N = convertToInt(N);