You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2018/05/31 19:27:43 UTC

[1/4] systemml git commit: [SYSTEMML-445] Refactored GPU Memory Manager

Repository: systemml
Updated Branches:
  refs/heads/master af9cc8a90 -> 4d3216678


http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index 82a76b6..cdb69f9 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -211,7 +211,7 @@ public class LibMatrixCUDA {
 		return gCtx.getCublasHandle();
 	}
 
-	protected static JCudaKernels getCudaKernels(GPUContext gCtx) {
+	public static JCudaKernels getCudaKernels(GPUContext gCtx) throws DMLRuntimeException {
 		return gCtx.getKernels();
 	}
 	
@@ -244,7 +244,7 @@ public class LibMatrixCUDA {
 	 */
 	public static Pointer one() {
 		if(_one == null || oldDataTypeSize != sizeOfDataType) {
-			_one = dataTypePointerTo(1.0);
+			_one = _dataTypePointerTo(1.0);
 			oldDataTypeSize = sizeOfDataType;
 		}
 		return _one;
@@ -255,7 +255,7 @@ public class LibMatrixCUDA {
 	 */
 	public static Pointer zero() {
 		if(_zero == null  || oldDataTypeSize != sizeOfDataType) {
-			_zero = dataTypePointerTo(0.0);
+			_zero = _dataTypePointerTo(0.0);
 			oldDataTypeSize = sizeOfDataType;
 		}
 		return _zero;
@@ -268,11 +268,11 @@ public class LibMatrixCUDA {
 	 * @param instName  the invoking instruction's name for record {@link Statistics}.
 	 * @return jcuda pointer
 	 */
-	protected static Pointer getDensePointer(GPUContext gCtx, MatrixObject input, String instName) {
+	public static Pointer getDensePointer(GPUContext gCtx, MatrixObject input, String instName) throws DMLRuntimeException {
 		if(isInSparseFormat(gCtx, input)) {
 			input.getGPUObject(gCtx).sparseToDense(instName);
 		}
-		return input.getGPUObject(gCtx).getJcudaDenseMatrixPtr();
+		return input.getGPUObject(gCtx).getDensePointer();
 	}
 
 	/**
@@ -289,7 +289,7 @@ public class LibMatrixCUDA {
 		return input.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
 	}
 	
-	protected static Pointer dataTypePointerTo(double value) {
+	private static Pointer _dataTypePointerTo(double value) {
 		if(sizeOfDataType == Sizeof.DOUBLE) {
 			return Pointer.to(new double[] { value });
 		}
@@ -301,6 +301,18 @@ public class LibMatrixCUDA {
 		}
 	}
 	
+	protected static Pointer dataTypePointerTo(double value) {
+		if(value == 1) {
+			return one();
+		}
+		else if(value == 0) {
+			return zero();
+		}
+		else {
+			return _dataTypePointerTo(value);
+		}
+	}
+	
 
 	/**
 	 * This method computes the backpropagation errors for previous layer of relu operation
@@ -355,8 +367,7 @@ public class LibMatrixCUDA {
 		Pointer tmp = gCtx.allocate(instName, cols*sizeOfDataType);
 		reduceCol(gCtx, instName, "reduce_col_sum", imagePointer, tmp, N, cols);
 		reduceRow(gCtx, instName, "reduce_row_sum", tmp, outputPointer, toInt(C), toInt(HW));
-		gCtx.cudaFreeHelper(tmp);
-
+		gCtx.cudaFreeHelper(instName, tmp, DMLScript.EAGER_CUDA_FREE);
 	}
 
 	/**
@@ -387,9 +398,9 @@ public class LibMatrixCUDA {
 		if(bias.getNumColumns() != 1 || cols % K != 0) {
 			throw new DMLRuntimeException("Incorrect inputs for bias_multiply: input[" + rows + " X " + cols + "] and bias[" + K + " X " + bias.getNumColumns() + "]");
 		}
-		Pointer imagePointer = input.getGPUObject(gCtx).getJcudaDenseMatrixPtr();
-		Pointer biasPointer = bias.getGPUObject(gCtx).getJcudaDenseMatrixPtr();
-		Pointer outputPointer = outputBlock.getGPUObject(gCtx).getJcudaDenseMatrixPtr();
+		Pointer imagePointer = input.getGPUObject(gCtx).getDensePointer();
+		Pointer biasPointer = bias.getGPUObject(gCtx).getDensePointer();
+		Pointer outputPointer = outputBlock.getGPUObject(gCtx).getDensePointer();
 		long t1 = 0;
 		if (DMLScript.FINEGRAINED_STATISTICS) t1 = System.nanoTime();
 		getCudaKernels(gCtx).launchKernel("bias_multiply",
@@ -729,7 +740,7 @@ public class LibMatrixCUDA {
 			default:
 				throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for summation squared");
 			}
-			gCtx.cudaFreeHelper(instName, tmp);
+			gCtx.cudaFreeHelper(instName, tmp, DMLScript.EAGER_CUDA_FREE);
 			break;
 		}
 		case OP_MEAN:{
@@ -842,7 +853,7 @@ public class LibMatrixCUDA {
 				ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), clen - 1);
 				matrixScalarOp(gCtx, instName, tmpRow, clen - 1, rlen, 1, out, divideOp);
 
-				gCtx.cudaFreeHelper(instName, tmpRow);
+				gCtx.cudaFreeHelper(instName, tmpRow, DMLScript.EAGER_CUDA_FREE);
 
 				break;
 			}
@@ -860,15 +871,15 @@ public class LibMatrixCUDA {
 				ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), rlen - 1);
 				matrixScalarOp(gCtx, instName, tmpCol, rlen - 1, 1, clen, out, divideOp);
 
-				gCtx.cudaFreeHelper(instName, tmpCol);
+				gCtx.cudaFreeHelper(instName, tmpCol, DMLScript.EAGER_CUDA_FREE);
 
 				break;
 			}
 			default:
 				throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for variance");
 			}
-			gCtx.cudaFreeHelper(instName, tmp);
-			gCtx.cudaFreeHelper(instName, tmp2);
+			gCtx.cudaFreeHelper(instName, tmp, DMLScript.EAGER_CUDA_FREE);
+			gCtx.cudaFreeHelper(instName, tmp2, DMLScript.EAGER_CUDA_FREE);
 			break;
 		}
 		case OP_MAXINDEX : {
@@ -923,7 +934,7 @@ public class LibMatrixCUDA {
 		int[] tmp = getKernelParamsForReduceAll(gCtx, n);
 		int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2];
 
-		Pointer tempOut = gCtx.allocate(instName, n * sizeOfDataType);
+		Pointer tempOut = gCtx.allocate(instName, n*sizeOfDataType); 
 
 		long t1=0,t2=0;
 
@@ -944,7 +955,7 @@ public class LibMatrixCUDA {
 		}
 		double[] result = {-1f};
 		cudaSupportFunctions.deviceToHost(gCtx, tempOut, result, instName, false);
-		gCtx.cudaFreeHelper(instName, tempOut);
+		gCtx.cudaFreeHelper(instName, tempOut, DMLScript.EAGER_CUDA_FREE);
 		return result[0];
 	}
 
@@ -1699,6 +1710,36 @@ public class LibMatrixCUDA {
 			if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_DGEAM_LIB, System.nanoTime() - t0);
 		}
 	}
+	
+	/**
+	 * Computes C = t(A)
+	 * @param ec execution context
+	 * @param gCtx gpu context
+	 * @param instName name of the instruction
+	 * @param A pointer to the input matrix
+	 * @param C pointer to the output matrix
+	 * @param numRowsA number of rows of the input matrix
+	 * @param numColsA number of columns of the output matrix
+	 * @throws DMLRuntimeException if error
+	 */
+	public static void denseTranspose(ExecutionContext ec, GPUContext gCtx, String instName, 
+			Pointer A, Pointer C, long numRowsA, long numColsA) throws DMLRuntimeException {
+		if (ec.getGPUContext(0) != gCtx)
+			throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
+		if(LOG.isTraceEnabled()) {
+			LOG.trace("GPU : dense transpose" + ", GPUContext=" + gCtx);
+		}
+		long t0=0;
+		// Dense-Dense dgeam
+		int lda = toInt(numColsA);
+		int ldb = lda;
+		int m = toInt(numRowsA);
+		int n = lda;
+		int ldc = m;
+		if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime();
+		cudaSupportFunctions.cublasgeam(getCublasHandle(gCtx), CUBLAS_OP_T, CUBLAS_OP_T, m, n, one(), A, lda, zero(), A, ldb, C, ldc);
+		if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_DGEAM_LIB, System.nanoTime() - t0);
+	}
 
 
 	//********************************************************************/
@@ -2389,7 +2430,7 @@ public class LibMatrixCUDA {
 		if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime();
 		ATobj.denseRowMajorToColumnMajor();
 		if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ROW_TO_COLUMN_MAJOR, System.nanoTime() - t0);
-		Pointer A = ATobj.getJcudaDenseMatrixPtr();
+		Pointer A = ATobj.getDensePointer();
 
 		if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime();
 		GPUObject bTobj = (GPUObject) bobj.clone();
@@ -2399,7 +2440,7 @@ public class LibMatrixCUDA {
 		if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ROW_TO_COLUMN_MAJOR, System.nanoTime() - t0);
 
 
-		Pointer b = bTobj.getJcudaDenseMatrixPtr();
+		Pointer b = bTobj.getDensePointer();
 
 		// The following set of operations is done following the example in the cusolver documentation
 		// http://docs.nvidia.com/cuda/cusolver/#ormqr-example1
@@ -2447,12 +2488,12 @@ public class LibMatrixCUDA {
 		// TODO  : Find a way to assign bTobj directly to the output and set the correct flags so as to not crash
 		// There is an avoidable copy happening here
 		MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, in1.getNumColumns(), 1);
-		cudaMemcpy(out.getGPUObject(gCtx).getJcudaDenseMatrixPtr(), bTobj.getJcudaDenseMatrixPtr(), n * 1 * sizeOfDataType, cudaMemcpyDeviceToDevice);
+		cudaMemcpy(out.getGPUObject(gCtx).getDensePointer(), bTobj.getDensePointer(), n * 1 * sizeOfDataType, cudaMemcpyDeviceToDevice);
 
-		gCtx.cudaFreeHelper(instName, work);
-		gCtx.cudaFreeHelper(instName, tau);
-		ATobj.clearData();
-		bTobj.clearData();
+		gCtx.cudaFreeHelper(instName, work, DMLScript.EAGER_CUDA_FREE);
+		gCtx.cudaFreeHelper(instName, tau, DMLScript.EAGER_CUDA_FREE);
+		ATobj.clearData(instName, DMLScript.EAGER_CUDA_FREE);
+		bTobj.clearData(instName, DMLScript.EAGER_CUDA_FREE);
 
 		//debugPrintMatrix(b, n, 1);
     }
@@ -2501,6 +2542,39 @@ public class LibMatrixCUDA {
 				GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ALLOCATE_SPARSE_OUTPUT, System.nanoTime() - t0);
 		return mb.getKey();
 	}
-
-
-}
+	
+//	// Small 1-int pointers to avoid unnecessary allocation/deallocation
+//	private static Pointer _TMP_NNZ_ROW_PTR = null;
+//	private static Pointer _TMP_NNZ_PTR = null;
+	/**
+	 * Utility to compute number of non-zeroes on the GPU
+	 * 
+	 * @param gCtx the associated GPUContext
+	 * @param densePtr device pointer to the dense matrix
+	 * @param length length of the dense pointer
+	 * @return the number of non-zeroes
+	 */
+	public static synchronized int computeNNZ(GPUContext gCtx, Pointer densePtr, int length) {
+		return (int) reduceAll(gCtx, null, "compute_nnz", densePtr, length);
+		// This is extremely slow
+//		cusparseMatDescr matDescr = CSRPointer.getDefaultCuSparseMatrixDescriptor();
+//		cusparseHandle cusparseHandle = gCtx.getCusparseHandle();
+//		if(_TMP_NNZ_ROW_PTR == null) {
+//			// As these are 4-byte pointers, using cudaMalloc directly so as not to include them in memory information.
+//			_TMP_NNZ_ROW_PTR = new Pointer();
+//			cudaMalloc(_TMP_NNZ_ROW_PTR, jcuda.Sizeof.INT);
+//			_TMP_NNZ_PTR = new Pointer();
+//			cudaMalloc(_TMP_NNZ_PTR, jcuda.Sizeof.INT);
+//			// _TMP_NNZ_ROW_PTR = gCtx.allocate(jcuda.Sizeof.INT);
+//			// _TMP_NNZ_PTR = gCtx.allocate(jcuda.Sizeof.INT);
+//		}
+//		// Output is in dense vector format, convert it to CSR
+//		LibMatrixCUDA.cudaSupportFunctions.cusparsennz(cusparseHandle, cusparseDirection.CUSPARSE_DIRECTION_ROW, 1, length, matDescr, densePtr, 1,
+//				_TMP_NNZ_ROW_PTR, _TMP_NNZ_PTR);
+//		int[] nnzC = { -1 };
+//		cudaMemcpy(Pointer.to(nnzC), _TMP_NNZ_PTR, jcuda.Sizeof.INT, cudaMemcpyDeviceToHost);
+//		return nnzC[0];
+	}
+
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
index 26a4d2e..2bfb8f2 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
@@ -215,7 +215,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 				CSRPointer filterPointer = filter.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
 				Pointer matmultOutputPointer = gCtx.allocate(instName, NKPQ*sizeOfDataType);
 				LibMatrixCuMatMult.sparseDenseMatMult(gCtx, instName, matmultOutputPointer, filterPointer, im2colPointer, K, CRS, CRS, NPQ, K, NPQ, false, false);
-				gCtx.cudaFreeHelper(instName, im2colPointer);
+				gCtx.cudaFreeHelper(instName, im2colPointer, DMLScript.EAGER_CUDA_FREE);
 				
 				// Perform reorg_knpq a reorg operation of matmultOutputPointer matrix with dimensions [K, NPQ]
 				// and return a matrix dstPointer with dimensions [N, KPQ]
@@ -224,7 +224,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 						matmultOutputPointer, dstPointer, NKPQ, NPQ, KPQ, P*Q);
 				if (DMLScript.FINEGRAINED_STATISTICS)
 					GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_REORG_KNPQ_KERNEL, System.nanoTime() - t1);
-				gCtx.cudaFreeHelper(instName, matmultOutputPointer);
+				gCtx.cudaFreeHelper(instName, matmultOutputPointer, DMLScript.EAGER_CUDA_FREE);
 			}
 			else {
 				// Filter and output are accounted as dense in the memory estimation for conv2d
@@ -444,7 +444,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 
 						}
 						// Deallocate temporary array to hold one element of input
-						gCtx.cudaFreeHelper(tempdwPointer, true);
+						gCtx.cudaFreeHelper(instName, tempdwPointer, true);
 					}
 				}
 			}
@@ -772,7 +772,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 			long t4=0;
 			if (DMLScript.FINEGRAINED_STATISTICS) t4 = System.nanoTime();
 			if(!isMaxPoolOutputProvided)
-				gCtx.cudaFreeHelper(instName, y);
+				gCtx.cudaFreeHelper(instName, y, DMLScript.EAGER_CUDA_FREE);
 			if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t4);
 		}
 	}
@@ -818,17 +818,15 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 			throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
 		long N = in.getNumRows();
 		long CHW = in.getNumColumns();
-		MatrixObject output = ec.getMatrixObject(outputName);
-		getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, in.getNumRows(), in.getNumColumns()); // Allocated the dense output matrix
+		Pointer dstData = getDenseOutputPointer(ec, gCtx, instName, outputName, in.getNumRows(), in.getNumColumns());
 		long t0=0;
 		if(N*CHW >= maxNumElementsOfCuDNNTensor) {
 			if(LOG.isTraceEnabled()) {
 				LOG.trace("GPU : relu custom kernel" + ", GPUContext=" + gCtx);
 			}
 			// Invokes relu(double* A,  double* ret, int rlen, int clen)
-			if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime();
-			Pointer dstData = getDensePointerForCuDNN(gCtx, output, instName);
 			Pointer srcData = getDensePointerForCuDNN(gCtx, in, instName); // TODO: FIXME: Add sparse kernel support for relu
+			if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime();
 			getCudaKernels(gCtx).launchKernel("relu",
 					ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(N), toInt(CHW)),
 					srcData, dstData, toInt(N), toInt(CHW));
@@ -838,11 +836,18 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 			cudnnTensorDescriptor tensorDescriptor = new cudnnTensorDescriptor();
 			cudnnCreateTensorDescriptor(tensorDescriptor);
 			cudnnSetTensor4dDescriptor(tensorDescriptor, CUDNN_TENSOR_NCHW, CUDNN_DATA_TYPE, toInt(N), 1, 1, toInt(CHW));
-			cudnnReLU(gCtx, instName, in, getDensePointerForCuDNN(gCtx, output, instName), tensorDescriptor);
+			cudnnReLU(gCtx, instName, in, dstData, tensorDescriptor);
 			cudnnDestroyTensorDescriptor(tensorDescriptor);
 		}
 	}
-
+	
+	private static Pointer getDenseOutputPointer(ExecutionContext ec, GPUContext gCtx, String instName, String outputName,
+			long numRows, long numCols) throws DMLRuntimeException {
+		MatrixObject output = ec.getMatrixObject(outputName);
+		getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, numRows, numCols); // Allocated the dense output matrix
+		return getDensePointerForCuDNN(gCtx, output, instName, toInt(numRows), toInt(numCols));
+	}
+	
 	/**
 	 * Convenience method to get jcudaDenseMatrixPtr. This method explicitly converts sparse to dense format, so use it judiciously.
 	 * 
@@ -858,6 +863,33 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 		}
 		return getDensePointer(gCtx, image, instName);
 	}
+	
+	/**
+	 * Convenience method to get jcudaDenseMatrixPtr. This method explicitly converts sparse to dense format, so use it judiciously.
+	 * 
+	 * @param gCtx a valid {@link GPUContext}
+	 * @param image input matrix object
+	 * @param instName name of the instruction
+	 * @param numRows expected number of rows
+	 * @param numCols expected number of columns 
+	 * @return jcuda pointer
+	 * @throws DMLRuntimeException if error occurs while sparse to dense conversion
+	 */
+	public static Pointer getDensePointerForCuDNN(GPUContext gCtx, MatrixObject image, String instName, int numRows, int numCols) throws DMLRuntimeException {
+		long numElems = image.getNumRows()*image.getNumColumns();
+		if(image.getNumRows() != numRows || image.getNumColumns() != numCols) {
+			throw new DMLRuntimeException("Expected input of size:[" +  numRows + ", " + numCols + "], but found [" + image.getNumRows() + ", " + image.getNumColumns() + "]."); 
+		}
+		else if(numElems > maxNumElementsOfCuDNNTensor) {
+			throw new DMLRuntimeException("CuDNN restriction: the size of input tensor cannot have greater than 2 giga-elements, but has " + numElems + " (i.e. [" + image.getNumRows() + " X " + image.getNumColumns() + "]). Hint: try reducing the mini-batch size.");
+		}
+		Pointer ptr = getDensePointer(gCtx, image, instName);
+		long sizeOfPtr = gCtx.getMemoryManager().getSizeAllocatedGPUPointer(ptr);
+		if(sizeOfPtr != numElems*sizeOfDataType) {
+			throw new DMLRuntimeException("Incorrect pointer: expected size:" +  (numElems*sizeOfDataType) + ", but found " + sizeOfPtr);
+		}
+		return ptr;
+	}
 
 	/**
 	 * Convenience method for checking the status of CuDNN kernel.
@@ -868,4 +900,4 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 		if(status != cudnnStatus.CUDNN_STATUS_SUCCESS)
 			throw new DMLRuntimeException("Error status returned by CuDNN:" + jcuda.jcudnn.cudnnStatus.stringFor(status));
 	}
-}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
index a50dbc3..432e79e 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
@@ -97,7 +97,7 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements java.lang.AutoCloseab
 			cudnnDestroyConvolutionDescriptor(convDesc);
 		if(sizeInBytes != 0) {
 			try {
-				gCtx.cudaFreeHelper(instName, workSpace);
+				gCtx.cudaFreeHelper(instName, workSpace, DMLScript.EAGER_CUDA_FREE);
 			} catch (DMLRuntimeException e) {
 				throw new RuntimeException(e);
 			}
@@ -276,4 +276,4 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements java.lang.AutoCloseab
 		cudnnSetConvolution2dDescriptor(convDesc, padding[0], padding[1], strides[0], strides[1], 1, 1, CUDNN_CROSS_CORRELATION, LibMatrixCUDA.CUDNN_DATA_TYPE);
 		return convDesc;
 	}
-}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java
index f52da30..f3ce70d 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java
@@ -76,7 +76,7 @@ public class LibMatrixCuDNNInputRowFetcher extends LibMatrixCUDA implements java
 	@Override
 	public void close() {
 		try {
-			gCtx.cudaFreeHelper(outPointer, true);
+			gCtx.cudaFreeHelper(null, outPointer, true);
 		} catch (DMLRuntimeException e) {
 			throw new RuntimeException(e);
 		}

http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
index f476dfe..60b2352 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
@@ -285,7 +285,7 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA {
 					toInt(outRLen), C, toInt(outCLen));
 			if (!DMLScript.EAGER_CUDA_FREE)
 				JCuda.cudaDeviceSynchronize();
-			gCtx.cudaFreeHelper(output, DMLScript.EAGER_CUDA_FREE);
+			gCtx.cudaFreeHelper(instName, output, DMLScript.EAGER_CUDA_FREE);
 			if (DMLScript.FINEGRAINED_STATISTICS)
 				GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_TRANSPOSE_LIB, System.nanoTime()
 						- t0);
@@ -466,4 +466,4 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA {
 	private static int reverseCusparseOp(int trans) {
 		return trans == CUSPARSE_OPERATION_TRANSPOSE ? CUSPARSE_OPERATION_NON_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
 	}
-}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
index 9fec026..5fd642e 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
@@ -315,7 +315,7 @@ public class LibMatrixNative
 		return ret2;
 	}
 	
-	private static void fromFloatBuffer(FloatBuffer buff, double[] output) {
+	public static void fromFloatBuffer(FloatBuffer buff, double[] output) {
 		Arrays.parallelSetAll(output, i -> (double)buff.get(i) );
 	}
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java b/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
index d5edf48..3bd101c 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
@@ -22,6 +22,11 @@ import static jcuda.runtime.JCuda.cudaMemcpy;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
 
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.FloatBuffer;
+import java.util.stream.IntStream;
+
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysml.api.DMLScript;
@@ -163,7 +168,7 @@ public class SinglePrecisionCudaSupportFunctions implements CudaSupportFunctions
 	
 	@Override
 	public void deviceToHost(GPUContext gCtx, Pointer src, double[] dest, String instName, boolean isEviction) {
-		long t1 = DMLScript.FINEGRAINED_STATISTICS  && instName != null? System.nanoTime() : 0;
+		long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
 		// We invoke transfer matrix from device to host in two cases:
 		// 1. During eviction of unlocked matrices
 		// 2. During acquireHostRead
@@ -177,40 +182,46 @@ public class SinglePrecisionCudaSupportFunctions implements CudaSupportFunctions
 			Pointer deviceDoubleData = gCtx.allocate(((long)dest.length)*Sizeof.DOUBLE);
 			LibMatrixCUDA.float2double(gCtx, src, deviceDoubleData, dest.length);
 			cudaMemcpy(Pointer.to(dest), deviceDoubleData, ((long)dest.length)*Sizeof.DOUBLE, cudaMemcpyDeviceToHost);
-			gCtx.cudaFreeHelper(deviceDoubleData);
+			gCtx.cudaFreeHelper(instName, deviceDoubleData, DMLScript.EAGER_CUDA_FREE);
 		}
 		else {
 			LOG.debug("Potential OOM: Allocated additional space on host in deviceToHost");
-			float [] floatData = new float[dest.length];
+			FloatBuffer floatData = ByteBuffer.allocateDirect(Sizeof.FLOAT*dest.length).order(ByteOrder.nativeOrder()).asFloatBuffer();
 			cudaMemcpy(Pointer.to(floatData), src, ((long)dest.length)*Sizeof.FLOAT, cudaMemcpyDeviceToHost);
-			for(int i = 0; i < dest.length; i++) {
-				dest[i] = floatData[i];
-			}
+			LibMatrixNative.fromFloatBuffer(floatData, dest);
+		}
+		if(DMLScript.STATISTICS) {
+			long totalTime = System.nanoTime() - t0;
+			GPUStatistics.cudaFloat2DoubleTime.add(totalTime);
+			GPUStatistics.cudaFloat2DoubleCount.add(1);
+			if(DMLScript.FINEGRAINED_STATISTICS && instName != null) 
+				GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DEVICE_TO_HOST, totalTime);
 		}
-		if(DMLScript.FINEGRAINED_STATISTICS && instName != null) 
-			GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DEVICE_TO_HOST, System.nanoTime() - t1);
 	}
 
 	@Override
 	public void hostToDevice(GPUContext gCtx, double[] src, Pointer dest, String instName) {
 		LOG.debug("Potential OOM: Allocated additional space in hostToDevice");
 		// TODO: Perform conversion on GPU using double2float and float2double kernels
-		long t1 = DMLScript.FINEGRAINED_STATISTICS  && instName != null? System.nanoTime() : 0;
+		long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
 		if(PERFORM_CONVERSION_ON_DEVICE) {
 			Pointer deviceDoubleData = gCtx.allocate(((long)src.length)*Sizeof.DOUBLE);
 			cudaMemcpy(deviceDoubleData, Pointer.to(src), ((long)src.length)*Sizeof.DOUBLE, cudaMemcpyHostToDevice);
 			LibMatrixCUDA.double2float(gCtx, deviceDoubleData, dest, src.length);
-			gCtx.cudaFreeHelper(deviceDoubleData);
+			gCtx.cudaFreeHelper(instName, deviceDoubleData, DMLScript.EAGER_CUDA_FREE);
 		}
 		else {
-			float [] floatData = new float[src.length];
-			for(int i = 0; i < src.length; i++) {
-				floatData[i] = (float) src[i];
-			}
+			FloatBuffer floatData = ByteBuffer.allocateDirect(Sizeof.FLOAT*src.length).order(ByteOrder.nativeOrder()).asFloatBuffer();
+			IntStream.range(0, src.length).parallel().forEach(i -> floatData.put(i, (float)src[i]));
 			cudaMemcpy(dest, Pointer.to(floatData), ((long)src.length)*Sizeof.FLOAT, cudaMemcpyHostToDevice);
 		}
 		
-		if(DMLScript.FINEGRAINED_STATISTICS && instName != null) 
-			GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_HOST_TO_DEVICE, System.nanoTime() - t1);
+		if(DMLScript.STATISTICS) {
+			long totalTime = System.nanoTime() - t0;
+			GPUStatistics.cudaDouble2FloatTime.add(totalTime);
+			GPUStatistics.cudaDouble2FloatCount.add(1);
+			if(DMLScript.FINEGRAINED_STATISTICS && instName != null) 
+				GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_HOST_TO_DEVICE, totalTime);
+		}
 	}
-}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/utils/GPUStatistics.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/utils/GPUStatistics.java b/src/main/java/org/apache/sysml/utils/GPUStatistics.java
index d12f4dd..f7bee4f 100644
--- a/src/main/java/org/apache/sysml/utils/GPUStatistics.java
+++ b/src/main/java/org/apache/sysml/utils/GPUStatistics.java
@@ -52,6 +52,10 @@ public class GPUStatistics {
 	public static LongAdder cudaToDevTime = new LongAdder();             // time spent in copying data from host (CPU) to device (GPU) memory
 	public static LongAdder cudaFromDevTime = new LongAdder();           // time spent in copying data from device to host
 	public static LongAdder cudaEvictTime = new LongAdder();           	 // time spent in eviction
+	public static LongAdder cudaEvictMallocTime = new LongAdder();      // time spent in eviction
+	public static LongAdder cudaFloat2DoubleTime = new LongAdder(); 	// time spent in converting float to double during eviction
+	public static LongAdder cudaDouble2FloatTime = new LongAdder(); 	// time spent in converting double to float during eviction
+	public static LongAdder cudaEvictMemcpyTime = new LongAdder(); 		// time spent in cudaMemcpy kernel during eviction
 	public static LongAdder cudaForcedClearLazyFreedEvictTime = new LongAdder(); // time spent in forced lazy eviction
 	public static LongAdder cudaForcedClearUnpinnedEvictTime = new LongAdder(); // time spent in forced unpinned eviction
 	public static LongAdder cudaAllocCount = new LongAdder();
@@ -60,6 +64,9 @@ public class GPUStatistics {
 	public static LongAdder cudaToDevCount = new LongAdder();
 	public static LongAdder cudaFromDevCount = new LongAdder();
 	public static LongAdder cudaEvictionCount = new LongAdder();
+	public static LongAdder cudaFloat2DoubleCount = new LongAdder();
+	public static LongAdder cudaDouble2FloatCount = new LongAdder();
+	public static LongAdder cudaEvictionMallocCount = new LongAdder();
 
 	// Per instruction miscellaneous timers.
 	// Used to record events in a CP Heavy Hitter instruction and
@@ -88,6 +95,11 @@ public class GPUStatistics {
 		cudaToDevTime.reset();
 		cudaFromDevTime.reset();
 		cudaEvictTime.reset();
+		cudaEvictMallocTime.reset();
+		cudaFloat2DoubleTime.reset();
+		cudaDouble2FloatTime.reset();
+		cudaFloat2DoubleCount.reset();
+		cudaDouble2FloatCount.reset();
 		cudaForcedClearLazyFreedEvictTime.reset();
 		cudaForcedClearUnpinnedEvictTime.reset();
 		cudaAllocCount.reset();
@@ -95,6 +107,7 @@ public class GPUStatistics {
 		cudaToDevCount.reset();
 		cudaFromDevCount.reset();
 		cudaEvictionCount.reset();
+		cudaEvictionMallocCount.reset();
 		resetMiscTimers();
 	}
 
@@ -193,21 +206,27 @@ public class GPUStatistics {
 		sb.append("CUDA/CuLibraries init time:\t" + String.format("%.3f", cudaInitTime*1e-9) + "/"
 				+ String.format("%.3f", cudaLibrariesInitTime*1e-9) + " sec.\n");
 		sb.append("Number of executed GPU inst:\t" + getNoOfExecutedGPUInst() + ".\n");
-		sb.append("GPU mem tx time  (alloc/dealloc/set0/toDev/fromDev/evict):\t"
+		sb.append("GPU mem tx time  (alloc/dealloc/set0/toDev(d2f)/fromDev(f2d)/evict(alloc)):\t"
 				+ String.format("%.3f", cudaAllocTime.longValue()*1e-9) + "/"
 				+ String.format("%.3f", cudaDeAllocTime.longValue()*1e-9) + "/"
 				+ String.format("%.3f", cudaMemSet0Time.longValue()*1e-9) + "/"
-				+ String.format("%.3f", cudaToDevTime.longValue()*1e-9) + "/"
-				+ String.format("%.3f", cudaFromDevTime.longValue()*1e-9) + "/"
-				+ String.format("%.3f", cudaEvictTime.longValue()*1e-9) + " sec.\n");
-		sb.append("GPU mem tx count (alloc/dealloc/set0/toDev/fromDev/evict):\t"
+				+ String.format("%.3f", cudaToDevTime.longValue()*1e-9) + "("
+				+ String.format("%.3f", cudaDouble2FloatTime.longValue()*1e-9)+ ")/"
+				+ String.format("%.3f", cudaFromDevTime.longValue()*1e-9) + "("
+				+ String.format("%.3f", cudaFloat2DoubleTime.longValue()*1e-9) + ")/"
+				+ String.format("%.3f", cudaEvictTime.longValue()*1e-9) + "("
+				+ String.format("%.3f", cudaEvictMallocTime.longValue()*1e-9) + ") sec.\n");
+		sb.append("GPU mem tx count (alloc/dealloc/set0/toDev(d2f)/fromDev(f2d)/evict(alloc)):\t"
 				+ cudaAllocCount.longValue() + "/"
 				+ cudaDeAllocCount.longValue() + "/"
 				+ cudaMemSet0Count.longValue() + "/"
 				+ cudaSparseConversionCount.longValue() + "/"
-				+ cudaToDevCount.longValue() + "/"
-				+ cudaFromDevCount.longValue() + "/"
-				+ cudaEvictionCount.longValue() + ".\n");
+				+ cudaToDevCount.longValue() + "("
+				+ cudaDouble2FloatCount.longValue() + ")/"
+				+ cudaFromDevCount.longValue() + "("
+				+ cudaFloat2DoubleCount.longValue() + ")/"
+				+ cudaEvictionCount.longValue() + "("
+				+ cudaEvictionMallocCount.longValue() + ").\n");
 		sb.append("GPU conversion time  (sparseConv/sp2dense/dense2sp):\t"
 				+ String.format("%.3f", cudaSparseConversionTime.longValue()*1e-9) + "/"
 				+ String.format("%.3f", cudaSparseToDenseTime.longValue()*1e-9) + "/"
@@ -221,4 +240,4 @@ public class GPUStatistics {
 	}
 
 
-}
+}
\ No newline at end of file


[4/4] systemml git commit: [SYSTEMML-445] Refactored GPU Memory Manager

Posted by ni...@apache.org.
[SYSTEMML-445] Refactored GPU Memory Manager

- Several bugfixes found during recent experiments with ResNet200.
- Added align_memory eviction policy.
- Added GPU usage documentation.
- Refactored the GPU Memory Manager into distinct components.

Closes #774.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/4d321667
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/4d321667
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/4d321667

Branch: refs/heads/master
Commit: 4d3216678f252f731ac7d7db62111dde6ca063f5
Parents: af9cc8a
Author: Niketan Pansare <np...@us.ibm.com>
Authored: Thu May 31 12:25:06 2018 -0700
Committer: Niketan Pansare <np...@us.ibm.com>
Committed: Thu May 31 12:26:44 2018 -0700

----------------------------------------------------------------------
 conf/SystemML-config.xml.template               |   14 +-
 docs/gpu.md                                     |   94 +
 src/main/cpp/kernels/SystemML.cu                |   99 +-
 src/main/cpp/kernels/SystemML.ptx               | 8587 +++++++++---------
 .../java/org/apache/sysml/api/DMLScript.java    |    4 +-
 .../apache/sysml/api/ScriptExecutorUtils.java   |    3 +
 .../java/org/apache/sysml/conf/DMLConfig.java   |   10 +-
 .../controlprogram/caching/CacheableData.java   |    4 +-
 .../instructions/gpu/context/CSRPointer.java    |   21 +-
 .../instructions/gpu/context/GPUContext.java    |   58 +-
 .../context/GPULazyCudaFreeMemoryManager.java   |  171 +
 .../gpu/context/GPUMatrixMemoryManager.java     |  184 +
 .../gpu/context/GPUMemoryManager.java           |  654 +-
 .../instructions/gpu/context/GPUObject.java     |  322 +-
 .../runtime/matrix/data/LibMatrixCUDA.java      |  130 +-
 .../runtime/matrix/data/LibMatrixCuDNN.java     |   54 +-
 .../LibMatrixCuDNNConvolutionAlgorithm.java     |    4 +-
 .../data/LibMatrixCuDNNInputRowFetcher.java     |    2 +-
 .../runtime/matrix/data/LibMatrixCuMatMult.java |    4 +-
 .../runtime/matrix/data/LibMatrixNative.java    |    2 +-
 .../SinglePrecisionCudaSupportFunctions.java    |   45 +-
 .../org/apache/sysml/utils/GPUStatistics.java   |   37 +-
 22 files changed, 5646 insertions(+), 4857 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/conf/SystemML-config.xml.template
----------------------------------------------------------------------
diff --git a/conf/SystemML-config.xml.template b/conf/SystemML-config.xml.template
index a9c73c8..05d6a1a 100644
--- a/conf/SystemML-config.xml.template
+++ b/conf/SystemML-config.xml.template
@@ -85,17 +85,23 @@
     <sysml.gpu.availableGPUs>-1</sysml.gpu.availableGPUs>
     
     <!-- whether to synchronize GPUs after every GPU instruction -->
-    <sysml.gpu.sync.postProcess>true</sysml.gpu.sync.postProcess>
+    <sysml.gpu.sync.postProcess>false</sysml.gpu.sync.postProcess>
     
     <!-- whether to perform eager CUDA free on rmvar instruction -->
     <sysml.gpu.eager.cudaFree>false</sysml.gpu.eager.cudaFree>
+    
+    <!-- Developer flag used to debug GPU memory leaks. This has huge performance overhead and should be only turned on for debugging purposes.  -->
+    <sysml.gpu.print.memoryInfo>false</sysml.gpu.print.memoryInfo>
    
     <!-- the floating point precision. supported values are double, single -->
     <sysml.floating.point.precision>double</sysml.floating.point.precision>
     
-    <!-- the eviction policy for the GPU bufferpool. supported values are lru, mru, lfu, min_evict -->
-    <sysml.gpu.eviction.policy>lru</sysml.gpu.eviction.policy>
+    <!-- the eviction policy for the GPU bufferpool. supported values are lru, mru, lfu, min_evict, align_memory -->
+    <sysml.gpu.eviction.policy>align_memory</sysml.gpu.eviction.policy>
     
    <!-- maximum wrap length for instruction and miscellaneous timer column of statistics -->
    <sysml.stats.maxWrapLength>30</sysml.stats.maxWrapLength>
-</root>
+   
+   <!-- Advanced optimization: fraction of driver memory to use for caching (default: 0.15) -->
+   <sysml.caching.bufferSize>0.15</sysml.caching.bufferSize>
+</root>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/docs/gpu.md
----------------------------------------------------------------------
diff --git a/docs/gpu.md b/docs/gpu.md
new file mode 100644
index 0000000..e9d7bca
--- /dev/null
+++ b/docs/gpu.md
@@ -0,0 +1,94 @@
+---
+layout: global
+title: Using SystemML with GPU
+description: Using SystemML with GPU
+---
+<!--
+{% comment %}
+Licensed to the Apache Software Foundation (ASF) under one or more
+contributor license agreements.  See the NOTICE file distributed with
+this work for additional information regarding copyright ownership.
+The ASF licenses this file to you under the Apache License, Version 2.0
+(the "License"); you may not use this file except in compliance with
+the License.  You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+{% endcomment %}
+-->
+
+* This will become a table of contents (this text will be scraped).
+{:toc}
+
+<br/>
+
+# User Guide
+
+To use SystemML on GPUs, please ensure that [CUDA 9](https://developer.nvidia.com/cuda-90-download-archive) and
+[CuDNN 7](https://developer.nvidia.com/cudnn) is installed on your system.
+
+## Python users
+
+Please install SystemML using pip:
+- For released version: `pip install systemml`
+- For bleeding edge version: `pip install https://sparktc.ibmcloud.com/repo/latest/systemml-1.2.0-SNAPSHOT-python.tar.gz`
+
+Then you can use the `setGPU(True)` method of [MLContext](http://apache.github.io/systemml/spark-mlcontext-programming-guide.html) and 
+[MLLearn](http://apache.github.io/systemml/beginners-guide-python.html#invoke-systemmls-algorithms) APIs to enable the GPU usage.
+
+```python
+from systemml.mllearn import Caffe2DML
+lenet = Caffe2DML(spark, solver='lenet_solver.proto', input_shape=(1, 28, 28))
+lenet.setGPU(True)
+```
+To skip memory-checking and force all GPU-enabled operations on the GPU, please use the `setForceGPU(True)` method after `setGPU(True)` method.
+
+```python
+from systemml.mllearn import Caffe2DML
+lenet = Caffe2DML(spark, solver='lenet_solver.proto', input_shape=(1, 28, 28))
+lenet.setGPU(True).setForceGPU(True)
+```
+
+## Command-line users
+
+To enable the GPU backend via command-line, please provide `systemml-1.*-extra.jar` in the classpath and `-gpu` flag.
+
+```
+spark-submit --jars systemml-1.*-extra.jar SystemML.jar -f myDML.dml -gpu
+``` 
+
+To skip memory-checking and force all GPU-enabled operations on the GPU, please provide `force` option to the `-gpu` flag.
+
+```
+spark-submit --jars systemml-1.*-extra.jar SystemML.jar -f myDML.dml -gpu force
+``` 
+
+## Scala users
+
+To enable the GPU backend via command-line, please provide `systemml-1.*-extra.jar` in the classpath and use 
+the `setGPU(True)` method of [MLContext](http://apache.github.io/systemml/spark-mlcontext-programming-guide.html) API to enable the GPU usage.
+
+```
+spark-shell --jars systemml-1.*-extra.jar,SystemML.jar
+``` 
+
+# Troubleshooting guide
+
+- If you have older gcc (< 5.0) and if you get `libstdc++.so.6: version CXXABI_1.3.8 not found` error, please upgrade to gcc v5+. 
+On Centos 5, you may have to compile gcc from the source:
+
+```
+sudo yum install libmpc-devel mpfr-devel gmp-devel zlib-devel*
+curl ftp://ftp.gnu.org/pub/gnu/gcc/gcc-5.3.0/gcc-5.3.0.tar.bz2 -O
+tar xvfj gcc-5.3.0.tar.bz2
+cd gcc-5.3.0
+./configure --with-system-zlib --disable-multilib --enable-languages=c,c++
+num_cores=`grep -c ^processor /proc/cpuinfo`
+make -j $num_cores
+sudo make install
+```
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/cpp/kernels/SystemML.cu
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.cu b/src/main/cpp/kernels/SystemML.cu
index 29ae820..55ebeaf 100644
--- a/src/main/cpp/kernels/SystemML.cu
+++ b/src/main/cpp/kernels/SystemML.cu
@@ -20,7 +20,7 @@
 /**********************************
 When updating a kernel or adding a new one,
 please compile the ptx file and commit it:
-nvcc -ptx -arch=sm_30 --std c++11 SystemML.cu
+nvcc -w -ptx -arch=sm_30 --std c++11 SystemML.cu
 ***********************************/
 
 #include <cfloat>
@@ -1961,3 +1961,100 @@ extern "C" __global__ void matrix_sigmoid_f(float *A, float *C,
                                          unsigned int size) {
   matrix_sigmoid(A, C, size);
 }
+
+// We can later fold it in our reduce method
+template <typename T>
+__device__ void compute_nnz(
+    T *g_idata,  ///< input data stored in device memory (of size n)
+    T *g_odata,  ///< output/temporary array stored in device memory (of size n)
+    unsigned int n)  ///< size of the input and temporary/output arrays
+{
+  // extern __shared__ T sdata[];
+  extern __shared__ __align__(sizeof(T)) unsigned char my_sdata[];
+  T *sdata = reinterpret_cast<T *>(my_sdata);
+
+  // perform first level of reduction,
+  // reading from global memory, writing to shared memory
+  unsigned int tid = threadIdx.x;
+  unsigned int i = blockIdx.x * blockDim.x * 2 + threadIdx.x;
+  unsigned int gridSize = blockDim.x * 2 * gridDim.x;
+
+  T v = 0;
+
+  // we reduce multiple elements per thread.  The number is determined by the
+  // number of active thread blocks (via gridDim).  More blocks will result
+  // in a larger gridSize and therefore fewer elements per thread
+  while (i < n) {
+    v += g_idata[i] != 0 ? 1 : 0;
+    // ensure we don't read out of bounds
+    if (i + blockDim.x < n) v += g_idata[i + blockDim.x] != 0 ? 1 : 0;
+    i += gridSize;
+  }
+
+  // each thread puts its local sum into shared memory
+  sdata[tid] = v;
+  __syncthreads();
+
+  // do reduction in shared mem
+  if (blockDim.x >= 1024) {
+    if (tid < 512) {
+      sdata[tid] = v = v + sdata[tid + 512];
+    }
+    __syncthreads();
+  }
+  if (blockDim.x >= 512) {
+    if (tid < 256) {
+      sdata[tid] = v = v + sdata[tid + 256];
+    }
+    __syncthreads();
+  }
+  if (blockDim.x >= 256) {
+    if (tid < 128) {
+      sdata[tid] = v = v + sdata[tid + 128];
+    }
+    __syncthreads();
+  }
+  if (blockDim.x >= 128) {
+    if (tid < 64) {
+      sdata[tid] = v = v + sdata[tid + 64];
+    }
+    __syncthreads();
+  }
+
+  if (tid < 32) {
+    // now that we are using warp-synchronous programming (below)
+    // we need to declare our shared memory volatile so that the compiler
+    // doesn't reorder stores to it and induce incorrect behavior.
+    volatile T *smem = sdata;
+    if (blockDim.x >= 64) {
+      smem[tid] = v = v + smem[tid + 32];
+    }
+    if (blockDim.x >= 32) {
+      smem[tid] = v = v + smem[tid + 16];
+    }
+    if (blockDim.x >= 16) {
+      smem[tid] = v = v + smem[tid + 8];
+    }
+    if (blockDim.x >= 8) {
+      smem[tid] = v = v + smem[tid + 4];
+    }
+    if (blockDim.x >= 4) {
+      smem[tid] = v = v + smem[tid + 2];
+    }
+    if (blockDim.x >= 2) {
+      smem[tid] = v = v + smem[tid + 1];
+    }
+  }
+
+  // write result for this block to global mem
+  if (tid == 0) g_odata[blockIdx.x] = sdata[0];
+}
+
+
+extern "C" __global__ void compute_nnz_d(double *g_idata, double *g_odata, unsigned int n) {
+	compute_nnz(g_idata, g_odata, n);
+}
+
+extern "C" __global__ void compute_nnz_f(float *g_idata, float *g_odata, unsigned int n) {
+	compute_nnz(g_idata, g_odata, n);
+}


[2/4] systemml git commit: [SYSTEMML-445] Refactored GPU Memory Manager

Posted by ni...@apache.org.
http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/api/DMLScript.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/DMLScript.java b/src/main/java/org/apache/sysml/api/DMLScript.java
index 07e766a..c737e92 100644
--- a/src/main/java/org/apache/sysml/api/DMLScript.java
+++ b/src/main/java/org/apache/sysml/api/DMLScript.java
@@ -112,6 +112,7 @@ public class DMLScript
 		LFU, 				// Evict the least frequently used GPUObject. 
 		MIN_EVICT,
 		MRU, 				// http://www.vldb.org/conf/1985/P127.PDF
+		ALIGN_MEMORY
 		// TODO:
 		// ARC, // https://dbs.uni-leipzig.de/file/ARC.pdf
 		// LOOP_AWARE 		// different policies for operations in for/while/parfor loop vs out-side the loop
@@ -172,7 +173,8 @@ public class DMLScript
 	public static ExplainType       EXPLAIN             = DMLOptions.defaultOptions.explainType; // explain type
 	public static String            DML_FILE_PATH_ANTLR_PARSER = DMLOptions.defaultOptions.filePath; // filename of dml/pydml script
 	public static String            FLOATING_POINT_PRECISION = "double"; 							// data type to use internally
-	public static EvictionPolicy	GPU_EVICTION_POLICY = EvictionPolicy.LRU;						// currently employed GPU eviction policy
+	public static EvictionPolicy	GPU_EVICTION_POLICY = EvictionPolicy.ALIGN_MEMORY;				// currently employed GPU eviction policy
+	public static boolean 			PRINT_GPU_MEMORY_INFO = false;									// whether to print GPU memory-related information
 
 	/**
 	 * Global variable indicating the script type (DML or PYDML). Can be used

http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
index d587f48..2d913b6 100644
--- a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
+++ b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
@@ -27,6 +27,7 @@ import org.apache.sysml.conf.DMLConfig;
 import org.apache.sysml.hops.codegen.SpoofCompiler;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.Program;
+import org.apache.sysml.runtime.controlprogram.caching.CacheableData;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
 import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool;
@@ -70,7 +71,9 @@ public class ScriptExecutorUtils {
 		// Whether extra statistics useful for developers and others interested
 		// in digging into performance problems are recorded and displayed
 		DMLScript.FINEGRAINED_STATISTICS = DMLScript.STATISTICS && dmlconf.getBooleanValue(DMLConfig.EXTRA_FINEGRAINED_STATS);
+		DMLScript.PRINT_GPU_MEMORY_INFO = dmlconf.getBooleanValue(DMLConfig.PRINT_GPU_MEMORY_INFO);
 		DMLScript.SYNCHRONIZE_GPU = dmlconf.getBooleanValue(DMLConfig.SYNCHRONIZE_GPU);
+		CacheableData.CACHING_BUFFER_SIZE = dmlconf.getDoubleValue(DMLConfig.CACHING_BUFFER_SIZE);
 		DMLScript.EAGER_CUDA_FREE = dmlconf.getBooleanValue(DMLConfig.EAGER_CUDA_FREE);
 		DMLScript.STATISTICS_MAX_WRAP_LEN = dmlconf.getIntValue(DMLConfig.STATS_MAX_WRAP_LEN);		
 		NativeHelper.initialize(dmlconf.getTextValue(DMLConfig.NATIVE_BLAS_DIR), dmlconf.getTextValue(DMLConfig.NATIVE_BLAS).trim());

http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/conf/DMLConfig.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/conf/DMLConfig.java b/src/main/java/org/apache/sysml/conf/DMLConfig.java
index fb0237f..7279c57 100644
--- a/src/main/java/org/apache/sysml/conf/DMLConfig.java
+++ b/src/main/java/org/apache/sysml/conf/DMLConfig.java
@@ -81,6 +81,7 @@ public class DMLConfig
 	public static final String CODEGEN_OPTIMIZER    = "sysml.codegen.optimizer"; //see SpoofCompiler.PlanSelector
 	public static final String CODEGEN_PLANCACHE    = "sysml.codegen.plancache"; //boolean
 	public static final String CODEGEN_LITERALS     = "sysml.codegen.literals"; //1..heuristic, 2..always
+	public static final String CACHING_BUFFER_SIZE 	= "sysml.caching.bufferSize"; //double: default:0.15
 	
 	public static final String EXTRA_FINEGRAINED_STATS = "sysml.stats.finegrained"; //boolean
 	public static final String STATS_MAX_WRAP_LEN   = "sysml.stats.maxWrapLength"; //int
@@ -92,7 +93,8 @@ public class DMLConfig
 	// to handle the tradeoff on calling cudaMemGetInfo too often.
 	public static final String GPU_MEMORY_UTILIZATION_FACTOR = "sysml.gpu.memory.util.factor";
 	public static final String FLOATING_POINT_PRECISION = "sysml.floating.point.precision"; // String to specify the datatype to use internally: supported values are double, single
-
+	public static final String PRINT_GPU_MEMORY_INFO = "sysml.gpu.print.memoryInfo";
+	
 	// supported prefixes for custom map/reduce configurations
 	public static final String PREFIX_MAPRED = "mapred";
 	public static final String PREFIX_MAPREDUCE = "mapreduce";
@@ -136,7 +138,9 @@ public class DMLConfig
 		_defaultVals.put(STATS_MAX_WRAP_LEN,     "30" );
 		_defaultVals.put(GPU_MEMORY_UTILIZATION_FACTOR,      "0.9" );
 		_defaultVals.put(AVAILABLE_GPUS,         "-1");
-		_defaultVals.put(GPU_EVICTION_POLICY,    "lru");
+		_defaultVals.put(GPU_EVICTION_POLICY,    "align_memory");
+		_defaultVals.put(SYNCHRONIZE_GPU,        "false" );
+		_defaultVals.put(CACHING_BUFFER_SIZE,    "0.15" );
 		_defaultVals.put(SYNCHRONIZE_GPU,        "true" );
 		_defaultVals.put(EAGER_CUDA_FREE,        "false" );
 		_defaultVals.put(FLOATING_POINT_PRECISION,        	 "double" );
@@ -420,7 +424,7 @@ public class DMLConfig
 				CP_PARALLEL_OPS, CP_PARALLEL_IO, NATIVE_BLAS, NATIVE_BLAS_DIR,
 				COMPRESSED_LINALG, 
 				CODEGEN, CODEGEN_COMPILER, CODEGEN_OPTIMIZER, CODEGEN_PLANCACHE, CODEGEN_LITERALS,
-				EXTRA_FINEGRAINED_STATS, STATS_MAX_WRAP_LEN,
+				EXTRA_FINEGRAINED_STATS, STATS_MAX_WRAP_LEN, PRINT_GPU_MEMORY_INFO, CACHING_BUFFER_SIZE,
 				AVAILABLE_GPUS, SYNCHRONIZE_GPU, EAGER_CUDA_FREE, FLOATING_POINT_PRECISION, GPU_EVICTION_POLICY
 		}; 
 		

http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
index 73f6c08..0bb2a67 100644
--- a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
+++ b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
@@ -77,7 +77,7 @@ public abstract class CacheableData<T extends CacheBlock> extends Data
 	// global constant configuration parameters
 	public static final long    CACHING_THRESHOLD = (long)Math.max(4*1024, //obj not s.t. caching
 		1e-5 * InfrastructureAnalyzer.getLocalMaxMemory());       //if below threshold [in bytes]
-	public static final double  CACHING_BUFFER_SIZE = 0.15; 
+	public static double  CACHING_BUFFER_SIZE = 0.15; 
 	public static final RPolicy CACHING_BUFFER_POLICY = RPolicy.FIFO; 
 	public static final boolean CACHING_BUFFER_PAGECACHE = false; 
 	public static final boolean CACHING_WRITE_CACHE_ON_READ = false;	
@@ -607,7 +607,7 @@ public abstract class CacheableData<T extends CacheBlock> extends Data
 		if( _gpuObjects != null )
 			for (GPUObject gObj : _gpuObjects.values())
 				if (gObj != null)
-					gObj.clearData();
+					gObj.clearData(null, DMLScript.EAGER_CUDA_FREE);
 
 		// change object state EMPTY
 		setDirty(false);

http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
index bdc5b21..7eb4033 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
@@ -416,10 +416,6 @@ public class CSRPointer {
 		return getGPUContext().allocate(size);
 	}
 
-	private void cudaFreeHelper(Pointer toFree, boolean eager) {
-		getGPUContext().cudaFreeHelper(toFree, eager);
-	}
-
 	private GPUContext getGPUContext() {
 		return gpuContext;
 	}
@@ -494,17 +490,20 @@ public class CSRPointer {
 	 */
 	public void deallocate(boolean eager) {
 		if (nnz > 0) {
-			cudaFreeHelper(val, eager);
-			cudaFreeHelper(rowPtr, eager);
-			cudaFreeHelper(colInd, eager);
-			val = null;
-			rowPtr = null;
-			colInd = null;
+			if (val != null)
+				getGPUContext().cudaFreeHelper(null, val, eager);
+			if (rowPtr != null)
+				getGPUContext().cudaFreeHelper(null, rowPtr, eager);
+			if (colInd != null)
+				getGPUContext().cudaFreeHelper(null, colInd, eager);
 		}
+		val = null;
+		rowPtr = null;
+		colInd = null;
 	}
 
 	@Override
 	public String toString() {
 		return "CSRPointer{" + "nnz=" + nnz + '}';
 	}
-}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
index 7f8f106..2d3918c 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
@@ -37,6 +37,7 @@ import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysml.utils.GPUStatistics;
+
 import jcuda.Pointer;
 import jcuda.jcublas.cublasHandle;
 import jcuda.jcudnn.cudnnHandle;
@@ -63,6 +64,7 @@ public class GPUContext {
 	 * active device assigned to this GPUContext instance
 	 */
 	private final int deviceNum;
+	
 	/**
 	 * cudnnHandle for Deep Neural Network operations on the GPU
 	 */
@@ -130,9 +132,11 @@ public class GPUContext {
 		}
 	}
 
-	private void initializeCudaLibraryHandles() {
-		deleteCudaLibraryHandles();
-
+	private void initializeCudaLibraryHandles() throws DMLRuntimeException {
+		// We don't need to explicitly delete the handles if we are planning to create them again. 
+		// This has a huge performance impact on scripts that has large number of layers (i.e. FunctionCallCP) for example ResNet.
+		// If this is absolutely required for parfor, please add appropriate safeguard for non-parfor scripts. 
+		// deleteCudaLibraryHandles();
 		if (cudnnHandle == null) {
 			cudnnHandle = new cudnnHandle();
 			cudnnCreate(cudnnHandle);
@@ -150,11 +154,6 @@ public class GPUContext {
 			cusparseHandle = new cusparseHandle();
 			cusparseCreate(cusparseHandle);
 		}
-
-		if (cusolverDnHandle == null) {
-			cusolverDnHandle = new cusolverDnHandle();
-			cusolverDnCreate(cusolverDnHandle);
-		}
 		
 		if (kernels == null) {
 			kernels = new JCudaKernels();
@@ -204,36 +203,6 @@ public class GPUContext {
 		return memoryManager.malloc(instructionName, size);
 	}
 
-
-	/**
-	 * Does lazy cudaFree calls.
-	 *
-	 * @param toFree {@link Pointer} instance to be freed
-	 */
-	public void cudaFreeHelper(final Pointer toFree) {
-		cudaFreeHelper(null, toFree, DMLScript.EAGER_CUDA_FREE);
-	}
-
-	/**
-	 * Does lazy/eager cudaFree calls.
-	 *
-	 * @param toFree {@link Pointer} instance to be freed
-	 * @param eager  true if to be done eagerly
-	 */
-	public void cudaFreeHelper(final Pointer toFree, boolean eager) {
-		cudaFreeHelper(null, toFree, eager);
-	}
-
-	/**
-	 * Does lazy cudaFree calls.
-	 *
-	 * @param instructionName name of the instruction for which to record per instruction free time, null if do not want to record
-	 * @param toFree          {@link Pointer} instance to be freed
-	 */
-	public void cudaFreeHelper(String instructionName, final Pointer toFree) {
-		cudaFreeHelper(instructionName, toFree, DMLScript.EAGER_CUDA_FREE);
-	}
-
 	/**
 	 * Does cudaFree calls, lazily.
 	 *
@@ -290,7 +259,7 @@ public class GPUContext {
 	 */
 	public GPUObject createGPUObject(MatrixObject mo) {
 		GPUObject ret = new GPUObject(this, mo);
-		getMemoryManager().addGPUObject(ret);
+		getMemoryManager().getGPUMatrixMemoryManager().addGPUObject(ret);
 		return ret;
 	}
 
@@ -376,6 +345,15 @@ public class GPUContext {
 	 * @return cusolverDnHandle for current thread
 	 */
 	public cusolverDnHandle getCusolverDnHandle() {
+		if (cusolverDnHandle == null) {
+			synchronized(this) {
+				if (cusolverDnHandle == null) {
+					// Since cusolverDnHandle handle is rarely used and occupies unnecessary memory, it is only initialized when needed.
+					cusolverDnHandle = new cusolverDnHandle();
+					cusolverDnCreate(cusolverDnHandle);
+				}
+			}
+		}
 		return cusolverDnHandle;
 	}
 
@@ -442,4 +420,4 @@ public class GPUContext {
 	public String toString() {
 		return "GPUContext{" + "deviceNum=" + deviceNum + '}';
 	}
-}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPULazyCudaFreeMemoryManager.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPULazyCudaFreeMemoryManager.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPULazyCudaFreeMemoryManager.java
new file mode 100644
index 0000000..830f7df
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPULazyCudaFreeMemoryManager.java
@@ -0,0 +1,171 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.instructions.gpu.context;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.stream.Collectors;
+import java.util.Optional;
+import java.util.Set;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysml.api.DMLScript;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
+
+import jcuda.Pointer;
+
+public class GPULazyCudaFreeMemoryManager {
+	protected static final Log LOG = LogFactory.getLog(GPULazyCudaFreeMemoryManager.class.getName());
+	GPUMemoryManager gpuManager;
+	public GPULazyCudaFreeMemoryManager(GPUMemoryManager gpuManager) {
+		this.gpuManager = gpuManager;
+	}
+
+	/**
+	 * Map of free blocks allocate on GPU. maps size_of_block -> pointer on GPU
+	 */
+	private HashMap<Long, Set<Pointer>> rmvarGPUPointers = new HashMap<Long, Set<Pointer>>();
+	
+	/**
+	 * Get any pointer of the given size from rmvar-ed pointers (applicable if eager cudaFree is set to false)
+	 * 
+	 * @param opcode opcode
+	 * @param size size in bytes
+	 * @return pointer
+	 */
+	public Pointer getRmvarPointer(String opcode, long size) {
+		if (rmvarGPUPointers.containsKey(size)) {
+			if(LOG.isTraceEnabled())
+				LOG.trace("Getting rmvar-ed pointers for size:" + size);
+			Pointer A = remove(rmvarGPUPointers, size); // remove from rmvarGPUPointers as you are not calling cudaFree
+			return A;
+		}
+		else {
+			return null;
+		}
+	}
+	
+	public Set<Pointer> getAllPointers() {
+		return rmvarGPUPointers.values().stream().flatMap(ptrs -> ptrs.stream()).collect(Collectors.toSet());
+	}
+	
+	public void clearAll() {
+		Set<Pointer> toFree = new HashSet<Pointer>();
+		for(Set<Pointer> ptrs : rmvarGPUPointers.values()) {
+			toFree.addAll(ptrs);
+		}
+		rmvarGPUPointers.clear();
+		for(Pointer ptr : toFree) {
+			gpuManager.guardedCudaFree(ptr);
+		}
+	}
+	
+	public Pointer getRmvarPointerMinSize(String opcode, long minSize) throws DMLRuntimeException {
+		Optional<Long> toClear = rmvarGPUPointers.entrySet().stream().filter(e -> e.getValue().size() > 0).map(e -> e.getKey())
+				.filter(size -> size >= minSize).min((s1, s2) -> s1 < s2 ? -1 : 1);
+		if(toClear.isPresent()) {
+			long t0 = opcode != null && DMLScript.FINEGRAINED_STATISTICS ?  System.nanoTime() : 0;
+			Pointer A = remove(rmvarGPUPointers, toClear.get()); // remove from rmvarGPUPointers as you are not calling cudaFree
+			gpuManager.addMiscTime(opcode, GPUInstruction.MISC_TIMER_REUSE, t0);
+			return A;
+		}
+		return null;
+	}
+	
+	
+	/**
+	 * Remove any pointer in the given hashmap
+	 * 
+	 * @param hm hashmap of size, pointers
+	 * @param size size in bytes
+	 * @return the pointer that was removed
+	 */
+	private Pointer remove(HashMap<Long, Set<Pointer>> hm, long size) {
+		Pointer A = hm.get(size).iterator().next();
+		remove(hm, size, A);
+		return A;
+	}
+	
+	/**
+	 * Remove a specific pointer in the given hashmap
+	 * 
+	 * @param hm hashmap of size, pointers
+	 * @param size size in bytes
+	 * @param ptr pointer to be removed
+	 */
+	private void remove(HashMap<Long, Set<Pointer>> hm, long size, Pointer ptr) {
+		hm.get(size).remove(ptr);
+		if (hm.get(size).isEmpty())
+			hm.remove(size);
+	}
+	
+	/**
+	 * Return the total memory in bytes used by this memory manager
+	 * @return number of bytes
+	 */
+	public long getTotalMemoryAllocated() {
+		long rmvarMemoryAllocated = 0;
+		for(long numBytes : rmvarGPUPointers.keySet()) {
+			rmvarMemoryAllocated += numBytes;
+		}
+		return rmvarMemoryAllocated;
+	}
+	
+	/**
+	 * Get total number of rmvared pointers
+	 * 
+	 * @return number of pointers
+	 */
+	public int getNumPointers() {
+		return rmvarGPUPointers.size();
+	}
+	
+	/**
+	 * Add a pointer to the rmvar-ed list
+	 * @param size size of the pointer
+	 * @param toFree pointer
+	 */
+	public void add(long size, Pointer toFree) {
+		Set<Pointer> freeList = rmvarGPUPointers.get(size);
+		if (freeList == null) {
+			freeList = new HashSet<Pointer>();
+			rmvarGPUPointers.put(size, freeList);
+		}
+		if (freeList.contains(toFree))
+			throw new RuntimeException("GPU : Internal state corrupted, double free");
+		freeList.add(toFree);
+	}
+	
+	/**
+	 * Remove a specific pointer if present in the internal hashmap
+	 * 
+	 * @param size size in bytes
+	 * @param ptr pointer to be removed
+	 */
+	public void removeIfPresent(long size, Pointer ptr) {
+		if(rmvarGPUPointers.containsKey(size) && rmvarGPUPointers.get(size).contains(ptr)) {
+			rmvarGPUPointers.get(size).remove(ptr);
+			if (rmvarGPUPointers.get(size).isEmpty())
+				rmvarGPUPointers.remove(size);
+		}
+	}
+	
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java
new file mode 100644
index 0000000..066905b
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java
@@ -0,0 +1,184 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.instructions.gpu.context;
+
+import java.util.Comparator;
+import java.util.HashSet;
+import java.util.Optional;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+import jcuda.Pointer;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysml.runtime.DMLRuntimeException;
+
+public class GPUMatrixMemoryManager {
+	protected static final Log LOG = LogFactory.getLog(GPUMatrixMemoryManager.class.getName());
+	GPUMemoryManager gpuManager;
+	public GPUMatrixMemoryManager(GPUMemoryManager gpuManager) {
+		this.gpuManager = gpuManager;
+	}
+	
+	/**
+	 * Adds the GPU object to the memory manager
+	 * 
+	 * @param gpuObj the handle to the GPU object
+	 */
+	void addGPUObject(GPUObject gpuObj) {
+		gpuObjects.add(gpuObj);
+	}
+	
+	/**
+	 * Returns worst-case contiguous memory size
+	 * @param gpuObj gpu object
+	 * @return memory size in bytes
+	 */
+	long getWorstCaseContiguousMemorySize(GPUObject gpuObj) {
+		long ret = 0;
+		if(!gpuObj.isDensePointerNull()) {
+			ret = gpuManager.allPointers.get(gpuObj.getDensePointer()).getSizeInBytes();
+		}
+		else if(gpuObj.getJcudaSparseMatrixPtr() != null) {
+			CSRPointer sparsePtr = gpuObj.getJcudaSparseMatrixPtr();
+			if(sparsePtr.nnz > 0) {
+				if(sparsePtr.rowPtr != null)
+					ret = Math.max(ret, gpuManager.allPointers.get(sparsePtr.rowPtr).getSizeInBytes());
+				if(sparsePtr.colInd != null)
+					ret = Math.max(ret, gpuManager.allPointers.get(sparsePtr.colInd).getSizeInBytes());
+				if(sparsePtr.val != null)
+					ret = Math.max(ret, gpuManager.allPointers.get(sparsePtr.val).getSizeInBytes());
+			}
+		}
+		return ret;
+	}
+	
+	/**
+	 * Get list of all Pointers in a GPUObject 
+	 * @param gObj gpu object 
+	 * @return set of pointers
+	 */
+	Set<Pointer> getPointers(GPUObject gObj) {
+		Set<Pointer> ret = new HashSet<>();
+		if(!gObj.isDensePointerNull() && gObj.getSparseMatrixCudaPointer() != null) {
+			LOG.warn("Matrix allocated in both dense and sparse format");
+		}
+		if(!gObj.isDensePointerNull()) {
+			ret.add(gObj.getDensePointer());
+		}
+		if(gObj.getSparseMatrixCudaPointer() != null) {
+			CSRPointer sparsePtr = gObj.getSparseMatrixCudaPointer();
+			if(sparsePtr != null) {
+				if(sparsePtr.rowPtr != null)
+					ret.add(sparsePtr.rowPtr);
+				else if(sparsePtr.colInd != null)
+					ret.add(sparsePtr.colInd);
+				else if(sparsePtr.val != null)
+					ret.add(sparsePtr.val);
+			}
+		}
+		return ret;
+	}
+	
+	/**
+	 * list of allocated {@link GPUObject} instances allocated on {@link GPUContext#deviceNum} GPU
+	 * These are matrices allocated on the GPU on which rmvar hasn't been called yet.
+	 * If a {@link GPUObject} has more than one lock on it, it cannot be freed
+	 * If it has zero locks on it, it can be freed, but it is preferrable to keep it around
+	 * so that an extraneous host to dev transfer can be avoided
+	 */
+	HashSet<GPUObject> gpuObjects = new HashSet<>();
+	
+	/**
+	 * Get GPUObjects from the first memory sections "Matrix Memory"
+	 * @param locked return locked GPU objects if true
+	 * @param dirty return dirty GPU objects if true
+	 * @return set of GPU Objects
+	 */
+	Set<GPUObject> getGPUObjects(boolean locked, boolean dirty) {
+		return gpuObjects.stream().filter(gObj -> gObj.isLocked() == locked && gObj.isDirty() == dirty).collect(Collectors.toSet());
+	}
+	
+	/**
+	 * Return all pointers in the first section
+	 * @return all pointers in this section
+	 */
+	Set<Pointer> getPointers() {
+		return gpuObjects.stream().flatMap(gObj -> getPointers(gObj).stream()).collect(Collectors.toSet());
+	}
+	
+	/**
+	 * Get pointers from the first memory sections "Matrix Memory"
+	 * @param locked return locked pointers if true
+	 * @param dirty return dirty pointers if true
+	 * @return set of pointers
+	 */
+	Set<Pointer> getPointers(boolean locked, boolean dirty) {
+		return gpuObjects.stream().filter(gObj -> gObj.isLocked() == locked && gObj.isDirty() == dirty).flatMap(gObj -> getPointers(gObj).stream()).collect(Collectors.toSet());
+	}
+	
+	/**
+	 * Clear the memory of the gpu object that matches the provided parameters
+	 * 
+	 * @param locked is locked
+	 * @param dirty is dirty
+	 * @param minSize of atleast given size
+	 * @param comparator sorting comparator in case there are more than one gpu object that matches above parameters
+	 * @param opcode instruction code
+	 * @return true if a gpu object satisfies the above condition else false
+	 * @throws DMLRuntimeException if error occurs
+	 */
+	boolean clear(boolean locked, boolean dirty, long minSize, Comparator<GPUObject> comparator, String opcode) throws DMLRuntimeException {
+		Optional<GPUObject> toClear = getGPUObjects(locked, dirty).stream()
+				.filter(gObj -> getWorstCaseContiguousMemorySize(gObj) >= minSize)
+					.max(comparator);
+		if(toClear.isPresent()) {
+			GPUObject gObj = toClear.get();
+			if(gObj.dirty) 
+				gObj.copyFromDeviceToHost(opcode, true, true); // Perform eviction if dirty
+			else
+				gObj.clearData(opcode, true);
+			gpuObjects.remove(gObj);
+		}
+		return toClear.isPresent();
+	}
+	
+	/**
+	 * Clear all unlocked gpu objects
+	 * 
+	 * @param opcode instruction code
+	 * @throws DMLRuntimeException if error
+	 */
+	void clearAllUnlocked(String opcode) throws DMLRuntimeException {
+		Set<GPUObject> unlockedGPUObjects = gpuObjects.stream()
+				.filter(gpuObj -> !gpuObj.isLocked()).collect(Collectors.toSet());
+		if(unlockedGPUObjects.size() > 0) {
+			if(LOG.isWarnEnabled())
+				LOG.warn("Clearing all unlocked matrices (count=" + unlockedGPUObjects.size() + ").");
+			for(GPUObject toBeRemoved : unlockedGPUObjects) {
+				if(toBeRemoved.dirty)
+					toBeRemoved.copyFromDeviceToHost(opcode, true, true);
+				else
+					toBeRemoved.clearData(opcode, true);
+			}
+			gpuObjects.removeAll(unlockedGPUObjects);
+		}
+	}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
index e2012a7..411606d 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
@@ -23,13 +23,14 @@ import static jcuda.runtime.JCuda.cudaMalloc;
 import static jcuda.runtime.JCuda.cudaMemGetInfo;
 import static jcuda.runtime.JCuda.cudaMemset;
 
-import java.util.ArrayList;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.HashMap;
 import java.util.HashSet;
+import java.util.List;
 import java.util.Set;
 import java.util.concurrent.atomic.LongAdder;
+import java.util.stream.Collectors;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -42,7 +43,6 @@ import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
 import org.apache.sysml.utils.GPUStatistics;
 
 import jcuda.Pointer;
-
 /**
  * - All cudaFree and cudaMalloc in SystemML should go through this class to avoid OOM or incorrect results.
  * - This class can be refactored in future to accept a chunk of memory ahead of time rather than while execution. This will only thow memory-related errors during startup.  
@@ -50,67 +50,87 @@ import jcuda.Pointer;
 public class GPUMemoryManager {
 	protected static final Log LOG = LogFactory.getLog(GPUMemoryManager.class.getName());
 	
-	// If the available free size is less than this factor, GPUMemoryManager will warn users of multiple programs grabbing onto GPU memory.
-	// This often happens if user tries to use both TF and SystemML, and TF grabs onto 90% of the memory ahead of time.
-	private static final double WARN_UTILIZATION_FACTOR = 0.7;
+	/*****************************************************************************************/
+	// GPU Memory is divided into three major sections:
+	// 1. Matrix Memory: Memory allocated to matrices in SystemML and addressable by GPUObjects.
+	// This memory section is divided into three minor sections:
+	// 1.1 Locked Matrix Memory
+	// 1.2 UnLocked + Non-Dirty Matrix Memory
+	// 1.3 UnLocked + Dirty Matrix Memory
+	// To get the GPUObjects/Pointers in this section, please use getGPUObjects and getPointers methods of GPUMatrixMemoryManager.
+	// To clear GPUObjects/Pointers in this section, please use clear and clearAll methods of GPUMatrixMemoryManager.
+	// Both these methods allow to get/clear unlocked/locked and dirty/non-dirty objects of a certain size.
+	GPUMatrixMemoryManager matrixMemoryManager;
+	public GPUMatrixMemoryManager getGPUMatrixMemoryManager() {
+		return matrixMemoryManager;
+	}
 	
-	// Invoke cudaMemGetInfo to get available memory information. Useful if GPU is shared among multiple application.
-	public double GPU_MEMORY_UTILIZATION_FACTOR = ConfigurationManager.getDMLConfig()
-			.getDoubleValue(DMLConfig.GPU_MEMORY_UTILIZATION_FACTOR);
+	// 2. Rmvar-ed pointers: If sysml.gpu.eager.cudaFree is set to false,
+	// then this manager caches pointers of the GPUObject on which rmvar instruction has been executed for future reuse.
+	// We observe 2-3x improvement with this approach and hence recommend to set this flag to false.
+	GPULazyCudaFreeMemoryManager lazyCudaFreeMemoryManager;
+	public GPULazyCudaFreeMemoryManager getGPULazyCudaFreeMemoryManager() {
+		return lazyCudaFreeMemoryManager;
+	}
 	
-	/**
-	 * Map of free blocks allocate on GPU. maps size_of_block -> pointer on GPU
-	 */
-	private HashMap<Long, Set<Pointer>> rmvarGPUPointers = new HashMap<Long, Set<Pointer>>();
+	// 3. Non-matrix locked pointers: Other pointers (required for execution of an instruction that are not memory). For example: workspace
+	// These pointers are not explicitly tracked by a memory manager but one can get them by using getNonMatrixLockedPointers
+	private Set<Pointer> getNonMatrixLockedPointers() {
+		Set<Pointer> managedPointers = matrixMemoryManager.getPointers();
+		managedPointers.addAll(lazyCudaFreeMemoryManager.getAllPointers());
+		return nonIn(allPointers.keySet(), managedPointers);
+	}
 	
-	/**
-	 * list of allocated {@link GPUObject} instances allocated on {@link GPUContext#deviceNum} GPU
-	 * These are matrices allocated on the GPU on which rmvar hasn't been called yet.
-	 * If a {@link GPUObject} has more than one lock on it, it cannot be freed
-	 * If it has zero locks on it, it can be freed, but it is preferrable to keep it around
-	 * so that an extraneous host to dev transfer can be avoided
-	 */
-	private ArrayList<GPUObject> allocatedGPUObjects = new ArrayList<>();
 	
 	/**
-	 * To record size of allocated blocks
+	 * To record size of all allocated pointers allocated by above memory managers
 	 */
-	private HashMap<Pointer, Long> allocatedGPUPointers = new HashMap<>();
-	
-	/**
-	 * Adds the GPU object to the memory manager
-	 * 
-	 * @param gpuObj the handle to the GPU object
-	 */
-	public void addGPUObject(GPUObject gpuObj) {
-		allocatedGPUObjects.add(gpuObj);
-	}
-	
-	/**
-	 * Removes the GPU object from the memory manager
-	 * 
-	 * @param gpuObj the handle to the GPU object
-	 */
-	public void removeGPUObject(GPUObject gpuObj) {
-		if(LOG.isDebugEnabled())
-			LOG.debug("Removing the GPU object: " + gpuObj);
-		allocatedGPUObjects.removeIf(a -> a.equals(gpuObj));
-	}
+	HashMap<Pointer, PointerInfo> allPointers = new HashMap<>();
 	
+	/*****************************************************************************************/
 	
+
 	/**
 	 * Get size of allocated GPU Pointer
 	 * @param ptr pointer to get size of
 	 * @return either the size or -1 if no such pointer exists
 	 */
 	public long getSizeAllocatedGPUPointer(Pointer ptr) {
-		if(allocatedGPUPointers.containsKey(ptr)) {
-			return allocatedGPUPointers.get(ptr);
+		if(allPointers.containsKey(ptr)) {
+			return allPointers.get(ptr).getSizeInBytes();
 		}
 		return -1;
 	}
 	
+	/**
+	 * Utility to debug memory leaks
+	 */
+	static class PointerInfo {
+		private long sizeInBytes;
+		private StackTraceElement[] stackTraceElements;
+		public PointerInfo(long sizeInBytes) {
+			if(DMLScript.PRINT_GPU_MEMORY_INFO) {
+				this.stackTraceElements = Thread.currentThread().getStackTrace();
+			}
+			this.sizeInBytes = sizeInBytes;
+		}
+		public long getSizeInBytes() {
+			return sizeInBytes;
+		}
+	}
+	
+	// If the available free size is less than this factor, GPUMemoryManager will warn users of multiple programs grabbing onto GPU memory.
+	// This often happens if user tries to use both TF and SystemML, and TF grabs onto 90% of the memory ahead of time.
+	private static final double WARN_UTILIZATION_FACTOR = 0.7;
+	
+	// Invoke cudaMemGetInfo to get available memory information. Useful if GPU is shared among multiple application.
+	public double GPU_MEMORY_UTILIZATION_FACTOR = ConfigurationManager.getDMLConfig()
+			.getDoubleValue(DMLConfig.GPU_MEMORY_UTILIZATION_FACTOR);
+	
+	
 	public GPUMemoryManager(GPUContext gpuCtx) {
+		matrixMemoryManager = new GPUMatrixMemoryManager(this);
+		lazyCudaFreeMemoryManager = new GPULazyCudaFreeMemoryManager(this);
 		long free[] = { 0 };
 		long total[] = { 0 };
 		cudaMemGetInfo(free, total);
@@ -135,19 +155,48 @@ public class GPUMemoryManager {
 	 * @param size size in bytes
 	 * @return allocated pointer
 	 */
-	private Pointer cudaMallocWarnIfFails(Pointer A, long size) {
+	private Pointer cudaMallocNoWarn(Pointer A, long size) {
 		try {
 			cudaMalloc(A, size);
-			allocatedGPUPointers.put(A, size);
+			allPointers.put(A, new PointerInfo(size));
 			return A;
 		} catch(jcuda.CudaException e) {
-			LOG.warn("cudaMalloc failed immediately after cudaMemGetInfo reported that memory of size " + size + " is available. "
-					+ "This usually happens if there are external programs trying to grab on to memory in parallel.");
 			return null;
 		}
 	}
 	
 	/**
+	 * Pretty printing utility to debug OOM error
+	 * 
+	 * @param stackTrace stack trace
+	 * @param index call depth
+	 * @return pretty printed string
+	 */
+	private String getCallerInfo(StackTraceElement [] stackTrace, int index) {
+		if(stackTrace.length <= index)
+			return "->";
+		else
+			return "->" + stackTrace[index].getClassName() + "." + stackTrace[index].getMethodName() + "(" + stackTrace[index].getFileName() + ":" + stackTrace[index].getLineNumber() + ")";
+	}
+	
+	/**
+	 * Pretty printing utility to print bytes
+	 * 
+	 * @param numBytes number of bytes
+	 * @return a human-readable display value
+	 */
+	private String byteCountToDisplaySize(long numBytes) {
+		// return org.apache.commons.io.FileUtils.byteCountToDisplaySize(bytes); // performs rounding
+	    if (numBytes < 1024) { 
+	    	return numBytes + " bytes";
+	    }
+	    else {
+		    int exp = (int) (Math.log(numBytes) / 6.931471805599453);
+		    return String.format("%.3f %sB", ((double)numBytes) / Math.pow(1024, exp), "KMGTP".charAt(exp-1));
+	    }
+	}
+	
+	/**
 	 * Allocate pointer of the given size in bytes.
 	 * 
 	 * @param opcode instruction name
@@ -156,20 +205,26 @@ public class GPUMemoryManager {
 	 */
 	public Pointer malloc(String opcode, long size) {
 		if(size < 0) {
-			throw new DMLRuntimeException("Cannot allocate memory of size " + size);
+			throw new DMLRuntimeException("Cannot allocate memory of size " + byteCountToDisplaySize(size));
 		}
 		long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
+		long mallocStart = 0;
 		// Step 1: First try reusing exact match in rmvarGPUPointers to avoid holes in the GPU memory
-		Pointer A = getRmvarPointer(opcode, size);
+		Pointer A = lazyCudaFreeMemoryManager.getRmvarPointer(opcode, size);
+		if(A != null)
+			addMiscTime(opcode, GPUInstruction.MISC_TIMER_REUSE, t0);
 		
+		Pointer tmpA = (A == null) ? new Pointer() : null;
 		// Step 2: Allocate a new pointer in the GPU memory (since memory is available)
 		if(A == null && size <= getAvailableMemory()) {
-			A = cudaMallocWarnIfFails(new Pointer(), size);
+			mallocStart = DMLScript.STATISTICS ? System.nanoTime() : 0;
+			A = cudaMallocNoWarn(tmpA, size); // Try malloc rather than check available memory to avoid fragmentation related issues
+			addMiscTime(null, GPUStatistics.cudaEvictMallocTime, GPUStatistics.cudaEvictionMallocCount, GPUInstruction.MISC_TIMER_EVICT, mallocStart);
 			if(LOG.isTraceEnabled()) {
 				if(A == null)
-					LOG.trace("Couldnot allocate a new pointer in the GPU memory:" + size);
+					LOG.trace("Couldnot allocate a new pointer in the GPU memory:" + byteCountToDisplaySize(size));
 				else
-					LOG.trace("Allocated a new pointer in the GPU memory:" + size);
+					LOG.trace("Allocated a new pointer in the GPU memory:" + byteCountToDisplaySize(size));
 			}
 		}
 		
@@ -178,21 +233,19 @@ public class GPUMemoryManager {
 		
 		// Step 3: Try reusing non-exact match entry of rmvarGPUPointers
 		if(A == null) { 
-			// Find minimum key that is greater than size
-			long key = Long.MAX_VALUE;
-			for(Long k : rmvarGPUPointers.keySet()) {
-				key = k > size ? Math.min(key, k) : key;
-			}
-			if(key != Long.MAX_VALUE) {
-				A = getRmvarPointer(opcode, key);
-				// To avoid potential for holes in the GPU memory
+			A = lazyCudaFreeMemoryManager.getRmvarPointerMinSize(opcode, size);
+			if(A != null) {
 				guardedCudaFree(A);
-				A = cudaMallocWarnIfFails(new Pointer(), size);
-				if(LOG.isTraceEnabled()) {
+				mallocStart = DMLScript.STATISTICS ? System.nanoTime() : 0;
+				A = cudaMallocNoWarn(tmpA, size); // Try malloc rather than check available memory to avoid fragmentation related issues
+				addMiscTime(null, GPUStatistics.cudaEvictMallocTime, GPUStatistics.cudaEvictionMallocCount, GPUInstruction.MISC_TIMER_EVICT, mallocStart);
+				if(DMLScript.PRINT_GPU_MEMORY_INFO || LOG.isTraceEnabled()) {
 					if(A == null)
-						LOG.trace("Couldnot reuse non-exact match of rmvarGPUPointers:" + size);
-					else
-						LOG.trace("Reuses a non-exact match from rmvarGPUPointers:" + size);
+						LOG.info("Couldnot reuse non-exact match of rmvarGPUPointers:" + byteCountToDisplaySize(size));
+					else {
+						LOG.info("Reuses a non-exact match from rmvarGPUPointers:" + byteCountToDisplaySize(size));
+						LOG.info("GPU Memory info after reusing a non-exact match from rmvarGPUPointers:" + toString());
+					}
 				}
 			}
 		}
@@ -205,20 +258,16 @@ public class GPUMemoryManager {
 		
 		// Step 4: Eagerly free-up rmvarGPUPointers and check if memory is available on GPU
 		if(A == null) {
-			Set<Pointer> toFree = new HashSet<Pointer>();
-			for(Set<Pointer> ptrs : rmvarGPUPointers.values()) {
-				toFree.addAll(ptrs);
-			}
-			for(Pointer ptr : toFree) {
-				guardedCudaFree(ptr);
-			}
+			lazyCudaFreeMemoryManager.clearAll();
 			if(size <= getAvailableMemory()) {
-				A = cudaMallocWarnIfFails(new Pointer(), size);
-				if(LOG.isTraceEnabled()) {
+				A = cudaMallocNoWarn(tmpA, size);
+				if(DMLScript.PRINT_GPU_MEMORY_INFO || LOG.isTraceEnabled()) {
 					if(A == null)
-						LOG.trace("Couldnot allocate a new pointer in the GPU memory after eager free:" + size);
-					else
-						LOG.trace("Allocated a new pointer in the GPU memory after eager free:" + size);
+						LOG.info("Couldnot allocate a new pointer in the GPU memory after eager free:" + byteCountToDisplaySize(size));
+					else {
+						LOG.info("Allocated a new pointer in the GPU memory after eager free:" + byteCountToDisplaySize(size));
+						LOG.info("GPU Memory info after allocating new pointer post lazyCudaFreeMemoryManager.clearAll():" + toString());
+					}
 				}
 			}
 		}
@@ -228,36 +277,114 @@ public class GPUMemoryManager {
 		// Step 5: Try eviction based on the given policy
 		if(A == null) {
 			t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
-			// Sort based on the eviction policy
-			Collections.sort(allocatedGPUObjects, new GPUComparator(size));
-			while (size > getAvailableMemory() && allocatedGPUObjects.size() > 0) {
-				GPUObject toBeRemoved = allocatedGPUObjects.get(allocatedGPUObjects.size() - 1);
-				if (toBeRemoved.isLocked()) {
-					// All remaining blocks will also be locked
-					break;
-				}
-				else {
-					// Perform eviction
-					if (toBeRemoved.dirty) {
-						toBeRemoved.copyFromDeviceToHost(opcode, true);
-					}
-					toBeRemoved.clearData(true);
+			
+			// First, clear unlocked non-dirty matrices greater than or equal to size
+			// Comparator clears the largest matrix to avoid future evictions
+			boolean success = matrixMemoryManager.clear(false, false, size, SIMPLE_COMPARATOR_SORT_BY_SIZE, opcode);
+			if(DMLScript.PRINT_GPU_MEMORY_INFO || LOG.isTraceEnabled()) {
+				if(success) {
+					LOG.info("Cleared an unlocked non-dirty matrix greater than or equal to " + byteCountToDisplaySize(size));
+					LOG.info("GPU Memory info after clearing an unlocked non-dirty matrix:" + toString());
 				}
+				else
+					LOG.info("No unlocked non-dirty matrix greater than or equal to " + byteCountToDisplaySize(size) + " found for clearing.");
 			}
-			addMiscTime(opcode, GPUStatistics.cudaEvictionCount, GPUStatistics.cudaEvictTime, GPUInstruction.MISC_TIMER_EVICT, t0);
-			if(size <= getAvailableMemory()) {
-				A = cudaMallocWarnIfFails(new Pointer(), size);
-				if(LOG.isTraceEnabled()) {
-					if(A == null)
-						LOG.trace("Couldnot allocate a new pointer in the GPU memory after eviction:" + size);
+			if(!success) {
+				// First, clear unlocked dirty matrices greater than or equal to size using the eviction policy
+				// Comparator clears the largest matrix to avoid future evictions
+				if(DMLScript.PRINT_GPU_MEMORY_INFO || LOG.isTraceEnabled()) {
+					LOG.info("GPU Memory info before eviction:" + toString());
+				}
+				success = matrixMemoryManager.clear(false, true, size, new EvictionPolicyBasedComparator(size), opcode);
+				// JCuda.cudaDeviceSynchronize();
+				if(DMLScript.PRINT_GPU_MEMORY_INFO || LOG.isTraceEnabled()) {
+					if(success) {
+						LOG.info("Evicted an unlocked dirty matrix greater than or equal to " + byteCountToDisplaySize(size));
+						LOG.info("GPU Memory info after evicting an unlocked dirty matrix:" + toString());
+					}
 					else
-						LOG.trace("Allocated a new pointer in the GPU memory after eviction:" + size);
+						LOG.info("No unlocked dirty matrix greater than or equal to " + byteCountToDisplaySize(size) + " found for evicted.");
 				}
+				
+				if(!success) {
+					// Minor optimization: clear all unlocked non-dirty matrices before attempting eviction 
+					// Delete all non-dirty
+					List<GPUObject> unlockedGPUObjects = matrixMemoryManager.gpuObjects.stream()
+												.filter(gpuObj -> !gpuObj.isLocked() && !gpuObj.isDirty()).collect(Collectors.toList());
+					matrixMemoryManager.gpuObjects.removeAll(unlockedGPUObjects);
+					for(GPUObject toBeRemoved : unlockedGPUObjects) {
+						toBeRemoved.clearData(opcode, true);
+					}
+					if(DMLScript.PRINT_GPU_MEMORY_INFO || LOG.isTraceEnabled()) {
+						LOG.info("GPU Memory info after clearing all unlocked non-dirty matrices:" + toString());
+					}
+					mallocStart = DMLScript.STATISTICS ? System.nanoTime() : 0;
+					A = cudaMallocNoWarn(tmpA, size); // Try malloc rather than check available memory to avoid fragmentation related issues
+					addMiscTime(null, GPUStatistics.cudaEvictMallocTime, GPUStatistics.cudaEvictionMallocCount, GPUInstruction.MISC_TIMER_EVICT, mallocStart);
+				
+					// ---------------------------------------------------------------
+					// Evict unlocked GPU objects one-by-one and try malloc
+					unlockedGPUObjects = null;
+					if(A == null) {
+						unlockedGPUObjects = matrixMemoryManager.gpuObjects.stream()
+								.filter(gpuObj -> !gpuObj.isLocked() && gpuObj.isDirty()).collect(Collectors.toList());
+						Collections.sort(unlockedGPUObjects, new EvictionPolicyBasedComparator(size));
+						while(A == null && unlockedGPUObjects.size() > 0) {
+							if(DMLScript.GPU_EVICTION_POLICY == DMLScript.EvictionPolicy.ALIGN_MEMORY) {
+								// TODO: Optimize later using sliding window
+								// Evict as many sequential dense objects from back of the queue as possible
+								long neededSize = size;
+								while(neededSize >= 0 && unlockedGPUObjects.size() > 0) {
+									GPUObject gpuObj = unlockedGPUObjects.remove(unlockedGPUObjects.size()-1);
+									neededSize -= matrixMemoryManager.getWorstCaseContiguousMemorySize(gpuObj);
+									gpuObj.copyFromDeviceToHost(opcode, true, true);
+								}
+							}
+							else {
+								GPUObject gpuObj = unlockedGPUObjects.remove(unlockedGPUObjects.size()-1);
+								gpuObj.copyFromDeviceToHost(opcode, true, true);
+							}
+							mallocStart = DMLScript.STATISTICS ? System.nanoTime() : 0;
+							A = cudaMallocNoWarn(tmpA, size); // Try malloc rather than check available memory to avoid fragmentation related issues
+							addMiscTime(null, GPUStatistics.cudaEvictMallocTime, GPUStatistics.cudaEvictionMallocCount, GPUInstruction.MISC_TIMER_EVICT, mallocStart);
+						}
+						if(DMLScript.PRINT_GPU_MEMORY_INFO || LOG.isTraceEnabled()) {
+							// greater than or equal to " + byteCountToDisplaySize(size)
+							LOG.info("GPU Memory info after eviction:" + toString());
+						}
+						if(unlockedGPUObjects != null && unlockedGPUObjects.size() == 0) {
+							LOG.warn("Evicted all unlocked matrices");
+						}
+					}
+					
+				}
+				// ---------------------------------------------------------------
+			}
+			addMiscTime(opcode, GPUStatistics.cudaEvictTime, GPUStatistics.cudaEvictionCount, GPUInstruction.MISC_TIMER_EVICT, t0);
+			if(A == null) {
+				A = cudaMallocNoWarn(tmpA, size); // if the matrix is not allocated via eviction
 			}
+			if(A == null) {
+				LOG.warn("cudaMalloc failed immediately after cudaMemGetInfo reported that memory of size " 
+						+ byteCountToDisplaySize(size) + " is available. "
+						+ "This usually happens if there are external programs trying to grab on to memory in parallel or there is potential fragmentation.");
+			}
+			else if(DMLScript.PRINT_GPU_MEMORY_INFO || LOG.isTraceEnabled()) {
+				LOG.info("Malloc after eviction/clearing is successful.");
+			}
+		}
+		
+		// Step 6: Handle defragmentation
+		if(A == null) {
+			LOG.warn("Potential fragmentation of the GPU memory. Forcibly evicting all ...");
+			LOG.info("Before clearAllUnlocked, GPU Memory info:" + toString());
+			matrixMemoryManager.clearAllUnlocked(opcode);
+			LOG.info("GPU Memory info after evicting all unlocked matrices:" + toString());
+			A = cudaMallocNoWarn(tmpA, size);
 		}
 		
 		if(A == null) {
-			throw new DMLRuntimeException("There is not enough memory on device for this matrix, request (" + size + "). "
+			throw new DMLRuntimeException("There is not enough memory on device for this matrix, requested = " + byteCountToDisplaySize(size) + ". \n "
 					+ toString());
 		}
 		
@@ -267,26 +394,50 @@ public class GPUMemoryManager {
 		return A;
 	}
 	
+	// --------------- Developer Utilities to debug potential memory leaks ------------------------
+	@SuppressWarnings("unused")
+	private void printPointers(List<PointerInfo> pointers) {
+		for(PointerInfo ptrInfo : pointers) {
+			System.out.println(">>" + 
+					// getCallerInfo(ptrInfo.stackTraceElements, 5) + getCallerInfo(ptrInfo.stackTraceElements, 6) + getCallerInfo(ptrInfo.stackTraceElements, 7) +
+					getCallerInfo(ptrInfo.stackTraceElements, 8) + getCallerInfo(ptrInfo.stackTraceElements, 9) + getCallerInfo(ptrInfo.stackTraceElements, 10));
+		}
+	}
+	
+	@SuppressWarnings("unused")
+	private void printPointers(Set<Pointer> pointers, StringBuilder sb) {
+		for(Pointer ptr : pointers) {
+			PointerInfo ptrInfo = allPointers.get(ptr);
+			sb.append(">>");
+			// getCallerInfo(ptrInfo.stackTraceElements, 5) + getCallerInfo(ptrInfo.stackTraceElements, 6) + getCallerInfo(ptrInfo.stackTraceElements, 7) +
+			sb.append(getCallerInfo(ptrInfo.stackTraceElements, 8));
+			sb.append(getCallerInfo(ptrInfo.stackTraceElements, 9));
+			sb.append(getCallerInfo(ptrInfo.stackTraceElements, 10));
+			sb.append("\n");
+		}
+	}
+	// --------------------------------------------------------------------------------------------
+
 	/**
 	 * Note: This method should not be called from an iterator as it removes entries from allocatedGPUPointers and rmvarGPUPointers
 	 * 
 	 * @param toFree pointer to call cudaFree method on
 	 */
-	private void guardedCudaFree(Pointer toFree) {
-		if (toFree != new Pointer()) {
-			if(allocatedGPUPointers.containsKey(toFree)) {
-				Long size = allocatedGPUPointers.remove(toFree);
-				if(rmvarGPUPointers.containsKey(size) && rmvarGPUPointers.get(size).contains(toFree)) {
-					remove(rmvarGPUPointers, size, toFree);
-				}
-				if(LOG.isDebugEnabled())
-					LOG.debug("Free-ing up the pointer: " + toFree);
-				cudaFree(toFree);
-			}
-			else {
-				throw new RuntimeException("Attempting to free an unaccounted pointer:" + toFree);
+	void guardedCudaFree(Pointer toFree) {
+		if(allPointers.containsKey(toFree)) {
+			long size = allPointers.get(toFree).getSizeInBytes();
+			if(LOG.isTraceEnabled()) {
+				LOG.trace("Free-ing up the pointer of size " +  byteCountToDisplaySize(size));
 			}
+			allPointers.remove(toFree);
+			lazyCudaFreeMemoryManager.removeIfPresent(size, toFree);
+			cudaFree(toFree);
+			// JCuda.cudaDeviceSynchronize(); // Force a device synchronize after free-ing the pointer for debugging
 		}
+		else {
+			throw new RuntimeException("Attempting to free an unaccounted pointer:" + toFree);
+		}
+
 	}
 	
 	/**
@@ -295,88 +446,62 @@ public class GPUMemoryManager {
 	 * @param opcode instruction name
 	 * @param toFree pointer to free
 	 * @param eager whether to deallocate eagerly
+	 * @throws DMLRuntimeException if error occurs
 	 */
-	public void free(String opcode, Pointer toFree, boolean eager) {
-		Pointer dummy = new Pointer();
-		if (toFree == dummy) { // trying to free a null pointer
-			return;
-		}
+	public void free(String opcode, Pointer toFree, boolean eager) throws DMLRuntimeException {
+		if(LOG.isTraceEnabled())
+			LOG.trace("Free-ing the pointer with eager=" + eager);
 		if (eager) {
 			long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
 			guardedCudaFree(toFree);
 			addMiscTime(opcode, GPUStatistics.cudaDeAllocTime, GPUStatistics.cudaDeAllocCount, GPUInstruction.MISC_TIMER_CUDA_FREE, t0);
 		}
 		else {
-			if (!allocatedGPUPointers.containsKey(toFree))
+			if (!allPointers.containsKey(toFree)) {
+				LOG.info("GPU memory info before failure:" + toString());
 				throw new RuntimeException("ERROR : Internal state corrupted, cache block size map is not aware of a block it trying to free up");
-			long size = allocatedGPUPointers.get(toFree);
-			Set<Pointer> freeList = rmvarGPUPointers.get(size);
-			if (freeList == null) {
-				freeList = new HashSet<Pointer>();
-				rmvarGPUPointers.put(size, freeList);
 			}
-			if (freeList.contains(toFree))
-				throw new RuntimeException("GPU : Internal state corrupted, double free");
-			freeList.add(toFree);
+			long size = allPointers.get(toFree).getSizeInBytes();
+			lazyCudaFreeMemoryManager.add(size, toFree);
 		}
 	}
 	
 	/**
+	 * Removes the GPU object from the memory manager
+	 * 
+	 * @param gpuObj the handle to the GPU object
+	 */
+	public void removeGPUObject(GPUObject gpuObj) {
+		if(LOG.isDebugEnabled())
+			LOG.debug("Removing the GPU object: " + gpuObj);
+		matrixMemoryManager.gpuObjects.removeIf(a -> a.equals(gpuObj));
+	}
+
+	
+	/**
 	 * Clear the allocated GPU objects
 	 */
 	public void clearMemory() {
 		// First deallocate all the GPU objects
-		for(GPUObject gpuObj : allocatedGPUObjects) {
+		for(GPUObject gpuObj : matrixMemoryManager.gpuObjects) {
 			if(gpuObj.isDirty()) {
-				LOG.debug("Attempted to free GPU Memory when a block[" + gpuObj + "] is still on GPU memory, copying it back to host.");
-				gpuObj.acquireHostRead(null);
+				if(LOG.isDebugEnabled())
+					LOG.debug("Attempted to free GPU Memory when a block[" + gpuObj + "] is still on GPU memory, copying it back to host.");
+				gpuObj.copyFromDeviceToHost(null, true, true);
 			}
-			gpuObj.clearData(true);
+			else
+				gpuObj.clearData(null, true);
 		}
-		allocatedGPUObjects.clear();
+		matrixMemoryManager.gpuObjects.clear();
 		
 		// Then clean up remaining allocated GPU pointers 
-		Set<Pointer> remainingPtr = new HashSet<>(allocatedGPUPointers.keySet());
+		Set<Pointer> remainingPtr = new HashSet<>(allPointers.keySet());
 		for(Pointer toFree : remainingPtr) {
 			guardedCudaFree(toFree); // cleans up allocatedGPUPointers and rmvarGPUPointers as well
 		}
+		allPointers.clear();
 	}
-	
-	/**
-	 * Get all pointers withing allocatedGPUObjects such that GPUObject is in dirty state
-	 * 
-	 * @return set of pointers
-	 */
-	private HashSet<Pointer> getDirtyPointers() {
-		HashSet<Pointer> nonTemporaryPointers = new HashSet<Pointer>();
-		for (GPUObject o : allocatedGPUObjects) {
-			if(o.isDirty()) {
-				if (o.isSparse()) {
-					CSRPointer p = o.getSparseMatrixCudaPointer();
-					if (p == null)
-						throw new RuntimeException("CSRPointer is null in clearTemporaryMemory");
-					if (p.rowPtr != null) {
-						nonTemporaryPointers.add(p.rowPtr);
-					}
-					if (p.colInd != null) {
-						nonTemporaryPointers.add(p.colInd);
-					}
-					if (p.val != null) {
-						nonTemporaryPointers.add(p.val);
-					}
-
-				} else {
-					Pointer p = o.getJcudaDenseMatrixPtr();
-					if (p == null)
-						throw new RuntimeException("Pointer is null in clearTemporaryMemory");
-					nonTemporaryPointers.add(p);
-				}
-			}
-		}
 		
-		return nonTemporaryPointers;
-	}
-	
 	/**
 	 * Performs a non-in operation
 	 * 
@@ -399,7 +524,8 @@ public class GPUMemoryManager {
 	 */
 	public void clearTemporaryMemory() {
 		// To record the cuda block sizes needed by allocatedGPUObjects, others are cleared up.
-		Set<Pointer> temporaryPointers = nonIn(allocatedGPUPointers.keySet(), getDirtyPointers());
+		Set<Pointer> unlockedDirtyPointers = matrixMemoryManager.getPointers(false, true);
+		Set<Pointer> temporaryPointers = nonIn(allPointers.keySet(), unlockedDirtyPointers);
 		for(Pointer tmpPtr : temporaryPointers) {
 			guardedCudaFree(tmpPtr);
 		}
@@ -431,86 +557,84 @@ public class GPUMemoryManager {
 	 * @param instructionLevelTimer member of GPUInstruction
 	 * @param startTime start time
 	 */
-	private void addMiscTime(String opcode, String instructionLevelTimer, long startTime) {
+	void addMiscTime(String opcode, String instructionLevelTimer, long startTime) {
 		if (opcode != null && DMLScript.FINEGRAINED_STATISTICS)
 			GPUStatistics.maintainCPMiscTimes(opcode, instructionLevelTimer, System.nanoTime() - startTime);
 	}
 	
-	/**
-	 * Get any pointer of the given size from rmvar-ed pointers (applicable if eager cudaFree is set to false)
-	 * 
-	 * @param opcode opcode
-	 * @param size size in bytes
-	 * @return pointer
-	 */
-	private Pointer getRmvarPointer(String opcode, long size) {
-		if (rmvarGPUPointers.containsKey(size)) {
-			if(LOG.isTraceEnabled())
-				LOG.trace("Getting rmvar-ed pointers for size:" + size);
-			long t0 = opcode != null && DMLScript.FINEGRAINED_STATISTICS ?  System.nanoTime() : 0;
-			Pointer A = remove(rmvarGPUPointers, size); // remove from rmvarGPUPointers as you are not calling cudaFree
-			addMiscTime(opcode, GPUInstruction.MISC_TIMER_REUSE, t0);
-			return A;
-		}
-		else {
-			return null;
-		}
-	}
-	
-	/**
-	 * Remove any pointer in the given hashmap
-	 * 
-	 * @param hm hashmap of size, pointers
-	 * @param size size in bytes
-	 * @return the pointer that was removed
-	 */
-	private Pointer remove(HashMap<Long, Set<Pointer>> hm, long size) {
-		Pointer A = hm.get(size).iterator().next();
-		remove(hm, size, A);
-		return A;
-	}
-	
-	/**
-	 * Remove a specific pointer in the given hashmap
-	 * 
-	 * @param hm hashmap of size, pointers
-	 * @param size size in bytes
-	 * @param ptr pointer to be removed
-	 */
-	private void remove(HashMap<Long, Set<Pointer>> hm, long size, Pointer ptr) {
-		hm.get(size).remove(ptr);
-		if (hm.get(size).isEmpty())
-			hm.remove(size);
-	}
-	
 	
 	/**
 	 * Print debugging information
 	 */
 	public String toString() {
-		long sizeOfLockedGPUObjects = 0; long numLockedGPUObjects = 0;
-		long sizeOfUnlockedGPUObjects = 0; long numUnlockedGPUObjects = 0;
-		for(GPUObject gpuObj : allocatedGPUObjects) {
-			try {
-				if(gpuObj.isLocked()) {
-					numLockedGPUObjects++;
-					sizeOfLockedGPUObjects += gpuObj.getSizeOnDevice();
+		long sizeOfLockedGPUObjects = 0; int numLockedGPUObjects = 0; int numLockedPointers = 0;
+		long sizeOfUnlockedDirtyGPUObjects = 0; int numUnlockedDirtyGPUObjects = 0; int numUnlockedDirtyPointers = 0;
+		long sizeOfUnlockedNonDirtyGPUObjects = 0; int numUnlockedNonDirtyGPUObjects = 0; int numUnlockedNonDirtyPointers = 0;
+		for(GPUObject gpuObj : matrixMemoryManager.gpuObjects) {
+			if(gpuObj.isLocked()) {
+				numLockedGPUObjects++;
+				sizeOfLockedGPUObjects += gpuObj.getSizeOnDevice();
+				numLockedPointers += matrixMemoryManager.getPointers(gpuObj).size();
+			}
+			else {
+				if(gpuObj.isDirty()) {
+					numUnlockedDirtyGPUObjects++;
+					sizeOfUnlockedDirtyGPUObjects += gpuObj.getSizeOnDevice();
+					numUnlockedDirtyPointers += matrixMemoryManager.getPointers(gpuObj).size();
 				}
 				else {
-					numUnlockedGPUObjects++;
-					sizeOfUnlockedGPUObjects += gpuObj.getSizeOnDevice();
+					numUnlockedNonDirtyGPUObjects++;
+					sizeOfUnlockedNonDirtyGPUObjects += gpuObj.getSizeOnDevice();
+					numUnlockedNonDirtyPointers += matrixMemoryManager.getPointers(gpuObj).size();
 				}
-			} catch (DMLRuntimeException e) {
-				throw new RuntimeException(e);
 			}
 		}
+		
+		
 		long totalMemoryAllocated = 0;
-		for(Long numBytes : allocatedGPUPointers.values()) {
-			totalMemoryAllocated += numBytes;
+		for(PointerInfo ptrInfo : allPointers.values()) {
+			totalMemoryAllocated += ptrInfo.getSizeInBytes();
+		}
+		
+		
+		Set<Pointer> potentiallyLeakyPointers = getNonMatrixLockedPointers();
+		List<Long> sizePotentiallyLeakyPointers = potentiallyLeakyPointers.stream().
+				map(ptr -> allPointers.get(ptr).sizeInBytes).collect(Collectors.toList());
+		long totalSizePotentiallyLeakyPointers = 0;
+		for(long size : sizePotentiallyLeakyPointers) {
+			totalSizePotentiallyLeakyPointers += size;
 		}
-		return "Num of GPU objects: [unlocked:" + numUnlockedGPUObjects + ", locked:" + numLockedGPUObjects + "]. "
-				+ "Size of GPU objects in bytes: [unlocked:" + sizeOfUnlockedGPUObjects + ", locked:" + sizeOfLockedGPUObjects + "]. "
-				+ "Total memory allocated by the current GPU context in bytes:" + totalMemoryAllocated;
+		StringBuilder ret = new StringBuilder();
+		//if(DMLScript.PRINT_GPU_MEMORY_INFO) {
+		//	if(potentiallyLeakyPointers.size() > 0) {
+		//		ret.append("Non-matrix pointers were allocated by:\n");
+		//		printPointers(potentiallyLeakyPointers, ret);
+		//	}
+		//}
+		ret.append("\n====================================================\n");
+		ret.append(String.format("%-35s%-15s%-15s%-15s\n", "", 
+				"Num Objects", "Num Pointers", "Size"));
+		ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Unlocked Dirty GPU objects", 
+				numUnlockedDirtyGPUObjects, numUnlockedDirtyPointers, byteCountToDisplaySize(sizeOfUnlockedDirtyGPUObjects)));
+		ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Unlocked NonDirty GPU objects", 
+				numUnlockedNonDirtyGPUObjects, numUnlockedNonDirtyPointers, byteCountToDisplaySize(sizeOfUnlockedNonDirtyGPUObjects)));
+		ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Locked GPU objects", 
+				numLockedGPUObjects, numLockedPointers, byteCountToDisplaySize(sizeOfLockedGPUObjects)));
+		ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Cached rmvar-ed pointers", 
+				"-", lazyCudaFreeMemoryManager.getNumPointers(), byteCountToDisplaySize(lazyCudaFreeMemoryManager.getTotalMemoryAllocated())));
+		ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Non-matrix/non-cached pointers", 
+				"-", potentiallyLeakyPointers.size(), byteCountToDisplaySize(totalSizePotentiallyLeakyPointers)));
+		ret.append(String.format("%-35s%-15s%-15s%-15s\n", "All pointers", 
+				"-", allPointers.size(), byteCountToDisplaySize(totalMemoryAllocated)));
+		long free[] = { 0 };
+		long total[] = { 0 };
+		cudaMemGetInfo(free, total);
+		ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Free mem (from cudaMemGetInfo)", 
+				"-", "-", byteCountToDisplaySize(free[0])));
+		ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Total mem (from cudaMemGetInfo)", 
+				"-", "-", byteCountToDisplaySize(total[0])));
+		ret.append("====================================================\n");
+		return ret.toString();
 	}
 	
 	/**
@@ -525,14 +649,39 @@ public class GPUMemoryManager {
 		return (long) (free[0] * GPU_MEMORY_UTILIZATION_FACTOR);
 	}
 	
+	private static Comparator<GPUObject> SIMPLE_COMPARATOR_SORT_BY_SIZE = (o1, o2) -> o1.getSizeOnDevice() < o2.getSizeOnDevice() ? -1 : 1;
+	
+	private static class CustomPointer extends Pointer {
+		public CustomPointer(Pointer p) {
+			super(p);
+		}
+		
+		@Override
+		public long getNativePointer() {
+			return super.getNativePointer();
+		}
+		
+	}
 	/**
 	 * Class that governs the eviction policy
 	 */
-	public static class GPUComparator implements Comparator<GPUObject> {
+	public static class EvictionPolicyBasedComparator implements Comparator<GPUObject> {
 		private long neededSize;
-		public GPUComparator(long neededSize) {
+		public EvictionPolicyBasedComparator(long neededSize) {
 			this.neededSize = neededSize;
 		}
+		
+		private int minEvictCompare(GPUObject p1, GPUObject p2) {
+			long p1Size = p1.getSizeOnDevice() - neededSize;
+			long p2Size = p2.getSizeOnDevice() - neededSize;
+
+			if (p1Size >= 0 && p2Size >= 0) {
+				return Long.compare(p2Size, p1Size);
+			} else {
+				return Long.compare(p1Size, p2Size);
+			}
+		}
+		
 		@Override
 		public int compare(GPUObject p1, GPUObject p2) {
 			if (p1.isLocked() && p2.isLocked()) {
@@ -548,25 +697,32 @@ public class GPUMemoryManager {
 				return 1;
 			} else {
 				// Both are unlocked
-				if (DMLScript.GPU_EVICTION_POLICY == DMLScript.EvictionPolicy.MIN_EVICT) {
-					long p1Size = 0;
-					long p2Size = 0;
-					try {
-						p1Size = p1.getSizeOnDevice() - neededSize;
-						p2Size = p2.getSizeOnDevice() - neededSize;
-					} catch (DMLRuntimeException e) {
-						throw new RuntimeException(e);
+				if (DMLScript.GPU_EVICTION_POLICY == DMLScript.EvictionPolicy.ALIGN_MEMORY) {
+					if(!p1.isDensePointerNull() && !p2.isDensePointerNull()) {
+						long p1Ptr = new CustomPointer(p1.getDensePointer()).getNativePointer();
+						long p2Ptr = new CustomPointer(p2.getDensePointer()).getNativePointer();
+						
+						if(p1Ptr <= p2Ptr)
+							return -1;
+						else
+							return 1;
 					}
-
-					if (p1Size >= 0 && p2Size >= 0) {
-						return Long.compare(p2Size, p1Size);
-					} else {
-						return Long.compare(p1Size, p2Size);
+					else if(p1.isDensePointerNull() && !p2.isDensePointerNull()) {
+						return -1;
+					}
+					else if(!p1.isDensePointerNull() && p2.isDensePointerNull()) {
+						return 1;
 					}
+					else {
+						return minEvictCompare(p1, p2);
+					}
+				}
+				else if (DMLScript.GPU_EVICTION_POLICY == DMLScript.EvictionPolicy.MIN_EVICT) {
+					return minEvictCompare(p1, p2);
 				} else {
 					return Long.compare(p2.timestamp.get(), p1.timestamp.get());
 				}
 			}
 		}
 	}
-}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
index 613e16f..bf44895 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -23,6 +23,7 @@ import static jcuda.runtime.JCuda.cudaMemcpy;
 import static jcuda.runtime.JCuda.cudaMemset;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
+
 import java.util.concurrent.atomic.AtomicLong;
 import java.util.concurrent.atomic.LongAdder;
 
@@ -97,6 +98,56 @@ public class GPUObject {
 	 * Enclosing {@link MatrixObject} instance
 	 */
 	protected MatrixObject mat = null;
+	
+	// ----------------------------------------------------------------------
+	// Methods used to access, set and check jcudaDenseMatrixPtr
+	
+	/**
+	 * Pointer to dense matrix
+	 *
+	 * @return a pointer to the dense matrix
+	 */
+	public Pointer getDensePointer() {
+		return jcudaDenseMatrixPtr;
+	}
+	
+	/**
+	 * Checks if the dense pointer is null
+	 * 
+	 * @return if the state of dense pointer is null
+	 */
+	public boolean isDensePointerNull() {
+		return jcudaDenseMatrixPtr == null;
+	}
+	
+	/**
+	 * Removes the dense pointer and potential soft reference
+	 */
+	public void clearDensePointer() {
+		jcudaDenseMatrixPtr = null;
+	}
+	
+	/**
+	 * Convenience method to directly set the dense matrix pointer on GPU
+	 *
+	 * @param densePtr dense pointer
+	 */
+	public void setDensePointer(Pointer densePtr) {
+		if (!this.isDensePointerNull()) {
+			throw new DMLRuntimeException("jcudaDenseMatrixPtr was already allocated for " + this + ", this will cause a memory leak on the GPU");
+		}
+		this.jcudaDenseMatrixPtr = densePtr;
+		this.isSparse = false;
+		if(LOG.isDebugEnabled()) {
+			LOG.debug("Setting dense pointer of size " + getGPUContext().getMemoryManager().getSizeAllocatedGPUPointer(densePtr));
+		}
+		if (getJcudaSparseMatrixPtr() != null) {
+			getJcudaSparseMatrixPtr().deallocate();
+			jcudaSparseMatrixPtr = null;
+		}
+	}
+	// ----------------------------------------------------------------------
+
 
 	@Override
 	public Object clone() {
@@ -111,12 +162,12 @@ public class GPUObject {
 		that.isSparse = me.isSparse;
 
 		try {
-			if (me.jcudaDenseMatrixPtr != null) {
+			if (!me.isDensePointerNull()) {
 				long rows = me.mat.getNumRows();
 				long cols = me.mat.getNumColumns();
 				long size = rows * cols * LibMatrixCUDA.sizeOfDataType;
-				that.jcudaDenseMatrixPtr = allocate(size);
-				cudaMemcpy(that.jcudaDenseMatrixPtr, me.jcudaDenseMatrixPtr, size, cudaMemcpyDeviceToDevice);
+				that.setDensePointer(allocate(size));
+				cudaMemcpy(that.getDensePointer(), me.getDensePointer(), size, cudaMemcpyDeviceToDevice);
 			}
 
 			if (me.getJcudaSparseMatrixPtr() != null) {
@@ -135,12 +186,8 @@ public class GPUObject {
 		return getGPUContext().allocate(size);
 	}
 
-	private void cudaFreeHelper(Pointer toFree) {
-		getGPUContext().cudaFreeHelper(toFree);
-	}
-
-	private void cudaFreeHelper(String instName, Pointer toFree, boolean eager) {
-		getGPUContext().cudaFreeHelper(instName, toFree, eager);
+	private void cudaFreeHelper(Pointer toFree) throws DMLRuntimeException {
+		getGPUContext().cudaFreeHelper(null, toFree, DMLScript.EAGER_CUDA_FREE);
 	}
 
 	private GPUContext getGPUContext() {
@@ -172,6 +219,7 @@ public class GPUObject {
 				lda, C, ldc);
 		return C;
 	}
+	
 
 	/**
 	 * Convenience method to convert a CSR matrix to a dense matrix on the GPU
@@ -224,8 +272,8 @@ public class GPUObject {
 				C.colInd);
 		//cudaDeviceSynchronize();
 
-		gCtx.cudaFreeHelper(nnzPerRowPtr);
-		gCtx.cudaFreeHelper(nnzTotalDevHostPtr);
+		gCtx.cudaFreeHelper(null, nnzPerRowPtr, DMLScript.EAGER_CUDA_FREE);
+		gCtx.cudaFreeHelper(null, nnzTotalDevHostPtr, DMLScript.EAGER_CUDA_FREE);
 
 		return C;
 	}
@@ -251,32 +299,12 @@ public class GPUObject {
 		}
 		this.jcudaSparseMatrixPtr = sparseMatrixPtr;
 		this.isSparse = true;
-		if (getJcudaDenseMatrixPtr() != null) {
-			cudaFreeHelper(getJcudaDenseMatrixPtr());
-			jcudaDenseMatrixPtr = null;
-		}
-	}
-
-	/**
-	 * Convenience method to directly set the dense matrix pointer on GPU
-	 *
-	 * @param densePtr dense pointer
-	 */
-	public void setDenseMatrixCudaPointer(Pointer densePtr) {
-		if (this.jcudaDenseMatrixPtr != null) {
-			throw new DMLRuntimeException("jcudaDenseMatrixPtr was already allocated for " + this + ", this will cause a memory leak on the GPU");
-		}
-		this.jcudaDenseMatrixPtr = densePtr;
-		this.isSparse = false;
-		if(LOG.isDebugEnabled()) {
-			LOG.debug("Setting dense pointer of size " + getGPUContext().getMemoryManager().getSizeAllocatedGPUPointer(densePtr));
-		}
-		if (getJcudaSparseMatrixPtr() != null) {
-			getJcudaSparseMatrixPtr().deallocate();
-			jcudaSparseMatrixPtr = null;
+		if (!isDensePointerNull()) {
+			cudaFreeHelper(getDensePointer());
+			clearDensePointer();
 		}
 	}
-
+	
 	/**
 	 * Converts this GPUObject from dense to sparse format.
 	 */
@@ -293,12 +321,12 @@ public class GPUObject {
 		int rows = toIntExact(mat.getNumRows());
 		int cols = toIntExact(mat.getNumColumns());
 
-		if (getJcudaDenseMatrixPtr() == null || !isAllocated())
+		if (isDensePointerNull() || !isAllocated())
 			throw new DMLRuntimeException("Expected allocated dense matrix before denseToSparse() call");
 
 		denseRowMajorToColumnMajor();
 		setSparseMatrixCudaPointer(
-				columnMajorDenseToRowMajorSparse(getGPUContext(), cusparseHandle, getJcudaDenseMatrixPtr(), rows,
+				columnMajorDenseToRowMajorSparse(getGPUContext(), cusparseHandle, getDensePointer(), rows,
 						cols));
 		// TODO: What if mat.getNnz() is -1 ?
 		if (DMLScript.STATISTICS)
@@ -322,10 +350,10 @@ public class GPUObject {
 			throw new DMLRuntimeException("Error in converting row major to column major : data is not allocated");
 		}
 
-		Pointer tmp = transpose(getGPUContext(), getJcudaDenseMatrixPtr(), m, n, lda, ldc);
-		cudaFreeHelper(getJcudaDenseMatrixPtr());
-		jcudaDenseMatrixPtr = null;
-		setDenseMatrixCudaPointer(tmp);
+		Pointer tmp = transpose(getGPUContext(), getDensePointer(), m, n, lda, ldc);
+		cudaFreeHelper(getDensePointer());
+		clearDensePointer();
+		setDensePointer(tmp);
 	}
 
 	/**
@@ -344,10 +372,10 @@ public class GPUObject {
 			throw new DMLRuntimeException("Error in converting column major to row major : data is not allocated");
 		}
 
-		Pointer tmp = transpose(getGPUContext(), getJcudaDenseMatrixPtr(), m, n, lda, ldc);
-		cudaFreeHelper(getJcudaDenseMatrixPtr());
-		jcudaDenseMatrixPtr = null;
-		setDenseMatrixCudaPointer(tmp);
+		Pointer tmp = transpose(getGPUContext(), getDensePointer(), m, n, lda, ldc);
+		cudaFreeHelper(getDensePointer());
+		clearDensePointer();
+		setDensePointer(tmp);
 	}
 
 	/**
@@ -400,7 +428,7 @@ public class GPUObject {
 			throw new DMLRuntimeException("Expected cusparse to be initialized");
 		int rows = toIntExact(mat.getNumRows());
 		int cols = toIntExact(mat.getNumColumns());
-		setDenseMatrixCudaPointer(getJcudaSparseMatrixPtr().toColumnMajorDenseMatrix(cusparseHandle, null, rows, cols, null));
+		setDensePointer(getJcudaSparseMatrixPtr().toColumnMajorDenseMatrix(cusparseHandle, null, rows, cols, null));
 	}
 
 	/**
@@ -426,7 +454,7 @@ public class GPUObject {
 	}
 
 	public boolean isAllocated() {
-		boolean eitherAllocated = (getJcudaDenseMatrixPtr() != null || getJcudaSparseMatrixPtr() != null);
+		boolean eitherAllocated = (!isDensePointerNull() || getJcudaSparseMatrixPtr() != null);
 		return eitherAllocated;
 	}
 
@@ -455,13 +483,13 @@ public class GPUObject {
 		long cols = mat.getNumColumns();
 		int numElems = toIntExact(rows * cols);
 		long size = getDatatypeSizeOf(numElems);
-		setDenseMatrixCudaPointer(allocate(size));
+		setDensePointer(allocate(size));
 		// The "fill" kernel is called which treats the matrix "jcudaDensePtr" like a vector and fills it with value "v"
 		// If the fill value is 0, no need to call the special kernel, the allocate memsets the allocated region to 0
 		if (v != 0)
 			getGPUContext().getKernels()
 			.launchKernel("fill", ExecutionConfig.getConfigForSimpleVectorOperations(numElems),
-					getJcudaDenseMatrixPtr(), v, numElems);
+					getDensePointer(), v, numElems);
 	}
 
 	/**
@@ -475,6 +503,7 @@ public class GPUObject {
 		boolean isEmptyAndSparseAndAllocated = isSparseAndAllocated && getJcudaSparseMatrixPtr().nnz == 0;
 		return isEmptyAndSparseAndAllocated;
 	}
+	
 		
 	/**
 	 * Being allocated is a prerequisite for computing nnz.
@@ -505,7 +534,7 @@ public class GPUObject {
 				Pointer nnzTotalDevHostPtr = null;
 				nnzPerRowPtr = gCtx.allocate(getIntSizeOf(rows));
 				nnzTotalDevHostPtr = gCtx.allocate(getIntSizeOf(1));
-				LibMatrixCUDA.cudaSupportFunctions.cusparsennz(cusparseHandle, cusparseDirection.CUSPARSE_DIRECTION_ROW, rows, cols, matDescr, getJcudaDenseMatrixPtr(), rows,
+				LibMatrixCUDA.cudaSupportFunctions.cusparsennz(cusparseHandle, cusparseDirection.CUSPARSE_DIRECTION_ROW, rows, cols, matDescr, getDensePointer(), rows,
 						nnzPerRowPtr, nnzTotalDevHostPtr);
 				int[] nnzC = { -1 };
 				cudaMemcpy(Pointer.to(nnzC), nnzTotalDevHostPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost);
@@ -513,8 +542,8 @@ public class GPUObject {
 					throw new DMLRuntimeException(
 							"cusparseDnnz did not calculate the correct number of nnz on the GPU");
 				}
-				gCtx.cudaFreeHelper(nnzPerRowPtr);
-				gCtx.cudaFreeHelper(nnzTotalDevHostPtr);
+				gCtx.cudaFreeHelper(instName, nnzPerRowPtr, DMLScript.EAGER_CUDA_FREE);
+				gCtx.cudaFreeHelper(instName, nnzTotalDevHostPtr, DMLScript.EAGER_CUDA_FREE);
 				if(DMLScript.FINEGRAINED_STATISTICS) {
 					GPUStatistics.maintainCPMiscTimes(instName, CPInstruction.MISC_TIMER_RECOMPUTE_NNZ, System.nanoTime()-t1);
 			}
@@ -600,8 +629,18 @@ public class GPUObject {
 				LOG.trace("GPU : data is dirty on device, copying to host, on " + this + ", GPUContext="
 					+ getGPUContext());
 			}
-			copyFromDeviceToHost(instName, false);
-			copied = true;
+
+			if (isAllocated() && dirty) {
+				if(LOG.isTraceEnabled()) {
+					LOG.trace("GPU : data is dirty on device, copying to host, on " + this + ", GPUContext="
+						+ getGPUContext());
+				}
+				// TODO: Future optimization:
+				// For now, we are deleting the device data when copied from device to host. 
+				// This can be optimized later by treating acquiredModify+release as a new state
+				copyFromDeviceToHost(instName, false, true); 
+				copied = true;
+			}
 		}
 		return copied;
 	}
@@ -657,6 +696,7 @@ public class GPUObject {
 				timestamp.addAndGet(1);
 				break;
 			case MIN_EVICT: /* Do Nothing */
+			case ALIGN_MEMORY:
 				break;
 			case MRU:
 				timestamp.set(-System.nanoTime());
@@ -704,7 +744,7 @@ public class GPUObject {
 			throw new DMLRuntimeException("Internal error - invalid number of columns when allocating dense matrix;");
 		long size = getDatatypeSizeOf(rows * cols);
 		Pointer tmp = allocate(size);
-		setDenseMatrixCudaPointer(tmp);
+		setDensePointer(tmp);
 	}
 
 	void allocateSparseMatrixOnDevice() {
@@ -723,21 +763,6 @@ public class GPUObject {
 		setSparseMatrixCudaPointer(tmp);
 	}
 
-	void deallocateMemoryOnDevice(boolean eager) {
-		if(LOG.isTraceEnabled()) {
-			LOG.trace("GPU : deallocateMemoryOnDevice, on " + this + ", GPUContext=" + getGPUContext());
-		}
-		if (getJcudaDenseMatrixPtr() != null) {
-			cudaFreeHelper(null, getJcudaDenseMatrixPtr(), eager);
-		}
-		if (getJcudaSparseMatrixPtr() != null) {
-			getJcudaSparseMatrixPtr().deallocate(eager);
-		}
-		jcudaDenseMatrixPtr = null;
-		jcudaSparseMatrixPtr = null;
-		resetReadWriteLock();
-	}
-
 	protected long getSizeOnDevice() {
 		long GPUSize = 0;
 		long rlen = mat.getNumRows();
@@ -850,14 +875,14 @@ public class GPUObject {
 				// Minor optimization: No need to allocate empty error for CPU 
 				// data = new double[tmp.getNumRows() * tmp.getNumColumns()];
 				long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0;
-				cudaMemset(getJcudaDenseMatrixPtr(), 0, getDatatypeSizeOf(mat.getNumRows() * mat.getNumColumns()));
+				cudaMemset(getDensePointer(), 0, getDatatypeSizeOf(mat.getNumRows() * mat.getNumColumns()));
 				if(DMLScript.FINEGRAINED_STATISTICS) 
 					GPUStatistics.maintainCPMiscTimes(opcode, GPUInstruction.MISC_TIMER_SET_ZERO, System.nanoTime() - t1);
 			}
 			else {
 				// Copy dense block
 				// H2D now only measures the time taken to do 
-				LibMatrixCUDA.cudaSupportFunctions.hostToDevice(getGPUContext(), data, getJcudaDenseMatrixPtr(), opcode);
+				LibMatrixCUDA.cudaSupportFunctions.hostToDevice(getGPUContext(), data, getDensePointer(), opcode);
 			}
 		}
 
@@ -875,103 +900,102 @@ public class GPUObject {
 		}
 		return (int) l;
 	}
+	
 
-	protected void copyFromDeviceToHost(String instName, boolean isEviction) {
+	/**
+	 * Copies the data from device to host.
+	 * Currently eagerDelete and isEviction are both provided for better control in different scenarios. 
+	 * In future, we can force eagerDelete if isEviction is true, else false.
+	 * 
+	 * @param instName opcode of the instruction for fine-grained statistics
+	 * @param isEviction is called for eviction
+	 * @param eagerDelete whether to perform eager deletion of the device data. 
+	 * @throws DMLRuntimeException if error occurs
+	 */
+	protected void copyFromDeviceToHost(String instName, boolean isEviction, boolean eagerDelete) throws DMLRuntimeException {
 		if(LOG.isTraceEnabled()) {
 			LOG.trace("GPU : copyFromDeviceToHost, on " + this + ", GPUContext=" + getGPUContext());
 		}
-		if (getJcudaDenseMatrixPtr() != null && getJcudaSparseMatrixPtr() != null) {
+		if (isDensePointerNull() && getJcudaSparseMatrixPtr() == null) {
+			throw new DMLRuntimeException(
+					"Cannot copy from device to host as JCuda dense/sparse pointer is not allocated");
+		}
+		else if (!isDensePointerNull() && getJcudaSparseMatrixPtr() != null) {
 			throw new DMLRuntimeException("Invalid state : JCuda dense/sparse pointer are both allocated");
 		}
-
-		if (getJcudaDenseMatrixPtr() != null) {
-			long start = 0;
-			if (DMLScript.STATISTICS)
-				start = System.nanoTime();
-			MatrixBlock tmp = new MatrixBlock(toIntExact(mat.getNumRows()), toIntExact(mat.getNumColumns()), false);
+		else if(getJcudaSparseMatrixPtr() != null && !LibMatrixCUDA.isInSparseFormat(getGPUContext(), mat)) {
+			throw new DMLRuntimeException(
+					"Block not in sparse format on host yet the device sparse matrix pointer is not null");
+		}
+		else if(getJcudaSparseMatrixPtr() != null && isSparseAndEmpty()) {
+			mat.acquireModify(new MatrixBlock((int)mat.getNumRows(), (int)mat.getNumColumns(), 0l)); // empty block
+			mat.release();
+			return;
+		}
+		
+		MatrixBlock tmp = null;
+		long start = DMLScript.STATISTICS ? System.nanoTime() : 0;
+		if (!isDensePointerNull()) {
+			tmp = new MatrixBlock(toIntExact(mat.getNumRows()), toIntExact(mat.getNumColumns()), false);
 			tmp.allocateDenseBlock();
 			LibMatrixCUDA.cudaSupportFunctions.deviceToHost(getGPUContext(),
-						getJcudaDenseMatrixPtr(), tmp.getDenseBlockValues(), instName, isEviction);
-			
-			tmp.recomputeNonZeros();
-			mat.acquireModify(tmp);
-			mat.release();
-
-			if (DMLScript.STATISTICS)
-				GPUStatistics.cudaFromDevTime.add(System.nanoTime() - start);
-			if (DMLScript.STATISTICS)
-				GPUStatistics.cudaFromDevCount.add(1);
-		} else if (getJcudaSparseMatrixPtr() != null) {
-			if (!LibMatrixCUDA.isInSparseFormat(getGPUContext(), mat))
-				throw new DMLRuntimeException(
-						"Block not in sparse format on host yet the device sparse matrix pointer is not null");
-
-			if (this.isSparseAndEmpty()) {
-				MatrixBlock tmp = new MatrixBlock((int)mat.getNumRows(), (int)mat.getNumColumns(), 0l);    // Empty Block
-				mat.acquireModify(tmp);
-				mat.release();
-			} else {
-				long start = 0;
-				if (DMLScript.STATISTICS)
-					start = System.nanoTime();
-
-				int rows = toIntExact(mat.getNumRows());
-				int cols = toIntExact(mat.getNumColumns());
-				int nnz = toIntExact(getJcudaSparseMatrixPtr().nnz);
-				double[] values = new double[nnz];
-				LibMatrixCUDA.cudaSupportFunctions.deviceToHost(getGPUContext(), getJcudaSparseMatrixPtr().val, values, instName, isEviction);
-				int[] rowPtr = new int[rows + 1];
-				int[] colInd = new int[nnz];
-				long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
-				CSRPointer.copyPtrToHost(getJcudaSparseMatrixPtr(), rows, nnz, rowPtr, colInd);
-				if (DMLScript.STATISTICS)
-					GPUStatistics.cudaFromDevTime.add(System.nanoTime() - t0);
-				if (DMLScript.STATISTICS)
-					GPUStatistics.cudaFromDevCount.add(3);
-
-				SparseBlockCSR sparseBlock = new SparseBlockCSR(rowPtr, colInd, values, nnz);
-				MatrixBlock tmp = new MatrixBlock(rows, cols, nnz, sparseBlock);
-				mat.acquireModify(tmp);
-				mat.release();
-				if (DMLScript.STATISTICS)
-					GPUStatistics.cudaFromDevTime.add(System.nanoTime() - start);
-				if (DMLScript.STATISTICS)
-					GPUStatistics.cudaFromDevCount.add(1);
-			}
+						getDensePointer(), tmp.getDenseBlockValues(), instName, isEviction); 
+			// int nnz = LibMatrixCUDA.computeNNZ(getGPUContext(), getJcudaDenseMatrixPtr(), toIntExact(mat.getNumRows()*mat.getNumColumns()));
+			// tmp.setNonZeros(nnz);
+			if(eagerDelete)
+				clearData(instName, true);
+			// tmp.recomputeNonZeros();
+			tmp.setNonZeros(-1);
 		} else {
-			throw new DMLRuntimeException(
-					"Cannot copy from device to host as JCuda dense/sparse pointer is not allocated");
+			int rows = toIntExact(mat.getNumRows());
+			int cols = toIntExact(mat.getNumColumns());
+			int nnz = toIntExact(getJcudaSparseMatrixPtr().nnz);
+			double[] values = new double[nnz];
+			LibMatrixCUDA.cudaSupportFunctions.deviceToHost(getGPUContext(), getJcudaSparseMatrixPtr().val, values, instName, isEviction);
+			int[] rowPtr = new int[rows + 1];
+			int[] colInd = new int[nnz];
+			CSRPointer.copyPtrToHost(getJcudaSparseMatrixPtr(), rows, nnz, rowPtr, colInd);
+			if(eagerDelete)
+				clearData(instName, true);
+			SparseBlockCSR sparseBlock = new SparseBlockCSR(rowPtr, colInd, values, nnz);
+			tmp = new MatrixBlock(rows, cols, nnz, sparseBlock);
+		}
+		mat.acquireModify(tmp);
+		mat.release();
+		if (DMLScript.STATISTICS)
+			GPUStatistics.cudaFromDevTime.add(System.nanoTime() - start);
+		if (DMLScript.STATISTICS) {
+			int count = !isDensePointerNull() ? 1 : 3;
+			GPUStatistics.cudaFromDevCount.add(count);
 		}
 		dirty = false;
 	}
 
-	/**
-	 * lazily clears the data associated with this {@link GPUObject} instance
-	 */
-	public void clearData() {
-		clearData(DMLScript.EAGER_CUDA_FREE);
-	}
 
 	/**
 	 * Clears the data associated with this {@link GPUObject} instance
 	 *
+	 * @param opcode opcode of the instruction
 	 * @param eager whether to be done synchronously or asynchronously
+	 * @throws DMLRuntimeException if error occurs
 	 */
-	public void clearData(boolean eager) {
-		deallocateMemoryOnDevice(eager);
+	public void clearData(String opcode, boolean eager) throws DMLRuntimeException {
+		if(LOG.isTraceEnabled()) {
+			LOG.trace("GPU : clearData on " + this + ", GPUContext=" + getGPUContext());
+		}
+		if (!isDensePointerNull()) {
+			getGPUContext().cudaFreeHelper(opcode, getDensePointer(), eager);
+		}
+		if (getJcudaSparseMatrixPtr() != null) {
+			getJcudaSparseMatrixPtr().deallocate(eager);
+		}
+		clearDensePointer();
+		jcudaSparseMatrixPtr = null;
+		resetReadWriteLock();
 		getGPUContext().getMemoryManager().removeGPUObject(this);
 	}
 
 	/**
-	 * Pointer to dense matrix
-	 *
-	 * @return ?
-	 */
-	public Pointer getJcudaDenseMatrixPtr() {
-		return jcudaDenseMatrixPtr;
-	}
-
-	/**
 	 * Pointer to sparse matrix
 	 *
 	 * @return ?
@@ -997,12 +1021,12 @@ public class GPUObject {
 		sb.append(", writeLock=").append(writeLock);
 		sb.append(", sparse? ").append(isSparse);
 		sb.append(", dims=[").append(mat.getNumRows()).append(",").append(mat.getNumColumns()).append("]");
-		if(jcudaDenseMatrixPtr != null)
-			sb.append(", densePtr=").append(jcudaDenseMatrixPtr);
+		if(!isDensePointerNull())
+			sb.append(", densePtr=").append(getDensePointer());
 		if(jcudaSparseMatrixPtr != null)
 			sb.append(", sparsePtr=").append(jcudaSparseMatrixPtr);
 		sb.append('}');
 		return sb.toString();
 	}
 
-}
+}
\ No newline at end of file


[3/4] systemml git commit: [SYSTEMML-445] Refactored GPU Memory Manager

Posted by ni...@apache.org.
http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/cpp/kernels/SystemML.ptx
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.ptx b/src/main/cpp/kernels/SystemML.ptx
index 5f72887..1865e18 100644
--- a/src/main/cpp/kernels/SystemML.ptx
+++ b/src/main/cpp/kernels/SystemML.ptx
@@ -1,12 +1,12 @@
 //
 // Generated by NVIDIA NVVM Compiler
 //
-// Compiler Build ID: CL-21124049
-// Cuda compilation tools, release 8.0, V8.0.44
+// Compiler Build ID: CL-22781540
+// Cuda compilation tools, release 9.0, V9.0.176
 // Based on LLVM 3.4svn
 //
 
-.version 5.0
+.version 6.0
 .target sm_30
 .address_size 64
 
@@ -128,15 +128,15 @@ BB1_2:
 )
 {
 	.reg .pred 	%p<13>;
-	.reg .b32 	%r<74>;
+	.reg .b32 	%r<72>;
 	.reg .f64 	%fd<2>;
-	.reg .b64 	%rd<18>;
+	.reg .b64 	%rd<17>;
 
 
-	ld.param.u64 	%rd4, [sparse_dense_im2col_d_param_0];
-	ld.param.u64 	%rd5, [sparse_dense_im2col_d_param_1];
-	ld.param.u64 	%rd6, [sparse_dense_im2col_d_param_2];
-	ld.param.u64 	%rd7, [sparse_dense_im2col_d_param_3];
+	ld.param.u64 	%rd3, [sparse_dense_im2col_d_param_0];
+	ld.param.u64 	%rd4, [sparse_dense_im2col_d_param_1];
+	ld.param.u64 	%rd5, [sparse_dense_im2col_d_param_2];
+	ld.param.u64 	%rd6, [sparse_dense_im2col_d_param_3];
 	ld.param.u32 	%r35, [sparse_dense_im2col_d_param_4];
 	ld.param.u32 	%r22, [sparse_dense_im2col_d_param_7];
 	ld.param.u32 	%r23, [sparse_dense_im2col_d_param_8];
@@ -158,27 +158,26 @@ BB1_2:
 	setp.ge.s32	%p1, %r1, %r35;
 	@%p1 bra 	BB2_11;
 
-	cvta.to.global.u64 	%rd1, %rd6;
-	cvta.to.global.u64 	%rd2, %rd5;
-	cvta.to.global.u64 	%rd8, %rd4;
-	cvt.s64.s32	%rd3, %r1;
-	mul.wide.s32 	%rd9, %r1, 8;
-	add.s64 	%rd10, %rd8, %rd9;
-	ld.global.f64 	%fd1, [%rd10];
+	cvta.to.global.u64 	%rd1, %rd5;
+	cvta.to.global.u64 	%rd2, %rd4;
+	cvta.to.global.u64 	%rd7, %rd3;
+	mul.wide.s32 	%rd8, %r1, 8;
+	add.s64 	%rd9, %rd7, %rd8;
+	ld.global.f64 	%fd1, [%rd9];
 	mov.u32 	%r67, 0;
 
 BB2_2:
 	mov.u32 	%r2, %r67;
 	add.s32 	%r67, %r2, 1;
-	mul.wide.s32 	%rd11, %r67, 4;
-	add.s64 	%rd12, %rd2, %rd11;
-	ld.global.u32 	%r40, [%rd12];
+	mul.wide.s32 	%rd10, %r67, 4;
+	add.s64 	%rd11, %rd2, %rd10;
+	ld.global.u32 	%r40, [%rd11];
 	setp.le.s32	%p2, %r40, %r1;
 	@%p2 bra 	BB2_2;
 
-	shl.b64 	%rd13, %rd3, 2;
-	add.s64 	%rd14, %rd1, %rd13;
-	ld.global.u32 	%r41, [%rd14];
+	mul.wide.s32 	%rd12, %r1, 4;
+	add.s64 	%rd13, %rd1, %rd12;
+	ld.global.u32 	%r41, [%rd13];
 	div.s32 	%r4, %r41, %r22;
 	rem.s32 	%r42, %r41, %r22;
 	div.s32 	%r43, %r42, %r23;
@@ -189,71 +188,69 @@ BB2_2:
 	sub.s32 	%r47, %r46, %r45;
 	add.s32 	%r48, %r47, %r5;
 	mov.u32 	%r49, 0;
-	max.s32 	%r70, %r49, %r48;
+	max.s32 	%r68, %r49, %r48;
 	add.s32 	%r50, %r24, -1;
 	min.s32 	%r7, %r50, %r5;
 	add.s32 	%r8, %r44, %r34;
 	mul.lo.s32 	%r51, %r32, %r27;
 	sub.s32 	%r52, %r46, %r51;
 	add.s32 	%r53, %r52, %r8;
-	max.s32 	%r73, %r49, %r53;
+	max.s32 	%r69, %r49, %r53;
 	add.s32 	%r54, %r25, -1;
 	min.s32 	%r10, %r54, %r8;
 
 BB2_4:
-	mov.u32 	%r69, %r70;
-	sub.s32 	%r55, %r5, %r69;
+	mov.u32 	%r70, %r68;
+	sub.s32 	%r55, %r5, %r70;
 	rem.s32 	%r56, %r55, %r31;
 	setp.ne.s32	%p3, %r56, 0;
-	setp.le.s32	%p4, %r69, %r7;
-	and.pred  	%p5, %p3, %p4;
-	add.s32 	%r70, %r69, 1;
+	setp.le.s32	%p4, %r70, %r7;
+	and.pred  	%p5, %p4, %p3;
+	add.s32 	%r68, %r70, 1;
 	@%p5 bra 	BB2_4;
 
 BB2_5:
-	mov.u32 	%r13, %r73;
+	mov.u32 	%r13, %r69;
 	sub.s32 	%r57, %r8, %r13;
 	rem.s32 	%r58, %r57, %r32;
 	setp.ne.s32	%p6, %r58, 0;
 	setp.le.s32	%p7, %r13, %r10;
-	and.pred  	%p8, %p6, %p7;
-	add.s32 	%r73, %r13, 1;
+	and.pred  	%p8, %p7, %p6;
+	add.s32 	%r69, %r13, 1;
 	@%p8 bra 	BB2_5;
 
-	setp.gt.s32	%p9, %r69, %r7;
+	setp.gt.s32	%p9, %r70, %r7;
 	@%p9 bra 	BB2_11;
 
 	mul.lo.s32 	%r15, %r2, %r28;
 	mul.lo.s32 	%r16, %r4, %r29;
-	cvta.to.global.u64 	%rd15, %rd7;
+	cvta.to.global.u64 	%rd14, %rd6;
 
 BB2_8:
-	sub.s32 	%r59, %r5, %r69;
+	sub.s32 	%r59, %r5, %r70;
 	div.s32 	%r60, %r59, %r31;
 	mad.lo.s32 	%r18, %r60, %r27, %r15;
 	setp.gt.s32	%p10, %r13, %r10;
-	mov.u32 	%r72, %r13;
+	mov.u32 	%r71, %r13;
 	@%p10 bra 	BB2_10;
 
 BB2_9:
-	mov.u32 	%r19, %r72;
-	sub.s32 	%r61, %r8, %r19;
+	sub.s32 	%r61, %r8, %r71;
 	div.s32 	%r62, %r61, %r32;
-	mad.lo.s32 	%r63, %r69, %r25, %r16;
-	add.s32 	%r64, %r63, %r19;
+	mad.lo.s32 	%r63, %r70, %r25, %r16;
+	add.s32 	%r64, %r63, %r71;
 	mad.lo.s32 	%r65, %r64, %r30, %r18;
 	add.s32 	%r66, %r65, %r62;
-	mul.wide.s32 	%rd16, %r66, 8;
-	add.s64 	%rd17, %rd15, %rd16;
-	st.global.f64 	[%rd17], %fd1;
-	add.s32 	%r20, %r19, %r32;
-	setp.le.s32	%p11, %r20, %r10;
-	mov.u32 	%r72, %r20;
+	mul.wide.s32 	%rd15, %r66, 8;
+	add.s64 	%rd16, %rd14, %rd15;
+	st.global.f64 	[%rd16], %fd1;
+	add.s32 	%r71, %r71, %r32;
+	setp.le.s32	%p11, %r71, %r10;
 	@%p11 bra 	BB2_9;
 
 BB2_10:
-	add.s32 	%r69, %r69, %r31;
-	setp.le.s32	%p12, %r69, %r7;
+	add.s32 	%r70, %r70, %r31;
+	setp.le.s32	%p12, %r70, %r7;
 	@%p12 bra 	BB2_8;
 
 BB2_11:
@@ -286,14 +283,14 @@ BB2_11:
 {
 	.reg .pred 	%p<13>;
 	.reg .f32 	%f<2>;
-	.reg .b32 	%r<74>;
-	.reg .b64 	%rd<18>;
+	.reg .b32 	%r<72>;
+	.reg .b64 	%rd<17>;
 
 
-	ld.param.u64 	%rd4, [sparse_dense_im2col_f_param_0];
-	ld.param.u64 	%rd5, [sparse_dense_im2col_f_param_1];
-	ld.param.u64 	%rd6, [sparse_dense_im2col_f_param_2];
-	ld.param.u64 	%rd7, [sparse_dense_im2col_f_param_3];
+	ld.param.u64 	%rd3, [sparse_dense_im2col_f_param_0];
+	ld.param.u64 	%rd4, [sparse_dense_im2col_f_param_1];
+	ld.param.u64 	%rd5, [sparse_dense_im2col_f_param_2];
+	ld.param.u64 	%rd6, [sparse_dense_im2col_f_param_3];
 	ld.param.u32 	%r35, [sparse_dense_im2col_f_param_4];
 	ld.param.u32 	%r22, [sparse_dense_im2col_f_param_7];
 	ld.param.u32 	%r23, [sparse_dense_im2col_f_param_8];
@@ -315,27 +312,25 @@ BB2_11:
 	setp.ge.s32	%p1, %r1, %r35;
 	@%p1 bra 	BB3_11;
 
-	cvta.to.global.u64 	%rd1, %rd6;
-	cvta.to.global.u64 	%rd2, %rd5;
-	cvta.to.global.u64 	%rd8, %rd4;
-	cvt.s64.s32	%rd3, %r1;
-	mul.wide.s32 	%rd9, %r1, 4;
-	add.s64 	%rd10, %rd8, %rd9;
-	ld.global.f32 	%f1, [%rd10];
+	cvta.to.global.u64 	%rd1, %rd5;
+	cvta.to.global.u64 	%rd2, %rd4;
+	cvta.to.global.u64 	%rd7, %rd3;
+	mul.wide.s32 	%rd8, %r1, 4;
+	add.s64 	%rd9, %rd7, %rd8;
+	ld.global.f32 	%f1, [%rd9];
 	mov.u32 	%r67, 0;
 
 BB3_2:
 	mov.u32 	%r2, %r67;
 	add.s32 	%r67, %r2, 1;
-	mul.wide.s32 	%rd11, %r67, 4;
-	add.s64 	%rd12, %rd2, %rd11;
-	ld.global.u32 	%r40, [%rd12];
+	mul.wide.s32 	%rd10, %r67, 4;
+	add.s64 	%rd11, %rd2, %rd10;
+	ld.global.u32 	%r40, [%rd11];
 	setp.le.s32	%p2, %r40, %r1;
 	@%p2 bra 	BB3_2;
 
-	shl.b64 	%rd13, %rd3, 2;
-	add.s64 	%rd14, %rd1, %rd13;
-	ld.global.u32 	%r41, [%rd14];
+	add.s64 	%rd13, %rd1, %rd8;
+	ld.global.u32 	%r41, [%rd13];
 	div.s32 	%r4, %r41, %r22;
 	rem.s32 	%r42, %r41, %r22;
 	div.s32 	%r43, %r42, %r23;
@@ -346,71 +341,69 @@ BB3_2:
 	sub.s32 	%r47, %r46, %r45;
 	add.s32 	%r48, %r47, %r5;
 	mov.u32 	%r49, 0;
-	max.s32 	%r70, %r49, %r48;
+	max.s32 	%r68, %r49, %r48;
 	add.s32 	%r50, %r24, -1;
 	min.s32 	%r7, %r50, %r5;
 	add.s32 	%r8, %r44, %r34;
 	mul.lo.s32 	%r51, %r32, %r27;
 	sub.s32 	%r52, %r46, %r51;
 	add.s32 	%r53, %r52, %r8;
-	max.s32 	%r73, %r49, %r53;
+	max.s32 	%r69, %r49, %r53;
 	add.s32 	%r54, %r25, -1;
 	min.s32 	%r10, %r54, %r8;
 
 BB3_4:
-	mov.u32 	%r69, %r70;
-	sub.s32 	%r55, %r5, %r69;
+	mov.u32 	%r70, %r68;
+	sub.s32 	%r55, %r5, %r70;
 	rem.s32 	%r56, %r55, %r31;
 	setp.ne.s32	%p3, %r56, 0;
-	setp.le.s32	%p4, %r69, %r7;
-	and.pred  	%p5, %p3, %p4;
-	add.s32 	%r70, %r69, 1;
+	setp.le.s32	%p4, %r70, %r7;
+	and.pred  	%p5, %p4, %p3;
+	add.s32 	%r68, %r70, 1;
 	@%p5 bra 	BB3_4;
 
 BB3_5:
-	mov.u32 	%r13, %r73;
+	mov.u32 	%r13, %r69;
 	sub.s32 	%r57, %r8, %r13;
 	rem.s32 	%r58, %r57, %r32;
 	setp.ne.s32	%p6, %r58, 0;
 	setp.le.s32	%p7, %r13, %r10;
-	and.pred  	%p8, %p6, %p7;
-	add.s32 	%r73, %r13, 1;
+	and.pred  	%p8, %p7, %p6;
+	add.s32 	%r69, %r13, 1;
 	@%p8 bra 	BB3_5;
 
-	setp.gt.s32	%p9, %r69, %r7;
+	setp.gt.s32	%p9, %r70, %r7;
 	@%p9 bra 	BB3_11;
 
 	mul.lo.s32 	%r15, %r2, %r28;
 	mul.lo.s32 	%r16, %r4, %r29;
-	cvta.to.global.u64 	%rd15, %rd7;
+	cvta.to.global.u64 	%rd14, %rd6;
 
 BB3_8:
-	sub.s32 	%r59, %r5, %r69;
+	sub.s32 	%r59, %r5, %r70;
 	div.s32 	%r60, %r59, %r31;
 	mad.lo.s32 	%r18, %r60, %r27, %r15;
 	setp.gt.s32	%p10, %r13, %r10;
-	mov.u32 	%r72, %r13;
+	mov.u32 	%r71, %r13;
 	@%p10 bra 	BB3_10;
 
 BB3_9:
-	mov.u32 	%r19, %r72;
-	sub.s32 	%r61, %r8, %r19;
+	sub.s32 	%r61, %r8, %r71;
 	div.s32 	%r62, %r61, %r32;
-	mad.lo.s32 	%r63, %r69, %r25, %r16;
-	add.s32 	%r64, %r63, %r19;
+	mad.lo.s32 	%r63, %r70, %r25, %r16;
+	add.s32 	%r64, %r63, %r71;
 	mad.lo.s32 	%r65, %r64, %r30, %r18;
 	add.s32 	%r66, %r65, %r62;
-	mul.wide.s32 	%rd16, %r66, 4;
-	add.s64 	%rd17, %rd15, %rd16;
-	st.global.f32 	[%rd17], %f1;
-	add.s32 	%r20, %r19, %r32;
-	setp.le.s32	%p11, %r20, %r10;
-	mov.u32 	%r72, %r20;
+	mul.wide.s32 	%rd15, %r66, 4;
+	add.s64 	%rd16, %rd14, %rd15;
+	st.global.f32 	[%rd16], %f1;
+	add.s32 	%r71, %r71, %r32;
+	setp.le.s32	%p11, %r71, %r10;
 	@%p11 bra 	BB3_9;
 
 BB3_10:
-	add.s32 	%r69, %r69, %r31;
-	setp.le.s32	%p12, %r69, %r7;
+	add.s32 	%r70, %r70, %r31;
+	setp.le.s32	%p12, %r70, %r7;
 	@%p12 bra 	BB3_8;
 
 BB3_11:
@@ -439,7 +432,7 @@ BB3_11:
 )
 {
 	.reg .pred 	%p<12>;
-	.reg .b32 	%r<71>;
+	.reg .b32 	%r<69>;
 	.reg .f64 	%fd<2>;
 	.reg .b64 	%rd<9>;
 
@@ -484,38 +477,38 @@ BB3_11:
 	sub.s32 	%r45, %r44, %r43;
 	add.s32 	%r46, %r45, %r4;
 	mov.u32 	%r47, 0;
-	max.s32 	%r67, %r47, %r46;
+	max.s32 	%r65, %r47, %r46;
 	add.s32 	%r48, %r24, -1;
 	min.s32 	%r6, %r48, %r4;
 	add.s32 	%r7, %r42, %r34;
 	mul.lo.s32 	%r49, %r32, %r27;
 	sub.s32 	%r50, %r44, %r49;
 	add.s32 	%r51, %r50, %r7;
-	max.s32 	%r70, %r47, %r51;
+	max.s32 	%r66, %r47, %r51;
 	add.s32 	%r52, %r25, -1;
 	min.s32 	%r9, %r52, %r7;
 
 BB4_2:
-	mov.u32 	%r66, %r67;
-	sub.s32 	%r53, %r4, %r66;
+	mov.u32 	%r67, %r65;
+	sub.s32 	%r53, %r4, %r67;
 	rem.s32 	%r54, %r53, %r31;
 	setp.ne.s32	%p2, %r54, 0;
-	setp.le.s32	%p3, %r66, %r6;
-	and.pred  	%p4, %p2, %p3;
-	add.s32 	%r67, %r66, 1;
+	setp.le.s32	%p3, %r67, %r6;
+	and.pred  	%p4, %p3, %p2;
+	add.s32 	%r65, %r67, 1;
 	@%p4 bra 	BB4_2;
 
 BB4_3:
-	mov.u32 	%r12, %r70;
+	mov.u32 	%r12, %r66;
 	sub.s32 	%r55, %r7, %r12;
 	rem.s32 	%r56, %r55, %r32;
 	setp.ne.s32	%p5, %r56, 0;
 	setp.le.s32	%p6, %r12, %r9;
-	and.pred  	%p7, %p5, %p6;
-	add.s32 	%r70, %r12, 1;
+	and.pred  	%p7, %p6, %p5;
+	add.s32 	%r66, %r12, 1;
 	@%p7 bra 	BB4_3;
 
-	setp.gt.s32	%p8, %r66, %r6;
+	setp.gt.s32	%p8, %r67, %r6;
 	@%p8 bra 	BB4_9;
 
 	mul.lo.s32 	%r14, %r2, %r28;
@@ -523,32 +516,30 @@ BB4_3:
 	cvta.to.global.u64 	%rd6, %rd2;
 
 BB4_6:
-	sub.s32 	%r57, %r4, %r66;
+	sub.s32 	%r57, %r4, %r67;
 	div.s32 	%r58, %r57, %r31;
 	mad.lo.s32 	%r17, %r58, %r27, %r14;
 	setp.gt.s32	%p9, %r12, %r9;
-	mov.u32 	%r69, %r12;
+	mov.u32 	%r68, %r12;
 	@%p9 bra 	BB4_8;
 
 BB4_7:
-	mov.u32 	%r18, %r69;
-	sub.s32 	%r59, %r7, %r18;
+	sub.s32 	%r59, %r7, %r68;
 	div.s32 	%r60, %r59, %r32;
-	mad.lo.s32 	%r61, %r66, %r25, %r15;
-	add.s32 	%r62, %r61, %r18;
+	mad.lo.s32 	%r61, %r67, %r25, %r15;
+	add.s32 	%r62, %r61, %r68;
 	mad.lo.s32 	%r63, %r62, %r30, %r17;
 	add.s32 	%r64, %r63, %r60;
 	mul.wide.s32 	%rd7, %r64, 8;
 	add.s64 	%rd8, %rd6, %rd7;
 	st.global.f64 	[%rd8], %fd1;
-	add.s32 	%r19, %r18, %r32;
-	setp.le.s32	%p10, %r19, %r9;
-	mov.u32 	%r69, %r19;
+	add.s32 	%r68, %r68, %r32;
+	setp.le.s32	%p10, %r68, %r9;
 	@%p10 bra 	BB4_7;
 
 BB4_8:
-	add.s32 	%r66, %r66, %r31;
-	setp.le.s32	%p11, %r66, %r6;
+	add.s32 	%r67, %r67, %r31;
+	setp.le.s32	%p11, %r67, %r6;
 	@%p11 bra 	BB4_6;
 
 BB4_9:
@@ -578,7 +569,7 @@ BB4_9:
 {
 	.reg .pred 	%p<12>;
 	.reg .f32 	%f<2>;
-	.reg .b32 	%r<71>;
+	.reg .b32 	%r<69>;
 	.reg .b64 	%rd<9>;
 
 
@@ -622,38 +613,38 @@ BB4_9:
 	sub.s32 	%r45, %r44, %r43;
 	add.s32 	%r46, %r45, %r4;
 	mov.u32 	%r47, 0;
-	max.s32 	%r67, %r47, %r46;
+	max.s32 	%r65, %r47, %r46;
 	add.s32 	%r48, %r24, -1;
 	min.s32 	%r6, %r48, %r4;
 	add.s32 	%r7, %r42, %r34;
 	mul.lo.s32 	%r49, %r32, %r27;
 	sub.s32 	%r50, %r44, %r49;
 	add.s32 	%r51, %r50, %r7;
-	max.s32 	%r70, %r47, %r51;
+	max.s32 	%r66, %r47, %r51;
 	add.s32 	%r52, %r25, -1;
 	min.s32 	%r9, %r52, %r7;
 
 BB5_2:
-	mov.u32 	%r66, %r67;
-	sub.s32 	%r53, %r4, %r66;
+	mov.u32 	%r67, %r65;
+	sub.s32 	%r53, %r4, %r67;
 	rem.s32 	%r54, %r53, %r31;
 	setp.ne.s32	%p2, %r54, 0;
-	setp.le.s32	%p3, %r66, %r6;
-	and.pred  	%p4, %p2, %p3;
-	add.s32 	%r67, %r66, 1;
+	setp.le.s32	%p3, %r67, %r6;
+	and.pred  	%p4, %p3, %p2;
+	add.s32 	%r65, %r67, 1;
 	@%p4 bra 	BB5_2;
 
 BB5_3:
-	mov.u32 	%r12, %r70;
+	mov.u32 	%r12, %r66;
 	sub.s32 	%r55, %r7, %r12;
 	rem.s32 	%r56, %r55, %r32;
 	setp.ne.s32	%p5, %r56, 0;
 	setp.le.s32	%p6, %r12, %r9;
-	and.pred  	%p7, %p5, %p6;
-	add.s32 	%r70, %r12, 1;
+	and.pred  	%p7, %p6, %p5;
+	add.s32 	%r66, %r12, 1;
 	@%p7 bra 	BB5_3;
 
-	setp.gt.s32	%p8, %r66, %r6;
+	setp.gt.s32	%p8, %r67, %r6;
 	@%p8 bra 	BB5_9;
 
 	mul.lo.s32 	%r14, %r2, %r28;
@@ -661,32 +652,30 @@ BB5_3:
 	cvta.to.global.u64 	%rd6, %rd2;
 
 BB5_6:
-	sub.s32 	%r57, %r4, %r66;
+	sub.s32 	%r57, %r4, %r67;
 	div.s32 	%r58, %r57, %r31;
 	mad.lo.s32 	%r17, %r58, %r27, %r14;
 	setp.gt.s32	%p9, %r12, %r9;
-	mov.u32 	%r69, %r12;
+	mov.u32 	%r68, %r12;
 	@%p9 bra 	BB5_8;
 
 BB5_7:
-	mov.u32 	%r18, %r69;
-	sub.s32 	%r59, %r7, %r18;
+	sub.s32 	%r59, %r7, %r68;
 	div.s32 	%r60, %r59, %r32;
-	mad.lo.s32 	%r61, %r66, %r25, %r15;
-	add.s32 	%r62, %r61, %r18;
+	mad.lo.s32 	%r61, %r67, %r25, %r15;
+	add.s32 	%r62, %r61, %r68;
 	mad.lo.s32 	%r63, %r62, %r30, %r17;
 	add.s32 	%r64, %r63, %r60;
 	mul.wide.s32 	%rd7, %r64, 4;
 	add.s64 	%rd8, %rd6, %rd7;
 	st.global.f32 	[%rd8], %f1;
-	add.s32 	%r19, %r18, %r32;
-	setp.le.s32	%p10, %r19, %r9;
-	mov.u32 	%r69, %r19;
+	add.s32 	%r68, %r68, %r32;
+	setp.le.s32	%p10, %r68, %r9;
 	@%p10 bra 	BB5_7;
 
 BB5_8:
-	add.s32 	%r66, %r66, %r31;
-	setp.le.s32	%p11, %r66, %r6;
+	add.s32 	%r67, %r67, %r31;
+	setp.le.s32	%p11, %r67, %r6;
 	@%p11 bra 	BB5_6;
 
 BB5_9:
@@ -805,7 +794,7 @@ BB7_2:
 )
 {
 	.reg .pred 	%p<7>;
-	.reg .b32 	%r<24>;
+	.reg .b32 	%r<25>;
 	.reg .f64 	%fd<2>;
 	.reg .b64 	%rd<23>;
 
@@ -831,8 +820,8 @@ BB7_2:
 	mul.wide.s32 	%rd14, %r2, 4;
 	add.s64 	%rd1, %rd13, %rd14;
 	ld.global.u32 	%r23, [%rd1];
-	ld.global.u32 	%r22, [%rd1+4];
-	setp.ge.s32	%p2, %r23, %r22;
+	ld.global.u32 	%r24, [%rd1+4];
+	setp.ge.s32	%p2, %r23, %r24;
 	@%p2 bra 	BB8_6;
 
 	cvta.to.global.u64 	%rd2, %rd12;
@@ -857,13 +846,13 @@ BB8_3:
 	mul.wide.s32 	%rd19, %r21, 8;
 	add.s64 	%rd20, %rd2, %rd19;
 	st.global.f64 	[%rd20], %fd1;
-	ld.global.u32 	%r22, [%rd1+4];
+	ld.global.u32 	%r24, [%rd1+4];
 
 BB8_5:
 	add.s64 	%rd22, %rd22, 8;
 	add.s64 	%rd21, %rd21, 4;
 	add.s32 	%r23, %r23, 1;
-	setp.lt.s32	%p6, %r23, %r22;
+	setp.lt.s32	%p6, %r23, %r24;
 	@%p6 bra 	BB8_3;
 
 BB8_6:
@@ -885,7 +874,7 @@ BB8_6:
 {
 	.reg .pred 	%p<7>;
 	.reg .f32 	%f<2>;
-	.reg .b32 	%r<24>;
+	.reg .b32 	%r<25>;
 	.reg .b64 	%rd<22>;
 
 
@@ -910,8 +899,8 @@ BB8_6:
 	mul.wide.s32 	%rd14, %r2, 4;
 	add.s64 	%rd1, %rd13, %rd14;
 	ld.global.u32 	%r23, [%rd1];
-	ld.global.u32 	%r22, [%rd1+4];
-	setp.ge.s32	%p2, %r23, %r22;
+	ld.global.u32 	%r24, [%rd1+4];
+	setp.ge.s32	%p2, %r23, %r24;
 	@%p2 bra 	BB9_6;
 
 	cvta.to.global.u64 	%rd2, %rd12;
@@ -935,13 +924,13 @@ BB9_3:
 	mul.wide.s32 	%rd18, %r21, 4;
 	add.s64 	%rd19, %rd2, %rd18;
 	st.global.f32 	[%rd19], %f1;
-	ld.global.u32 	%r22, [%rd1+4];
+	ld.global.u32 	%r24, [%rd1+4];
 
 BB9_5:
 	add.s64 	%rd21, %rd21, 4;
 	add.s64 	%rd20, %rd20, 4;
 	add.s32 	%r23, %r23, 1;
-	setp.lt.s32	%p6, %r23, %r22;
+	setp.lt.s32	%p6, %r23, %r24;
 	@%p6 bra 	BB9_3;
 
 BB9_6:
@@ -964,13 +953,13 @@ BB9_6:
 	.reg .pred 	%p<6>;
 	.reg .b32 	%r<22>;
 	.reg .f64 	%fd<2>;
-	.reg .b64 	%rd<22>;
+	.reg .b64 	%rd<21>;
 
 
-	ld.param.u64 	%rd5, [slice_sparse_dense_nnz_d_param_0];
-	ld.param.u64 	%rd8, [slice_sparse_dense_nnz_d_param_1];
-	ld.param.u64 	%rd6, [slice_sparse_dense_nnz_d_param_2];
-	ld.param.u64 	%rd7, [slice_sparse_dense_nnz_d_param_3];
+	ld.param.u64 	%rd4, [slice_sparse_dense_nnz_d_param_0];
+	ld.param.u64 	%rd7, [slice_sparse_dense_nnz_d_param_1];
+	ld.param.u64 	%rd5, [slice_sparse_dense_nnz_d_param_2];
+	ld.param.u64 	%rd6, [slice_sparse_dense_nnz_d_param_3];
 	ld.param.u32 	%r5, [slice_sparse_dense_nnz_d_param_4];
 	ld.param.u32 	%r9, [slice_sparse_dense_nnz_d_param_5];
 	ld.param.u32 	%r6, [slice_sparse_dense_nnz_d_param_6];
@@ -980,24 +969,23 @@ BB9_6:
 	mov.u32 	%r11, %ctaid.x;
 	mov.u32 	%r12, %tid.x;
 	mad.lo.s32 	%r13, %r10, %r11, %r12;
-	cvta.to.global.u64 	%rd1, %rd8;
-	mul.wide.s32 	%rd9, %r5, 4;
-	add.s64 	%rd10, %rd1, %rd9;
-	ld.global.u32 	%r14, [%rd10];
+	cvta.to.global.u64 	%rd1, %rd7;
+	mul.wide.s32 	%rd8, %r5, 4;
+	add.s64 	%rd9, %rd1, %rd8;
+	ld.global.u32 	%r14, [%rd9];
 	add.s32 	%r1, %r13, %r14;
-	mul.wide.s32 	%rd11, %r9, 4;
-	add.s64 	%rd12, %rd1, %rd11;
-	ld.global.u32 	%r15, [%rd12+4];
+	mul.wide.s32 	%rd10, %r9, 4;
+	add.s64 	%rd11, %rd1, %rd10;
+	ld.global.u32 	%r15, [%rd11+4];
 	setp.ge.s32	%p1, %r1, %r15;
 	@%p1 bra 	BB10_5;
 
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd5;
-	cvta.to.global.u64 	%rd13, %rd6;
-	cvt.s64.s32	%rd4, %r1;
-	mul.wide.s32 	%rd14, %r1, 4;
-	add.s64 	%rd15, %rd13, %rd14;
-	ld.global.u32 	%r2, [%rd15];
+	cvta.to.global.u64 	%rd2, %rd6;
+	cvta.to.global.u64 	%rd3, %rd4;
+	cvta.to.global.u64 	%rd12, %rd5;
+	mul.wide.s32 	%rd13, %r1, 4;
+	add.s64 	%rd14, %rd12, %rd13;
+	ld.global.u32 	%r2, [%rd14];
 	setp.lt.s32	%p2, %r2, %r6;
 	setp.gt.s32	%p3, %r2, %r7;
 	or.pred  	%p4, %p2, %p3;
@@ -1007,24 +995,23 @@ BB9_6:
 
 BB10_3:
 	mov.u32 	%r3, %r21;
-	add.s32 	%r4, %r3, 1;
-	mul.wide.s32 	%rd16, %r4, 4;
-	add.s64 	%rd17, %rd1, %rd16;
-	ld.global.u32 	%r16, [%rd17];
+	add.s32 	%r21, %r3, 1;
+	mul.wide.s32 	%rd15, %r21, 4;
+	add.s64 	%rd16, %rd1, %rd15;
+	ld.global.u32 	%r16, [%rd16];
 	setp.le.s32	%p5, %r16, %r1;
-	mov.u32 	%r21, %r4;
 	@%p5 bra 	BB10_3;
 
-	shl.b64 	%rd18, %rd4, 3;
-	add.s64 	%rd19, %rd3, %rd18;
-	ld.global.f64 	%fd1, [%rd19];
+	mul.wide.s32 	%rd17, %r1, 8;
+	add.s64 	%rd18, %rd3, %rd17;
+	ld.global.f64 	%fd1, [%rd18];
 	sub.s32 	%r17, %r3, %r5;
 	mul.lo.s32 	%r18, %r17, %r8;
 	sub.s32 	%r19, %r18, %r6;
 	add.s32 	%r20, %r19, %r2;
-	mul.wide.s32 	%rd20, %r20, 8;
-	add.s64 	%rd21, %rd2, %rd20;
-	st.global.f64 	[%rd21], %fd1;
+	mul.wide.s32 	%rd19, %r20, 8;
+	add.s64 	%rd20, %rd2, %rd19;
+	st.global.f64 	[%rd20], %fd1;
 
 BB10_5:
 	ret;
@@ -1046,13 +1033,13 @@ BB10_5:
 	.reg .pred 	%p<6>;
 	.reg .f32 	%f<2>;
 	.reg .b32 	%r<22>;
-	.reg .b64 	%rd<22>;
+	.reg .b64 	%rd<21>;
 
 
-	ld.param.u64 	%rd5, [slice_sparse_dense_nnz_f_param_0];
-	ld.param.u64 	%rd8, [slice_sparse_dense_nnz_f_param_1];
-	ld.param.u64 	%rd6, [slice_sparse_dense_nnz_f_param_2];
-	ld.param.u64 	%rd7, [slice_sparse_dense_nnz_f_param_3];
+	ld.param.u64 	%rd4, [slice_sparse_dense_nnz_f_param_0];
+	ld.param.u64 	%rd7, [slice_sparse_dense_nnz_f_param_1];
+	ld.param.u64 	%rd5, [slice_sparse_dense_nnz_f_param_2];
+	ld.param.u64 	%rd6, [slice_sparse_dense_nnz_f_param_3];
 	ld.param.u32 	%r5, [slice_sparse_dense_nnz_f_param_4];
 	ld.param.u32 	%r9, [slice_sparse_dense_nnz_f_param_5];
 	ld.param.u32 	%r6, [slice_sparse_dense_nnz_f_param_6];
@@ -1062,24 +1049,23 @@ BB10_5:
 	mov.u32 	%r11, %ctaid.x;
 	mov.u32 	%r12, %tid.x;
 	mad.lo.s32 	%r13, %r10, %r11, %r12;
-	cvta.to.global.u64 	%rd1, %rd8;
-	mul.wide.s32 	%rd9, %r5, 4;
-	add.s64 	%rd10, %rd1, %rd9;
-	ld.global.u32 	%r14, [%rd10];
+	cvta.to.global.u64 	%rd1, %rd7;
+	mul.wide.s32 	%rd8, %r5, 4;
+	add.s64 	%rd9, %rd1, %rd8;
+	ld.global.u32 	%r14, [%rd9];
 	add.s32 	%r1, %r13, %r14;
-	mul.wide.s32 	%rd11, %r9, 4;
-	add.s64 	%rd12, %rd1, %rd11;
-	ld.global.u32 	%r15, [%rd12+4];
+	mul.wide.s32 	%rd10, %r9, 4;
+	add.s64 	%rd11, %rd1, %rd10;
+	ld.global.u32 	%r15, [%rd11+4];
 	setp.ge.s32	%p1, %r1, %r15;
 	@%p1 bra 	BB11_5;
 
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd5;
-	cvta.to.global.u64 	%rd13, %rd6;
-	cvt.s64.s32	%rd4, %r1;
-	mul.wide.s32 	%rd14, %r1, 4;
-	add.s64 	%rd15, %rd13, %rd14;
-	ld.global.u32 	%r2, [%rd15];
+	cvta.to.global.u64 	%rd2, %rd6;
+	cvta.to.global.u64 	%rd3, %rd4;
+	cvta.to.global.u64 	%rd12, %rd5;
+	mul.wide.s32 	%rd13, %r1, 4;
+	add.s64 	%rd14, %rd12, %rd13;
+	ld.global.u32 	%r2, [%rd14];
 	setp.lt.s32	%p2, %r2, %r6;
 	setp.gt.s32	%p3, %r2, %r7;
 	or.pred  	%p4, %p2, %p3;
@@ -1089,24 +1075,22 @@ BB10_5:
 
 BB11_3:
 	mov.u32 	%r3, %r21;
-	add.s32 	%r4, %r3, 1;
-	mul.wide.s32 	%rd16, %r4, 4;
-	add.s64 	%rd17, %rd1, %rd16;
-	ld.global.u32 	%r16, [%rd17];
+	add.s32 	%r21, %r3, 1;
+	mul.wide.s32 	%rd15, %r21, 4;
+	add.s64 	%rd16, %rd1, %rd15;
+	ld.global.u32 	%r16, [%rd16];
 	setp.le.s32	%p5, %r16, %r1;
-	mov.u32 	%r21, %r4;
 	@%p5 bra 	BB11_3;
 
-	shl.b64 	%rd18, %rd4, 2;
-	add.s64 	%rd19, %rd3, %rd18;
-	ld.global.f32 	%f1, [%rd19];
+	add.s64 	%rd18, %rd3, %rd13;
+	ld.global.f32 	%f1, [%rd18];
 	sub.s32 	%r17, %r3, %r5;
 	mul.lo.s32 	%r18, %r17, %r8;
 	sub.s32 	%r19, %r18, %r6;
 	add.s32 	%r20, %r19, %r2;
-	mul.wide.s32 	%rd20, %r20, 4;
-	add.s64 	%rd21, %rd2, %rd20;
-	st.global.f32 	[%rd21], %f1;
+	mul.wide.s32 	%rd19, %r20, 4;
+	add.s64 	%rd20, %rd2, %rd19;
+	st.global.f32 	[%rd20], %f1;
 
 BB11_5:
 	ret;
@@ -1409,12 +1393,12 @@ BB17_2:
 	.reg .pred 	%p<5>;
 	.reg .b32 	%r<8>;
 	.reg .f64 	%fd<6>;
-	.reg .b64 	%rd<14>;
+	.reg .b64 	%rd<13>;
 
 
-	ld.param.u64 	%rd2, [relu_backward_d_param_0];
-	ld.param.u64 	%rd3, [relu_backward_d_param_1];
-	ld.param.u64 	%rd4, [relu_backward_d_param_2];
+	ld.param.u64 	%rd1, [relu_backward_d_param_0];
+	ld.param.u64 	%rd2, [relu_backward_d_param_1];
+	ld.param.u64 	%rd3, [relu_backward_d_param_2];
 	ld.param.u32 	%r2, [relu_backward_d_param_3];
 	ld.param.u32 	%r3, [relu_backward_d_param_4];
 	mov.u32 	%r4, %ntid.x;
@@ -1429,25 +1413,22 @@ BB17_2:
 	bra.uni 	BB18_1;
 
 BB18_1:
-	cvta.to.global.u64 	%rd5, %rd2;
-	cvt.s64.s32	%rd1, %r1;
-	mul.wide.s32 	%rd6, %r1, 8;
-	add.s64 	%rd7, %rd5, %rd6;
-	ld.global.f64 	%fd4, [%rd7];
+	cvta.to.global.u64 	%rd4, %rd1;
+	mul.wide.s32 	%rd5, %r1, 8;
+	add.s64 	%rd6, %rd4, %rd5;
+	ld.global.f64 	%fd4, [%rd6];
 	mov.f64 	%fd5, 0d0000000000000000;
 	setp.leu.f64	%p4, %fd4, 0d0000000000000000;
 	@%p4 bra 	BB18_3;
 
-	cvta.to.global.u64 	%rd8, %rd3;
-	shl.b64 	%rd9, %rd1, 3;
-	add.s64 	%rd10, %rd8, %rd9;
-	ld.global.f64 	%fd5, [%rd10];
+	cvta.to.global.u64 	%rd7, %rd2;
+	add.s64 	%rd9, %rd7, %rd5;
+	ld.global.f64 	%fd5, [%rd9];
 
 BB18_3:
-	cvta.to.global.u64 	%rd11, %rd4;
-	shl.b64 	%rd12, %rd1, 3;
-	add.s64 	%rd13, %rd11, %rd12;
-	st.global.f64 	[%rd13], %fd5;
+	cvta.to.global.u64 	%rd10, %rd3;
+	add.s64 	%rd12, %rd10, %rd5;
+	st.global.f64 	[%rd12], %fd5;
 
 BB18_4:
 	ret;
@@ -1465,12 +1446,12 @@ BB18_4:
 	.reg .pred 	%p<5>;
 	.reg .f32 	%f<6>;
 	.reg .b32 	%r<8>;
-	.reg .b64 	%rd<14>;
+	.reg .b64 	%rd<13>;
 
 
-	ld.param.u64 	%rd2, [relu_backward_f_param_0];
-	ld.param.u64 	%rd3, [relu_backward_f_param_1];
-	ld.param.u64 	%rd4, [relu_backward_f_param_2];
+	ld.param.u64 	%rd1, [relu_backward_f_param_0];
+	ld.param.u64 	%rd2, [relu_backward_f_param_1];
+	ld.param.u64 	%rd3, [relu_backward_f_param_2];
 	ld.param.u32 	%r2, [relu_backward_f_param_3];
 	ld.param.u32 	%r3, [relu_backward_f_param_4];
 	mov.u32 	%r4, %ntid.x;
@@ -1485,25 +1466,22 @@ BB18_4:
 	bra.uni 	BB19_1;
 
 BB19_1:
-	cvta.to.global.u64 	%rd5, %rd2;
-	cvt.s64.s32	%rd1, %r1;
-	mul.wide.s32 	%rd6, %r1, 4;
-	add.s64 	%rd7, %rd5, %rd6;
-	ld.global.f32 	%f4, [%rd7];
+	cvta.to.global.u64 	%rd4, %rd1;
+	mul.wide.s32 	%rd5, %r1, 4;
+	add.s64 	%rd6, %rd4, %rd5;
+	ld.global.f32 	%f4, [%rd6];
 	mov.f32 	%f5, 0f00000000;
 	setp.leu.f32	%p4, %f4, 0f00000000;
 	@%p4 bra 	BB19_3;
 
-	cvta.to.global.u64 	%rd8, %rd3;
-	shl.b64 	%rd9, %rd1, 2;
-	add.s64 	%rd10, %rd8, %rd9;
-	ld.global.f32 	%f5, [%rd10];
+	cvta.to.global.u64 	%rd7, %rd2;
+	add.s64 	%rd9, %rd7, %rd5;
+	ld.global.f32 	%f5, [%rd9];
 
 BB19_3:
-	cvta.to.global.u64 	%rd11, %rd4;
-	shl.b64 	%rd12, %rd1, 2;
-	add.s64 	%rd13, %rd11, %rd12;
-	st.global.f32 	[%rd13], %f5;
+	cvta.to.global.u64 	%rd10, %rd3;
+	add.s64 	%rd12, %rd10, %rd5;
+	st.global.f32 	[%rd12], %f5;
 
 BB19_4:
 	ret;
@@ -1965,106 +1943,119 @@ BB27_2:
 	.param .u32 matrix_matrix_cellwise_op_d_param_7
 )
 {
-	.reg .pred 	%p<77>;
-	.reg .b32 	%r<56>;
-	.reg .f64 	%fd<55>;
+	.reg .pred 	%p<73>;
+	.reg .b32 	%r<61>;
+	.reg .f64 	%fd<51>;
 	.reg .b64 	%rd<19>;
 
 
 	ld.param.u64 	%rd2, [matrix_matrix_cellwise_op_d_param_0];
 	ld.param.u64 	%rd3, [matrix_matrix_cellwise_op_d_param_1];
 	ld.param.u64 	%rd4, [matrix_matrix_cellwise_op_d_param_2];
-	ld.param.u32 	%r10, [matrix_matrix_cellwise_op_d_param_3];
-	ld.param.u32 	%r6, [matrix_matrix_cellwise_op_d_param_4];
-	ld.param.u32 	%r7, [matrix_matrix_cellwise_op_d_param_5];
-	ld.param.u32 	%r8, [matrix_matrix_cellwise_op_d_param_6];
-	ld.param.u32 	%r9, [matrix_matrix_cellwise_op_d_param_7];
-	mov.u32 	%r11, %ctaid.x;
-	mov.u32 	%r12, %ntid.x;
-	mov.u32 	%r13, %tid.x;
-	mad.lo.s32 	%r1, %r12, %r11, %r13;
-	div.s32 	%r2, %r1, %r6;
-	setp.lt.s32	%p2, %r2, %r10;
-	setp.gt.s32	%p3, %r6, -1;
+	ld.param.u32 	%r14, [matrix_matrix_cellwise_op_d_param_3];
+	ld.param.u32 	%r10, [matrix_matrix_cellwise_op_d_param_4];
+	ld.param.u32 	%r11, [matrix_matrix_cellwise_op_d_param_5];
+	ld.param.u32 	%r12, [matrix_matrix_cellwise_op_d_param_6];
+	ld.param.u32 	%r13, [matrix_matrix_cellwise_op_d_param_7];
+	mov.u32 	%r15, %ntid.x;
+	mov.u32 	%r16, %ctaid.x;
+	mov.u32 	%r17, %tid.x;
+	mad.lo.s32 	%r18, %r15, %r16, %r17;
+	div.s32 	%r60, %r18, %r10;
+	rem.s32 	%r2, %r18, %r10;
+	setp.lt.s32	%p2, %r60, %r14;
+	setp.gt.s32	%p3, %r10, -1;
 	and.pred  	%p4, %p2, %p3;
-	@!%p4 bra 	BB28_65;
+	@!%p4 bra 	BB28_77;
 	bra.uni 	BB28_1;
 
 BB28_1:
-	rem.s32 	%r14, %r1, %r6;
-	cvta.to.global.u64 	%rd5, %rd2;
-	mad.lo.s32 	%r3, %r2, %r6, %r14;
-	setp.eq.s32	%p5, %r7, 2;
-	selp.b32	%r15, %r14, %r3, %p5;
-	setp.eq.s32	%p6, %r7, 1;
-	selp.b32	%r16, %r2, %r15, %p6;
-	setp.eq.s32	%p7, %r8, 2;
-	selp.b32	%r17, %r14, %r3, %p7;
-	setp.eq.s32	%p8, %r8, 1;
-	selp.b32	%r18, %r2, %r17, %p8;
-	mul.wide.s32 	%rd6, %r16, 8;
-	add.s64 	%rd7, %rd5, %rd6;
-	ld.global.f64 	%fd1, [%rd7];
-	cvta.to.global.u64 	%rd8, %rd3;
-	mul.wide.s32 	%rd9, %r18, 8;
-	add.s64 	%rd10, %rd8, %rd9;
+	mad.lo.s32 	%r3, %r60, %r10, %r2;
+	setp.eq.s32	%p5, %r11, 1;
+	mov.u32 	%r58, %r60;
+	@%p5 bra 	BB28_4;
+
+	setp.ne.s32	%p6, %r11, 2;
+	mov.u32 	%r58, %r3;
+	@%p6 bra 	BB28_4;
+
+	mov.u32 	%r58, %r2;
+
+BB28_4:
+	setp.eq.s32	%p7, %r12, 1;
+	@%p7 bra 	BB28_7;
+
+	setp.ne.s32	%p8, %r12, 2;
+	mov.u32 	%r60, %r3;
+	@%p8 bra 	BB28_7;
+
+	mov.u32 	%r60, %r2;
+
+BB28_7:
+	cvta.to.global.u64 	%rd5, %rd3;
+	cvta.to.global.u64 	%rd6, %rd2;
+	mul.wide.s32 	%rd7, %r58, 8;
+	add.s64 	%rd8, %rd6, %rd7;
+	ld.global.f64 	%fd1, [%rd8];
+	mul.wide.s32 	%rd9, %r60, 8;
+	add.s64 	%rd10, %rd5, %rd9;
 	ld.global.f64 	%fd2, [%rd10];
-	mov.f64 	%fd54, 0d7FEFFFFFFFFFFFFF;
-	setp.gt.s32	%p9, %r9, 8;
-	@%p9 bra 	BB28_18;
+	mov.f64 	%fd50, 0d7FEFFFFFFFFFFFFF;
+	setp.gt.s32	%p9, %r13, 8;
+	@%p9 bra 	BB28_24;
 
-	setp.gt.s32	%p23, %r9, 3;
-	@%p23 bra 	BB28_10;
+	setp.gt.s32	%p23, %r13, 3;
+	@%p23 bra 	BB28_16;
 
-	setp.gt.s32	%p30, %r9, 1;
-	@%p30 bra 	BB28_7;
+	setp.gt.s32	%p30, %r13, 1;
+	@%p30 bra 	BB28_13;
 
-	setp.eq.s32	%p33, %r9, 0;
-	@%p33 bra 	BB28_63;
-	bra.uni 	BB28_5;
+	setp.eq.s32	%p33, %r13, 0;
+	@%p33 bra 	BB28_75;
+	bra.uni 	BB28_11;
 
-BB28_63:
-	add.f64 	%fd54, %fd1, %fd2;
-	bra.uni 	BB28_64;
+BB28_75:
+	add.f64 	%fd50, %fd1, %fd2;
+	bra.uni 	BB28_76;
 
-BB28_18:
-	setp.gt.s32	%p10, %r9, 13;
-	@%p10 bra 	BB28_27;
+BB28_24:
+	setp.gt.s32	%p10, %r13, 13;
+	@%p10 bra 	BB28_33;
 
-	setp.gt.s32	%p17, %r9, 10;
-	@%p17 bra 	BB28_23;
+	setp.gt.s32	%p17, %r13, 10;
+	@%p17 bra 	BB28_29;
 
-	setp.eq.s32	%p21, %r9, 9;
-	@%p21 bra 	BB28_45;
-	bra.uni 	BB28_21;
+	setp.eq.s32	%p21, %r13, 9;
+	@%p21 bra 	BB28_53;
+	bra.uni 	BB28_27;
 
-BB28_45:
-	setp.eq.f64	%p50, %fd1, %fd2;
-	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p50;
-	bra.uni 	BB28_64;
+BB28_53:
+	setp.eq.f64	%p48, %fd1, %fd2;
+	selp.f64	%fd50, 0d3FF0000000000000, 0d0000000000000000, %p48;
+	bra.uni 	BB28_76;
 
-BB28_10:
-	setp.gt.s32	%p24, %r9, 5;
-	@%p24 bra 	BB28_14;
+BB28_16:
+	setp.gt.s32	%p24, %r13, 5;
+	@%p24 bra 	BB28_20;
 
-	setp.eq.s32	%p28, %r9, 4;
-	@%p28 bra 	BB28_48;
-	bra.uni 	BB28_12;
+	setp.eq.s32	%p28, %r13, 4;
+	@%p28 bra 	BB28_56;
+	bra.uni 	BB28_18;
 
-BB28_48:
+BB28_56:
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r4}, %fd1;
+	mov.b64 	{%temp, %r8}, %fd1;
 	}
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r5}, %fd2;
+	mov.b64 	{%temp, %r9}, %fd2;
 	}
-	bfe.u32 	%r31, %r5, 20, 11;
+	bfe.u32 	%r31, %r9, 20, 11;
 	add.s32 	%r32, %r31, -1012;
 	mov.b64 	 %rd15, %fd2;
 	shl.b64 	%rd1, %rd15, %r32;
-	setp.eq.s64	%p55, %rd1, -9223372036854775808;
+	setp.eq.s64	%p53, %rd1, -9223372036854775808;
 	abs.f64 	%fd19, %fd1;
 	// Callseq Start 0
 	{
@@ -2081,341 +2072,343 @@ BB28_48:
 	param0, 
 	param1
 	);
-	ld.param.f64	%fd53, [retval0+0];
+	ld.param.f64	%fd25, [retval0+0];
 	
 	//{
 	}// Callseq End 0
-	setp.lt.s32	%p56, %r4, 0;
-	and.pred  	%p1, %p56, %p55;
-	@!%p1 bra 	BB28_50;
-	bra.uni 	BB28_49;
+	setp.lt.s32	%p54, %r8, 0;
+	and.pred  	%p1, %p54, %p53;
+	@!%p1 bra 	BB28_58;
+	bra.uni 	BB28_57;
 
-BB28_49:
+BB28_57:
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r33}, %fd53;
+	mov.b64 	{%temp, %r33}, %fd25;
 	}
 	xor.b32  	%r34, %r33, -2147483648;
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r35, %temp}, %fd53;
+	mov.b64 	{%r35, %temp}, %fd25;
 	}
-	mov.b64 	%fd53, {%r35, %r34};
+	mov.b64 	%fd25, {%r35, %r34};
 
-BB28_50:
-	mov.f64 	%fd52, %fd53;
-	setp.eq.f64	%p57, %fd1, 0d0000000000000000;
-	@%p57 bra 	BB28_53;
-	bra.uni 	BB28_51;
+BB28_58:
+	setp.eq.f64	%p55, %fd1, 0d0000000000000000;
+	@%p55 bra 	BB28_61;
+	bra.uni 	BB28_59;
 
-BB28_53:
-	selp.b32	%r36, %r4, 0, %p55;
+BB28_61:
+	selp.b32	%r36, %r8, 0, %p53;
 	or.b32  	%r37, %r36, 2146435072;
-	setp.lt.s32	%p61, %r5, 0;
-	selp.b32	%r38, %r37, %r36, %p61;
+	setp.lt.s32	%p59, %r9, 0;
+	selp.b32	%r38, %r37, %r36, %p59;
 	mov.u32 	%r39, 0;
-	mov.b64 	%fd52, {%r39, %r38};
-	bra.uni 	BB28_54;
+	mov.b64 	%fd25, {%r39, %r38};
+	bra.uni 	BB28_62;
 
-BB28_27:
-	setp.gt.s32	%p11, %r9, 15;
-	@%p11 bra 	BB28_31;
+BB28_33:
+	setp.gt.s32	%p11, %r13, 15;
+	@%p11 bra 	BB28_37;
 
-	setp.eq.s32	%p15, %r9, 14;
-	@%p15 bra 	BB28_42;
-	bra.uni 	BB28_29;
+	setp.eq.s32	%p15, %r13, 14;
+	@%p15 bra 	BB28_50;
+	bra.uni 	BB28_35;
 
-BB28_42:
+BB28_50:
 	cvt.rni.s64.f64	%rd11, %fd1;
-	cvt.rni.s64.f64	%rd12, %fd2;
 	cvt.u32.u64	%r25, %rd11;
+	cvt.rni.s64.f64	%rd12, %fd2;
 	cvt.u32.u64	%r26, %rd12;
 	or.b32  	%r27, %r26, %r25;
-	setp.eq.s32	%p47, %r27, 0;
-	selp.f64	%fd54, 0d0000000000000000, 0d3FF0000000000000, %p47;
-	bra.uni 	BB28_64;
+	setp.eq.s32	%p45, %r27, 0;
+	selp.f64	%fd50, 0d0000000000000000, 0d3FF0000000000000, %p45;
+	bra.uni 	BB28_76;
 
-BB28_7:
-	setp.eq.s32	%p31, %r9, 2;
-	@%p31 bra 	BB28_62;
-	bra.uni 	BB28_8;
+BB28_13:
+	setp.eq.s32	%p31, %r13, 2;
+	@%p31 bra 	BB28_74;
+	bra.uni 	BB28_14;
 
-BB28_62:
-	mul.f64 	%fd54, %fd1, %fd2;
-	bra.uni 	BB28_64;
+BB28_74:
+	mul.f64 	%fd50, %fd1, %fd2;
+	bra.uni 	BB28_76;
 
-BB28_23:
-	setp.eq.s32	%p18, %r9, 11;
-	@%p18 bra 	BB28_44;
+BB28_29:
+	setp.eq.s32	%p18, %r13, 11;
+	@%p18 bra 	BB28_52;
 
-	setp.eq.s32	%p19, %r9, 12;
-	@%p19 bra 	BB28_43;
-	bra.uni 	BB28_25;
+	setp.eq.s32	%p19, %r13, 12;
+	@%p19 bra 	BB28_51;
+	bra.uni 	BB28_31;
 
-BB28_43:
-	max.f64 	%fd54, %fd1, %fd2;
-	bra.uni 	BB28_64;
+BB28_51:
+	max.f64 	%fd50, %fd1, %fd2;
+	bra.uni 	BB28_76;
 
-BB28_14:
-	setp.eq.s32	%p25, %r9, 6;
-	@%p25 bra 	BB28_47;
+BB28_20:
+	setp.eq.s32	%p25, %r13, 6;
+	@%p25 bra 	BB28_55;
 
-	setp.eq.s32	%p26, %r9, 7;
-	@%p26 bra 	BB28_46;
-	bra.uni 	BB28_16;
+	setp.eq.s32	%p26, %r13, 7;
+	@%p26 bra 	BB28_54;
+	bra.uni 	BB28_22;
 
-BB28_46:
-	setp.gt.f64	%p52, %fd1, %fd2;
-	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p52;
-	bra.uni 	BB28_64;
+BB28_54:
+	setp.gt.f64	%p50, %fd1, %fd2;
+	selp.f64	%fd50, 0d3FF0000000000000, 0d0000000000000000, %p50;
+	bra.uni 	BB28_76;
 
-BB28_31:
-	setp.eq.s32	%p12, %r9, 16;
-	@%p12 bra 	BB28_41;
+BB28_37:
+	setp.eq.s32	%p12, %r13, 16;
+	@%p12 bra 	BB28_49;
 
-	setp.eq.s32	%p13, %r9, 17;
-	@%p13 bra 	BB28_37;
-	bra.uni 	BB28_33;
+	setp.eq.s32	%p13, %r13, 17;
+	@%p13 bra 	BB28_44;
+	bra.uni 	BB28_39;
 
-BB28_37:
-	setp.eq.f64	%p39, %fd2, 0d0000000000000000;
-	setp.eq.f64	%p40, %fd2, 0d8000000000000000;
-	or.pred  	%p41, %p39, %p40;
-	mov.f64 	%fd54, 0d7FF8000000000000;
-	@%p41 bra 	BB28_64;
+BB28_44:
+	setp.eq.f64	%p38, %fd2, 0d0000000000000000;
+	setp.eq.f64	%p39, %fd2, 0d8000000000000000;
+	or.pred  	%p40, %p38, %p39;
+	mov.f64 	%fd50, 0d7FF8000000000000;
+	@%p40 bra 	BB28_76;
 
-	div.rn.f64 	%fd54, %fd1, %fd2;
-	abs.f64 	%fd39, %fd54;
-	setp.gtu.f64	%p42, %fd39, 0d7FF0000000000000;
-	@%p42 bra 	BB28_64;
+	div.rn.f64 	%fd50, %fd1, %fd2;
+	abs.f64 	%fd39, %fd50;
+	setp.gtu.f64	%p41, %fd39, 0d7FF0000000000000;
+	@%p41 bra 	BB28_76;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r22, %temp}, %fd54;
+	mov.b64 	{%temp, %r22}, %fd50;
 	}
+	and.b32  	%r23, %r22, 2147483647;
+	setp.ne.s32	%p42, %r23, 2146435072;
+	@%p42 bra 	BB28_48;
+
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r23}, %fd54;
+	mov.b64 	{%r24, %temp}, %fd50;
 	}
-	and.b32  	%r24, %r23, 2147483647;
-	setp.ne.s32	%p43, %r24, 2146435072;
-	setp.ne.s32	%p44, %r22, 0;
-	or.pred  	%p45, %p43, %p44;
-	@!%p45 bra 	BB28_64;
-	bra.uni 	BB28_40;
+	setp.eq.s32	%p43, %r24, 0;
+	@%p43 bra 	BB28_76;
 
-BB28_40:
-	cvt.rmi.f64.f64	%fd40, %fd54;
+BB28_48:
+	cvt.rmi.f64.f64	%fd40, %fd50;
 	mul.f64 	%fd41, %fd2, %fd40;
-	sub.f64 	%fd54, %fd1, %fd41;
-	bra.uni 	BB28_64;
+	sub.f64 	%fd50, %fd1, %fd41;
+	bra.uni 	BB28_76;
 
-BB28_5:
-	setp.eq.s32	%p34, %r9, 1;
-	@%p34 bra 	BB28_6;
-	bra.uni 	BB28_64;
+BB28_11:
+	setp.eq.s32	%p34, %r13, 1;
+	@%p34 bra 	BB28_12;
+	bra.uni 	BB28_76;
 
-BB28_6:
-	sub.f64 	%fd54, %fd1, %fd2;
-	bra.uni 	BB28_64;
+BB28_12:
+	sub.f64 	%fd50, %fd1, %fd2;
+	bra.uni 	BB28_76;
 
-BB28_21:
-	setp.eq.s32	%p22, %r9, 10;
-	@%p22 bra 	BB28_22;
-	bra.uni 	BB28_64;
+BB28_27:
+	setp.eq.s32	%p22, %r13, 10;
+	@%p22 bra 	BB28_28;
+	bra.uni 	BB28_76;
 
-BB28_22:
-	setp.neu.f64	%p49, %fd1, %fd2;
-	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p49;
-	bra.uni 	BB28_64;
+BB28_28:
+	setp.neu.f64	%p47, %fd1, %fd2;
+	selp.f64	%fd50, 0d3FF0000000000000, 0d0000000000000000, %p47;
+	bra.uni 	BB28_76;
 
-BB28_12:
-	setp.eq.s32	%p29, %r9, 5;
-	@%p29 bra 	BB28_13;
-	bra.uni 	BB28_64;
+BB28_18:
+	setp.eq.s32	%p29, %r13, 5;
+	@%p29 bra 	BB28_19;
+	bra.uni 	BB28_76;
 
-BB28_13:
-	setp.lt.f64	%p54, %fd1, %fd2;
-	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p54;
-	bra.uni 	BB28_64;
+BB28_19:
+	setp.lt.f64	%p52, %fd1, %fd2;
+	selp.f64	%fd50, 0d3FF0000000000000, 0d0000000000000000, %p52;
+	bra.uni 	BB28_76;
 
-BB28_29:
-	setp.eq.s32	%p16, %r9, 15;
-	@%p16 bra 	BB28_30;
-	bra.uni 	BB28_64;
+BB28_35:
+	setp.eq.s32	%p16, %r13, 15;
+	@%p16 bra 	BB28_36;
+	bra.uni 	BB28_76;
 
-BB28_30:
+BB28_36:
 	mul.f64 	%fd43, %fd1, %fd2;
 	mov.f64 	%fd44, 0d3FF0000000000000;
-	sub.f64 	%fd54, %fd44, %fd43;
-	bra.uni 	BB28_64;
+	sub.f64 	%fd50, %fd44, %fd43;
+	bra.uni 	BB28_76;
 
-BB28_8:
-	setp.eq.s32	%p32, %r9, 3;
-	@%p32 bra 	BB28_9;
-	bra.uni 	BB28_64;
+BB28_14:
+	setp.eq.s32	%p32, %r13, 3;
+	@%p32 bra 	BB28_15;
+	bra.uni 	BB28_76;
 
-BB28_9:
-	div.rn.f64 	%fd54, %fd1, %fd2;
-	bra.uni 	BB28_64;
+BB28_15:
+	div.rn.f64 	%fd50, %fd1, %fd2;
+	bra.uni 	BB28_76;
 
-BB28_44:
-	min.f64 	%fd54, %fd1, %fd2;
-	bra.uni 	BB28_64;
+BB28_52:
+	min.f64 	%fd50, %fd1, %fd2;
+	bra.uni 	BB28_76;
 
-BB28_25:
-	setp.eq.s32	%p20, %r9, 13;
-	@%p20 bra 	BB28_26;
-	bra.uni 	BB28_64;
+BB28_31:
+	setp.eq.s32	%p20, %r13, 13;
+	@%p20 bra 	BB28_32;
+	bra.uni 	BB28_76;
 
-BB28_26:
+BB28_32:
 	cvt.rni.s64.f64	%rd13, %fd1;
-	cvt.rni.s64.f64	%rd14, %fd2;
 	cvt.u32.u64	%r28, %rd13;
+	cvt.rni.s64.f64	%rd14, %fd2;
 	cvt.u32.u64	%r29, %rd14;
 	and.b32  	%r30, %r29, %r28;
-	setp.eq.s32	%p48, %r30, 0;
-	selp.f64	%fd54, 0d0000000000000000, 0d3FF0000000000000, %p48;
-	bra.uni 	BB28_64;
+	setp.eq.s32	%p46, %r30, 0;
+	selp.f64	%fd50, 0d0000000000000000, 0d3FF0000000000000, %p46;
+	bra.uni 	BB28_76;
 
-BB28_47:
-	setp.le.f64	%p53, %fd1, %fd2;
-	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p53;
-	bra.uni 	BB28_64;
+BB28_55:
+	setp.gtu.f64	%p51, %fd1, %fd2;
+	selp.f64	%fd50, 0d0000000000000000, 0d3FF0000000000000, %p51;
+	bra.uni 	BB28_76;
 
-BB28_16:
-	setp.eq.s32	%p27, %r9, 8;
-	@%p27 bra 	BB28_17;
-	bra.uni 	BB28_64;
+BB28_22:
+	setp.eq.s32	%p27, %r13, 8;
+	@%p27 bra 	BB28_23;
+	bra.uni 	BB28_76;
 
-BB28_17:
-	setp.ge.f64	%p51, %fd1, %fd2;
-	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p51;
-	bra.uni 	BB28_64;
+BB28_23:
+	setp.ltu.f64	%p49, %fd1, %fd2;
+	selp.f64	%fd50, 0d0000000000000000, 0d3FF0000000000000, %p49;
+	bra.uni 	BB28_76;
 
-BB28_41:
-	setp.neu.f64	%p46, %fd1, 0d0000000000000000;
+BB28_49:
+	setp.neu.f64	%p44, %fd1, 0d0000000000000000;
 	sub.f64 	%fd42, %fd1, %fd2;
-	selp.f64	%fd54, %fd42, 0d0000000000000000, %p46;
-	bra.uni 	BB28_64;
+	selp.f64	%fd50, %fd42, 0d0000000000000000, %p44;
+	bra.uni 	BB28_76;
 
-BB28_33:
-	setp.ne.s32	%p14, %r9, 18;
-	@%p14 bra 	BB28_64;
+BB28_39:
+	setp.ne.s32	%p14, %r13, 18;
+	@%p14 bra 	BB28_76;
 
-	div.rn.f64 	%fd54, %fd1, %fd2;
-	abs.f64 	%fd37, %fd54;
+	div.rn.f64 	%fd50, %fd1, %fd2;
+	abs.f64 	%fd37, %fd50;
 	setp.gtu.f64	%p35, %fd37, 0d7FF0000000000000;
-	@%p35 bra 	BB28_64;
+	@%p35 bra 	BB28_76;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r19, %temp}, %fd54;
+	mov.b64 	{%temp, %r19}, %fd50;
 	}
+	and.b32  	%r20, %r19, 2147483647;
+	setp.ne.s32	%p36, %r20, 2146435072;
+	@%p36 bra 	BB28_43;
+
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r20}, %fd54;
+	mov.b64 	{%r21, %temp}, %fd50;
 	}
-	and.b32  	%r21, %r20, 2147483647;
-	setp.ne.s32	%p36, %r21, 2146435072;
-	setp.ne.s32	%p37, %r19, 0;
-	or.pred  	%p38, %p36, %p37;
-	@!%p38 bra 	BB28_64;
-	bra.uni 	BB28_36;
+	setp.eq.s32	%p37, %r21, 0;
+	@%p37 bra 	BB28_76;
 
-BB28_36:
-	cvt.rmi.f64.f64	%fd54, %fd54;
-	bra.uni 	BB28_64;
+BB28_43:
+	cvt.rmi.f64.f64	%fd50, %fd50;
+	bra.uni 	BB28_76;
 
-BB28_51:
-	setp.gt.s32	%p58, %r4, -1;
-	@%p58 bra 	BB28_54;
+BB28_59:
+	setp.gt.s32	%p56, %r8, -1;
+	@%p56 bra 	BB28_62;
 
 	cvt.rzi.f64.f64	%fd45, %fd2;
-	setp.neu.f64	%p59, %fd45, %fd2;
-	selp.f64	%fd52, 0dFFF8000000000000, %fd52, %p59;
+	setp.neu.f64	%p57, %fd45, %fd2;
+	selp.f64	%fd25, 0dFFF8000000000000, %fd25, %p57;
 
-BB28_54:
-	mov.f64 	%fd25, %fd52;
-	add.f64 	%fd26, %fd1, %fd2;
+BB28_62:
+	add.f64 	%fd49, %fd1, %fd2;
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r40}, %fd26;
+	mov.b64 	{%temp, %r40}, %fd49;
 	}
 	and.b32  	%r41, %r40, 2146435072;
-	setp.ne.s32	%p62, %r41, 2146435072;
-	mov.f64 	%fd51, %fd25;
-	@%p62 bra 	BB28_61;
+	setp.ne.s32	%p60, %r41, 2146435072;
+	@%p60 bra 	BB28_63;
 
-	setp.gtu.f64	%p63, %fd19, 0d7FF0000000000000;
-	mov.f64 	%fd51, %fd26;
-	@%p63 bra 	BB28_61;
+	setp.gtu.f64	%p61, %fd19, 0d7FF0000000000000;
+	@%p61 bra 	BB28_73;
 
 	abs.f64 	%fd46, %fd2;
-	setp.gtu.f64	%p64, %fd46, 0d7FF0000000000000;
-	mov.f64 	%fd50, %fd26;
-	mov.f64 	%fd51, %fd50;
-	@%p64 bra 	BB28_61;
+	setp.gtu.f64	%p62, %fd46, 0d7FF0000000000000;
+	@%p62 bra 	BB28_73;
+
+	and.b32  	%r42, %r9, 2147483647;
+	setp.ne.s32	%p63, %r42, 2146435072;
+	@%p63 bra 	BB28_68;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r42, %temp}, %fd2;
-	}
-	and.b32  	%r43, %r5, 2147483647;
-	setp.eq.s32	%p65, %r43, 2146435072;
-	setp.eq.s32	%p66, %r42, 0;
-	and.pred  	%p67, %p65, %p66;
-	@%p67 bra 	BB28_60;
-	bra.uni 	BB28_58;
-
-BB28_60:
-	setp.gt.f64	%p71, %fd19, 0d3FF0000000000000;
-	selp.b32	%r51, 2146435072, 0, %p71;
-	xor.b32  	%r52, %r51, 2146435072;
-	setp.lt.s32	%p72, %r5, 0;
-	selp.b32	%r53, %r52, %r51, %p72;
-	setp.eq.f64	%p73, %fd1, 0dBFF0000000000000;
-	selp.b32	%r54, 1072693248, %r53, %p73;
-	mov.u32 	%r55, 0;
-	mov.b64 	%fd51, {%r55, %r54};
-	bra.uni 	BB28_61;
+	mov.b64 	{%r43, %temp}, %fd2;
+	}
+	setp.eq.s32	%p64, %r43, 0;
+	@%p64 bra 	BB28_72;
+
+BB28_68:
+	and.b32  	%r44, %r8, 2147483647;
+	setp.ne.s32	%p65, %r44, 2146435072;
+	@%p65 bra 	BB28_69;
 
-BB28_58:
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r44, %temp}, %fd1;
-	}
-	and.b32  	%r45, %r4, 2147483647;
-	setp.eq.s32	%p68, %r45, 2146435072;
-	setp.eq.s32	%p69, %r44, 0;
-	and.pred  	%p70, %p68, %p69;
-	mov.f64 	%fd51, %fd25;
-	@!%p70 bra 	BB28_61;
-	bra.uni 	BB28_59;
+	mov.b64 	{%r45, %temp}, %fd1;
+	}
+	setp.ne.s32	%p66, %r45, 0;
+	mov.f64 	%fd49, %fd25;
+	@%p66 bra 	BB28_73;
 
-BB28_59:
-	shr.s32 	%r46, %r5, 31;
+	shr.s32 	%r46, %r9, 31;
 	and.b32  	%r47, %r46, -2146435072;
-	selp.b32	%r48, -1048576, 2146435072, %p1;
-	add.s32 	%r49, %r48, %r47;
-	mov.u32 	%r50, 0;
-	mov.b64 	%fd51, {%r50, %r49};
+	add.s32 	%r48, %r47, 2146435072;
+	or.b32  	%r49, %r48, -2147483648;
+	selp.b32	%r50, %r49, %r48, %p1;
+	mov.u32 	%r51, 0;
+	mov.b64 	%fd49, {%r51, %r50};
+	bra.uni 	BB28_73;
 
-BB28_61:
-	setp.eq.f64	%p74, %fd2, 0d0000000000000000;
-	setp.eq.f64	%p75, %fd1, 0d3FF0000000000000;
-	or.pred  	%p76, %p75, %p74;
-	selp.f64	%fd54, 0d3FF0000000000000, %fd51, %p76;
+BB28_63:
+	mov.f64 	%fd49, %fd25;
+
+BB28_73:
+	setp.eq.f64	%p70, %fd2, 0d0000000000000000;
+	setp.eq.f64	%p71, %fd1, 0d3FF0000000000000;
+	or.pred  	%p72, %p71, %p70;
+	selp.f64	%fd50, 0d3FF0000000000000, %fd49, %p72;
 
-BB28_64:
+BB28_76:
 	cvta.to.global.u64 	%rd16, %rd4;
 	mul.wide.s32 	%rd17, %r3, 8;
 	add.s64 	%rd18, %rd16, %rd17;
-	st.global.f64 	[%rd18], %fd54;
+	st.global.f64 	[%rd18], %fd50;
 	bar.sync 	0;
 
-BB28_65:
+BB28_77:
 	ret;
+
+BB28_69:
+	mov.f64 	%fd49, %fd25;
+	bra.uni 	BB28_73;
+
+BB28_72:
+	setp.gt.f64	%p67, %fd19, 0d3FF0000000000000;
+	selp.b32	%r52, 2146435072, 0, %p67;
+	xor.b32  	%r53, %r52, 2146435072;
+	setp.lt.s32	%p68, %r9, 0;
+	selp.b32	%r54, %r53, %r52, %p68;
+	setp.eq.f64	%p69, %fd1, 0dBFF0000000000000;
+	selp.b32	%r55, 1072693248, %r54, %p69;
+	mov.u32 	%r56, 0;
+	mov.b64 	%fd49, {%r56, %r55};
+	bra.uni 	BB28_73;
 }
 
 	// .globl	matrix_matrix_cellwise_op_f
@@ -2431,425 +2424,436 @@ BB28_65:
 )
 {
 	.reg .pred 	%p<76>;
-	.reg .f32 	%f<134>;
-	.reg .b32 	%r<42>;
+	.reg .f32 	%f<135>;
+	.reg .b32 	%r<46>;
 	.reg .b64 	%rd<17>;
 
 
 	ld.param.u64 	%rd1, [matrix_matrix_cellwise_op_f_param_0];
 	ld.param.u64 	%rd2, [matrix_matrix_cellwise_op_f_param_1];
 	ld.param.u64 	%rd3, [matrix_matrix_cellwise_op_f_param_2];
-	ld.param.u32 	%r8, [matrix_matrix_cellwise_op_f_param_3];
-	ld.param.u32 	%r4, [matrix_matrix_cellwise_op_f_param_4];
-	ld.param.u32 	%r5, [matrix_matrix_cellwise_op_f_param_5];
-	ld.param.u32 	%r6, [matrix_matrix_cellwise_op_f_param_6];
-	ld.param.u32 	%r7, [matrix_matrix_cellwise_op_f_param_7];
-	mov.u32 	%r9, %ntid.x;
-	mov.u32 	%r10, %ctaid.x;
-	mov.u32 	%r11, %tid.x;
-	mad.lo.s32 	%r1, %r9, %r10, %r11;
-	div.s32 	%r2, %r1, %r4;
-	setp.lt.s32	%p2, %r2, %r8;
-	setp.gt.s32	%p3, %r4, -1;
+	ld.param.u32 	%r12, [matrix_matrix_cellwise_op_f_param_3];
+	ld.param.u32 	%r8, [matrix_matrix_cellwise_op_f_param_4];
+	ld.param.u32 	%r9, [matrix_matrix_cellwise_op_f_param_5];
+	ld.param.u32 	%r10, [matrix_matrix_cellwise_op_f_param_6];
+	ld.param.u32 	%r11, [matrix_matrix_cellwise_op_f_param_7];
+	mov.u32 	%r13, %ntid.x;
+	mov.u32 	%r14, %ctaid.x;
+	mov.u32 	%r15, %tid.x;
+	mad.lo.s32 	%r16, %r13, %r14, %r15;
+	div.s32 	%r45, %r16, %r8;
+	rem.s32 	%r2, %r16, %r8;
+	setp.lt.s32	%p2, %r45, %r12;
+	setp.gt.s32	%p3, %r8, -1;
 	and.pred  	%p4, %p2, %p3;
-	@!%p4 bra 	BB29_63;
+	@!%p4 bra 	BB29_69;
 	bra.uni 	BB29_1;
 
 BB29_1:
-	rem.s32 	%r12, %r1, %r4;
-	cvta.to.global.u64 	%rd4, %rd1;
-	mad.lo.s32 	%r3, %r2, %r4, %r12;
-	setp.eq.s32	%p5, %r5, 2;
-	selp.b32	%r13, %r12, %r3, %p5;
-	setp.eq.s32	%p6, %r5, 1;
-	selp.b32	%r14, %r2, %r13, %p6;
-	setp.eq.s32	%p7, %r6, 2;
-	selp.b32	%r15, %r12, %r3, %p7;
-	setp.eq.s32	%p8, %r6, 1;
-	selp.b32	%r16, %r2, %r15, %p8;
-	mul.wide.s32 	%rd5, %r14, 4;
-	add.s64 	%rd6, %rd4, %rd5;
-	ld.global.f32 	%f1, [%rd6];
-	cvta.to.global.u64 	%rd7, %rd2;
-	mul.wide.s32 	%rd8, %r16, 4;
-	add.s64 	%rd9, %rd7, %rd8;
+	mad.lo.s32 	%r3, %r45, %r8, %r2;
+	setp.eq.s32	%p5, %r9, 1;
+	mov.u32 	%r43, %r45;
+	@%p5 bra 	BB29_4;
+
+	setp.ne.s32	%p6, %r9, 2;
+	mov.u32 	%r43, %r3;
+	@%p6 bra 	BB29_4;
+
+	mov.u32 	%r43, %r2;
+
+BB29_4:
+	setp.eq.s32	%p7, %r10, 1;
+	@%p7 bra 	BB29_7;
+
+	setp.ne.s32	%p8, %r10, 2;
+	mov.u32 	%r45, %r3;
+	@%p8 bra 	BB29_7;
+
+	mov.u32 	%r45, %r2;
+
+BB29_7:
+	cvta.to.global.u64 	%rd4, %rd2;
+	cvta.to.global.u64 	%rd5, %rd1;
+	mul.wide.s32 	%rd6, %r43, 4;
+	add.s64 	%rd7, %rd5, %rd6;
+	ld.global.f32 	%f1, [%rd7];
+	mul.wide.s32 	%rd8, %r45, 4;
+	add.s64 	%rd9, %rd4, %rd8;
 	ld.global.f32 	%f2, [%rd9];
-	mov.f32 	%f133, 0f7F7FFFFF;
-	setp.gt.s32	%p9, %r7, 8;
-	@%p9 bra 	BB29_18;
+	mov.f32 	%f134, 0f7F7FFFFF;
+	setp.gt.s32	%p9, %r11, 8;
+	@%p9 bra 	BB29_24;
 
-	setp.gt.s32	%p23, %r7, 3;
-	@%p23 bra 	BB29_10;
+	setp.gt.s32	%p23, %r11, 3;
+	@%p23 bra 	BB29_16;
 
-	setp.gt.s32	%p30, %r7, 1;
-	@%p30 bra 	BB29_7;
+	setp.gt.s32	%p30, %r11, 1;
+	@%p30 bra 	BB29_13;
 
-	setp.eq.s32	%p33, %r7, 0;
-	@%p33 bra 	BB29_61;
-	bra.uni 	BB29_5;
+	setp.eq.s32	%p33, %r11, 0;
+	@%p33 bra 	BB29_67;
+	bra.uni 	BB29_11;
 
-BB29_61:
-	add.f32 	%f133, %f1, %f2;
-	bra.uni 	BB29_62;
+BB29_67:
+	add.f32 	%f134, %f1, %f2;
+	bra.uni 	BB29_68;
 
-BB29_18:
-	setp.gt.s32	%p10, %r7, 13;
-	@%p10 bra 	BB29_27;
+BB29_24:
+	setp.gt.s32	%p10, %r11, 13;
+	@%p10 bra 	BB29_33;
 
-	setp.gt.s32	%p17, %r7, 10;
-	@%p17 bra 	BB29_23;
+	setp.gt.s32	%p17, %r11, 10;
+	@%p17 bra 	BB29_29;
 
-	setp.eq.s32	%p21, %r7, 9;
-	@%p21 bra 	BB29_43;
-	bra.uni 	BB29_21;
+	setp.eq.s32	%p21, %r11, 9;
+	@%p21 bra 	BB29_49;
+	bra.uni 	BB29_27;
 
-BB29_43:
+BB29_49:
 	setp.eq.f32	%p44, %f1, %f2;
-	selp.f32	%f133, 0f3F800000, 0f00000000, %p44;
-	bra.uni 	BB29_62;
+	selp.f32	%f134, 0f3F800000, 0f00000000, %p44;
+	bra.uni 	BB29_68;
 
-BB29_10:
-	setp.gt.s32	%p24, %r7, 5;
-	@%p24 bra 	BB29_14;
+BB29_16:
+	setp.gt.s32	%p24, %r11, 5;
+	@%p24 bra 	BB29_20;
 
-	setp.eq.s32	%p28, %r7, 4;
-	@%p28 bra 	BB29_46;
-	bra.uni 	BB29_12;
+	setp.eq.s32	%p28, %r11, 4;
+	@%p28 bra 	BB29_52;
+	bra.uni 	BB29_18;
 
-BB29_46:
-	mul.f32 	%f53, %f2, 0f3F000000;
-	cvt.rzi.f32.f32	%f54, %f53;
-	fma.rn.f32 	%f55, %f54, 0fC0000000, %f2;
-	abs.f32 	%f19, %f55;
+BB29_52:
+	mul.f32 	%f51, %f2, 0f3F000000;
+	cvt.rzi.f32.f32	%f52, %f51;
+	fma.rn.f32 	%f53, %f52, 0fC0000000, %f2;
+	abs.f32 	%f19, %f53;
 	abs.f32 	%f20, %f1;
 	setp.lt.f32	%p49, %f20, 0f00800000;
-	mul.f32 	%f56, %f20, 0f4B800000;
-	selp.f32	%f57, 0fC3170000, 0fC2FE0000, %p49;
-	selp.f32	%f58, %f56, %f20, %p49;
-	mov.b32 	 %r23, %f58;
+	mul.f32 	%f54, %f20, 0f4B800000;
+	selp.f32	%f55, 0fC3170000, 0fC2FE0000, %p49;
+	selp.f32	%f56, %f54, %f20, %p49;
+	mov.b32 	 %r23, %f56;
 	and.b32  	%r24, %r23, 8388607;
 	or.b32  	%r25, %r24, 1065353216;
-	mov.b32 	 %f59, %r25;
+	mov.b32 	 %f57, %r25;
 	shr.u32 	%r26, %r23, 23;
-	cvt.rn.f32.u32	%f60, %r26;
-	add.f32 	%f61, %f57, %f60;
-	setp.gt.f32	%p50, %f59, 0f3FB504F3;
-	mul.f32 	%f62, %f59, 0f3F000000;
-	add.f32 	%f63, %f61, 0f3F800000;
-	selp.f32	%f64, %f62, %f59, %p50;
-	selp.f32	%f65, %f63, %f61, %p50;
-	add.f32 	%f66, %f64, 0fBF800000;
-	add.f32 	%f50, %f64, 0f3F800000;
+	cvt.rn.f32.u32	%f58, %r26;
+	add.f32 	%f59, %f55, %f58;
+	setp.gt.f32	%p50, %f57, 0f3FB504F3;
+	mul.f32 	%f60, %f57, 0f3F000000;
+	add.f32 	%f61, %f59, 0f3F800000;
+	selp.f32	%f62, %f60, %f57, %p50;
+	selp.f32	%f63, %f61, %f59, %p50;
+	add.f32 	%f64, %f62, 0fBF800000;
+	add.f32 	%f50, %f62, 0f3F800000;
 	// inline asm
 	rcp.approx.ftz.f32 %f49,%f50;
 	// inline asm
-	add.f32 	%f67, %f66, %f66;
-	mul.f32 	%f68, %f49, %f67;
-	mul.f32 	%f69, %f68, %f68;
-	mov.f32 	%f70, 0f3C4CAF63;
-	mov.f32 	%f71, 0f3B18F0FE;
-	fma.rn.f32 	%f72, %f71, %f69, %f70;
-	mov.f32 	%f73, 0f3DAAAABD;
-	fma.rn.f32 	%f74, %f72, %f69, %f73;
-	mul.rn.f32 	%f75, %f74, %f69;
-	mul.rn.f32 	%f76, %f75, %f68;
-	sub.f32 	%f77, %f66, %f68;
-	neg.f32 	%f78, %f68;
-	add.f32 	%f79, %f77, %f77;
-	fma.rn.f32 	%f80, %f78, %f66, %f79;
-	mul.rn.f32 	%f81, %f49, %f80;
-	add.f32 	%f82, %f76, %f68;
-	sub.f32 	%f83, %f68, %f82;
-	add.f32 	%f84, %f76, %f83;
-	add.f32 	%f85, %f81, %f84;
-	add.f32 	%f86, %f82, %f85;
-	sub.f32 	%f87, %f82, %f86;
-	add.f32 	%f88, %f85, %f87;
-	mov.f32 	%f89, 0f3F317200;
-	mul.rn.f32 	%f90, %f65, %f89;
-	mov.f32 	%f91, 0f35BFBE8E;
-	mul.rn.f32 	%f92, %f65, %f91;
-	add.f32 	%f93, %f90, %f86;
-	sub.f32 	%f94, %f90, %f93;
-	add.f32 	%f95, %f86, %f94;
-	add.f32 	%f96, %f88, %f95;
-	add.f32 	%f97, %f92, %f96;
-	add.f32 	%f98, %f93, %f97;
-	sub.f32 	%f99, %f93, %f98;
-	add.f32 	%f100, %f97, %f99;
+	add.f32 	%f65, %f64, %f64;
+	mul.f32 	%f66, %f49, %f65;
+	mul.f32 	%f67, %f66, %f66;
+	mov.f32 	%f68, 0f3C4CAF63;
+	mov.f32 	%f69, 0f3B18F0FE;
+	fma.rn.f32 	%f70, %f69, %f67, %f68;
+	mov.f32 	%f71, 0f3DAAAABD;
+	fma.rn.f32 	%f72, %f70, %f67, %f71;
+	mul.rn.f32 	%f73, %f72, %f67;
+	mul.rn.f32 	%f74, %f73, %f66;
+	sub.f32 	%f75, %f64, %f66;
+	neg.f32 	%f76, %f66;
+	add.f32 	%f77, %f75, %f75;
+	fma.rn.f32 	%f78, %f76, %f64, %f77;
+	mul.rn.f32 	%f79, %f49, %f78;
+	add.f32 	%f80, %f74, %f66;
+	sub.f32 	%f81, %f66, %f80;
+	add.f32 	%f82, %f74, %f81;
+	add.f32 	%f83, %f79, %f82;
+	add.f32 	%f84, %f80, %f83;
+	sub.f32 	%f85, %f80, %f84;
+	add.f32 	%f86, %f83, %f85;
+	mov.f32 	%f87, 0f3F317200;
+	mul.rn.f32 	%f88, %f63, %f87;
+	mov.f32 	%f89, 0f35BFBE8E;
+	mul.rn.f32 	%f90, %f63, %f89;
+	add.f32 	%f91, %f88, %f84;
+	sub.f32 	%f92, %f88, %f91;
+	add.f32 	%f93, %f84, %f92;
+	add.f32 	%f94, %f86, %f93;
+	add.f32 	%f95, %f90, %f94;
+	add.f32 	%f96, %f91, %f95;
+	sub.f32 	%f97, %f91, %f96;
+	add.f32 	%f98, %f95, %f97;
 	abs.f32 	%f21, %f2;
 	setp.gt.f32	%p51, %f21, 0f77F684DF;
-	mul.f32 	%f101, %f2, 0f39000000;
-	selp.f32	%f102, %f101, %f2, %p51;
-	mul.rn.f32 	%f103, %f102, %f98;
-	neg.f32 	%f104, %f103;
-	fma.rn.f32 	%f105, %f102, %f98, %f104;
-	fma.rn.f32 	%f106, %f102, %f100, %f105;
-	mov.f32 	%f107, 0f00000000;
-	fma.rn.f32 	%f108, %f107, %f98, %f106;
-	add.rn.f32 	%f109, %f103, %f108;
-	neg.f32 	%f110, %f109;
-	add.rn.f32 	%f111, %f103, %f110;
-	add.rn.f32 	%f112, %f111, %f108;
-	mov.b32 	 %r27, %f109;
+	mul.f32 	%f99, %f2, 0f39000000;
+	selp.f32	%f100, %f99, %f2, %p51;
+	mul.rn.f32 	%f101, %f100, %f96;
+	neg.f32 	%f102, %f101;
+	fma.rn.f32 	%f103, %f100, %f96, %f102;
+	fma.rn.f32 	%f104, %f100, %f98, %f103;
+	mov.f32 	%f105, 0f00000000;
+	fma.rn.f32 	%f106, %f105, %f96, %f104;
+	add.rn.f32 	%f107, %f101, %f106;
+	neg.f32 	%f108, %f107;
+	add.rn.f32 	%f109, %f101, %f108;
+	add.rn.f32 	%f110, %f109, %f106;
+	mov.b32 	 %r27, %f107;
 	setp.eq.s32	%p52, %r27, 1118925336;
 	add.s32 	%r28, %r27, -1;
-	mov.b32 	 %f113, %r28;
-	add.f32 	%f114, %f112, 0f37000000;
-	selp.f32	%f115, %f113, %f109, %p52;
-	selp.f32	%f22, %f114, %f112, %p52;
-	mul.f32 	%f116, %f115, 0f3FB8AA3B;
-	cvt.rzi.f32.f32	%f117, %f116;
-	mov.f32 	%f118, 0fBF317200;
-	fma.rn.f32 	%f119, %f117, %f118, %f115;
-	mov.f32 	%f120, 0fB5BFBE8E;
-	fma.rn.f32 	%f121, %f117, %f120, %f119;
-	mul.f32 	%f52, %f121, 0f3FB8AA3B;
-	// inline asm
-	ex2.approx.ftz.f32 %f51,%f52;
-	// inline asm
-	add.f32 	%f122, %f117, 0f00000000;
+	mov.b32 	 %f111, %r28;
+	add.f32 	%f112, %f110, 0f37000000;
+	selp.f32	%f113, %f111, %f107, %p52;
+	selp.f32	%f22, %f112, %f110, %p52;
+	mul.f32 	%f114, %f113, 0f3FB8AA3B;
+	cvt.rzi.f32.f32	%f115, %f114;
+	mov.f32 	%f116, 0fBF317200;
+	fma.rn.f32 	%f117, %f115, %f116, %f113;
+	mov.f32 	%f118, 0fB5BFBE8E;
+	fma.rn.f32 	%f119, %f115, %f118, %f117;
+	mul.f32 	%f120, %f119, 0f3FB8AA3B;
+	ex2.approx.ftz.f32 	%f121, %f120;
+	add.f32 	%f122, %f115, 0f00000000;
 	ex2.approx.f32 	%f123, %f122;
-	mul.f32 	%f124, %f51, %f123;
-	setp.lt.f32	%p53, %f115, 0fC2D20000;
+	mul.f32 	%f124, %f121, %f123;
+	setp.lt.f32	%p53, %f113, 0fC2D20000;
 	selp.f32	%f125, 0f00000000, %f124, %p53;
-	setp.gt.f32	%p54, %f115, 0f42D20000;
+	setp.gt.f32	%p54, %f113, 0f42D20000;
 	selp.f32	%f131, 0f7F800000, %f125, %p54;
 	setp.eq.f32	%p55, %f131, 0f7F800000;
-	@%p55 bra 	BB29_48;
+	@%p55 bra 	BB29_54;
 
 	fma.rn.f32 	%f131, %f131, %f22, %f131;
 
-BB29_48:
+BB29_54:
 	setp.lt.f32	%p56, %f1, 0f00000000;
 	setp.eq.f32	%p57, %f19, 0f3F800000;
 	and.pred  	%p1, %p56, %p57;
 	mov.b32 	 %r29, %f131;
 	xor.b32  	%r30, %r29, -2147483648;
 	mov.b32 	 %f126, %r30;
-	selp.f32	%f132, %f126, %f131, %p1;
+	selp.f32	%f133, %f126, %f131, %p1;
 	setp.eq.f32	%p58, %f1, 0f00000000;
-	@%p58 bra 	BB29_51;
-	bra.uni 	BB29_49;
+	@%p58 bra 	BB29_57;
+	bra.uni 	BB29_55;
 
-BB29_51:
+BB29_57:
 	add.f32 	%f128, %f1, %f1;
 	mov.b32 	 %r31, %f128;
 	selp.b32	%r32, %r31, 0, %p57;
 	or.b32  	%r33, %r32, 2139095040;
 	setp.lt.f32	%p62, %f2, 0f00000000;
 	selp.b32	%r34, %r33, %r32, %p62;
-	mov.b32 	 %f132, %r34;
-	bra.uni 	BB29_52;
+	mov.b32 	 %f133, %r34;
+	bra.uni 	BB29_58;
 
-BB29_27:
-	setp.gt.s32	%p11, %r7, 15;
-	@%p11 bra 	BB29_31;
+BB29_33:
+	setp.gt.s32	%p11, %r11, 15;
+	@%p11 bra 	BB29_37;
 
-	setp.eq.s32	%p15, %r7, 14;
-	@%p15 bra 	BB29_40;
-	bra.uni 	BB29_29;
+	setp.eq.s32	%p15, %r11, 14;
+	@%p15 bra 	BB29_46;
+	bra.uni 	BB29_35;
 
-BB29_40:
+BB29_46:
 	cvt.rni.s64.f32	%rd10, %f1;
-	cvt.rni.s64.f32	%rd11, %f2;
 	cvt.u32.u64	%r17, %rd10;
+	cvt.rni.s64.f32	%rd11, %f2;
 	cvt.u32.u64	%r18, %rd11;
 	or.b32  	%r19, %r18, %r17;
 	setp.eq.s32	%p41, %r19, 0;
-	selp.f32	%f133, 0f00000000, 0f3F800000, %p41;
-	bra.uni 	BB29_62;
+	selp.f32	%f134, 0f00000000, 0f3F800000, %p41;
+	bra.uni 	BB29_68;
 
-BB29_7:
-	setp.eq.s32	%p31, %r7, 2;
-	@%p31 bra 	BB29_60;
-	bra.uni 	BB29_8;
+BB29_13:
+	setp.eq.s32	%p31, %r11, 2;
+	@%p31 bra 	BB29_66;
+	bra.uni 	BB29_14;
 
-BB29_60:
-	mul.f32 	%f133, %f1, %f2;
-	bra.uni 	BB29_62;
+BB29_66:
+	mul.f32 	%f134, %f1, %f2;
+	bra.uni 	BB29_68;
 
-BB29_23:
-	setp.eq.s32	%p18, %r7, 11;
-	@%p18 bra 	BB29_42;
+BB29_29:
+	setp.eq.s32	%p18, %r11, 11;
+	@%p18 bra 	BB29_48;
 
-	setp.eq.s32	%p19, %r7, 12;
-	@%p19 bra 	BB29_41;
-	bra.uni 	BB29_25;
+	setp.eq.s32	%p19, %r11, 12;
+	@%p19 bra 	BB29_47;
+	bra.uni 	BB29_31;
 
-BB29_41:
-	max.f32 	%f133, %f1, %f2;
-	bra.uni 	BB29_62;
+BB29_47:
+	max.f32 	%f134, %f1, %f2;
+	bra.uni 	BB29_68;
 
-BB29_14:
-	setp.eq.s32	%p25, %r7, 6;
-	@%p25 bra 	BB29_45;
+BB29_20:
+	setp.eq.s32	%p25, %r11, 6;
+	@%p25 bra 	BB29_51;
 
-	setp.eq.s32	%p26, %r7, 7;
-	@%p26 bra 	BB29_44;
-	bra.uni 	BB29_16;
+	setp.eq.s32	%p26, %r11, 7;
+	@%p26 bra 	BB29_50;
+	bra.uni 	BB29_22;
 
-BB29_44:
+BB29_50:
 	setp.gt.f32	%p46, %f1, %f2;
-	selp.f32	%f133, 0f3F800000, 0f00000000, %p46;
-	bra.uni 	BB29_62;
+	selp.f32	%f134, 0f3F800000, 0f00000000, %p46;
+	bra.uni 	BB29_68;
 
-BB29_31:
-	setp.eq.s32	%p12, %r7, 16;
-	@%p12 bra 	BB29_39;
+BB29_37:
+	setp.eq.s32	%p12, %r11, 16;
+	@%p12 bra 	BB29_45;
 
-	setp.eq.s32	%p13, %r7, 17;
-	@%p13 bra 	BB29_36;
-	bra.uni 	BB29_33;
+	setp.eq.s32	%p13, %r11, 17;
+	@%p13 bra 	BB29_42;
+	bra.uni 	BB29_39;
 
-BB29_36:
+BB29_42:
 	setp.eq.f32	%p36, %f2, 0f00000000;
 	setp.eq.f32	%p37, %f2, 0f80000000;
 	or.pred  	%p38, %p36, %p37;
-	mov.f32 	%f133, 0f7FC00000;
-	@%p38 bra 	BB29_62;
+	mov.f32 	%f134, 0f7FC00000;
+	@%p38 bra 	BB29_68;
 
-	div.rn.f32 	%f133, %f1, %f2;
-	abs.f32 	%f43, %f133;
+	div.rn.f32 	%f134, %f1, %f2;
+	abs.f32 	%f43, %f134;
 	setp.geu.f32	%p39, %f43, 0f7F800000;
-	@%p39 bra 	BB29_62;
+	@%p39 bra 	BB29_68;
 
-	cvt.rmi.f32.f32	%f44, %f133;
+	cvt.rmi.f32.f32	%f44, %f134;
 	mul.f32 	%f45, %f2, %f44;
-	sub.f32 	%f133, %f1, %f45;
-	bra.uni 	BB29_62;
+	sub.f32 	%f134, %f1, %f45;
+	bra.uni 	BB29_68;
 
-BB29_5:
-	setp.eq.s32	%p34, %r7, 1;
-	@%p34 bra 	BB29_6;
-	bra.uni 	BB29_62;
+BB29_11:
+	setp.eq.s32	%p34, %r11, 1;
+	@%p34 bra 	BB29_12;
+	bra.uni 	BB29_68;
 
-BB29_6:
-	sub.f32 	%f133, %f1, %f2;
-	bra.uni 	BB29_62;
+BB29_12:
+	sub.f32 	%f134, %f1, %f2;
+	bra.uni 	BB29_68;
 
-BB29_21:
-	setp.eq.s32	%p22, %r7, 10;
-	@%p22 bra 	BB29_22;
-	bra.uni 	BB29_62;
+BB29_27:
+	setp.eq.s32	%p22, %r11, 10;
+	@%p22 bra 	BB29_28;
+	bra.uni 	BB29_68;
 
-BB29_22:
+BB29_28:
 	setp.neu.f32	%p43, %f1, %f2;
-	selp.f32	%f133, 0f3F800000, 0f00000000, %p43;
-	bra.uni 	BB29_62;
+	selp.f32	%f134, 0f3F800000, 0f00000000, %p43;
+	bra.uni 	BB29_68;
 
-BB29_12:
-	setp.eq.s32	%p29, %r7, 5;
-	@%p29 bra 	BB29_13;
-	bra.uni 	BB29_62;
+BB29_18:
+	setp.eq.s32	%p29, %r11, 5;
+	@%p29 bra 	BB29_19;
+	bra.uni 	BB29_68;
 
-BB29_13:
+BB29_19:
 	setp.lt.f32	%p48, %f1, %f2;
-	selp.f32	%f133, 0f3F800000, 0f00000000, %p48;
-	bra.uni 	BB29_62;
+	selp.f32	%f134, 0f3F800000, 0f00000000, %p48;
+	bra.uni 	BB29_68;
 
-BB29_29:
-	setp.eq.s32	%p16, %r7, 15;
-	@%p16 bra 	BB29_30;
-	bra.uni 	BB29_62;
+BB29_35:
+	setp.eq.s32	%p16, %r11, 15;
+	@%p16 bra 	BB29_36;
+	bra.uni 	BB29_68;
 
-BB29_30:
+BB29_36:
 	mul.f32 	%f47, %f1, %f2;
 	mov.f32 	%f48, 0f3F800000;
-	sub.f32 	%f133, %f48, %f47;
-	bra.uni 	BB29_62;
+	sub.f32 	%f134, %f48, %f47;
+	bra.uni 	BB29_68;
 
-BB29_8:
-	setp.eq.s32	%p32, %r7, 3;
-	@%p32 bra 	BB29_9;
-	bra.uni 	BB29_62;
+BB29_14:
+	setp.eq.s32	%p32, %r11, 3;
+	@%p32 bra 	BB29_15;
+	bra.uni 	BB29_68;
 
-BB29_9:
-	div.rn.f32 	%f133, %f1, %f2;
-	bra.uni 	BB29_62;
+BB29_15:
+	div.rn.f32 	%f134, %f1, %f2;
+	bra.uni 	BB29_68;
 
-BB29_42:
-	min.f32 	%f133, %f1, %f2;
-	bra.uni 	BB29_62;
+BB29_48:
+	min.f32 	%f134, %f1, %f2;
+	bra.uni 	BB29_68;
 
-BB29_25:
-	setp.eq.s32	%p20, %r7, 13;
-	@%p20 bra 	BB29_26;
-	bra.uni 	BB29_62;
+BB29_31:
+	setp.eq.s32	%p20, %r11, 13;
+	@%p20 bra 	BB29_32;
+	bra.uni 	BB29_68;
 
-BB29_26:
+BB29_32:
 	cvt.rni.s64.f32	%rd12, %f1;
-	cvt.rni.s64.f32	%rd13, %f2;
 	cvt.u32.u64	%r20, %rd12;
+	cvt.rni.s64.f32	%rd13, %f2;
 	cvt.u32.u64	%r21, %rd13;
 	and.b32  	%r22, %r21, %r20;
 	setp.eq.s32	%p42, %r22, 0;
-	selp.f32	%f133, 0f00000000, 0f3F800000, %p42;
-	bra.uni 	BB29_62;
+	selp.f32	%f134, 0f00000000, 0f3F800000, %p42;
+	bra.uni 	BB29_68;
 
-BB29_45:
-	setp.le.f32	%p47, %f1, %f2;
-	selp.f32	%f133, 0f3F800000, 0f00000000, %p47;
-	bra.uni 	BB29_62;
+BB29_51:
+	setp.gtu.f32	%p47, %f1, %f2;
+	selp.f32	%f134, 0f00000000, 0f3F800000, %p47;
+	bra.uni 	BB29_68;
 
-BB29_16:
-	setp.eq.s32	%p27, %r7, 8;
-	@%p27 bra 	BB29_17;
-	bra.uni 	BB29_62;
+BB29_22:
+	setp.eq.s32	%p27, %r11, 8;
+	@%p27 bra 	BB29_23;
+	bra.uni 	BB29_68;
 
-BB29_17:
-	setp.ge.f32	%p45, %f1, %f2;
-	selp.f32	%f133, 0f3F800000, 0f00000000, %p45;
-	bra.uni 	BB29_62;
+BB29_23:
+	setp.ltu.f32	%p45, %f1, %f2;
+	selp.f32	%f134, 0f00000000, 0f3F800000, %p45;
+	bra.uni 	BB29_68;
 
-BB29_39:
+BB29_45:
 	setp.neu.f32	%p40, %f1, 0f00000000;
 	sub.f32 	%f46, %f1, %f2;
-	selp.f32	%f133, %f46, 0f00000000, %p40;
-	bra.uni 	BB29_62;
+	selp.f32	%f134, %f46, 0f00000000, %p40;
+	bra.uni 	BB29_68;
 
-BB29_33:
-	setp.ne.s32	%p14, %r7, 18;
-	@%p14 bra 	BB29_62;
+BB29_39:
+	setp.ne.s32	%p14, %r11, 18;
+	@%p14 bra 	BB29_68;
 
-	div.rn.f32 	%f133, %f1, %f2;
-	abs.f32 	%f41, %f133;
+	div.rn.f32 	%f134, %f1, %f2;
+	abs.f32 	%f41, %f134;
 	setp.geu.f32	%p35, %f41, 0f7F800000;
-	@%p35 bra 	BB29_62;
+	@%p35 bra 	BB29_68;
 
-	cvt.rmi.f32.f32	%f133, %f133;
-	bra.uni 	BB29_62;
+	cvt.rmi.f32.f32	%f134, %f134;
+	bra.uni 	BB29_68;
 
-BB29_49:
+BB29_55:
 	setp.geu.f32	%p59, %f1, 0f00000000;
-	@%p59 bra 	BB29_52;
+	@%p59 bra 	BB29_58;
 
 	cvt.rzi.f32.f32	%f127, %f2;
 	setp.neu.f32	%p60, %f127, %f2;
-	selp.f32	%f132, 0f7FFFFFFF, %f132, %p60;
+	selp.f32	%f133, 0f7FFFFFFF, %f133, %p60;
 
-BB29_52:
+BB29_58:
 	add.f32 	%f129, %f20, %f21;
 	mov.b32 	 %r35, %f129;
 	setp.lt.s32	%p63, %r35, 2139095040;
-	@%p63 bra 	BB29_59;
+	@%p63 bra 	BB29_65;
 
 	setp.gtu.f32	%p64, %f20, 0f7F800000;
 	setp.gtu.f32	%p65, %f21, 0f7F800000;
 	or.pred  	%p66, %p64, %p65;
-	@%p66 bra 	BB29_58;
-	bra.uni 	BB29_54;
+	@%p66 bra 	BB29_64;
+	bra.uni 	BB29_60;
 
-BB29_58:
-	add.f32 	%f132, %f1, %f2;
-	bra.uni 	BB29_59;
+BB29_64:
+	add.f32 	%f133, %f1, %f2;
+	bra.uni 	BB29_65;
 
-BB29_54:
+BB29_60:
 	setp.eq.f32	%p67, %f21, 0f7F800000;
-	@%p67 bra 	BB29_57;
-	bra.uni 	BB29_55;
+	@%p67 bra 	BB29_63;
+	bra.uni 	BB29_61;
 
-BB29_57:
+BB29_63:
 	setp.gt.f32	%p70, %f20, 0f3F800000;
 	selp.b32	%r39, 2139095040, 0, %p70;
 	xor.b32  	%r40, %r39, 2139095040;
@@ -2857,33 +2861,33 @@ BB29_57:
 	selp.b32	%r41, %r40, %r39, %p71;
 	mov.b32 	 %f130, %r41;
 	setp.eq.f32	%p72, %f1, 0fBF800000;
-	selp.f32	%f132, 0f3F800000, %f130, %p72;
-	bra.uni 	BB29_59;
+	selp.f32	%f133, 0f3F800000, %f130, %p72;
+	bra.uni 	BB29_65;
 
-BB29_55:
+BB29_61:
 	setp.neu.f32	%p68, %f20, 0f7F800000;
-	@%p68 bra 	BB29_59;
+	@%p68 bra 	BB29_65;
 
-	setp.ge.f32	%p69, %f2, 0f00000000;
-	selp.b32	%r36, 2139095040, 0, %p69;
+	setp.ltu.f32	%p69, %f2, 0f00000000;
+	selp.b32	%r36, 0, 2139095040, %p69;
 	or.b32  	%r37, %r36, -2147483648;
 	selp.b32	%r38, %r37, %r36, %p1;
-	mov.b32 	 %f132, %r38;
+	mov.b32 	 %f133, %r38;
 
-BB29_59:
+BB29_65:
 	setp.eq.f32	%p73, %f2, 0f00000000;
 	setp.eq.f32	%p74, %f1, 0f3F800000;
 	or.pred  	%p75, %p74, %p73;
-	selp.f32	%f133, 0f3F800000, %f132, %p75;
+	selp.f32	%f134, 0f3F800000, %f133, %p75;
 
-BB29_62:
+BB29_68:
 	cvta.to.global.u64 	%rd14, %rd3;
 	mul.wide.s32 	%rd15, %r3, 4;
 	add.s64 	%rd16, %rd14, %rd15;
-	st.global.f32 	[%rd16], %f133;
+	st.global.f32 	[%rd16], %f134;
 	bar.sync 	0;
 
-BB29_63:
+BB29_69:
 	ret;
 }
 
@@ -2897,9 +2901,9 @@ BB29_63:
 	.param .u32 matrix_scalar_op_d_param_5
 )
 {
-	.reg .pred 	%p<141>;
-	.reg .b32 	%r<86>;
-	.reg .f64 	%fd<107>;
+	.reg .pred 	%p<133>;
+	.reg .b32 	%r<88>;
+	.reg .f64 	%fd<99>;
 	.reg .b64 	%rd<20>;
 
 
@@ -2914,7 +2918,7 @@ BB29_63:
 	mov.u32 	%r11, %tid.x;
 	mad.lo.s32 	%r1, %r9, %r10, %r11;
 	setp.ge.s32	%p3, %r1, %r8;
-	@%p3 bra 	BB30_130;
+	@%p3 bra 	BB30_142;
 
 	cvta.to.global.u64 	%rd6, %rd5;
 	cvta.to.global.u64 	%rd7, %rd4;
@@ -2923,9 +2927,9 @@ BB29_63:
 	ld.global.f64 	%fd1, [%rd9];
 	add.s64 	%rd1, %rd6, %rd8;
 	setp.eq.s32	%p4, %r7, 0;
-	@%p4 bra 	BB30_66;
+	@%p4 bra 	BB30_72;
 
-	mov.f64 	%fd98, 0d7FEFFFFFFFFFFFFF;
+	mov.f64 	%fd94, 0d7FEFFFFFFFFFFFFF;
 	setp.gt.s32	%p5, %r6, 8;
 	@%p5 bra 	BB30_19;
 
@@ -2936,31 +2940,31 @@ BB29_63:
 	@%p26 bra 	BB30_8;
 
 	setp.eq.s32	%p29, %r6, 0;
-	@%p29 bra 	BB30_64;
+	@%p29 bra 	BB30_70;
 	bra.uni 	BB30_6;
 
-BB30_64:
-	add.f64 	%fd98, %fd1, %fd68;
-	bra.uni 	BB30_65;
+BB30_70:
+	add.f64 	%fd94, %fd1, %fd68;
+	bra.uni 	BB30_71;
 
-BB30_66:
-	mov.f64 	%fd106, 0d7FEFFFFFFFFFFFFF;
-	setp.gt.s32	%p73, %r6, 8;
-	@%p73 bra 	BB30_83;
+BB30_72:
+	mov.f64 	%fd98, 0d7FEFFFFFFFFFFFFF;
+	setp.gt.s32	%p69, %r6, 8;
+	@%p69 bra 	BB30_89;
 
-	setp.gt.s32	%p87, %r6, 3;
-	@%p87 bra 	BB30_75;
+	setp.gt.s32	%p83, %r6, 3;
+	@%p83 bra 	BB30_81;
 
-	setp.gt.s32	%p94, %r6, 1;
-	@%p94 bra 	BB30_72;
+	setp.gt.s32	%p90, %r6, 1;
+	@%p90 bra 	BB30_78;
 
-	setp.eq.s32	%p97, %r6, 0;
-	@%p97 bra 	BB30_128;
-	bra.uni 	BB30_70;
+	setp.eq.s32	%p93, %r6, 0;
+	@%p93 bra 	BB30_140;
+	bra.uni 	BB30_76;
 
-BB30_128:
-	add.f64 	%fd106, %fd1, %fd68;
-	bra.uni 	BB30_129;
+BB30_140:
+	add.f64 	%fd98, %fd1, %fd68;
+	bra.uni 	BB30_141;
 
 BB30_19:
 	setp.gt.s32	%p6, %r6, 13;
@@ -2970,39 +2974,39 @@ BB30_19:
 	@%p13 bra 	BB30_24;
 
 	setp.eq.s32	%p17, %r6, 9;
-	@%p17 bra 	BB30_46;
+	@%p17 bra 	BB30_48;
 	bra.uni 	BB30_22;
 
-BB30_46:
-	setp.eq.f64	%p46, %fd1, %fd68;
-	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p46;
-	bra.uni 	BB30_65;
+BB30_48:
+	setp.eq.f64	%p44, %fd1, %fd68;
+	selp.f64	%fd94, 0d3FF0000000000000, 0d0000000000000000, %p44;
+	bra.uni 	BB30_71;
 
-BB30_83:
-	setp.gt.s32	%p74, %r6, 13;
-	@%p74 bra 	BB30_92;
+BB30_89:
+	setp.gt.s32	%p70, %r6, 13;
+	@%p70 bra 	BB30_98;
 
-	setp.gt.s32	%p81, %r6, 10;
-	@%p81 bra 	BB30_88;
+	setp.gt.s32	%p77, %r6, 10;
+	@%p77 bra 	BB30_94;
 
-	setp.eq.s32	%p85, %r6, 9;
-	@%p85 bra 	BB30_110;
-	bra.uni 	BB30_86;
+	setp.eq.s32	%p81, %r6, 9;
+	@%p81 bra 	BB30_118;
+	bra.uni 	BB30_92;
 
-BB30_110:
-	setp.eq.f64	%p114, %fd1, %fd68;
-	selp.f64	%fd106, 0d3FF0000000000000, 0d0000000000000000, %p114;
-	bra.uni 	BB30_129;
+BB30_118:
+	setp.eq.f64	%p108, %fd1, %fd68;
+	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p108;
+	bra.uni 	BB30_141;
 
 BB30_11:
 	setp.gt.s32	%p20, %r6, 5;
 	@%p20 bra 	BB30_15;
 
 	setp.eq.s32	%p24, %r6, 4;
-	@%p24 bra 	BB30_49;
+	@%p24 bra 	BB30_51;
 	bra.uni 	BB30_13;
 
-BB30_49:
+BB30_51:
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r2}, %fd68;
@@ -3015,7 +3019,7 @@ BB30_49:
 	add.s32 	%r25, %r24, -1012;
 	mov.b64 	 %rd14, %fd1;
 	shl.b64 	%rd2, %rd14, %r25;
-	setp.eq.s64	%p51, %rd2, -9223372036854775808;
+	setp.eq.s64	%p49, %rd2, -9223372036854775808;
 	abs.f64 	%fd18, %fd68;
 	// Callseq Start 1
 	{
@@ -3032,69 +3036,68 @@ BB30_49:
 	param0, 
 	param1
 	);
-	ld.param.f64	%fd97, [retval0+0];
+	ld.param.f64	%fd24, [retval0+0];
 	
 	//{
 	}// Callseq End 1
-	setp.lt.s32	%p52, %r2, 0;
-	and.pred  	%p1, %p52, %p51;
-	@!%p1 bra 	BB30_51;
-	bra.uni 	BB30_50;
+	setp.lt.s32	%p50, %r2, 0;
+	and.pred  	%p1, %p50, %p49;
+	@!%p1 bra 	BB30_53;
+	bra.uni 	BB30_52;
 
-BB30_50:
+BB30_52:
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r26}, %fd97;
+	mov.b64 	{%temp, %r26}, %fd24;
 	}
 	xor.b32  	%r27, %r26, -2147483648;
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r28, %temp}, %fd97;
+	mov.b64 	{%r28, %temp}, %fd24;
 	}
-	mov.b64 	%fd97, {%r28, %r27};
+	mov.b64 	%fd24, {%r28, %r27};
 
-BB30_51:
-	mov.f64 	%fd96, %fd97;
-	setp.eq.f64	%p53, %fd68, 0d0000000000000000;
-	@%p53 bra 	BB30_54;
-	bra.uni 	BB30_52;
+BB30_53:
+	setp.eq.f64	%p51, %fd68, 0d0000000000000000;
+	@%p51 bra 	BB30_56;
+	bra.uni 	BB30_54;
 
-BB30_54:
-	selp.b32	%r29, %r2, 0, %p51;
+BB30_56:
+	selp.b32	%r29, %r2, 0, %p49;
 	or.b32  	%r30, %r29, 2146435072;
-	setp.lt.s32	%p57, %r3, 0;
-	selp.b32	%r31, %r30, %r29, %p57;
+	setp.lt.s32	%p55, %r3, 0;
+	selp.b32	%r31, %r30, %r29, %p55;
 	mov.u32 	%r32, 0;
-	mov.b64 	%fd96, {%r32, %r31};
-	bra.uni 	BB30_55;
+	mov.b64 	%fd24, {%r32, %r31};
+	bra.uni 	BB30_57;
 
 BB30_28:
 	setp.gt.s32	%p7, %r6, 15;
 	@%p7 bra 	BB30_32;
 
 	setp.eq.s32	%p11, %r6, 14;
-	@%p11 bra 	BB30_43;
+	@%p11 bra 	BB30_45;
 	bra.uni 	BB30_30;
 
-BB30_43:
+BB30_45:
 	cvt.rni.s64.f64	%rd10, %fd68;
-	cvt.rni.s64.f64	%rd11, %fd1;
 	cvt.u32.u64	%r18, %rd10;
+	cvt.rni.s64.f64	%rd11, %fd1;
 	cvt.u32.u64	%r19, %rd11;
 	or.b32  	%r20, %r19, %r18;
-	setp.eq.s32	%p43, %r20, 0;
-	selp.f64	%fd98, 0d0000000000000000, 0d3FF0000000000000, %p43;
-	bra.uni 	BB30_65;
+	setp.eq.s32	%p41, %r20, 0;
+	selp.f64	%fd94, 0d0000000000000000, 0d3FF0000000000000, %p41;
+	bra.uni 	BB30_71;
 
-BB30_75:
-	setp.gt.s32	%p88, %r6, 5;
-	@%p88 bra 	BB30_79;
+BB30_81:
+	setp.gt.s32	%p84, %r6, 5;
+	@%p84 bra 	BB30_85;
 
-	setp.eq.s32	%p92, %r6, 4;
-	@%p92 bra 	BB30_113;
-	bra.uni 	BB30_77;
+	setp.eq.s32	%p88, %r6, 4;
+	@%p88 bra 	BB30_121;
+	bra.uni 	BB30_83;
 
-BB30_113:
+BB30_121:
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r4}, %fd1;
@@ -3103,11 +3106,11 @@ BB30_113:
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r5}, %fd68;
 	}
-	bfe.u32 	%r61, %r5, 20, 11;
-	add.s32 	%r62, %r61, -1012;
+	bfe.u32 	%r62, %r5, 20, 11;
+	add.s32 	%r63, %r62, -1012;
 	mov.b64 	 %rd19, %fd68;
-	shl.b64 	%rd3, %rd19, %r62;
-	setp.eq.s64	%p119, %rd3, -9223372036854775808;
+	shl.b64 	%rd3, %rd19, %r63;
+	setp.eq.s64	%p113, %rd3, -9223372036854775808;
 	abs.f64 	%fd51, %fd1;
 	// Callseq Start 2
 	{
@@ -3124,614 +3127,619 @@ BB30_113:
 	param0, 
 	param1
 	);
-	ld.param.f64	%fd105, [retval0+0];
+	ld.param.f64	%fd57, [retval0+0];
 	
 	//{
 	}// Callseq End 2
-	setp.lt.s32	%p120, %r4, 0;
-	and.pred  	%p2, %p120, %p119;
-	@!%p2 bra 	BB30_115;
-	bra.uni 	BB30_114;
+	setp.lt.s32	%p114, %r4, 0;
+	and.pred  	%p2, %p114, %p113;
+	@!%p2 bra 	BB30_123;
+	bra.uni 	BB30_122;
 
-BB30_114:
+BB30_122:
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r63}, %fd105;
+	mov.b64 	{%temp, %r64}, %fd57;
 	}
-	xor.b32  	%r64, %r63, -2147483648;
+	xor.b32  	%r65, %r64, -2147483648;
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r65, %temp}, %fd105;
+	mov.b64 	{%r66, %temp}, %fd57;
 	}
-	mov.b64 	%fd105, {%r65, %r64};
+	mov.b64 	%fd57, {%r66, %r65};
 
-BB30_115:
-	mov.f64 	%fd104, %fd105;
-	setp.eq.f64	%p121, %fd1, 0d0000000000000000;
-	@%p121 bra 	BB30_118;
-	bra.uni 	BB30_116;
+BB30_123:
+	setp.eq.f64	%p115, %fd1, 0d0000000000000000;
+	@%p115 bra 	BB30_126;
+	bra.uni 	BB30_124;
 
-BB30_118:
-	selp.b32	%r66, %r4, 0, %p119;
-	or.b32  	%r67, %r66, 2146435072;
-	setp.lt.s32	%p125, %r5, 0;
-	selp.b32	%r68, %r67, %r66, %p125;
-	mov.u32 	%r69, 0;
-	mov.b64 	%fd104, {%r69, %r68};
-	bra.uni 	BB30_119;
+BB30_126:
+	selp.b32	%r67, %r4, 0, %p113;
+	or.b32  	%r68, %r67, 2146435072;
+	setp.lt.s32	%p119, %r5, 0;
+	selp.b32	%r69, %r68, %r67, %p119;
+	mov.u32 	%r70, 0;
+	mov.b64 	%fd57, {%r70, %r69};
+	bra.uni 	BB30_127;
 
-BB30_92:
-	setp.gt.s32	%p75, %r6, 15;
-	@%p75 bra 	BB30_96;
+BB30_98:
+	setp.gt.s32	%p71, %r6, 15;
+	@%p71 bra 	BB30_102;
 
-	setp.eq.s32	%p79, %r6, 14;
-	@%p79 bra 	BB30_107;
-	bra.uni 	BB30_94;
+	setp.eq.s32	%p75, %r6, 14;
+	@%p75 bra 	BB30_115;
+	bra.uni 	BB30_100;
 
-BB30_107:
+BB30_115:
 	cvt.rni.s64.f64	%rd15, %fd1;
+	cvt.u32.u64	%r56, %rd15;
 	cvt.rni.s64.f64	%rd16, %fd68;
-	cvt.u32.u64	%r55, %rd15;
-	cvt.u32.u64	%r56, %rd16;
-	or.b32  	%r57, %r56, %r55;
-	setp.eq.s32	%p111, %r57, 0;
-	selp.f64	%fd106, 0d0000000000000000, 0d3FF0000000000000, %p111;
-	bra.uni 	BB30_129;
+	cvt.u32.u64	%r57, %rd16;
+	or.b32  	%r58, %r57, %r56;
+	setp.eq.s32	%p105, %r58, 0;
+	selp.f64	%fd98, 0d0000000000000000, 0d3FF0000000000000, %p105;
+	bra.uni 	BB30_141;
 
 BB30_8:
 	setp.eq.s32	%p27, %r6, 2;
-	@%p27 bra 	BB30_63;
+	@%p27 bra 	BB30_69;
 	bra.uni 	BB30_9;
 
-BB30_63:
-	mul.f64 	%fd98, %fd1, %fd68;
-	bra.uni 	BB30_65;
+BB30_69:
+	mul.f64 	%fd94, %fd1, %fd68;
+	bra.uni 	BB30_71;
 
 BB30_24:
 	setp.eq.s32	%p14, %r6, 11;
-	@%p14 bra 	BB30_45;
+	@%p14 bra 	BB30_47;
 
 	setp.eq.s32	%p15, %r6, 12;
-	@%p15 bra 	BB30_44;
+	@%p15 bra 	BB30_46;
 	bra.uni 	BB30_26;
 
-BB30_44:
-	max.f64 	%fd98, %fd68, %fd1;
-	bra.uni 	BB30_65;
+BB30_46:
+	max.f64 	%fd94, %fd68, %fd1;
+	bra.uni 	BB30_71;
 
 BB30_15:
 	setp.eq.s32	%p21, %r6, 6;
-	@%p21 bra 	BB30_48;
+	@%p21 bra 	BB30_50;
 
 	setp.eq.s32	%p22, %r6, 7;
-	@%p22 bra 	BB30_47;
+	@%p22 bra 	BB30_49;
 	bra.uni 	BB30_17;
 
-BB30_47:
-	setp.lt.f64	%p48, %fd1, %fd68;
-	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p48;
-	bra.uni 	BB30_65;
+BB30_49:
+	setp.lt.f64	%p46, %fd1, %fd68;
+	selp.f64	%fd94, 0d3FF0000000000000, 0d0000000000000000, %p46;
+	bra.uni 	BB30_71;
 
 BB30_32:
 	setp.eq.s32	%p8, %r6, 16;
-	@%p8 bra 	BB30_42;
+	@%p8 bra 	BB30_44;
 
 	setp.eq.s32	%p9, %r6, 17;
-	@%p9 bra 	BB30_38;
+	@%p9 bra 	BB30_39;
 	bra.uni 	BB30_34;
 
-BB30_38:
-	setp.eq.f64	%p35, %fd1, 0d0000000000000000;
-	setp.eq.f64	%p36, %fd1, 0d8000000000000000;
-	or.pred  	%p37, %p35, %p36;
-	mov.f64 	%fd98, 0d7FF8000000000000;
-	@%p37 bra 	BB30_65;
+BB30_39:
+	setp.eq.f64	%p34, %fd1, 0d0000000000000000;
+	setp.eq.f64	%p35, %fd1, 0d8000000000000000;
+	or.pred  	%p36, %p34, %p35;
+	mov.f64 	%fd94, 0d7FF8000000000000;
+	@%p36 bra 	BB30_71;
 
-	div.rn.f64 	%fd98, %fd68, %fd1;
-	abs.f64 	%fd72, %fd98;
-	setp.gtu.f64	%p38, %fd72, 0d7FF0000000000000;
-	@%p38 bra 	BB30_65;
+	div.rn.f64 	%fd94, %fd68, %fd1;
+	abs.f64 	%fd72, %fd94;
+	setp.gtu.f64	%p37, %fd72, 0d7FF0000000000000;
+	@%p37 bra 	BB30_71;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r15, %temp}, %fd98;
+	mov.b64 	{%temp, %r15}, %fd94;
 	}
+	and.b32  	%r16, %r15, 2147483647;
+	setp.ne.s32	%p38, %r16, 2146435072;
+	@%p38 bra 	BB30_43;
+
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r16}, %fd98;
+	mov.b64 	{%r17, %temp}, %fd94;
 	}
-	and.b32  	%r17, %r16, 2147483647;
-	setp.ne.s32	%p39, %r17, 2146435072;
-	setp.ne.s32	%p40, %r15, 0;
-	or.pred  	%p41, %p39, %p40;
-	@!%p41 bra 	BB30_65;
-	bra.uni 	BB30_41;
+	setp.eq.s32	%p39, %r17, 0;
+	@%p39 bra 	BB30_71;
 
-BB30_41:
-	cvt.rmi.f64.f64	%fd73, %fd98;
+BB30_43:
+	cvt.rmi.f64.f64	%fd73, %fd94;
 	mul.f64 	%fd74, %fd1, %fd73;
-	sub.f64 	%fd98, %fd68, %fd74;
-	bra.uni 	BB30_65;
+	sub.f64 	%fd94, %fd68, %fd74;
+	bra.uni 	BB30_71;
 
-BB30_72:
-	setp.eq.s32	%p95, %r6, 2;
-	@%p95 bra 	BB30_127;
-	bra.uni 	BB30_73;
+BB30_78:
+	setp.eq.s32	%p91, %r6, 2;
+	@%p91 bra 	BB30_139;
+	bra.uni 	BB30_79;
 
-BB30_127:
-	mul.f64 	%fd106, %fd1, %fd68;
-	bra.uni 	BB30_129;
+BB30_139:
+	mul.f64 	%fd98, %fd1, %fd68;
+	bra.uni 	BB30_141;
 
-BB30_88:
-	setp.eq.s32	%p82, %r6, 11;
-	@%p82 bra 	BB30_109;
+BB30_94:
+	setp.eq.s32	%p78, %r6, 11;
+	@%p78 bra 	BB30_117;
 
-	setp.eq.s32	%p83, %r6, 12;
-	@%p83 bra 	BB30_108;
-	bra.uni 	BB30_90;
+	setp.eq.s32	%p79, %r6, 12;
+	@%p79 bra 	BB30_116;
+	bra.uni 	BB30_96;
 
-BB30_108:
-	max.f64 	%fd106, %fd1, %fd68;
-	bra.uni 	BB30_129;
+BB30_116:
+	max.f64 	%fd98, %fd1, %fd68;
+	bra.uni 	BB30_141;
 
-BB30_79:
-	setp.eq.s32	%p89, %r6, 6;
-	@%p89 bra 	BB30_112;
+BB30_85:
+	setp.eq.s32	%p85, %r6, 6;
+	@%p85 bra 	BB30_120;
 
-	setp.eq.s32	%p90, %r6, 7;
-	@%p90 bra 	BB30_111;
-	bra.uni 	BB30_81;
+	setp.eq.s32	%p86, %r6, 7;
+	@%p86 bra 	BB30_119;
+	bra.uni 	BB30_87;
 
-BB30_111:
-	setp.gt.f64	%p116, %fd1, %fd68;
-	selp.f64	%fd106, 0d3FF0000000000000, 0d0000000000000000, %p116;
-	bra.uni 	BB30_129;
+BB30_119:
+	setp.gt.f64	%p110, %fd1, %fd68;
+	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p110;
+	bra.uni 	BB30_141;
 
-BB30_96:
-	setp.eq.s32	%p76, %r6, 16;
-	@%p76 bra 	BB30_106;
+BB30_102:
+	setp.eq.s32	%p72, %r6, 16;
+	@%p72 bra 	BB30_114;
 
-	setp.eq.s32	%p77, %r6, 17;
-	@%p77 bra 	BB30_102;
-	bra.uni 	BB30_98;
+	setp.eq.s32	%p73, %r6, 17;
+	@%p73 bra 	BB30_109;
+	bra.uni 	BB30_104;
 
-BB30_102:
-	setp.eq.f64	%p103, %fd68, 0d0000000000000000;
-	setp.eq.f64	%p104, %fd68, 0d8000000000000000;
-	or.pred  	%p105, %p103, %p104;
-	mov.f64 	%fd106, 0d7FF8000000000000;
-	@%p105 bra 	BB30_129;
+BB30_109:
+	setp.eq.f64	%p98, %fd68, 0d0000000000000000;
+	setp.eq.f64	%p99, %fd68, 0d8000000000000000;
+	or.pred  	%p100, %p98, %p99;
+	mov.f64 	%fd98, 0d7FF8000000000000;
+	@%p100 bra 	BB30_141;
 
-	div.rn.f64 	%fd106, %fd1, %fd68;
-	abs.f64 	%fd83, %fd106;
-	setp.gtu.f64	%p106, %fd83, 0d7FF0000000000000;
-	@%p106 bra 	BB30_129;
+	div.rn.f64 	%fd98, %fd1, %fd68;
+	abs.f64 	%fd83, %fd98;
+	setp.gtu.f64	%p101, %fd83, 0d7FF0000000000000;
+	@%p101 bra 	BB30_141;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r52, %temp}, %fd106;
+	mov.b64 	{%temp, %r53}, %fd98;
 	}
+	and.b32  	%r54, %r53, 2147483647;
+	setp.ne.s32	%p102, %r54, 2146435072;
+	@%p102 bra 	BB30_113;
+
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r53}, %fd106;
+	mov.b64 	{%r55, %temp}, %fd98;
 	}
-	and.b32  	%r54, %r53, 2147483647;
-	setp.ne.s32	%p107, %r54, 2146435072;
-	setp.ne.s32	%p108, %r52, 0;
-	or.pred  	%p109, %p107, %p108;
-	@!%p109 bra 	BB30_129;
-	bra.uni 	BB30_105;
-
-BB30_105:
-	cvt.rmi.f64.f64	%fd84, %fd106;
+	setp.eq.s32	%p103, %r55, 0;
+	@%p103 bra 	BB30_141;
+
+BB30_113:
+	cvt.rmi.f64.f64	%fd84, %fd98;
 	mul.f64 	%fd85, %fd84, %fd68;
-	sub.f64 	%fd106, %fd1, %fd85;
-	bra.uni 	BB30_129;
+	sub.f64 	%fd98, %fd1, %fd85;
+	bra.uni 	BB30_141;
 
 BB30_6:
 	setp.eq.s32	%p30, %r6, 1;
 	@%p30 bra 	BB30_7;
-	bra.uni 	BB30_65;
+	bra.uni 	BB30_71;
 
 BB30_7:
-	sub.f64 	%fd98, %fd68, %fd1;
-	bra.uni 	BB30_65;
+	sub.f64 	%fd94, %fd68, %fd1;
+	bra.uni 	BB30_71;
 
 BB30_22:
 	setp.eq.s32	%p18, %r6, 10;
 	@%p18 bra 	BB30_23;
-	bra.uni 	BB30_65;
+	bra.uni 	BB30_71;
 
 BB30_23:
-	setp.neu.f64	%p45, %fd1, %fd68;
-	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p45;
-	bra.uni 	BB30_65;
+	setp.neu.f64	%p43, %fd1, %fd68;
+	selp.f64	%fd94, 0d3FF0000000000000, 0d0000000000000000, %p43;
+	bra.uni 	BB30_71;
 
 BB30_13:
 	setp.eq.s32	%p25, %r6, 5;
 	@%p25 bra 	BB30_14;
-	bra.uni 	BB30_65;
+	bra.uni 	BB30_71;
 
 BB30_14:
-	setp.gt.f64	%p50, %fd1, %fd68;
-	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p50;
-	bra.uni 	BB30_65;
+	setp.gt.f64	%p48, %fd1, %fd68;
+	selp.f64	%fd94, 0d3FF0000000000000, 0d0000000000000000, %p48;
+	bra.uni 	BB30_71;
 
 BB30_30:
 	setp.eq.s32	%p12, %r6, 15;
 	@%p12 bra 	BB30_31;
-	bra.uni 	BB30_65;
+	bra.uni 	BB30_71;
 
 BB30_31:
 	mul.f64 	%fd76, %fd1, %fd68;
 	mov.f64 	%fd77, 0d3FF0000000000000;
-	sub.f64 	%fd98, %fd77, %fd76;
-	bra.uni 	BB30_65;
+	sub.f64 	%fd94, %fd77, %fd76;
+	bra.uni 	BB30_71;
 
 BB30_9:
 	setp.eq.s32	%p28, %r6, 3;
 	@%p28 bra 	BB30_10;
-	bra.uni 	BB30_65;
+	bra.uni 	BB30_71;
 
 BB30_10:
-	div.rn.f64 	%fd98, %fd68, %fd1;
-	bra.uni 	BB30_65;
+	div.rn.f64 	%fd94, %fd68, %fd1;
+	bra.uni 	BB30_71;
 
-BB30_45:
-	min.f64 	%fd98, %fd68, %fd1;
-	bra.uni 	BB30_65;
+BB30_47:
+	min.f64 	%fd94, %fd68, %fd1;
+	bra.uni 	BB30_71;
 
 BB30_26:
 	setp.eq.s32	%p16, %r6, 13;
 	@%p16 bra 	BB30_27;
-	bra.uni 	BB30_65;
+	bra.uni 	BB30_71;
 
 BB30_27:
 	cvt.rni.s64.f64	%rd12, %fd68;
-	cvt.rni.s64.f64	%rd13, %fd1;
 	cvt.u32.u64	%r21, %rd12;
+	cvt.rni.s64.f64	%rd13, %fd1;
 	cvt.u32.u64	%r22, %rd13;
 	and.b32  	%r23, %r22, %r21;
-	setp.eq.s32	%p44, %r23, 0;
-	selp.f64	%fd98, 0d0000000000000000, 0d3FF0000000000000, %p44;
-	bra.uni 	BB30_65;
+	setp.eq.s32	%p42, %r23, 0;
+	selp.f64	%fd94, 0d0000000000000000, 0d3FF0000000000000, %p42;
+	bra.uni 	BB30_71;
 
-BB30_48:
-	setp.ge.f64	%p49, %fd1, %fd68;
-	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p49;
-	bra.uni 	BB30_65;
+BB30_50:
+	setp.ltu.f64	%p47, %fd1, %fd68;
+	selp.f64	%fd94, 0d0000000000000000, 0d3FF0000000000000, %p47;
+	bra.uni 	BB30_71;
 
 BB30_17:
 	setp.eq.s32	%p23, %r6, 8;
 	@%p23 bra 	BB30_18;
-	bra.uni 	BB30_65;
+	bra.uni 	BB30_71;
 
 BB30_18:
-	setp.le.f64	%p47, %fd1, %fd68;
-	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p47;
-	bra.uni 	BB30_65;
+	setp.gtu.f64	%p45, %fd1, %fd68;
+	selp.f64	%fd94, 0d0000000000000000, 0d3FF0000000000000, %p45;
+	bra.uni 	BB30_71;
 
-BB30_42:
-	setp.neu.f64	%p42, %fd68, 0d0000000000000000;
+BB30_44:
+	setp.neu.f64	%p40, %fd68, 0d0000000000000000;
 	sub.f64 	%fd75, %fd68, %fd1;
-	selp.f64	%fd98, %fd75, 0d0000000000000000, %p42;
-	bra.uni 	BB30_65;
+	selp.f64	%fd94, %fd75, 0d0000000000000000, %p40;
+	bra.uni 	BB30_71;
 
 BB30_34:
 	setp.ne.s32	%p10, %r6, 18;
-	@%p10 bra 	BB30_65;
+	@%p10 bra 	BB30_71;
 
-	div.rn.f64 	%fd98, %fd68, %fd1;
-	abs.f64 	%fd70, %fd98;
+	div.rn.f64 	%fd94, %fd68, %fd1;
+	abs.f64 	%fd70, %fd94;
 	setp.gtu.f64	%p31, %fd70, 0d7FF0000000000000;
-	@%p31 bra 	BB30_65;
+	@%p31 bra 	BB30_71;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r12, %temp}, %fd98;
+	mov.b64 	{%temp, %r12}, %fd94;
 	}
+	and.b32  	%r13, %r12, 2147483647;
+	setp.ne.s32	%p32, %r13, 2146435072;
+	@%p32 bra 	BB30_38;
+
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r13}, %fd98;
+	mov.b64 	{%r14, %temp}, %fd94;
 	}
-	and.b32  	%r14, %r13, 2147483647;
-	setp.ne.s32	%p32, %r14, 2146435072;
-	setp.ne.s32	%p33, %r12, 0;
-	or.pred  	%p34, %p32, %p33;
-	@!%p34 bra 	BB30_65;
-	bra.uni 	BB30_37;
+	setp.eq.s32	%p33, %r14, 0;
+	@%p33 bra 	BB30_71;
 
-BB30_37:
-	cvt.rmi.f64.f64	%fd98, %fd98;
-	bra.uni 	BB30_65;
+BB30_38:
+	cvt.rmi.f64.f64	%fd94, %fd94;
+	bra.uni 	BB30_71;
 
-BB30_70:
-	setp.eq.s32	%p98, %r6, 1;
-	@%p98 bra 	BB30_71;
-	bra.uni 	BB30_129;
+BB30_76:
+	setp.eq.s32	%p94, %r6, 1;
+	@%p94 bra 	BB30_77;
+	bra.uni 	BB30_141;
 
-BB30_71:
-	sub.f64 	%fd106, %fd1, %fd68;
-	bra.uni 	BB30_129;
+BB30_77:
+	sub.f64 	%fd98, %fd1, %fd68;
+	bra.uni 	BB30_141;
 
-BB30_86:
-	setp.eq.s32	%p86, %r6, 10;
-	@%p86 bra 	BB30_87;
-	bra.uni 	BB30_129;
+BB30_92:
+	setp.eq.s32	%p82, %r6, 10;
+	@%p82 bra 	BB30_93;
+	bra.uni 	BB30_141;
 
-BB30_87:
-	setp.neu.f64	%p113, %fd1, %fd68;
-	selp.f64	%fd106, 0d3FF0000000000000, 0d0000000000000000, %p113;
-	bra.uni 	BB30_129;
+BB30_93:
+	setp.neu.f64	%p107, %fd1, %fd68;
+	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p107;
+	bra.uni 	BB30_141;
 
-BB30_77:
-	setp.eq.s32	%p93, %r6, 5;
-	@%p93 bra 	BB30_78;
-	bra.uni 	BB30_129;
+BB30_83:
+	setp.eq.s32	%p89, %r6, 5;
+	@%p89 bra 	BB30_84;
+	bra.uni 	BB30_141;
 
-BB30_78:
-	setp.lt.f64	%p118, %fd1, %fd68;
-	selp.f64	%fd106, 0d3FF0000000000000, 0d0000000000000000, %p118;
-	bra.uni 	BB30_129;
+BB30_84:
+	setp.lt.f64	%p112, %fd1, %fd68;
+	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p112;
+	bra.uni 	BB30_141;
 
-BB30_94:
-	setp.eq.s32	%p80, %r6, 15;
-	@%p80 bra 	BB30_95;
-	bra.uni 	BB30_129;
+BB30_100:
+	setp.eq.s32	%p76, %r6, 15;
+	@%p76 bra 	BB30_101;
+	bra.uni 	BB30_141;
 
-BB30_95:
+BB30_101:
 	mul.f64 	%fd87, %fd1, %fd68;
 	mov.f64 	%fd88, 0d3FF0000000000000;
-	sub.f64 	%fd106, %fd88, %fd87;
-	bra.uni 	BB30_129;
+	sub.f64 	%fd98, %fd88, %fd87;
+	bra.uni 	BB30_141;
 
-BB30_73:
-	setp.eq.s32	%p96, %r6, 3;
-	@%p96 bra 	BB30_74;
-	bra.uni 	BB30_129;
+BB30_79:
+	setp.eq.s32	%p92, %r6, 3;
+	@%p92 bra 	BB30_80;
+	bra.uni 	BB30_141;
 
-BB30_74:
-	div.rn.f64 	%fd106, %fd1, %fd68;
-	bra.uni 	BB30_129;
+BB30_80:
+	div.rn.f64 	%fd98, %fd1, %fd68;
+	bra.uni 	BB30_141;
 
-BB30_109:
-	min.f64 	%fd106, %fd1, %fd68;
-	bra.uni 	BB30_129;
+BB30_117:
+	min.f64 	%fd98, %fd1, %fd68;
+	bra.uni 	BB30_141;
 
-BB30_90:
-	setp.eq.s32	%p84, %r6, 13;
-	@%p84 bra 	BB30_91;
-	bra.uni 	BB30_129;
+BB30_96:
+	setp.eq.s32	%p80, %r6, 13;
+	@%p80 bra 	BB30_97;
+	bra.uni 	BB30_141;
 
-BB30_91:
+BB30_97:
 	cvt.rni.s64.f64	%rd17, %fd1;
+	cvt.u32.u64	%r59, %rd17;
 	cvt.rni.s64.f64	%rd18, %fd68;
-	cvt.u32.u64	%r58, %rd17;
-	cvt.u32.u64	%r59, %rd18;
-	and.b32  	%r60, %r59, %r58;
-	setp.eq.s32	%p112, %r60, 0;
-	selp.f64	%fd106, 0d0000000000000000, 0d3FF0000000000000, %p112;
-	bra.uni 	BB30_129;
-
-BB30_112:
-	setp.le.f64	%p117, %fd1, %fd68;
-	selp.f64	%fd106, 0d3FF0000000000000, 0d0000000000000000, %p117;
-	bra.uni 	BB30_129;
+	cvt.u32.u64	%r60, %rd18;
+	and.b32  	%r61, %r60, %r59;
+	setp.eq.s32	%p106, %r61, 0;
+	selp.f64	%fd98, 0d0000000000000000, 0d3FF0000000000000, %p106;
+	bra.uni 	BB30_141;
 
-BB30_81:
-	setp.eq.s32	%p91, %r6, 8;
-	@%p91 bra 	BB30_82;
-	bra.uni 	BB30_129;
+BB30_120:
+	setp.gtu.f64	%p111, %fd1, %fd68;
+	selp.f64	%fd98, 0d0000000000000000, 0d3FF0000000000000, %p111;
+	bra.uni 	BB30_141;
+
+BB30_87:
+	setp.eq.s32	%p87, %r6, 8;
+	@%p87 bra 	BB30_88;
+	bra.uni 	BB30_141;
 
-BB30_82:
-	setp.ge.f64	%p115, %fd1, %fd68;
-	selp.f64	%fd106, 0d3FF0000000000000, 0d0000000000000000, %p115;
-	bra.uni 	BB30_129;
+BB30_88:
+	setp.ltu.f64	%p109, %fd1, %fd68;
+	selp.f64	%fd98, 0d0000000000000000, 0d3FF0000000000000, %p109;
+	bra.uni 	BB30_141;
 
-BB30_106:
-	setp.neu.f64	%p110, %fd1, 0d0000000000000000;
+BB30_114:
+	setp.neu.f64	%p104, %fd1, 0d0000000000000000;
 	sub.f64 	%fd86, %fd1, %fd68;
-	selp.f64	%fd106, %fd86, 0d0000000000000000, %p110;
-	bra.uni 	BB30_129;
+	selp.f64	%fd98, %fd86, 0d0000000000000000, %p104;
+	bra.uni 	BB30_141;
 
-BB30_98:
-	setp.ne.s32	%p78, %r6, 18;
-	@%p78 bra 	BB30_129;
+BB30_104:
+	setp.ne.s32	%p74, %r6, 18;
+	@%p74 bra 	BB30_141;
 
-	div.rn.f64 	%fd106, %fd1, %fd68;
-	abs.f64 	%fd81, %fd106;
-	setp.gtu.f64	%p99, %fd81, 0d7FF0000000000000;
-	@%p99 bra 	BB30_129;
+	div.rn.f64 	%fd98, %fd1, %fd68;
+	abs.f64 	%fd81, %fd98;
+	setp.gtu.f64	%p95, %fd81, 0d7FF0000000000000;
+	@%p95 bra 	BB30_141;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r49, %temp}, %fd106;
+	mov.b64 	{%temp, %r50}, %fd98;
 	}
+	and.b32  	%r51, %r50, 2147483647;
+	setp.ne.s32	%p96, %r51, 2146435072;
+	@%p96 bra 	BB30_108;
+
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r50}, %fd106;
+	mov.b64 	{%r52, %temp}, %fd98;
 	}
-	and.b32  	%r51, %r50, 2147483647;
-	setp.ne.s32	%p100, %r51, 2146435072;
-	setp.ne.s32	%p101, %r49, 0;
-	or.pred  	%p102, %p100, %p101;
-	@!%p102 bra 	BB30_129;
-	bra.uni 	BB30_101;
+	setp.eq.s32	%p97, %r52, 0;
+	@%p97 bra 	BB30_141;
 
-BB30_101:
-	cvt.rmi.f64.f64	%fd106, %fd106;
-	bra.uni 	BB30_129;
+BB30_108:
+	cvt.rmi.f64.f64	%fd98, %fd98;
+	bra.uni 	BB30_141;
 
-BB30_52:
-	setp.gt.s32	%p54, %r2, -1;
-	@%p54 bra 	BB30_55;
+BB30_54:
+	setp.gt.s32	%p52, %r2, -1;
+	@%p52 bra 	BB30_57;
 
 	cvt.rzi.f64.f64	%fd78, %fd1;
-	setp.neu.f64	%p55, %fd78, %fd1;
-	selp.f64	%fd96, 0dFFF8000000000000, %fd96, %p55;
+	setp.neu.f64	%p53, %fd78, %fd1;
+	selp.f64	%fd24, 0dFFF8000000000000, %fd24, %p53;
 
-BB30_55:
-	mov.f64 	%fd24, %fd96;
-	add.f64 	%fd25, %fd1, %fd68;
+BB30_57:
+	add.f64 	%fd93, %fd1, %fd68;
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r33}, %fd25;
+	mov.b64 	{%temp, %r33}, %fd93;
 	}
 	and.b32  	%r34, %r33, 2146435072;
-	setp.ne.s32	%p58, %r34, 2146435072;
-	mov.f64 	%fd95, %fd24;
-	@%p58 bra 	BB30_62;
+	setp.ne.s32	%p56, %r34, 2146435072;
+	@%p56 bra 	BB30_58;
 
-	setp.gtu.f64	%p59, %fd18, 0d7FF0000000000000;
-	mov.f64 	%fd95, %fd25;
-	@%p59 bra 	BB30_62;
+	setp.gtu.f64	%p57, %fd18, 0d7FF0000000000000;
+	@%p57 bra 	BB30_68;
 
 	abs.f64 	%fd79, %fd1;
-	setp.gtu.f64	%p60, %fd79, 0d7FF0000000000000;
-	mov.f64 	%fd94, %fd25;
-	mov.f64 	%fd95, %fd94;
-	@%p60 bra 	BB30_62;
+	setp.gtu.f64	%p58, %fd79, 0d7FF0000000000000;
+	@%p58 bra 	BB30_68;
+
+	and.b32  	%r35, %r3, 2147483647;
+	setp.ne.s32	%p59, %r35, 2146435072;
+	@%p59 bra 	BB30_63;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r35, %temp}, %fd1;
-	}
-	and.b32  	%r36, %r3, 2147483647;
-	setp.eq.s32	%p61, %r36, 2146435072;
-	setp.eq.s32	%p62, %r35, 0;
-	and.pred  	%p63, %p61, %p62;
-	@%p63 bra 	BB30_61;
-	bra.uni 	BB30_59;
-
-BB30_61:
-	setp.gt.f64	%p67, %fd18, 0d3FF0000000000000;
-	selp.b32	%r44, 2146435072, 0, %p67;
-	xor.b32  	%r45, %r44, 2146435072;
-	setp.lt.s32	%p68, %r3, 0;
-	selp.b32	%r46, %r45, %r44, %p68;
-	setp.eq.f64	%p69, %fd68, 0dBFF0000000000000;
-	selp.b32	%r47, 1072693248, %r46, %p69;
-	mov.u32 	%r48, 0;
-	mov.b64 	%fd95, {%r48, %r47};
-	bra.uni 	BB30_62;
-
-BB30_116:
-	setp.gt.s32	%p122, %r4, -1;
-	@%p122 bra 	BB30_119;
+	mov.b64 	{%r36, %temp}, %fd1;
+	}
+	setp.eq.s32	%p60, %r36, 0;
+	@%p60 bra 	BB30_67;
 
-	cvt.rzi.f64.f64	%fd89, %fd68;
-	setp.neu.f64	%p123, %fd89, %fd68;
-	selp.f64	%fd104, 0dFFF8000000000000, %fd104, %p123;
+BB30_63:
+	and.b32  	%r37, %r2, 2147483647;
+	setp.ne.s32	%p61, %r37, 2146435072;
+	@%p61 bra 	BB30_64;
 
-BB30_119:
-	mov.f64 	%fd57, %fd104;
-	add.f64 	%fd58, %fd1, %fd68;
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r70}, %fd58;
+	mov.b64 	{%r38, %temp}, %fd68;
 	}
-	and.b32  	%r71, %r70, 2146435072;
-	setp.ne.s32	%p126, %r71, 2146435072;
-	mov.f64 	%fd103, %fd57;
-	@%p126 bra 	BB30_126;
+	setp.ne.s32	%p62, %r38, 0;
+	mov.f64 	%fd93, %fd24;
+	@%p62 bra 	BB30_68;
 
-	setp.gtu.f64	%p127, %fd51, 0d7FF0000000000000;
-	mov.f64 	%fd103, %fd58;
-	@%p127 bra 	BB30_126;
+	shr.s32 	%r39, %r3, 31;
+	and.b32  	%r40, %r39, -2146435072;
+	add.s32 	%r41, %r40, 2146435072;
+	or.b32  	%r42, %r41, -2147483648;
+	selp.b32	%r43, %r42, %r41, %p1;
+	mov.u32 	%r44, 0;
+	mov.b64 	%fd93, {%r44, %r43};
+	bra.uni 	BB30_68;
 
-	abs.f64 	%fd90, %fd68;
-	setp.gtu.f64	%p128, %fd90, 0d7FF0000000000000;
-	mov.f64 	%fd102, %fd58;
-	mov.f64 	%fd103, %fd102;
-	@%p128 bra 	BB30_126;
+BB30_58:
+	mov.f64 	%fd93, %fd24;
 
+BB30_68:
+	setp.eq.f64	%p66, %fd1, 0d0000000000000000;
+	setp.eq.f64	%p67, %fd68, 0d3FF0000000000000;
+	or.pred  	%p68, %p67, %p66;
+	selp.f64	%fd94, 0d3FF0000000000000, %fd93, %p68;
+
+BB30_71:
+	st.global.f64 	[%rd1], %fd94;
+	bra.uni 	BB30_142;
+
+BB30_124:
+	setp.gt.s32	%p116, %r4, -1;
+	@%p116 bra 	BB30_127;
+
+	cvt.rzi.f64.f64	%fd89, %fd68;
+	setp.neu.f64	%p117, %fd89, %fd68;
+	selp.f64	%fd57, 0dFFF8000000000000, %fd57, %p117;
+
+BB30_127:
+	add.f64 	%fd97, %fd1, %fd68;
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r72, %temp}, %fd68;
+	mov.b64 	{%temp, %r71}, %fd97;
 	}
+	and.b32  	%r72, %r71, 2146435072;
+	setp.ne.s32	%p120, %r72, 2146435072;
+	@%p120 bra 	BB30_128;
+
+	setp.gtu.f64	%p121, %fd51, 0d7FF0000000000000;
+	@%p121 bra 	BB30_138;
+
+	abs.f64 	%fd90, %fd68;
+	setp.gtu.f64	%p122, %fd90, 0d7FF0000000000000;
+	@%p122 bra 	BB30_138;
+
 	and.b32  	%r73, %r5, 2147483647;
-	setp.eq.s32	%p129, %r73, 2146435072;
-	setp.eq.s32	%p130, %r72, 0;
-	and.pred  	%p131, %p129, %p130;
-	@%p131 bra 	BB30_125;
-	bra.uni 	BB30_123;
-
-BB30_125:
-	setp.gt.f64	%p135, %fd51, 0d3FF0000000000000;
-	selp.b32	%r81, 2146435072, 0, %p135;
-	xor.b32  	%r82, %r81, 2146435072;
-	setp.lt.s32	%p136, %r5, 0;
-	selp.b32	%r83, %r82, %r81, %p136;
-	setp.eq.f64	%p137, %fd1, 0dBFF0000000000000;
-	selp.b32	%r84, 1072693248, %r83, %p137;
-	mov.u32 	%r85, 0;
-	mov.b64 	%fd103, {%r85, %r84};
-	bra.uni 	BB30_126;
-
-BB30_59:
+	setp.ne.s32	%p123, %r73, 2146435072;
+	@%p123 bra 	BB30_133;
+
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r37, %temp}, %fd68;
+	mov.b64 	{%r74, %temp}, %fd68;
 	}
-	and.b32  	%r38, %r2, 2147483647;
-	setp.eq.s32	%p64, %r38, 2146435072;
-	setp.eq.s32	%p65, %r37, 0;
-	and.pred  	%p66, %p64, %p65;
-	mov.f64 	%fd95, %fd24;
-	@!%p66 bra 	BB30_62;
-	bra.uni 	BB30_60;
-
-BB30_60:
-	shr.s32 	%r39, %r3, 31;
-	and.b32  	%r40, %r39, -2146435072;
-	selp.b32	%r41, -1048576, 2146435072, %p1;
-	add.s32 	%r42, %r41, %r40;
-	mov.u32 	%r43, 0;
-	mov.b64 	%fd95, {%r43, %r42};
-
-BB30_62:
-	setp.eq.f64	%p70, %fd1, 0d0000000000000000;
-	setp.eq.f64	%p71, %fd68, 0d3FF0000000000000;
-	or.pred  	%p72, %p71, %p70;
-	selp.f64	%fd98, 0d3FF0000000000000, %fd95, %p72;
+	setp.eq.s32	%p124, %r74, 0;
+	@%p124 bra 	BB30_137;
 
-BB30_65:
-	st.global.f64 	[%rd1], %fd98;
-	bra.uni 	BB30_130;
+BB30_133:
+	and.b32  	%r75, %r4, 2147483647;
+	setp.ne.s32	%p125, %r75, 2146435072;
+	@%p125 bra 	BB30_134;
 
-BB30_123:
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r74, %temp}, %fd1;
+	mov.b64 	{%r76, %temp}, %fd1;
 	}
-	and.b32  	%r75, %r4, 2147483647;
-	setp.eq.s32	%p132, %r75, 2146435072;
-	setp.eq.s32	%p133, %r74, 0;
-	and.pred  	%p134, %p132, %p133;
-	mov.f64 	%fd103, %fd57;
-	@!%p134 bra 	BB30_126;
-	bra.uni 	BB30_124;
+	setp.ne.s32	%p126, %r76, 0;
+	mov.f64 	%fd97, %fd57;
+	@%p126 bra 	BB30_138;
 
-BB30_124:
-	shr.s32 	%r76, %r5, 31;
-	and.b32  	%r77, %r76, -2146435072;
-	selp.b32	%r78, -1048576, 2146435072, %p2;
-	add.s32 	%r79, %r78, %r77;
-	mov.u32 	%r80, 0;
-	mov.b64 	%fd103, {%r80, %r79};
+	shr.s32 	%r77, %r5, 31;
+	and.b32  	%r78, %r77, -2146435072;
+	add.s32 	%r79, %r78, 2146435072;
+	or.b32  	%r80, %r79, -2147483648;
+	selp.b32	%r81, %r80, %r79, %p2;
+	mov.u32 	%r82, 0;
+	mov.b64 	%fd97, {%r82, %r81};
+	bra.uni 	BB30_138;
 
-BB30_126:
-	setp.eq.f64	%p138, %fd68, 0d0000000000000000;
-	setp.eq.f64	%p139, %fd1, 0d3FF0000000000000;
-	or.pred  	%p140, %p139, %p138;
-	selp.f64	%fd106, 0d3FF0000000000000, %fd103, %p140;
+BB30_128:
+	mov.f64 	%fd97, %fd57;
+
+BB30_138:
+	setp.eq.f64	%p130, %fd68, 0d0000000000000000;
+	setp.eq.f64	%p131, %fd1, 0d3FF0000000000000;
+	or.pred  	%p132, %p131, %p130;
+	selp.f64	%fd98, 0d3FF0000000000000, %fd97, %p132;
 
-BB30_129:

<TRUNCATED>