You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2017/08/24 21:41:32 UTC
[2/5] systemml git commit: [SYSTEMML-1793] Support matrix range indexing on GPU

http://git-wip-us.apache.org/repos/asf/systemml/blob/628ffad1/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index 62c0e0d..92a5546 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -112,6 +112,7 @@ import org.apache.sysml.runtime.matrix.operators.CMOperator;
 import org.apache.sysml.runtime.matrix.operators.LeftScalarOperator;
 import org.apache.sysml.runtime.matrix.operators.RightScalarOperator;
 import org.apache.sysml.runtime.matrix.operators.ScalarOperator;
+import org.apache.sysml.runtime.util.IndexRange;
 import org.apache.sysml.utils.GPUStatistics;
 import org.apache.sysml.utils.Statistics;
 
@@ -148,7 +149,7 @@ public class LibMatrixCUDA {
 
 	private static final Log LOG = LogFactory.getLog(LibMatrixCUDA.class.getName());
 
-    // Assume Compute Capability 3.0
+	// Assume Compute Capability 3.0
 	// MAX BLOCKS is 2^31 - 1 For compute capability > 3.0
 	// MAX_THREADS is 1024 For compute capability > 3.0
 	private static int _MAX_THREADS = -1;
@@ -163,7 +164,7 @@ public class LibMatrixCUDA {
 	static GPUContext gCtx throws DMLRuntimeException {
 			return GPUContext.gCtx;
 	}
-	*/
+	 */
 
 	/**
 	 * Utility function to get maximum number of threads supported by the active CUDA device.
@@ -336,7 +337,6 @@ public class LibMatrixCUDA {
 	 * @return a sparse matrix pointer
 	 * @throws DMLRuntimeException if error occurs
 	 */
-	@SuppressWarnings("unused")
 	private static CSRPointer getSparsePointer(GPUContext gCtx, MatrixObject input, String instName) throws DMLRuntimeException {
 		if(!isInSparseFormat(gCtx, input)) {
 			input.getGPUObject(gCtx).denseToSparse();
@@ -405,7 +405,7 @@ public class LibMatrixCUDA {
 		biasAdd(instName, tmp, biasPointer, outputPointer, rows, cols, (int)k1);
 
 		cudaFreeHelper(tmp);
-		*/
+		 */
 		LOG.trace("GPU : conv2dBiasAdd" + ", GPUContext=" + gCtx);
 		conv2d(gCtx, instName, image, filter, output, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
 		//cudaDeviceSynchronize;
@@ -413,7 +413,7 @@ public class LibMatrixCUDA {
 	}
 
 	public static void conv2d(GPUContext gCtx, String instName, MatrixObject image, MatrixObject filter, MatrixObject outputBlock, int N, int C, int H, int W,
-														int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q)
+			int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q)
 					throws DMLRuntimeException {
 		Pointer imagePointer = getDensePointer(gCtx, image, true, instName);
 		Pointer filterPointer = getDensePointer(gCtx, filter, true, instName);
@@ -448,10 +448,10 @@ public class LibMatrixCUDA {
 	 * @throws DMLRuntimeException if error
 	 */
 	public static void conv2d(GPUContext gCtx, String instName, Pointer image, Pointer filter, Pointer output, int N,
-														 int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q)
+			int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q)
 					throws DMLRuntimeException {
-        LOG.trace("GPU : conv2d" + ", GPUContext=" + gCtx);
-        cudnnFilterDescriptor filterDesc = null;
+		LOG.trace("GPU : conv2d" + ", GPUContext=" + gCtx);
+		cudnnFilterDescriptor filterDesc = null;
 		cudnnConvolutionDescriptor convDesc = null;
 		Pointer workSpace = null;
 		long sizeInBytes = 0;
@@ -480,7 +480,7 @@ public class LibMatrixCUDA {
 				// Also ensure that GPU has enough memory to allocate memory
 				long sizeInBytesArray[] = {0};
 				jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardAlgorithm(getCudnnHandle(gCtx), srcTensorDesc, filterDesc, convDesc, dstTensorDesc,
-								CONVOLUTION_PREFERENCE, sizeInBytesArray[0], algos);
+						CONVOLUTION_PREFERENCE, sizeInBytesArray[0], algos);
 				cudnnGetConvolutionForwardWorkspaceSize(getCudnnHandle(gCtx), srcTensorDesc, filterDesc, convDesc, dstTensorDesc, algos[0], sizeInBytesArray);
 				if (sizeInBytesArray[0] != 0)
 					workSpace = gCtx.allocate(sizeInBytesArray[0]);
@@ -494,10 +494,10 @@ public class LibMatrixCUDA {
 				GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
 			if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
 			int status = cudnnConvolutionForward(getCudnnHandle(gCtx), one(),
-							srcTensorDesc, image,
-							filterDesc, filter,
-							convDesc, algo, workSpace, sizeInBytes, zero(),
-							dstTensorDesc, output);
+					srcTensorDesc, image,
+					filterDesc, filter,
+					convDesc, algo, workSpace, sizeInBytes, zero(),
+					dstTensorDesc, output);
 			if (GPUStatistics.DISPLAY_STATISTICS)
 				GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CONVOLUTION_FORWARD_LIB, System.nanoTime() - t2);
 			if (status != cudnnStatus.CUDNN_STATUS_SUCCESS) {
@@ -564,8 +564,8 @@ public class LibMatrixCUDA {
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
 	public static void reluBackward(GPUContext gCtx, String instName, MatrixObject input, MatrixObject dout, MatrixObject outputBlock) throws DMLRuntimeException {
-        LOG.trace("GPU : reluBackward" + ", GPUContext=" + gCtx);
-        long rows = input.getNumRows();
+		LOG.trace("GPU : reluBackward" + ", GPUContext=" + gCtx);
+		long rows = input.getNumRows();
 		long cols = input.getNumColumns();
 		Pointer imagePointer = getDensePointer(gCtx, input, instName);
 		Pointer doutPointer = getDensePointer(gCtx, dout, instName);
@@ -574,8 +574,8 @@ public class LibMatrixCUDA {
 		long t1=0;
 		if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
 		getCudaKernels(gCtx).launchKernel("relu_backward",
-						ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(rows), toInt(cols)),
-						imagePointer, doutPointer, outputPointer, toInt(rows), toInt(cols));
+				ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(rows), toInt(cols)),
+				imagePointer, doutPointer, outputPointer, toInt(rows), toInt(cols));
 		if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_BIAS_ADD_LIB, System.nanoTime() - t1);
 
 	}
@@ -593,8 +593,8 @@ public class LibMatrixCUDA {
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
 	public static void biasMultiply(GPUContext gCtx, String instName, MatrixObject input, MatrixObject bias, MatrixObject outputBlock) throws DMLRuntimeException {
-        LOG.trace("GPU : biasMultiply" + ", GPUContext=" + gCtx);
-        if(isInSparseFormat(gCtx, input)) {
+		LOG.trace("GPU : biasMultiply" + ", GPUContext=" + gCtx);
+		if(isInSparseFormat(gCtx, input)) {
 			input.getGPUObject(gCtx).sparseToDense(instName);
 		}
 		if(isInSparseFormat(gCtx, bias)) {
@@ -613,8 +613,8 @@ public class LibMatrixCUDA {
 		long t1 = 0;
 		if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
 		getCudaKernels(gCtx).launchKernel("bias_multiply",
-						ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(rows), toInt(cols)),
-						imagePointer, biasPointer, outputPointer, toInt(rows), toInt(cols), toInt(PQ));
+				ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(rows), toInt(cols)),
+				imagePointer, biasPointer, outputPointer, toInt(rows), toInt(cols), toInt(PQ));
 		if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RELU_BACKWARD_KERNEL, System.nanoTime() - t1);
 
 	}
@@ -660,13 +660,13 @@ public class LibMatrixCUDA {
 	 * @throws DMLRuntimeException
 	 */
 	private static void biasAdd(GPUContext gCtx, String instName, Pointer image, Pointer bias, Pointer output, int rows, int cols, int k) throws DMLRuntimeException {
-        LOG.trace("GPU : biasAdd" + ", GPUContext=" + gCtx);
-        int PQ = cols / k;
+		LOG.trace("GPU : biasAdd" + ", GPUContext=" + gCtx);
+		int PQ = cols / k;
 		long t1 = 0;
 		if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
 		getCudaKernels(gCtx).launchKernel("bias_add",
-						ExecutionConfig.getConfigForSimpleMatrixOperations(rows, cols),
-						image, bias, output, rows, cols, PQ);
+				ExecutionConfig.getConfigForSimpleMatrixOperations(rows, cols),
+				image, bias, output, rows, cols, PQ);
 		if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RELU_BACKWARD_KERNEL, System.nanoTime() - t1);
 	}
 
@@ -701,8 +701,8 @@ public class LibMatrixCUDA {
 	public static void batchNormalizationForwardInference(GPUContext gCtx, String instName, MatrixObject image,
 			MatrixObject scale, MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar,
 			MatrixObject ret, double epsilon) throws DMLRuntimeException {
-        LOG.trace("GPU : batchNormalizationForwardInference" + ", GPUContext=" + gCtx);
-        int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
+		LOG.trace("GPU : batchNormalizationForwardInference" + ", GPUContext=" + gCtx);
+		int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
 
 		int N = toInt(image.getNumRows());
 		int C = toInt(scale.getNumColumns());
@@ -724,8 +724,8 @@ public class LibMatrixCUDA {
 
 		checkStatus(cudnnBatchNormalizationForwardInference(getCudnnHandle(gCtx), mode, one(), zero(),
 				nCHWDescriptor, imagePtr, nCHWDescriptor, retPtr,
-			scaleTensorDesc, scalePtr, biasPtr,
-			runningMeanPtr, runningVarPtr, epsilon));
+				scaleTensorDesc, scalePtr, biasPtr,
+				runningMeanPtr, runningVarPtr, epsilon));
 	}
 
 	/**
@@ -747,8 +747,8 @@ public class LibMatrixCUDA {
 	public static void batchNormalizationForwardTraining(GPUContext gCtx, String instName, MatrixObject image,
 			MatrixObject scale,  MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar,
 			MatrixObject ret, MatrixObject retRunningMean, MatrixObject retRunningVar, double epsilon, double exponentialAverageFactor) throws DMLRuntimeException {
-        LOG.trace("GPU : batchNormalizationForwardTraining" + ", GPUContext=" + gCtx);
-        int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
+		LOG.trace("GPU : batchNormalizationForwardTraining" + ", GPUContext=" + gCtx);
+		int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
 
 		int N = toInt(image.getNumRows());
 		int C = toInt(scale.getNumColumns());
@@ -777,8 +777,8 @@ public class LibMatrixCUDA {
 		// ignoring resultSaveMean and resultSaveVariance as it requires state management
 		checkStatus(cudnnBatchNormalizationForwardTraining(getCudnnHandle(gCtx), mode, one(), zero(),
 				nCHWDescriptor, imagePtr, nCHWDescriptor, retPtr,
-			scaleTensorDesc, scalePtr, biasPtr, exponentialAverageFactor,
-			retRunningMeanPtr, retRunningVarPtr, epsilon, new Pointer(), new Pointer()));
+				scaleTensorDesc, scalePtr, biasPtr, exponentialAverageFactor,
+				retRunningMeanPtr, retRunningVarPtr, epsilon, new Pointer(), new Pointer()));
 	}
 
 	/**
@@ -852,8 +852,8 @@ public class LibMatrixCUDA {
 	public static void batchNormalizationBackward(GPUContext gCtx, String instName, MatrixObject image, MatrixObject dout,
 			MatrixObject scale, MatrixObject ret, MatrixObject retScale, MatrixObject retBias,
 			double epsilon) throws DMLRuntimeException {
-        LOG.trace("GPU : batchNormalizationBackward" + ", GPUContext=" + gCtx);
-        int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
+		LOG.trace("GPU : batchNormalizationBackward" + ", GPUContext=" + gCtx);
+		int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
 
 		int N = toInt(image.getNumRows());
 		int C = toInt(scale.getNumColumns());
@@ -902,11 +902,11 @@ public class LibMatrixCUDA {
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
 	public static void conv2dBackwardFilter(GPUContext gCtx, String instName, MatrixObject image, MatrixObject dout,
-																					MatrixObject outputBlock, int N, int C, int H, int W, int K, int R,
-																					int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
-																					int Q) throws DMLRuntimeException {
-        LOG.trace("GPU : conv2dBackwardFilter" + ", GPUContext=" + gCtx);
-        cudnnFilterDescriptor dwDesc = null;
+			MatrixObject outputBlock, int N, int C, int H, int W, int K, int R,
+			int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
+			int Q) throws DMLRuntimeException {
+		LOG.trace("GPU : conv2dBackwardFilter" + ", GPUContext=" + gCtx);
+		cudnnFilterDescriptor dwDesc = null;
 		cudnnConvolutionDescriptor convDesc = null;
 
 		Pointer workSpace = null;
@@ -934,13 +934,13 @@ public class LibMatrixCUDA {
 
 			workSpace = new Pointer();
 			cudnnGetConvolutionBackwardFilterWorkspaceSize(getCudnnHandle(gCtx),
-							xTensorDesc, doutTensorDesc, convDesc, dwDesc, algo, sizeInBytesArray);
+					xTensorDesc, doutTensorDesc, convDesc, dwDesc, algo, sizeInBytesArray);
 			if (GPUStatistics.DISPLAY_STATISTICS)
 				GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
 
 			if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
 			int status = cudnnConvolutionBackwardFilter(getCudnnHandle(gCtx), one(), xTensorDesc, imagePointer,
-							doutTensorDesc, doutPointer, convDesc, algo, workSpace, sizeInBytes, zero(), dwDesc, dwPointer);
+					doutTensorDesc, doutPointer, convDesc, algo, workSpace, sizeInBytes, zero(), dwDesc, dwPointer);
 			if (GPUStatistics.DISPLAY_STATISTICS)
 				GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CONVOLUTION_BACKWARD_FILTER_LIB, System.nanoTime() - t2);
 
@@ -948,7 +948,7 @@ public class LibMatrixCUDA {
 				throw new DMLRuntimeException("Could not executed cudnnConvolutionBackwardFilter: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
 			}
 		} catch (CudaException e) {
-				throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e);
+			throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e);
 		} finally {
 			long t3=0;
 			if (GPUStatistics.DISPLAY_STATISTICS) t3 = System.nanoTime();
@@ -989,11 +989,11 @@ public class LibMatrixCUDA {
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
 	public static void conv2dBackwardData(GPUContext gCtx, String instName, MatrixObject filter, MatrixObject dout,
-																				MatrixObject output, int N, int C, int H, int W, int K, int R,
-																				int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
-																				int Q) throws DMLRuntimeException {
-        LOG.trace("GPU : conv2dBackwardData" + ", GPUContext=" + gCtx);
-        cudnnFilterDescriptor wDesc = null;
+			MatrixObject output, int N, int C, int H, int W, int K, int R,
+			int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
+			int Q) throws DMLRuntimeException {
+		LOG.trace("GPU : conv2dBackwardData" + ", GPUContext=" + gCtx);
+		cudnnFilterDescriptor wDesc = null;
 		cudnnConvolutionDescriptor convDesc = null;
 
 		Pointer workSpace = null;
@@ -1020,12 +1020,12 @@ public class LibMatrixCUDA {
 			int algo = jcuda.jcudnn.cudnnConvolutionBwdDataAlgo.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
 			workSpace = new Pointer();
 			cudnnGetConvolutionBackwardDataWorkspaceSize(getCudnnHandle(gCtx),
-							wDesc, dyDesc, convDesc, dxDesc, algo, sizeInBytesArray);
+					wDesc, dyDesc, convDesc, dxDesc, algo, sizeInBytesArray);
 			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
 
 			if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
 			int status = cudnnConvolutionBackwardData(getCudnnHandle(gCtx), one(), wDesc, w,
-							dyDesc, dy, convDesc, algo, workSpace, sizeInBytes, zero(), dxDesc, dx);
+					dyDesc, dy, convDesc, algo, workSpace, sizeInBytes, zero(), dxDesc, dx);
 			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CONVOLUTION_BACKWARD_DATA_LIB, System.nanoTime() - t2);
 
 			if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
@@ -1074,7 +1074,7 @@ public class LibMatrixCUDA {
 			MatrixObject outputBlock, int N, int C, int H, int W, int K, int R,
 			int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
 			int Q) throws DMLRuntimeException {
-        Pointer x = getDensePointer(gCtx, image, true, instName);
+		Pointer x = getDensePointer(gCtx, image, true, instName);
 		cudnnTensorDescriptor xDesc = allocateTensorDescriptor(gCtx, image, N, C, H, W);
 		performMaxpooling(gCtx, instName, x, xDesc, outputBlock, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
 	}
@@ -1083,8 +1083,8 @@ public class LibMatrixCUDA {
 			MatrixObject outputBlock, int N, int C, int H, int W, int K, int R,
 			int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
 			int Q) throws DMLRuntimeException {
-        LOG.trace("GPU : performMaxpooling" + ", GPUContext=" + gCtx);
-        Pointer y = getDensePointer(gCtx, outputBlock, true, instName);
+		LOG.trace("GPU : performMaxpooling" + ", GPUContext=" + gCtx);
+		Pointer y = getDensePointer(gCtx, outputBlock, true, instName);
 		cudnnPoolingDescriptor poolingDesc = null;
 
 		try {
@@ -1138,11 +1138,11 @@ public class LibMatrixCUDA {
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
 	public static void maxpoolingBackward(GPUContext gCtx, String instName, MatrixObject image, MatrixObject dout,
-																				MatrixObject outputBlock, int N, int C, int H, int W, int K, int R,
-																				int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
-																				int Q) throws DMLRuntimeException {
-        LOG.trace("GPU : maxpoolingBackward" + ", GPUContext=" + gCtx);
-        Pointer y = null;
+			MatrixObject outputBlock, int N, int C, int H, int W, int K, int R,
+			int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
+			int Q) throws DMLRuntimeException {
+		LOG.trace("GPU : maxpoolingBackward" + ", GPUContext=" + gCtx);
+		Pointer y = null;
 		cudnnPoolingDescriptor poolingDesc = null;
 
 		try {
@@ -1202,8 +1202,8 @@ public class LibMatrixCUDA {
 	private static void performCuDNNReLU(GPUContext gCtx, String instName, MatrixObject in, Pointer dstData, cudnnTensorDescriptor srcTensorDesc) throws DMLRuntimeException {
 		long t0=0;
 		try {
-            LOG.trace("GPU : performCuDNNReLU" + ", GPUContext=" + gCtx);
-            cudnnTensorDescriptor dstTensorDesc = srcTensorDesc;
+			LOG.trace("GPU : performCuDNNReLU" + ", GPUContext=" + gCtx);
+			cudnnTensorDescriptor dstTensorDesc = srcTensorDesc;
 
 			Pointer srcData = getDensePointer(gCtx, in, true, instName);
 			cudnnActivationDescriptor activationDescriptor = new cudnnActivationDescriptor();
@@ -1212,8 +1212,8 @@ public class LibMatrixCUDA {
 			cudnnSetActivationDescriptor(activationDescriptor, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, dummy);
 			if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
 			cudnnActivationForward(getCudnnHandle(gCtx), activationDescriptor,
-							one(), srcTensorDesc, srcData,
-							zero(), dstTensorDesc, dstData);
+					one(), srcTensorDesc, srcData,
+					zero(), dstTensorDesc, dstData);
 			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ACTIVATION_FORWARD_LIB, System.nanoTime() - t0);
 		} catch (CudaException e) {
 			throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e);
@@ -1245,14 +1245,14 @@ public class LibMatrixCUDA {
 		long t0=0;
 		cudnnTensorDescriptor srcTensorDesc = in.getGPUObject(gCtx).getTensorDescriptor();
 		if(N*CHW >= numDoublesIn2GB ||  srcTensorDesc == null) {
-            LOG.trace("GPU : relu custom kernel" + ", GPUContext=" + gCtx);
-            // Invokes relu(double* A,  double* ret, int rlen, int clen)
+			LOG.trace("GPU : relu custom kernel" + ", GPUContext=" + gCtx);
+			// Invokes relu(double* A,  double* ret, int rlen, int clen)
 			if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
 			Pointer dstData = getDensePointer(gCtx, output, instName);
 			Pointer srcData = getDensePointer(gCtx, in, instName); // TODO: FIXME: Add sparse kernel support for relu
 			getCudaKernels(gCtx).launchKernel("relu",
-							ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(N), toInt(CHW)),
-							srcData, dstData, toInt(N), toInt(CHW));
+					ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(N), toInt(CHW)),
+					srcData, dstData, toInt(N), toInt(CHW));
 			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RELU_KERNEL, System.nanoTime() - t0);
 		}
 		else {
@@ -1287,7 +1287,7 @@ public class LibMatrixCUDA {
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
 	public static void matmultTSMM(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject left, String outputName,
-																 boolean isLeftTransposed) throws DMLRuntimeException {
+			boolean isLeftTransposed) throws DMLRuntimeException {
 		LOG.trace("GPU : matmultTSMM" + ", GPUContext=" + gCtx);
 		if (ec.getGPUContext(0) != gCtx)
 			throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
@@ -1303,7 +1303,7 @@ public class LibMatrixCUDA {
 		// Note: the dimensions are swapped
 		int m = toInt(isLeftTransposed ? left.getNumColumns() : left.getNumRows());
 		int k = toInt(isLeftTransposed ? left.getNumRows() : left.getNumColumns());
-				
+
 		// For dense TSMM, exploit cublasDsyrk(...) and call custom kernel to flip the matrix
 		MatrixObject output = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, m, m);	// Allocated the dense output matrix
 
@@ -1343,8 +1343,8 @@ public class LibMatrixCUDA {
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
 	private static void copyUpperToLowerTriangle(GPUContext gCtx, String instName, MatrixObject ret) throws DMLRuntimeException {
-        LOG.trace("GPU : copyUpperToLowerTriangle" + ", GPUContext=" + gCtx);
-        if(isInSparseFormat(gCtx, ret)) {
+		LOG.trace("GPU : copyUpperToLowerTriangle" + ", GPUContext=" + gCtx);
+		if(isInSparseFormat(gCtx, ret)) {
 			throw new DMLRuntimeException("Sparse GPU copyUpperToLowerTriangle is not implemented");
 		}
 		if(ret.getNumRows() != ret.getNumColumns()) {
@@ -1352,8 +1352,8 @@ public class LibMatrixCUDA {
 		}
 		int dim = toInt(ret.getNumRows());
 		getCudaKernels(gCtx).launchKernel("copy_u2l_dense",
-						ExecutionConfig.getConfigForSimpleMatrixOperations(dim, dim),
-						getDensePointer(gCtx, ret, instName), dim, dim*dim);
+				ExecutionConfig.getConfigForSimpleMatrixOperations(dim, dim),
+				getDensePointer(gCtx, ret, instName), dim, dim*dim);
 	}
 
 
@@ -1389,7 +1389,7 @@ public class LibMatrixCUDA {
 	 * @return output of matrix multiply
 	 */
 	public static MatrixObject matmult(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject left, MatrixObject right, String outputName,
-																		 boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException {
+			boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException {
 		if (ec.getGPUContext(0) != gCtx)
 			throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
 		LOG.trace("GPU : matmult" + ", GPUContext=" + gCtx);
@@ -1400,7 +1400,7 @@ public class LibMatrixCUDA {
 		boolean bothSparse = left.getGPUObject(gCtx).isSparse() && right.getGPUObject(gCtx).isSparse();
 
 		MatrixObject output = ec.getMatrixObject(outputName);
-		
+
 		long outRLen = isLeftTransposed ? left.getNumColumns() : left.getNumRows();
 		long outCLen = isRightTransposed ? right.getNumRows() : right.getNumColumns();
 
@@ -1436,7 +1436,7 @@ public class LibMatrixCUDA {
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
 	private static void eitherSparseMatmult(GPUContext gCtx, String instName, MatrixObject output, MatrixObject left, MatrixObject right,
-																						boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException {
+			boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException {
 
 		int m = toInt(isLeftTransposed ? left.getNumColumns() : left.getNumRows()) ;
 		int n = toInt(isRightTransposed ? right.getNumRows() : right.getNumColumns());
@@ -1476,15 +1476,15 @@ public class LibMatrixCUDA {
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
 	private static void denseSparseMatmult(GPUContext gCtx, String instName, MatrixObject left, MatrixObject right, MatrixObject output,
-                                             boolean isLeftTransposed, boolean isRightTransposed, int m, int n, int k)
+			boolean isLeftTransposed, boolean isRightTransposed, int m, int n, int k)
 					throws DMLRuntimeException {
 		// right sparse, left dense
 		CSRPointer B = right.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
 		Pointer ADense = getDensePointer(gCtx, left, instName);
 		if (B.isUltraSparse(k, n)){
-            LOG.trace(" GPU : Convert d M %*% sp M --> sp M %*% sp M)" + ", GPUContext=" + gCtx);
+			LOG.trace(" GPU : Convert d M %*% sp M --> sp M %*% sp M)" + ", GPUContext=" + gCtx);
 
-            // Convert left to CSR and do cuSparse matmul
+			// Convert left to CSR and do cuSparse matmul
 			int rowsA = (int)left.getNumRows();
 			int colsA = (int)left.getNumColumns();
 
@@ -1497,8 +1497,8 @@ public class LibMatrixCUDA {
 			CSRPointer A = GPUObject.columnMajorDenseToRowMajorSparse(gCtx, getCusparseHandle(gCtx), AT, rowsA, colsA);
 			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t1);
 
-			if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseTime.getAndAdd(System.nanoTime() - t0);
-			if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseCount.getAndAdd(1);
+			if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseTime.add(System.nanoTime() - t0);
+			if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseCount.add(1);
 			sparseSparseMatmult(gCtx, instName, A, B, output, isLeftTransposed, isRightTransposed, m, n, k);
 
 			if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
@@ -1507,7 +1507,7 @@ public class LibMatrixCUDA {
 			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t2, 2);
 
 		} else {
-            LOG.trace(" GPU : Convert d M %*% sp M --> d M %*% d M" + ", GPUContext=" + gCtx);
+			LOG.trace(" GPU : Convert d M %*% sp M --> d M %*% d M" + ", GPUContext=" + gCtx);
 			// Convert right to dense and do a cuBlas matmul
 			// BDenseTransposed is a column major matrix
 			// Note the arguments to denseDenseMatmult to accommodate for this.
@@ -1515,8 +1515,8 @@ public class LibMatrixCUDA {
 			if (DMLScript.STATISTICS) t0 = System.nanoTime();
 			Pointer BDenseTransposed = B.toColumnMajorDenseMatrix(getCusparseHandle(gCtx), getCublasHandle(gCtx), (int)right.getNumRows(), (int)right.getNumColumns());
 			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0);
-			if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseTime.getAndAdd(System.nanoTime() - t0);
-			if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseCount.getAndAdd(System.nanoTime() - t0);
+			if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseTime.add(System.nanoTime() - t0);
+			if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseCount.add(System.nanoTime() - t0);
 
 			if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
 			boolean allocated = output.getGPUObject(gCtx).acquireDeviceModifyDense();	// To allocate the dense matrix
@@ -1550,7 +1550,7 @@ public class LibMatrixCUDA {
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
 	private static void sparseDenseMatmult(GPUContext gCtx, String instName, MatrixObject output, MatrixObject left, MatrixObject right,
-																					 boolean isLeftTransposed, boolean isRightTransposed, int m, int n, int k)
+			boolean isLeftTransposed, boolean isRightTransposed, int m, int n, int k)
 					throws DMLRuntimeException {
 		CSRPointer A = left.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
 		Pointer BDense = getDensePointer(gCtx, right, instName);
@@ -1577,8 +1577,8 @@ public class LibMatrixCUDA {
 				CSRPointer B = GPUObject.columnMajorDenseToRowMajorSparse(gCtx, getCusparseHandle(gCtx), BT, rowsB, colsB);
 				if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t1);
 
-				if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseTime.getAndAdd(System.nanoTime() - t0);
-				if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseCount.getAndAdd(1);
+				if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseTime.add(System.nanoTime() - t0);
+				if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseCount.add(1);
 
 				sparseSparseMatmult(gCtx, instName, A, B, output, isLeftTransposed, isRightTransposed, m, n, k);
 
@@ -1588,15 +1588,15 @@ public class LibMatrixCUDA {
 				if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t2, 2);
 
 			} else {
-                LOG.trace(" GPU : Convert sp M %*% d M --> d M %*% d M" + ", GPUContext=" + gCtx);
+				LOG.trace(" GPU : Convert sp M %*% d M --> d M %*% d M" + ", GPUContext=" + gCtx);
 				// Convert left to dense and do a cuBlas matmul
 				// ADenseTransposed is a column major matrix
 				// Note the arguments to denseDenseMatmult to accommodate for this.
 				if (DMLScript.STATISTICS) t0 = System.nanoTime();
 				Pointer ADenseTransposed = A.toColumnMajorDenseMatrix(getCusparseHandle(gCtx), getCublasHandle(gCtx), (int)left.getNumRows(), (int)left.getNumColumns());
 				if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0);
-				if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseTime.getAndAdd(System.nanoTime() - t0);
-				if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseCount.getAndAdd(System.nanoTime() - t0);
+				if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseTime.add(System.nanoTime() - t0);
+				if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseCount.add(System.nanoTime() - t0);
 
 				if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
 				boolean allocated = output.getGPUObject(gCtx).acquireDeviceModifyDense();	// To allocate the dense matrix
@@ -1629,13 +1629,13 @@ public class LibMatrixCUDA {
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
 	private static void sparseMatrixDenseVectorMult(GPUContext gCtx, String instName, MatrixObject output, CSRPointer A, Pointer B_dense, boolean isATranposed,
-																										int m, int k) throws DMLRuntimeException {
-        LOG.trace("GPU : sp M %*% dense V" + ", GPUContext=" + gCtx);
-        int transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
+			int m, int k) throws DMLRuntimeException {
+		LOG.trace("GPU : sp M %*% dense V" + ", GPUContext=" + gCtx);
+		int transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
 		long size = m * Sizeof.DOUBLE;
 		if (isATranposed){
 			size = k * Sizeof.DOUBLE;
-            transA = CUSPARSE_OPERATION_TRANSPOSE;
+			transA = CUSPARSE_OPERATION_TRANSPOSE;
 		}
 		Pointer C_dense = gCtx.allocate(instName, (int)size);
 		long t1=0;
@@ -1662,8 +1662,8 @@ public class LibMatrixCUDA {
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
 	private static void bothSparseMatmult(GPUContext gCtx, String instName, MatrixObject output, MatrixObject left, MatrixObject right,
-																					boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException {
-        int m = toInt(isLeftTransposed ? left.getNumColumns() : left.getNumRows()) ;
+			boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException {
+		int m = toInt(isLeftTransposed ? left.getNumColumns() : left.getNumRows()) ;
 		int n = toInt(isRightTransposed ? right.getNumRows() : right.getNumColumns());
 		int k = toInt(isLeftTransposed ? left.getNumRows() :  left.getNumColumns());
 		int k1 = toInt(isRightTransposed ? right.getNumColumns() : right.getNumRows());
@@ -1701,7 +1701,7 @@ public class LibMatrixCUDA {
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
 	private static void sparseMatrixVectorMult(GPUContext gCtx, String instName, MatrixObject output, boolean isATranposed, int m, int n, int k,
-																							 CSRPointer A, CSRPointer B) throws DMLRuntimeException {
+			CSRPointer A, CSRPointer B) throws DMLRuntimeException {
 		long t0=0;
 		if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
 		Pointer BDenseVector = B.toColumnMajorDenseMatrix(getCusparseHandle(gCtx), getCublasHandle(gCtx), k, 1);
@@ -1726,11 +1726,11 @@ public class LibMatrixCUDA {
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
 	private static void sparseSparseMatmult(GPUContext gCtx, String instName, CSRPointer A, CSRPointer B, MatrixObject output,
-                                              boolean isLeftTransposed, boolean isRightTransposed, int m, int n, int k) throws DMLRuntimeException {
-        LOG.trace("GPU : sp M %*% sp M" + ", GPUContext=" + gCtx);
+			boolean isLeftTransposed, boolean isRightTransposed, int m, int n, int k) throws DMLRuntimeException {
+		LOG.trace("GPU : sp M %*% sp M" + ", GPUContext=" + gCtx);
 
-        int transA = isLeftTransposed ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
-        int transB = isRightTransposed ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
+		int transA = isLeftTransposed ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
+		int transB = isRightTransposed ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
 
 		long t0=0, t1=0;
 		if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
@@ -1741,9 +1741,9 @@ public class LibMatrixCUDA {
 
 		if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
 		cusparseDcsrgemm(getCusparseHandle(gCtx), transA, transB, m, n, k,
-						A.descr, (int)A.nnz, A.val, A.rowPtr, A.colInd,
-						B.descr, (int)B.nnz, B.val, B.rowPtr, B.colInd,
-						C.descr, C.val, C.rowPtr, C.colInd);
+				A.descr, (int)A.nnz, A.val, A.rowPtr, A.colInd,
+				B.descr, (int)B.nnz, B.val, B.rowPtr, B.colInd,
+				C.descr, C.val, C.rowPtr, C.colInd);
 		//cudaDeviceSynchronize;
 		if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_MATRIX_SPARSE_MATRIX_LIB, System.nanoTime() - t1);
 	}
@@ -1762,7 +1762,7 @@ public class LibMatrixCUDA {
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
 	private static void denseDenseMatmult(GPUContext gCtx, String instName, MatrixObject output, MatrixObject left, MatrixObject right,
-																					boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException {
+			boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException {
 
 		Pointer leftPtr = getDensePointer(gCtx, left, instName);
 		Pointer rightPtr = getDensePointer(gCtx, right, instName);
@@ -1773,7 +1773,7 @@ public class LibMatrixCUDA {
 		int rightCols = toInt(right.getNumColumns());
 		Pointer C = getDensePointer(gCtx, output, instName);
 		denseDenseMatmult(gCtx, instName, C, leftRows, leftCols, rightRows, rightCols, isLeftTransposed, isRightTransposed,
-						leftPtr, rightPtr);
+				leftPtr, rightPtr);
 	}
 
 	/**
@@ -1799,9 +1799,9 @@ public class LibMatrixCUDA {
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
 	public static void denseDenseMatmult(GPUContext gCtx, String instName, Pointer output, int leftRows1, int leftCols1, int rightRows1,
-																			 int rightCols1, boolean isLeftTransposed1, boolean isRightTransposed1, Pointer leftPtr, Pointer rightPtr)
+			int rightCols1, boolean isLeftTransposed1, boolean isRightTransposed1, Pointer leftPtr, Pointer rightPtr)
 					throws DMLRuntimeException {
-        LOG.trace("GPU : d M %*% d M" + ", GPUContext=" + gCtx);
+		LOG.trace("GPU : d M %*% d M" + ", GPUContext=" + gCtx);
 
 		Pointer A = rightPtr;
 		Pointer B = leftPtr;
@@ -1892,7 +1892,7 @@ public class LibMatrixCUDA {
 	 * @throws DMLRuntimeException if {@link DMLRuntimeException} occurs
 	 */
 	public static void unaryAggregate(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String output, AggregateUnaryOperator op)
-					throws DMLRuntimeException {
+			throws DMLRuntimeException {
 		if (ec.getGPUContext(0) != gCtx)
 			throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
 		LOG.trace("GPU : unaryAggregate" + ", GPUContext=" + gCtx);
@@ -1955,12 +1955,12 @@ public class LibMatrixCUDA {
 		} else if (aggOp.increOp.fn instanceof Builtin) {
 			Builtin b = (Builtin)aggOp.increOp.fn;
 			switch(b.bFunc) {
-				case MAX: opIndex = OP_MAX; break;
-				case MIN: opIndex = OP_MIN; break;
-				case MAXINDEX: opIndex = OP_MAXINDEX; break;
-				case MININDEX: opIndex = OP_MININDEX;break;
-				default:
-					new DMLRuntimeException("Internal Error - Unsupported Builtin Function for Aggregate unary being done on GPU");
+			case MAX: opIndex = OP_MAX; break;
+			case MIN: opIndex = OP_MIN; break;
+			case MAXINDEX: opIndex = OP_MAXINDEX; break;
+			case MININDEX: opIndex = OP_MININDEX;break;
+			default:
+				new DMLRuntimeException("Internal Error - Unsupported Builtin Function for Aggregate unary being done on GPU");
 			}
 		} else {
 			throw new DMLRuntimeException("Internal Error - Aggregate operator has invalid Value function");
@@ -1980,7 +1980,7 @@ public class LibMatrixCUDA {
 			// throw new DMLRuntimeException("Internal Error - Not implemented");
 
 		}
-		
+
 		long outRLen = -1;
 		long outCLen = -1;
 		if (indexFn instanceof ReduceRow) { // COL{SUM, MAX...}
@@ -2004,210 +2004,210 @@ public class LibMatrixCUDA {
 
 		// For scalars, set the scalar output in the Execution Context object
 		switch (opIndex){
-			case OP_PLUS: {
-				switch(reductionDirection) {
-					case REDUCTION_ALL : {
-						double result = reduceAll(gCtx, instName, "reduce_sum", in, size);
-						ec.setScalarOutput(output, new DoubleObject(result));
-						break;
-					}
-					case REDUCTION_COL : {	// The names are a bit misleading, REDUCTION_COL refers to the direction (reduce all elements in a column)
-						reduceRow(gCtx, instName, "reduce_row_sum", in, out, rlen, clen);
-						break;
-					}
-					case REDUCTION_ROW : {
-						reduceCol(gCtx, instName, "reduce_col_sum", in, out, rlen, clen);
-						break;
-					}
-					case REDUCTION_DIAG :
-						throw new DMLRuntimeException("Internal Error - Row, Column and Diag summation not implemented yet");
-				}
+		case OP_PLUS: {
+			switch(reductionDirection) {
+			case REDUCTION_ALL : {
+				double result = reduceAll(gCtx, instName, "reduce_sum", in, size);
+				ec.setScalarOutput(output, new DoubleObject(result));
 				break;
 			}
-			case OP_PLUS_SQ : {
-				// Calculate the squares in a temporary object tmp
-				Pointer tmp = gCtx.allocate(instName, size * Sizeof.DOUBLE);
-
-				squareMatrix(gCtx, instName, in, tmp, rlen, clen);
-				// Then do the sum on the temporary object and free it
-				switch(reductionDirection) {
-					case REDUCTION_ALL : {
-						double result = reduceAll(gCtx, instName, "reduce_sum", tmp, size);
-						ec.setScalarOutput(output, new DoubleObject(result));
-						break;
-					}
-					case REDUCTION_COL : {	// The names are a bit misleading, REDUCTION_COL refers to the direction (reduce all elements in a column)
-						reduceRow(gCtx, instName, "reduce_row_sum", tmp, out, rlen, clen);
-						break;
-					}
-					case REDUCTION_ROW : {
-						reduceCol(gCtx, instName, "reduce_col_sum", tmp, out, rlen, clen);
-						break;
-					}
-					default:
-						throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for summation squared");
-				}
-				gCtx.cudaFreeHelper(instName, tmp);
+			case REDUCTION_COL : {	// The names are a bit misleading, REDUCTION_COL refers to the direction (reduce all elements in a column)
+				reduceRow(gCtx, instName, "reduce_row_sum", in, out, rlen, clen);
 				break;
 			}
-			case OP_MEAN:{
-				switch(reductionDirection) {
-					case REDUCTION_ALL: {
-						double result = reduceAll(gCtx, instName, "reduce_sum", in, size);
-						double mean = result / size;
-						ec.setScalarOutput(output, new DoubleObject(mean));
-						break;
-					}
-					case REDUCTION_COL: {
-						reduceRow(gCtx, instName, "reduce_row_mean", in, out, rlen, clen);
-						break;
-					}
-					case REDUCTION_ROW: {
-						reduceCol(gCtx, instName, "reduce_col_mean", in, out, rlen, clen);
-						break;
-					}
-					default:
-						throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for mean");
-				}
+			case REDUCTION_ROW : {
+				reduceCol(gCtx, instName, "reduce_col_sum", in, out, rlen, clen);
 				break;
 			}
-			case OP_MULTIPLY : {
-				switch (reductionDirection) {
-					case REDUCTION_ALL: {
-						double result = reduceAll(gCtx, instName, "reduce_prod", in, size);
-						ec.setScalarOutput(output, new DoubleObject(result));
-						break;
-					}
-					default:
-						throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for multiplication");
-				}
+			case REDUCTION_DIAG :
+				throw new DMLRuntimeException("Internal Error - Row, Column and Diag summation not implemented yet");
+			}
+			break;
+		}
+		case OP_PLUS_SQ : {
+			// Calculate the squares in a temporary object tmp
+			Pointer tmp = gCtx.allocate(instName, size * Sizeof.DOUBLE);
+
+			squareMatrix(gCtx, instName, in, tmp, rlen, clen);
+			// Then do the sum on the temporary object and free it
+			switch(reductionDirection) {
+			case REDUCTION_ALL : {
+				double result = reduceAll(gCtx, instName, "reduce_sum", tmp, size);
+				ec.setScalarOutput(output, new DoubleObject(result));
 				break;
 			}
-			case OP_MAX :{
-				switch(reductionDirection) {
-					case REDUCTION_ALL: {
-						double result = reduceAll(gCtx, instName, "reduce_max", in, size);
-						ec.setScalarOutput(output, new DoubleObject(result));
-						break;
-					}
-					case REDUCTION_COL: {
-						reduceRow(gCtx, instName, "reduce_row_max", in, out, rlen, clen);
-						break;
-					}
-					case REDUCTION_ROW: {
-						reduceCol(gCtx, instName, "reduce_col_max", in, out, rlen, clen);
-						break;
-					}
-					default:
-						throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for max");
-				}
+			case REDUCTION_COL : {	// The names are a bit misleading, REDUCTION_COL refers to the direction (reduce all elements in a column)
+				reduceRow(gCtx, instName, "reduce_row_sum", tmp, out, rlen, clen);
 				break;
 			}
-			case OP_MIN :{
-				switch(reductionDirection) {
-					case REDUCTION_ALL: {
-						double result = reduceAll(gCtx, instName, "reduce_min", in, size);
-						ec.setScalarOutput(output, new DoubleObject(result));
-						break;
-					}
-					case REDUCTION_COL: {
-						reduceRow(gCtx, instName, "reduce_row_min", in, out, rlen, clen);
-						break;
-					}
-					case REDUCTION_ROW: {
-						reduceCol(gCtx, instName, "reduce_col_min", in, out, rlen, clen);
-						break;
-					}
-					default:
-						throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for min");
-				}
+			case REDUCTION_ROW : {
+				reduceCol(gCtx, instName, "reduce_col_sum", tmp, out, rlen, clen);
+				break;
+			}
+			default:
+				throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for summation squared");
+			}
+			gCtx.cudaFreeHelper(instName, tmp);
+			break;
+		}
+		case OP_MEAN:{
+			switch(reductionDirection) {
+			case REDUCTION_ALL: {
+				double result = reduceAll(gCtx, instName, "reduce_sum", in, size);
+				double mean = result / size;
+				ec.setScalarOutput(output, new DoubleObject(mean));
+				break;
+			}
+			case REDUCTION_COL: {
+				reduceRow(gCtx, instName, "reduce_row_mean", in, out, rlen, clen);
+				break;
+			}
+			case REDUCTION_ROW: {
+				reduceCol(gCtx, instName, "reduce_col_mean", in, out, rlen, clen);
+				break;
+			}
+			default:
+				throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for mean");
+			}
+			break;
+		}
+		case OP_MULTIPLY : {
+			switch (reductionDirection) {
+			case REDUCTION_ALL: {
+				double result = reduceAll(gCtx, instName, "reduce_prod", in, size);
+				ec.setScalarOutput(output, new DoubleObject(result));
+				break;
+			}
+			default:
+				throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for multiplication");
+			}
+			break;
+		}
+		case OP_MAX :{
+			switch(reductionDirection) {
+			case REDUCTION_ALL: {
+				double result = reduceAll(gCtx, instName, "reduce_max", in, size);
+				ec.setScalarOutput(output, new DoubleObject(result));
+				break;
+			}
+			case REDUCTION_COL: {
+				reduceRow(gCtx, instName, "reduce_row_max", in, out, rlen, clen);
+				break;
+			}
+			case REDUCTION_ROW: {
+				reduceCol(gCtx, instName, "reduce_col_max", in, out, rlen, clen);
+				break;
+			}
+			default:
+				throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for max");
+			}
+			break;
+		}
+		case OP_MIN :{
+			switch(reductionDirection) {
+			case REDUCTION_ALL: {
+				double result = reduceAll(gCtx, instName, "reduce_min", in, size);
+				ec.setScalarOutput(output, new DoubleObject(result));
 				break;
 			}
-			case OP_VARIANCE : {
-				// Temporary GPU array for
-				Pointer tmp = gCtx.allocate(instName, size * Sizeof.DOUBLE);
-				Pointer tmp2 = gCtx.allocate(instName, size * Sizeof.DOUBLE);
+			case REDUCTION_COL: {
+				reduceRow(gCtx, instName, "reduce_row_min", in, out, rlen, clen);
+				break;
+			}
+			case REDUCTION_ROW: {
+				reduceCol(gCtx, instName, "reduce_col_min", in, out, rlen, clen);
+				break;
+			}
+			default:
+				throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for min");
+			}
+			break;
+		}
+		case OP_VARIANCE : {
+			// Temporary GPU array for
+			Pointer tmp = gCtx.allocate(instName, size * Sizeof.DOUBLE);
+			Pointer tmp2 = gCtx.allocate(instName, size * Sizeof.DOUBLE);
 
-				switch(reductionDirection) {
+			switch(reductionDirection) {
 
-					case REDUCTION_ALL: {
-						double result = reduceAll(gCtx, instName, "reduce_sum", in, size);
-						double mean = result / size;
+			case REDUCTION_ALL: {
+				double result = reduceAll(gCtx, instName, "reduce_sum", in, size);
+				double mean = result / size;
 
-						// Subtract mean from every element in the matrix
-						ScalarOperator minusOp = new RightScalarOperator(Minus.getMinusFnObject(), mean);
-						matrixScalarOp(gCtx, instName, in, mean, rlen, clen, tmp, minusOp);
+				// Subtract mean from every element in the matrix
+				ScalarOperator minusOp = new RightScalarOperator(Minus.getMinusFnObject(), mean);
+				matrixScalarOp(gCtx, instName, in, mean, rlen, clen, tmp, minusOp);
 
-						squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen);
+				squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen);
 
-						double result2 = reduceAll(gCtx, instName, "reduce_sum", tmp2, size);
-						double variance = result2 / (size - 1);
-						ec.setScalarOutput(output, new DoubleObject(variance));
+				double result2 = reduceAll(gCtx, instName, "reduce_sum", tmp2, size);
+				double variance = result2 / (size - 1);
+				ec.setScalarOutput(output, new DoubleObject(variance));
 
-						break;
-					}
-					case REDUCTION_COL: {
-						reduceRow(gCtx, instName, "reduce_row_mean", in, out, rlen, clen);
-						// Subtract the row-wise mean from every element in the matrix
-						BinaryOperator minusOp = new BinaryOperator(Minus.getMinusFnObject());
-						matrixMatrixOp(gCtx, instName, in, out, rlen, clen, VectorShape.NONE.code(), VectorShape.COLUMN.code(), tmp, minusOp);
+				break;
+			}
+			case REDUCTION_COL: {
+				reduceRow(gCtx, instName, "reduce_row_mean", in, out, rlen, clen);
+				// Subtract the row-wise mean from every element in the matrix
+				BinaryOperator minusOp = new BinaryOperator(Minus.getMinusFnObject());
+				matrixMatrixOp(gCtx, instName, in, out, rlen, clen, VectorShape.NONE.code(), VectorShape.COLUMN.code(), tmp, minusOp);
 
-						squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen);
+				squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen);
 
-						Pointer tmpRow = gCtx.allocate(instName, rlen * Sizeof.DOUBLE);
-						reduceRow(gCtx, instName, "reduce_row_sum", tmp2, tmpRow, rlen, clen);
+				Pointer tmpRow = gCtx.allocate(instName, rlen * Sizeof.DOUBLE);
+				reduceRow(gCtx, instName, "reduce_row_sum", tmp2, tmpRow, rlen, clen);
 
-						ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), clen - 1);
-						matrixScalarOp(gCtx, instName, tmpRow, clen - 1, rlen, 1, out, divideOp);
+				ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), clen - 1);
+				matrixScalarOp(gCtx, instName, tmpRow, clen - 1, rlen, 1, out, divideOp);
 
-						gCtx.cudaFreeHelper(instName, tmpRow);
+				gCtx.cudaFreeHelper(instName, tmpRow);
 
-						break;
-					}
-					case REDUCTION_ROW: {
-						reduceCol(gCtx, instName, "reduce_col_mean", in, out, rlen, clen);
-						// Subtract the columns-wise mean from every element in the matrix
-						BinaryOperator minusOp = new BinaryOperator(Minus.getMinusFnObject());
-						matrixMatrixOp(gCtx, instName, in, out, rlen, clen, VectorShape.NONE.code(), VectorShape.ROW.code(), tmp, minusOp);
+				break;
+			}
+			case REDUCTION_ROW: {
+				reduceCol(gCtx, instName, "reduce_col_mean", in, out, rlen, clen);
+				// Subtract the columns-wise mean from every element in the matrix
+				BinaryOperator minusOp = new BinaryOperator(Minus.getMinusFnObject());
+				matrixMatrixOp(gCtx, instName, in, out, rlen, clen, VectorShape.NONE.code(), VectorShape.ROW.code(), tmp, minusOp);
 
-						squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen);
+				squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen);
 
-						Pointer tmpCol = gCtx.allocate(instName, clen * Sizeof.DOUBLE);
-						reduceCol(gCtx, instName, "reduce_col_sum", tmp2, tmpCol, rlen, clen);
+				Pointer tmpCol = gCtx.allocate(instName, clen * Sizeof.DOUBLE);
+				reduceCol(gCtx, instName, "reduce_col_sum", tmp2, tmpCol, rlen, clen);
 
-						ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), rlen - 1);
-						matrixScalarOp(gCtx, instName, tmpCol, rlen - 1, 1, clen, out, divideOp);
+				ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), rlen - 1);
+				matrixScalarOp(gCtx, instName, tmpCol, rlen - 1, 1, clen, out, divideOp);
 
-						gCtx.cudaFreeHelper(instName, tmpCol);
+				gCtx.cudaFreeHelper(instName, tmpCol);
 
-						break;
-					}
-					default:
-						throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for variance");
-				}
-				gCtx.cudaFreeHelper(instName, tmp);
-				gCtx.cudaFreeHelper(instName, tmp2);
 				break;
 			}
-			case OP_MAXINDEX : {
-				switch(reductionDirection) {
-					case REDUCTION_COL:
-						throw new DMLRuntimeException("Internal Error - Column maxindex of matrix not implemented yet for GPU ");
-					default:
-						throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for maxindex");
-				}
-				// break;
+			default:
+				throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for variance");
 			}
-			case OP_MININDEX : {
-				switch(reductionDirection) {
-					case REDUCTION_COL:
-						throw new DMLRuntimeException("Internal Error - Column minindex of matrix not implemented yet for GPU ");
-					default:
-						throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for minindex");
-				}
-				// break;
+			gCtx.cudaFreeHelper(instName, tmp);
+			gCtx.cudaFreeHelper(instName, tmp2);
+			break;
+		}
+		case OP_MAXINDEX : {
+			switch(reductionDirection) {
+			case REDUCTION_COL:
+				throw new DMLRuntimeException("Internal Error - Column maxindex of matrix not implemented yet for GPU ");
+			default:
+				throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for maxindex");
+			}
+			// break;
+		}
+		case OP_MININDEX : {
+			switch(reductionDirection) {
+			case REDUCTION_COL:
+				throw new DMLRuntimeException("Internal Error - Column minindex of matrix not implemented yet for GPU ");
+			default:
+				throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for minindex");
 			}
-			default : throw new DMLRuntimeException("Internal Error - Invalid GPU Unary aggregate function!");
+			// break;
+		}
+		default : throw new DMLRuntimeException("Internal Error - Invalid GPU Unary aggregate function!");
 		}
 	}
 
@@ -2222,7 +2222,7 @@ public class LibMatrixCUDA {
 	 * @throws DMLRuntimeException if error
 	 */
 	private static void squareMatrix(GPUContext gCtx, String instName, Pointer in, Pointer out, int rlen, int clen) throws DMLRuntimeException {
-        ScalarOperator power2op = new RightScalarOperator(Power.getPowerFnObject(), 2);
+		ScalarOperator power2op = new RightScalarOperator(Power.getPowerFnObject(), 2);
 		matrixScalarOp(gCtx, instName, in, 2, rlen, clen, out, power2op);
 	}
 
@@ -2236,9 +2236,9 @@ public class LibMatrixCUDA {
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
 	private static double reduceAll(GPUContext gCtx, String instName, String kernelFunction, Pointer in, int n) throws DMLRuntimeException {
-        LOG.trace("GPU : reduceAll for " + kernelFunction + ", GPUContext=" + gCtx);
+		LOG.trace("GPU : reduceAll for " + kernelFunction + ", GPUContext=" + gCtx);
 
-        int[] tmp = getKernelParamsForReduceAll(gCtx, n);
+		int[] tmp = getKernelParamsForReduceAll(gCtx, n);
 		int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2];
 
 		Pointer tempOut = gCtx.allocate(instName, n * Sizeof.DOUBLE);
@@ -2256,7 +2256,7 @@ public class LibMatrixCUDA {
 			blocks = tmp[0]; threads = tmp[1]; sharedMem = tmp[2];
 			if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
 			getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem),
-							tempOut, tempOut, s);
+					tempOut, tempOut, s);
 			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_REDUCE_ALL_KERNEL, System.nanoTime() - t2);
 			s = (s + (threads*2-1)) / (threads*2);
 		}
@@ -2282,15 +2282,15 @@ public class LibMatrixCUDA {
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
 	private static void reduceRow(GPUContext gCtx, String instName, String kernelFunction, Pointer in, Pointer out, int rows, int cols) throws DMLRuntimeException {
-        LOG.trace("GPU : reduceRow for " + kernelFunction + ", GPUContext=" + gCtx);
+		LOG.trace("GPU : reduceRow for " + kernelFunction + ", GPUContext=" + gCtx);
 
-        int[] tmp = getKernelParamsForReduceByRow(gCtx, rows, cols);
+		int[] tmp = getKernelParamsForReduceByRow(gCtx, rows, cols);
 		int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2];
 
 		long t0=0;
 		if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
 		getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem),
-						in, out, rows, cols);
+				in, out, rows, cols);
 		//cudaDeviceSynchronize;
 		if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_REDUCE_ROW_KERNEL, System.nanoTime() - t0);
 
@@ -2308,15 +2308,15 @@ public class LibMatrixCUDA {
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
 	private static void reduceCol(GPUContext gCtx, String instName, String kernelFunction, Pointer in, Pointer out, int rows, int cols) throws DMLRuntimeException {
-        LOG.trace("GPU : reduceCol for " + kernelFunction + ", GPUContext=" + gCtx);
+		LOG.trace("GPU : reduceCol for " + kernelFunction + ", GPUContext=" + gCtx);
 
-        int[] tmp = getKernelParamsForReduceByCol(gCtx, rows, cols);
+		int[] tmp = getKernelParamsForReduceByCol(gCtx, rows, cols);
 		int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2];
 
 		long t0=0;
 		if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
 		getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem),
-						in, out, rows, cols);
+				in, out, rows, cols);
 		//cudaDeviceSynchronize;
 		if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_REDUCE_COL_KERNEL, System.nanoTime() - t0);
 	}
@@ -2448,51 +2448,51 @@ public class LibMatrixCUDA {
 			throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
 		double constant = op.getConstant();
 		LOG.trace("GPU : matrixScalarArithmetic, scalar: " + constant + ", GPUContext=" + gCtx);
-		
+
 		int outRLen = isInputTransposed ? (int) in.getNumColumns() : (int) in.getNumRows();
 		int outCLen = isInputTransposed ? (int) in.getNumRows() : (int) in.getNumColumns();
-		
+
 		//boolean isCUDALibAvailable = (op.fn instanceof Multiply
 		//		|| (op.fn instanceof Divide && op instanceof RightScalarOperator && constant != 0)) && !isSparseAndEmpty(gCtx, in);
 		//if(!isCUDALibAvailable) {
-			if(constant == 0) {
-                if(op.fn instanceof Plus || (op.fn instanceof Minus && op instanceof RightScalarOperator) || op.fn instanceof Or) {
-					deviceCopy(ec, gCtx, instName, in, outputName, isInputTransposed);
-				}
-				else if(op.fn instanceof Multiply || op.fn instanceof And) {
-					setOutputToConstant(ec, gCtx, instName, 0.0, outputName, outRLen, outCLen);
-				}
-				else if(op.fn instanceof Power) {
-					setOutputToConstant(ec, gCtx, instName, 1.0, outputName, outRLen, outCLen);
-				}
-                // TODO:
-                // x/0.0 is either +Infinity or -Infinity according to Java.
-                // In the context of a matrix, different elements of the matrix
-                // could have different values.
-                // If the IEEE 754 standard defines otherwise, this logic needs
-                // to be re-enabled and the Java computation logic for divide by zero
-                // needs to be revisited
-                //else if(op.fn instanceof Divide && isSparseAndEmpty(gCtx, in)) {
-                //	setOutputToConstant(ec, gCtx, instName, Double.NaN, outputName);
-                //}
-                //else if(op.fn instanceof Divide) {
-                //	//For division, IEEE 754 defines x/0.0 as INFINITY and 0.0/0.0 as NaN.
-                //	compareAndSet(ec, gCtx, instName, in, outputName, 0.0, 1e-6, Double.NaN, Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY);
-                //}
-                else {
-					// TODO: Potential to optimize
-					matrixScalarOp(ec, gCtx, instName, in, outputName, isInputTransposed, op);
-				}
+		if(constant == 0) {
+			if(op.fn instanceof Plus || (op.fn instanceof Minus && op instanceof RightScalarOperator) || op.fn instanceof Or) {
+				deviceCopy(ec, gCtx, instName, in, outputName, isInputTransposed);
 			}
-			else if(constant == 1.0 && op.fn instanceof Or) {
-				setOutputToConstant(ec, gCtx, instName, 1.0, outputName, outRLen, outCLen);
+			else if(op.fn instanceof Multiply || op.fn instanceof And) {
+				setOutputToConstant(ec, gCtx, instName, 0.0, outputName, outRLen, outCLen);
 			}
-			else if(constant == 1.0 && (op.fn instanceof And || op.fn instanceof Power)) {
-				deviceCopy(ec, gCtx, instName, in, outputName, isInputTransposed);
+			else if(op.fn instanceof Power) {
+				setOutputToConstant(ec, gCtx, instName, 1.0, outputName, outRLen, outCLen);
 			}
+			// TODO:
+				// x/0.0 is either +Infinity or -Infinity according to Java.
+			// In the context of a matrix, different elements of the matrix
+			// could have different values.
+			// If the IEEE 754 standard defines otherwise, this logic needs
+			// to be re-enabled and the Java computation logic for divide by zero
+			// needs to be revisited
+			//else if(op.fn instanceof Divide && isSparseAndEmpty(gCtx, in)) {
+			//	setOutputToConstant(ec, gCtx, instName, Double.NaN, outputName);
+			//}
+			//else if(op.fn instanceof Divide) {
+			//	//For division, IEEE 754 defines x/0.0 as INFINITY and 0.0/0.0 as NaN.
+			//	compareAndSet(ec, gCtx, instName, in, outputName, 0.0, 1e-6, Double.NaN, Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY);
+			//}
 			else {
+				// TODO: Potential to optimize
 				matrixScalarOp(ec, gCtx, instName, in, outputName, isInputTransposed, op);
 			}
+		}
+		else if(constant == 1.0 && op.fn instanceof Or) {
+			setOutputToConstant(ec, gCtx, instName, 1.0, outputName, outRLen, outCLen);
+		}
+		else if(constant == 1.0 && (op.fn instanceof And || op.fn instanceof Power)) {
+			deviceCopy(ec, gCtx, instName, in, outputName, isInputTransposed);
+		}
+		else {
+			matrixScalarOp(ec, gCtx, instName, in, outputName, isInputTransposed, op);
+		}
 		// }
 		//else {
 		//	double alpha = 0;
@@ -2506,8 +2506,8 @@ public class LibMatrixCUDA {
 		//		throw new DMLRuntimeException("Unsupported op");
 		//	}
 
-			// TODO: Performance optimization: Call cublasDaxpy if(in.getNumRows() == 1 || in.getNumColumns() == 1)
-			// C = alpha* op( A ) + beta* op ( B )
+		// TODO: Performance optimization: Call cublasDaxpy if(in.getNumRows() == 1 || in.getNumColumns() == 1)
+		// C = alpha* op( A ) + beta* op ( B )
 		//	dgeam(ec, gCtx, instName, in, in, outputName, isInputTransposed, isInputTransposed, alpha, 0.0);
 		//}
 	}
@@ -2563,15 +2563,15 @@ public class LibMatrixCUDA {
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
 	public static void matrixMatrixArithmetic(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2,
-																						String outputName, boolean isLeftTransposed, boolean isRightTransposed, BinaryOperator op) throws DMLRuntimeException {
+			String outputName, boolean isLeftTransposed, boolean isRightTransposed, BinaryOperator op) throws DMLRuntimeException {
 		if (ec.getGPUContext(0) != gCtx)
 			throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
 		boolean isCUDALibAvailable = (op.fn instanceof Plus || op.fn instanceof Minus) && !isSparseAndEmpty(gCtx, in1) && !isSparseAndEmpty(gCtx, in2) && !isVector(in1) && !isVector(in2);
 		if(!isCUDALibAvailable) {
-            matrixMatrixOp(ec, gCtx, instName, in1, in2, outputName, isLeftTransposed, isRightTransposed, op);
+			matrixMatrixOp(ec, gCtx, instName, in1, in2, outputName, isLeftTransposed, isRightTransposed, op);
 		}
 		else {
-            double alpha;
+			double alpha;
 			double beta;
 			if(op.fn instanceof Plus) {
 				alpha = 1.0;
@@ -2602,7 +2602,7 @@ public class LibMatrixCUDA {
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
 	private static void matrixScalarOp(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in, String outputName, boolean isInputTransposed,
-																		 ScalarOperator op) throws DMLRuntimeException {
+			ScalarOperator op) throws DMLRuntimeException {
 		if (ec.getGPUContext(0) != gCtx)
 			throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
 		if(isInputTransposed)
@@ -2638,11 +2638,11 @@ public class LibMatrixCUDA {
 		int isLeftScalar = (op instanceof LeftScalarOperator) ? 1 : 0;
 		int size = rlenA * clenA;
 		long t0=0;
-        if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
-            getCudaKernels(gCtx).launchKernel("matrix_scalar_op",
-                            ExecutionConfig.getConfigForSimpleVectorOperations(size),
-                            a, scalar, c, size, getBinaryOp(op.fn), isLeftScalar);
-            if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MATRIX_SCALAR_OP_KERNEL, System.nanoTime() - t0);
+		if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
+		getCudaKernels(gCtx).launchKernel("matrix_scalar_op",
+				ExecutionConfig.getConfigForSimpleVectorOperations(size),
+				a, scalar, c, size, getBinaryOp(op.fn), isLeftScalar);
+		if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MATRIX_SCALAR_OP_KERNEL, System.nanoTime() - t0);
 	}
 
 	/**
@@ -2660,7 +2660,7 @@ public class LibMatrixCUDA {
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
 	private static void matrixMatrixOp(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2,
-																		 String outputName, boolean isLeftTransposed, boolean isRightTransposed, BinaryOperator op) throws DMLRuntimeException {
+			String outputName, boolean isLeftTransposed, boolean isRightTransposed, BinaryOperator op) throws DMLRuntimeException {
 		if (ec.getGPUContext(0) != gCtx)
 			throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
 		boolean isEmpty1 = isSparseAndEmpty(gCtx, in1);
@@ -2671,7 +2671,7 @@ public class LibMatrixCUDA {
 		int clenB = toInt(in2.getNumColumns());
 		int vecStatusA = getVectorStatus(rlenA, clenA).code();
 		int vecStatusB = getVectorStatus(rlenB, clenB).code();
-		
+
 		if(isLeftTransposed || isRightTransposed) {
 			throw new DMLRuntimeException("Unsupported operator: GPU transposed binary op " + isLeftTransposed + " " + isRightTransposed);
 		}
@@ -2742,8 +2742,8 @@ public class LibMatrixCUDA {
 		long t0=0;
 		if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
 		getCudaKernels(gCtx).launchKernel("matrix_matrix_cellwise_op",
-            ExecutionConfig.getConfigForSimpleMatrixOperations(maxRlen, maxClen),
-						a, b, c, maxRlen, maxClen, vecStatusA, vecStatusB, getBinaryOp(op.fn));
+				ExecutionConfig.getConfigForSimpleMatrixOperations(maxRlen, maxClen),
+				a, b, c, maxRlen, maxClen, vecStatusA, vecStatusB, getBinaryOp(op.fn));
 		if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MATRIX_MATRIX_CELLWISE_OP_KERNEL, System.nanoTime() - t0);
 	}
 
@@ -2816,7 +2816,7 @@ public class LibMatrixCUDA {
 
 	@SuppressWarnings("unused")
 	private static void compareAndSet(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in, String outputName, double compareVal,  double tolerance,
-																		double ifEqualsVal, double ifLessThanVal, double ifGreaterThanVal) throws DMLRuntimeException {
+			double ifEqualsVal, double ifLessThanVal, double ifGreaterThanVal) throws DMLRuntimeException {
 		if (ec.getGPUContext(0) != gCtx)
 			throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
 		Pointer A = getDensePointer(gCtx, in, instName); // TODO: FIXME: Implement sparse kernel
@@ -2825,14 +2825,14 @@ public class LibMatrixCUDA {
 		int clen = toInt(out.getNumColumns());
 		getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, rlen, clen);	// Allocated the dense output matrix
 		Pointer ret = getDensePointer(gCtx, out, instName);
-		
+
 		// out.getMatrixCharacteristics().setNonZeros(rlen*clen);
 		// compareAndSet(double* A,  double* ret, int rlen, int clen, double compareVal, double ifEqualsVal, double ifNotEqualsVal)
 		long t0=0;
 		if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
 		getCudaKernels(gCtx).launchKernel("compare_and_set",
-						ExecutionConfig.getConfigForSimpleMatrixOperations(rlen, clen),
-						A, ret, rlen, clen, compareVal, tolerance, ifEqualsVal, ifLessThanVal, ifGreaterThanVal);
+				ExecutionConfig.getConfigForSimpleMatrixOperations(rlen, clen),
+				A, ret, rlen, clen, compareVal, tolerance, ifEqualsVal, ifLessThanVal, ifGreaterThanVal);
 		if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_COMPARE_AND_SET_KERNEL, System.nanoTime() - t0);
 	}
 
@@ -2935,7 +2935,7 @@ public class LibMatrixCUDA {
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
 	private static void dgeam(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName,
-														boolean isLeftTransposed, boolean isRightTransposed, double alpha, double beta) throws DMLRuntimeException {
+			boolean isLeftTransposed, boolean isRightTransposed, double alpha, double beta) throws DMLRuntimeException {
 		if (ec.getGPUContext(0) != gCtx)
 			throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
 		LOG.trace("GPU : dgeam" + ", GPUContext=" + gCtx);
@@ -2944,7 +2944,7 @@ public class LibMatrixCUDA {
 		Pointer betaPtr = pointerTo(beta);
 		int transa = isLeftTransposed ? CUBLAS_OP_T : CUBLAS_OP_N;
 		int transb = isRightTransposed ? CUBLAS_OP_T : CUBLAS_OP_N;
-		
+
 		long outRLen = isLeftTransposed ? in1.getNumColumns() : in1.getNumRows();
 		long outCLen = isLeftTransposed ? in1.getNumRows() : in1.getNumColumns();
 
@@ -3086,6 +3086,70 @@ public class LibMatrixCUDA {
 	//**************** Matrix Manipulation Functions *********************/
 	//********************************************************************/
 
+	/**
+	 * Method to perform rangeReIndex operation for a given lower and upper bounds in row and column dimensions.
+	 *  
+	 * @param ec current execution context
+	 * @param gCtx current gpu context
+	 * @param instName name of the instruction for maintaining statistics
+	 * @param in1 input matrix object
+	 * @param ixrange index range (0-based)
+	 * @param outputName output matrix object
+	 * @throws DMLRuntimeException if error occurs
+	 */
+	public static void sliceOperations(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1,
+			IndexRange ixrange, String outputName) throws DMLRuntimeException {
+		if (ec.getGPUContext(0) != gCtx)
+			throw new DMLRuntimeException(
+					"GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
+		LOG.trace("GPU : sliceOperations" + ", GPUContext=" + gCtx);
+
+		int rl = (int) ixrange.rowStart;
+		int ru = (int) ixrange.rowEnd;
+		int cl = (int) ixrange.colStart;
+		int cu = (int) ixrange.colEnd;
+		if (rl < 0 || rl >= in1.getNumRows() || ru < rl || ru >= in1.getNumRows() || cl < 0
+				|| cu >= in1.getNumColumns() || cu < cl || cu >= in1.getNumColumns()) {
+			throw new DMLRuntimeException("Invalid values for matrix indexing: [" + (rl + 1) + ":" + (ru + 1) + ","
+					+ (cl + 1) + ":" + (cu + 1) + "] " + "must be within matrix dimensions [" + in1.getNumRows() + ","
+					+ in1.getNumColumns() + "]");
+		}
+
+		int len1 = toInt(in1.getNumColumns());
+		int len2 = toInt(ec.getMatrixObject(outputName).getNumColumns());
+		if(isInSparseFormat(gCtx, in1)) {
+			// Input in1 is in sparse format and output is in dense format
+			MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, ru - rl + 1, cu - cl + 1);
+			CSRPointer inPointer = getSparsePointer(gCtx, in1, instName);
+			Pointer outPointer = getDensePointer(gCtx, out, instName);
+			int size = ru - rl + 1;
+			long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
+			// Performs a slice operation where the input matrix is sparse and the output matrix is dense.
+			// This function avoids unnecessary sparse to dense conversion of the input matrix.
+			// We can generalize this later to output sparse matrix.
+			getCudaKernels(gCtx).launchKernel("slice_sparse_dense", ExecutionConfig.getConfigForSimpleVectorOperations(size),
+					inPointer.val, inPointer.rowPtr, inPointer.colInd, outPointer, rl, ru, cl, cu);
+			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RIX_SPARSE_DENSE_OP, System.nanoTime() - t0);
+		}
+		else {
+			// Input in1 is in dense format (see inPointer)
+			MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, ru - rl + 1, cu - cl + 1);
+
+			Pointer inPointer = getDensePointer(gCtx, in1, instName);
+			Pointer outPointer = getDensePointer(gCtx, out, instName);
+			long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
+			if (len1 == len2) {
+				cudaMemcpy(outPointer, inPointer.withByteOffset(rl * len1 * Sizeof.DOUBLE), (ru - rl + 1) * len1
+						* Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
+			} else {
+				for (int i = rl, ix1 = rl * len1 + cl, ix2 = 0; i <= ru; i++, ix1 += len1, ix2 += len2) {
+					cudaMemcpy(outPointer.withByteOffset(ix2 * Sizeof.DOUBLE),
+							inPointer.withByteOffset(ix1 * Sizeof.DOUBLE), len2 * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
+				}
+			}
+			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RIX_DENSE_OP, System.nanoTime() - t0);
+		}
+	}
 
 	public static void cbind(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName) throws DMLRuntimeException {
 		if (ec.getGPUContext(0) != gCtx)
@@ -3093,7 +3157,7 @@ public class LibMatrixCUDA {
 		LOG.trace("GPU : cbind" + ", GPUContext=" + gCtx);
 
 		long t1 = 0;
-		
+
 		long rowsA = toInt(in1.getNumRows());
 		long colsA = toInt(in1.getNumColumns());
 		long rowsB = toInt(in2.getNumRows());
@@ -3114,8 +3178,8 @@ public class LibMatrixCUDA {
 
 		if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
 		getCudaKernels(gCtx)
-				.launchKernel("cbind", ExecutionConfig.getConfigForSimpleMatrixOperations(maxRows, maxCols), A, B, C,
-						rowsA, colsA, rowsB, colsB);
+		.launchKernel("cbind", ExecutionConfig.getConfigForSimpleMatrixOperations(maxRows, maxCols), A, B, C,
+				rowsA, colsA, rowsB, colsB);
 		if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CBIND_KERNEL, System.nanoTime() - t1);
 
 	}
@@ -3126,7 +3190,7 @@ public class LibMatrixCUDA {
 		LOG.trace("GPU : rbind" + ", GPUContext=" + gCtx);
 
 		long t1 = 0;
-		
+
 		int rowsA = toInt(in1.getNumRows());
 		int colsA = toInt(in1.getNumColumns());
 		int rowsB = toInt(in2.getNumRows());
@@ -3147,8 +3211,8 @@ public class LibMatrixCUDA {
 
 		if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
 		getCudaKernels(gCtx)
-				.launchKernel("rbind", ExecutionConfig.getConfigForSimpleMatrixOperations(maxRows, maxCols), A, B, C,
-						rowsA, colsA, rowsB, colsB);
+		.launchKernel("rbind", ExecutionConfig.getConfigForSimpleMatrixOperations(maxRows, maxCols), A, B, C,
+				rowsA, colsA, rowsB, colsB);
 		if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RBIND_KERNEL, System.nanoTime() - t1);
 
 	}
@@ -3422,7 +3486,7 @@ public class LibMatrixCUDA {
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
 	public static void axpy(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2,
-													String outputName,  double constant) throws DMLRuntimeException {
+			String outputName,  double constant) throws DMLRuntimeException {
 		if (ec.getGPUContext(0) != gCtx)
 			throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
 		Pointer A = getDensePointer(gCtx, in1, instName);
@@ -3433,9 +3497,9 @@ public class LibMatrixCUDA {
 
 		long t1=0, t2=0;
 		if(in1.getNumRows() == in2.getNumRows() && in1.getNumColumns() == in2.getNumColumns()) {
-            LOG.trace("GPU : cublasDaxpy" + ", GPUContext=" + gCtx);
+			LOG.trace("GPU : cublasDaxpy" + ", GPUContext=" + gCtx);
 
-            // Matrix-Matrix daxpy
+			// Matrix-Matrix daxpy
 			long n = in1.getNumRows()*in2.getNumColumns(); // Since A is always a matrix
 			Pointer alphaPtr = pointerTo(constant);
 			// C <- A + alpha*B
@@ -3451,9 +3515,9 @@ public class LibMatrixCUDA {
 			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DAXPY_LIB, System.nanoTime() - t2);
 		}
 		else {
-            LOG.trace("GPU : daxpy_matrix_vector" + ", GPUContext=" + gCtx);
+			LOG.trace("GPU : daxpy_matrix_vector" + ", GPUContext=" + gCtx);
 
-            // Matrix-Vector daxpy
+			// Matrix-Vector daxpy
 			// Note: Vector-Matrix operation is not supported
 			// daxpy_matrix_vector(double* A,  double* B, double alpha, double* ret, int rlenA, int clenA, int rlenB, int clenB)
 			if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
@@ -3466,51 +3530,51 @@ public class LibMatrixCUDA {
 	}
 
 
-    /**
-     * Implements the "solve" function for systemml Ax = B (A is of size m*n, B is of size m*1, x is of size n*1)
-     *
-     * @param ec         a valid {@link ExecutionContext}
-     * @param gCtx       a valid {@link GPUContext}
-     * @param instName   the invoking instruction's name for record {@link Statistics}.
-     * @param in1        input matrix A
-     * @param in2        input matrix B
-     * @param outputName name of the output matrix
-     * @throws DMLRuntimeException if an error occurs
-     */
-    public static void solve(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName) throws DMLRuntimeException {
-        if (ec.getGPUContext(0) != gCtx)
-            throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
+	/**
+	 * Implements the "solve" function for systemml Ax = B (A is of size m*n, B is of size m*1, x is of size n*1)
+	 *
+	 * @param ec         a valid {@link ExecutionContext}
+	 * @param gCtx       a valid {@link GPUContext}
+	 * @param instName   the invoking instruction's name for record {@link Statistics}.
+	 * @param in1        input matrix A
+	 * @param in2        input matrix B
+	 * @param outputName name of the output matrix
+	 * @throws DMLRuntimeException if an error occurs
+	 */
+	public static void solve(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName) throws DMLRuntimeException {
+		if (ec.getGPUContext(0) != gCtx)
+			throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
 
-        // x = solve(A, b)
+		// x = solve(A, b)
 		LOG.trace("GPU : solve" + ", GPUContext=" + gCtx);
 
 		long t0 = -1;
 
-        if (!isInSparseFormat(gCtx, in1) && !isInSparseFormat(gCtx, in2)) {    // Both dense
-            GPUObject Aobj = in1.getGPUObject(gCtx);
-            GPUObject bobj = in2.getGPUObject(gCtx);
-            int m = toInt(in1.getNumRows());
-            int n = toInt(in1.getNumColumns());
-            if (in2.getNumRows() != m)
-                throw new DMLRuntimeException("GPU : Incorrect input for solve(), rows in A should be the same as rows in B");
-            if (in2.getNumColumns() != 1)
-                throw new DMLRuntimeException("GPU : Incorrect input for solve(), columns in B should be 1");
-
-
-            // Copy over matrices and
-            // convert dense matrices to row major
-            // Operation in cuSolver and cuBlas are for column major dense matrices
-            // and are destructive to the original input
+		if (!isInSparseFormat(gCtx, in1) && !isInSparseFormat(gCtx, in2)) {    // Both dense
+			GPUObject Aobj = in1.getGPUObject(gCtx);
+			GPUObject bobj = in2.getGPUObject(gCtx);
+			int m = toInt(in1.getNumRows());
+			int n = toInt(in1.getNumColumns());
+			if (in2.getNumRows() != m)
+				throw new DMLRuntimeException("GPU : Incorrect input for solve(), rows in A should be the same as rows in B");
+			if (in2.getNumColumns() != 1)
+				throw new DMLRuntimeException("GPU : Incorrect input for solve(), columns in B should be 1");
+
+
+			// Copy over matrices and
+			// convert dense matrices to row major
+			// Operation in cuSolver and cuBlas are for column major dense matrices
+			// and are destructive to the original input
 			if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
-            GPUObject ATobj = (GPUObject) Aobj.clone();
+			GPUObject ATobj = (GPUObject) Aobj.clone();
 			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_OBJECT_CLONE, System.nanoTime() - t0);
 			if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
 			ATobj.denseRowMajorToColumnMajor();
-            if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ROW_TO_COLUMN_MAJOR, System.nanoTime() - t0);
-            Pointer A = ATobj.getJcudaDenseMatrixPtr();
+			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ROW_TO_COLUMN_MAJOR, System.nanoTime() - t0);
+			Pointer A = ATobj.getJcudaDenseMatrixPtr();
 
 			if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
-            GPUObject bTobj = (GPUObject) bobj.clone();
+			GPUObject bTobj = (GPUObject) bobj.clone();
 			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_OBJECT_CLONE, System.nanoTime() - t0);
 			if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
 			bTobj.denseRowMajorToColumnMajor();
@@ -3518,76 +3582,76 @@ public class LibMatrixCUDA {
 
 			Pointer b = bTobj.getJcudaDenseMatrixPtr();
 
-            // The following set of operations is done following the example in the cusolver documentation
-            // http://docs.nvidia.com/cuda/cusolver/#ormqr-example1
+			// The following set of operations is done following the example in the cusolver documentation
+			// http://docs.nvidia.com/cuda/cusolver/#ormqr-example1
 
-            // step 3: query working space of geqrf and ormqr
+			// step 3: query working space of geqrf and ormqr
 			if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
 			int[] lwork = {0};
-            JCusolverDn.cusolverDnDgeqrf_bufferSize(gCtx.getCusolverDnHandle(), m, n, A, m, lwork);
+			JCusolverDn.cusolverDnDgeqrf_bufferSize(gCtx.getCusolverDnHandle(), m, n, A, m, lwork);
 			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_QR_BUFFER, System.nanoTime() - t0);
 
 
 			// step 4: compute QR factorization
-            Pointer work = gCtx.allocate(instName, lwork[0] * Sizeof.DOUBLE);
-            Pointer tau = gCtx.allocate(instName, m * Sizeof.DOUBLE);
-            Pointer devInfo = gCtx.allocate(Sizeof.INT);
+			Pointer work = gCtx.allocate(instName, lwork[0] * Sizeof.DOUBLE);
+			Pointer tau = gCtx.allocate(instName, m * Sizeof.DOUBLE);
+			Pointer devInfo = gCtx.allocate(Sizeof.INT);
 			if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
 			JCusolverDn.cusolverDnDgeqrf(gCtx.getCusolverDnHandle(), m, n, A, m, tau, work, lwork[0], devInfo);
 			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_QR, System.nanoTime() - t0);
 
 
 			int[] qrError = {-1};
-            cudaMemcpy(Pointer.to(qrError), devInfo, Sizeof.INT, cudaMemcpyDeviceToHost);
-            if (qrError[0] != 0) {
-                throw new DMLRuntimeException("GPU : Error in call to geqrf (QR factorization) as part of solve, argument " + qrError[0] + " was wrong");
-            }
+			cudaMemcpy(Pointer.to(qrError), devInfo, Sizeof.INT, cudaMemcpyDeviceToHost);
+			if (qrError[0] != 0) {
+				throw new DMLRuntimeException("GPU : Error in call to geqrf (QR factorization) as part of solve, argument " + qrError[0] + " was wrong");
+			}
 
-            // step 5: compute Q^T*B
+			// step 5: compute Q^T*B
 			if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
 			JCusolverDn.cusolverDnDormqr(gCtx.getCusolverDnHandle(), cublasSideMode.CUBLAS_SIDE_LEFT, cublasOperation.CUBLAS_OP_T, m, 1, n, A, m, tau, b, m, work, lwork[0], devInfo);
 			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ORMQR, System.nanoTime() - t0);
 			cudaMemcpy(Pointer.to(qrError), devInfo, Sizeof.INT, cudaMemcpyDeviceToHost);
-            if (qrError[0] != 0) {
-                throw new DMLRuntimeException("GPU : Error in call to ormqr (to compuete Q^T*B after QR factorization) as part of solve, argument " + qrError[0] + " was wrong");
-            }
+			if (qrError[0] != 0) {
+				throw new DMLRuntimeException("GPU : Error in call to ormqr (to compuete Q^T*B after QR factorization) as part of solve, argument " + qrError[0] + " was wrong");
+			}
 
-            // step 6: compute x = R \ Q^T*B
+			// step 6: compute x = R \ Q^T*B
 			if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
 			JCublas2.cublasDtrsm(gCtx.getCublasHandle(),
-                cublasSideMode.CUBLAS_SIDE_LEFT, cublasFillMode.CUBLAS_FILL_MODE_UPPER, cublasOperation.CUBLAS_OP_N, cublasDiagType.CUBLAS_DIAG_NON_UNIT,
-                n, 1, pointerTo(1.0), A, m, b, m);
+					cublasSideMode.CUBLAS_SIDE_LEFT, cublasFillMode.CUBLAS_FILL_MODE_UPPER, cublasOperation.CUBLAS_OP_N, cublasDiagType.CUBLAS_DIAG_NON_UNIT,
+					n, 1, pointerTo(1.0), A, m, b, m);
 			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_TRSM, System.nanoTime() - t0);
 
 			if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
 			bTobj.denseColumnMajorToRowMajor();
 			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_COLUMN_TO_ROW_MAJOR, System.nanoTime() - t0);
 
-            // TODO  : Find a way to assign bTobj directly to the output and set the correct flags so as to not crash
-            // There is an avoidable copy happening here
+			// TODO  : Find a way to assign bTobj directly to the output and set the correct flags so as to not crash
+			// There is an avoidable copy happening here
 			MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, in1.getNumColumns(), 1);
-            cudaMemcpy(out.getGPUObject(gCtx).getJcudaDenseMatrixPtr(), bTobj.getJcudaDenseMatrixPtr(), n * 1 * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
+			cudaMemcpy(out.getGPUObject(gCtx).getJcudaDenseMatrixPtr(), bTobj.getJcudaDenseMatrixPtr(), n * 1 * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
 
-            gCtx.cudaFreeHelper(instName, work);
-            gCtx.cudaFreeHelper(instName, tau);
-            ATobj.clearData();
-            bTobj.clearData();
+			gCtx.cudaFreeHelper(instName, work);
+			gCtx.cudaFreeHelper(instName, tau);
+			ATobj.clearData();
+			bTobj.clearData();
 
-            //debugPrintMatrix(b, n, 1);
+			//debugPrintMatrix(b, n, 1);
 
 
-        } else if (isInSparseFormat(gCtx, in1) && isInSparseFormat(gCtx, in2)) { // Both sparse
-            throw new DMLRuntimeException("GPU : solve on sparse inputs not supported");
-        } else if (!isInSparseFormat(gCtx, in1) && isInSparseFormat(gCtx, in2)) { // A is dense, b is sparse
-            // Pointer A = getDensePointer(gCtx, in1, instName);
-            // Pointer B = getDensePointer(gCtx, in2, instName);
-            throw new DMLRuntimeException("GPU : solve on sparse inputs not supported");
-        } else if (isInSparseFormat(gCtx, in1) && !isInSparseFormat(gCtx, in2)) { // A is sparse, b is dense
-            throw new DMLRuntimeException("GPU : solve on sparse inputs not supported");
-        }
+		} else if (isInSparseFormat(gCtx, in1) && isInSparseFormat(gCtx, in2)) { // Both sparse
+			throw new DMLRuntimeException("GPU : solve on sparse inputs not supported");
+		} else if (!isInSparseFormat(gCtx, in1) && isInSparseFormat(gCtx, in2)) { // A is dense, b is sparse
+			// Pointer A = getDensePointer(gCtx, in1, instName);
+			// Pointer B = getDensePointer(gCtx, in2, instName);
+			throw new DMLRuntimeException("GPU : solv

<TRUNCATED>