You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by na...@apache.org on 2017/04/21 23:23:18 UTC

[3/5] incubator-systemml git commit: Refactored GPU{Contex, Object} to make it friendlier for parfor

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java
index d858b0b..b4cb87d 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java
@@ -18,34 +18,25 @@
  */
 package org.apache.sysml.runtime.instructions.gpu.context;
 
-import static jcuda.driver.JCudaDriver.cuCtxCreate;
-import static jcuda.driver.JCudaDriver.cuCtxGetCurrent;
-import static jcuda.driver.JCudaDriver.cuDeviceGet;
-import static jcuda.driver.JCudaDriver.cuInit;
 import static jcuda.driver.JCudaDriver.cuLaunchKernel;
 import static jcuda.driver.JCudaDriver.cuModuleGetFunction;
 import static jcuda.driver.JCudaDriver.cuModuleLoadDataEx;
-import static jcuda.driver.JCudaDriver.cuModuleUnload;
 
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.HashMap;
 
-import jcuda.runtime.JCuda;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.io.IOUtilFunctions;
 
-import jcuda.CudaException;
 import jcuda.Pointer;
-import jcuda.driver.CUcontext;
-import jcuda.driver.CUdevice;
 import jcuda.driver.CUfunction;
 import jcuda.driver.CUmodule;
 import jcuda.driver.CUresult;
 
 /**
- * Utility class that allows LibMatrixCUDA as well as JCudaObject to invoke custom CUDA kernels.
+ * Utility class that allows LibMatrixCUDA as well as GPUObject to invoke custom CUDA kernels.
  * 
  * The utility org.apache.sysml.runtime.instructions.gpu.context.JCudaKernels simplifies the launching of the kernels. 
  * For example: to launch a kernel 
@@ -54,70 +45,23 @@ import jcuda.driver.CUresult;
  */
 public class JCudaKernels {
 
-	private static String ptxFileName = "/kernels/SystemML.ptx";
+	private final static String ptxFileName = "/kernels/SystemML.ptx";
 	private HashMap<String, CUfunction> kernels = new HashMap<String, CUfunction>();
 	private CUmodule module;
+	private final int deviceNum;
 	
 	/**
 	 * Loads the kernels in the file ptxFileName. Though cubin files are also supported, we will stick with
 	 * ptx file as they are target-independent similar to Java's .class files.
-	 * 
+	 * @param deviceNum  the device number for which to initiate the driver API
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
-	public JCudaKernels() throws DMLRuntimeException {
-		shutdown();
-		initCUDA();
+	JCudaKernels(int deviceNum) throws DMLRuntimeException {
+		this.deviceNum = deviceNum;
 		module = new CUmodule();
 		// Load the kernels specified in the ptxFileName file
 		checkResult(cuModuleLoadDataEx(module, initKernels(ptxFileName), 0, new int[0], Pointer.to(new int[0])));
 	}
-	
-	/**
-     * Initializes the JCuda driver API. Then it will try to attach to the 
-     * current CUDA context. If no active CUDA context exists, then it will 
-     * try to create one, for the device which is specified by the current 
-     * deviceNumber.
-     * 
-	 * @throws DMLRuntimeException If it is neither possible to attach to an 
-     * existing context, nor to create a new context.
-     */
-    private static void initCUDA() throws DMLRuntimeException {
-        checkResult(cuInit(0));
-
-        // Try to obtain the current context
-        CUcontext context = new CUcontext();
-        checkResult(cuCtxGetCurrent(context));
-        
-        // If the context is 'null', then a new context
-        // has to be created.
-        CUcontext nullContext = new CUcontext(); 
-        if (context.equals(nullContext)) {
-            createContext();
-        }
-    }
-    
-    /**
-     * Tries to create a context for device 'deviceNumber'.
-     * @throws DMLRuntimeException 
-     * 
-     * @throws CudaException If the device can not be 
-     * accessed or the context can not be created
-     */
-    private static void createContext() throws DMLRuntimeException {
-    	int deviceNumber = 0;
-        CUdevice device = new CUdevice();
-        checkResult(cuDeviceGet(device, deviceNumber));
-        CUcontext context = new CUcontext();
-        checkResult(cuCtxCreate(context, 0, device));
-    }
-    
-	/**
-	 * Performs cleanup actions such as unloading the module
-	 */
-	public void shutdown() {
-		if(module != null)
-			cuModuleUnload(module);
-	}
 
 	/**
 	 * Setups the kernel parameters and launches the kernel using cuLaunchKernel API. 
@@ -167,7 +111,7 @@ public class JCudaKernels {
 				config.gridDimX, config.gridDimY, config.gridDimZ, 
 				config.blockDimX, config.blockDimY, config.blockDimZ, 
 				config.sharedMemBytes, config.stream, Pointer.to(kernelParams), null));
-		JCuda.cudaDeviceSynchronize();
+		//JCuda.cudaDeviceSynchronize();
 	}
 	
     public static void checkResult(int cuResult) throws DMLRuntimeException {

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java
deleted file mode 100644
index bc0031b..0000000
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java
+++ /dev/null
@@ -1,1330 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.sysml.runtime.instructions.gpu.context;
-
-import jcuda.Pointer;
-import jcuda.jcublas.JCublas2;
-import jcuda.jcublas.cublasHandle;
-import jcuda.jcudnn.cudnnTensorDescriptor;
-import jcuda.jcusparse.JCusparse;
-import jcuda.jcusparse.cusparseDirection;
-import jcuda.jcusparse.cusparseHandle;
-import jcuda.jcusparse.cusparseMatDescr;
-import jcuda.jcusparse.cusparsePointerMode;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.sysml.api.DMLScript;
-import org.apache.sysml.runtime.DMLRuntimeException;
-import org.apache.sysml.runtime.controlprogram.caching.CacheException;
-import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
-import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
-import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
-import org.apache.sysml.runtime.matrix.data.MatrixBlock;
-import org.apache.sysml.runtime.matrix.data.SparseBlock;
-import org.apache.sysml.runtime.matrix.data.SparseBlockCOO;
-import org.apache.sysml.runtime.matrix.data.SparseBlockCSR;
-import org.apache.sysml.runtime.matrix.data.SparseBlockMCSR;
-import org.apache.sysml.utils.GPUStatistics;
-import org.apache.sysml.utils.LRUCacheMap;
-
-import java.util.HashMap;
-import java.util.LinkedList;
-
-import static jcuda.jcublas.cublasOperation.CUBLAS_OP_T;
-import static jcuda.jcudnn.JCudnn.cudnnCreateTensorDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnDestroyTensorDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnSetTensor4dDescriptor;
-import static jcuda.jcudnn.cudnnTensorFormat.CUDNN_TENSOR_NCHW;
-import static jcuda.jcudnn.cudnnDataType.CUDNN_DATA_DOUBLE;
-import static jcuda.jcusparse.JCusparse.cusparseCreateMatDescr;
-import static jcuda.jcusparse.JCusparse.cusparseDcsr2dense;
-import static jcuda.jcusparse.JCusparse.cusparseDdense2csr;
-import static jcuda.jcusparse.JCusparse.cusparseDnnz;
-import static jcuda.jcusparse.JCusparse.cusparseSetMatIndexBase;
-import static jcuda.jcusparse.JCusparse.cusparseSetMatType;
-import static jcuda.jcusparse.JCusparse.cusparseSetPointerMode;
-import static jcuda.jcusparse.JCusparse.cusparseXcsrgeamNnz;
-import static jcuda.jcusparse.JCusparse.cusparseXcsrgemmNnz;
-import static jcuda.jcusparse.cusparseIndexBase.CUSPARSE_INDEX_BASE_ZERO;
-import static jcuda.jcusparse.cusparseMatrixType.CUSPARSE_MATRIX_TYPE_GENERAL;
-import static jcuda.runtime.JCuda.cudaDeviceSynchronize;
-import static jcuda.runtime.JCuda.cudaFree;
-import static jcuda.runtime.JCuda.cudaMalloc;
-import static jcuda.runtime.JCuda.cudaMemcpy;
-import static jcuda.runtime.JCuda.cudaMemset;
-import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
-import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
-
-/**
- * Handle to a matrix block on the GPU
- */
-public class JCudaObject extends GPUObject {
-
-	private static final Log LOG = LogFactory.getLog(JCudaObject.class.getName());
-	
-	// An optional tensor descriptor (and shape) that can be set by a tensor instruction such as convolution, maxpooling
-	// and exploited by a subsequent non-tensor instruction such as relu
-	private cudnnTensorDescriptor tensorDescriptor = null;
-	private int [] tensorShape = null;
-	
-	/**
-	 * Returns a previously allocated tensor shape or null
-	 * @return int array of four elements or null
-	 */
-	public int [] getTensorShape() {
-		return tensorShape;
-	}
-	
-	/**
-	 * Returns a previously allocated tensor descriptor or null
-	 * @return cudnn tensor descriptor
-	 */
-	public cudnnTensorDescriptor getTensorDescriptor() {
-		return tensorDescriptor;
-	}
-	
-	/**
-	 * Returns a previously allocated or allocates and returns a tensor descriptor
-	 * @param N number of images
-	 * @param C number of channels
-	 * @param H height
-	 * @param W width
-	 * @return cudnn tensor descriptor
-	 */
-	public cudnnTensorDescriptor allocateTensorDescriptor(int N, int C, int H, int W) {
-		if(tensorDescriptor == null) {
-			tensorDescriptor = new cudnnTensorDescriptor();
-			cudnnCreateTensorDescriptor(tensorDescriptor);
-			cudnnSetTensor4dDescriptor(tensorDescriptor, CUDNN_TENSOR_NCHW, CUDNN_DATA_DOUBLE, N, C, H, W);
-			tensorShape = new int[4];
-			tensorShape[0] = N;
-			tensorShape[1] = C;
-			tensorShape[2] = H;
-			tensorShape[3] = W;
-		}
-		return tensorDescriptor;
-	}
-	
-	/**
-	 * Compressed Sparse Row (CSR) format for CUDA
-	 * Generalized matrix multiply is implemented for CSR format in the cuSparse library among other operations
-	 */
-	public static class CSRPointer {
-		
-		public static cusparseMatDescr matrixDescriptor;
-		
-		/**
-		 * @return Singleton default matrix descriptor object 
-		 * 			(set with CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_INDEX_BASE_ZERO)
-		 */
-		public static cusparseMatDescr getDefaultCuSparseMatrixDescriptor(){
-			if (matrixDescriptor == null){
-				// Code from JCuda Samples - http://www.jcuda.org/samples/JCusparseSample.java
-				matrixDescriptor = new cusparseMatDescr();
-				cusparseCreateMatDescr(matrixDescriptor);
-				cusparseSetMatType(matrixDescriptor, CUSPARSE_MATRIX_TYPE_GENERAL);
-				cusparseSetMatIndexBase(matrixDescriptor, CUSPARSE_INDEX_BASE_ZERO);
-			}
-			return matrixDescriptor;
-		}
-		
-		private static final double ULTRA_SPARSITY_TURN_POINT = 0.0004;
-
-		/**
-		 * Default constructor to help with Factory method {@link #allocateEmpty(long, long)}
-		 */
-		private CSRPointer() {
-			val = new Pointer();
-			rowPtr = new Pointer();
-			colInd = new Pointer();
-			allocateMatDescrPointer();
-		}
-		
-		/** Number of non zeroes	 									*/
-		public long nnz;
-		/** double array of non zero values 							*/
-		public Pointer val;
-		/** integer array of start of all rows and end of last row + 1 	*/
-		public Pointer rowPtr;
-		/** integer array of nnz values' column indices					*/
-		public Pointer colInd;
-		/** descriptor of matrix, only CUSPARSE_MATRIX_TYPE_GENERAL supported	*/
-		public cusparseMatDescr descr;
-		
-		/** 
-		 * Check for ultra sparsity
-		 * 
-		 * @param rows number of rows
-		 * @param cols number of columns
-		 * @return true if ultra sparse
-		 */
-		public boolean isUltraSparse(int rows, int cols) {
-			double sp = ((double)nnz/rows/cols);
-			return sp<ULTRA_SPARSITY_TURN_POINT;
-		}
-		
-		/**
-		 * Initializes {@link #descr} to CUSPARSE_MATRIX_TYPE_GENERAL,
-		 * the default that works for DGEMM.
-		 */
-		private void allocateMatDescrPointer() {			
-			this.descr = getDefaultCuSparseMatrixDescriptor();
-		}
-		
-		/**
-		 * Estimate the size of a CSR matrix in GPU memory
-		 * Size of pointers is not needed and is not added in
-		 * @param nnz2	number of non zeroes
-		 * @param rows	number of rows 
-		 * @return size estimate
-		 */
-		public static long estimateSize(long nnz2, long rows) {
-			long sizeofValArray = getDoubleSizeOf(nnz2);
-			long sizeofRowPtrArray  = getIntSizeOf(rows + 1);
-			long sizeofColIndArray = getIntSizeOf(nnz2);
-			long sizeofDescr = getIntSizeOf(4);
-			// From the CUSPARSE documentation, the cusparseMatDescr in native code is represented as: 
-			// typedef struct {
-			// 	cusparseMatrixType_t MatrixType;
-			//	cusparseFillMode_t FillMode;
-			//	cusparseDiagType_t DiagType;
-			// 	cusparseIndexBase_t IndexBase;
-			// } cusparseMatDescr_t;
-			long tot = sizeofValArray + sizeofRowPtrArray + sizeofColIndArray + sizeofDescr;
-			return tot;
-		}
-		
-		/** 
-		 * Factory method to allocate an empty CSR Sparse matrix on the GPU
-		 * @param nnz2	number of non-zeroes
-		 * @param rows 	number of rows
-		 * @return a {@link CSRPointer} instance that encapsulates the CSR matrix on GPU
-		 * @throws DMLRuntimeException if DMLRuntimeException occurs
-		 */
-		public static CSRPointer allocateEmpty(long nnz2, long rows) throws DMLRuntimeException {
-			assert nnz2 > -1 : "Incorrect usage of internal API, number of non zeroes is less than 0 when trying to allocate sparse data on GPU";
-			CSRPointer r = new CSRPointer();
-			r.nnz = nnz2;
-			if(nnz2 == 0) {
-				// The convention for an empty sparse matrix is to just have an instance of the CSRPointer object
-				// with no memory allocated on the GPU.
-				return r;
-			}
-			ensureFreeSpace(getDoubleSizeOf(nnz2) + getIntSizeOf(rows + 1) + getIntSizeOf(nnz2));
-			// increment the cudaCount by 1 for the allocation of all 3 arrays
-			r.val = allocate(null, getDoubleSizeOf(nnz2), 0);
-			r.rowPtr = allocate(null, getIntSizeOf(rows + 1), 0);
-			r.colInd = allocate(null, getIntSizeOf(nnz2), 1);
-			return r;
-		}
-		
-		/**
-		 * Static method to copy a CSR sparse matrix from Host to Device
-		 * @param dest	[input] destination location (on GPU)
-		 * @param rows	number of rows
-		 * @param nnz 	number of non-zeroes
-		 * @param rowPtr	integer array of row pointers
-		 * @param colInd	integer array of column indices
-		 * @param values	double array of non zero values
-		 */
-		public static void copyToDevice(CSRPointer dest, int rows, long nnz, int[] rowPtr, int[] colInd, double[] values) {
-			CSRPointer r = dest;
-			long t0=0;
-			if (DMLScript.STATISTICS) t0 = System.nanoTime();
-			r.nnz = nnz;
-			cudaMemcpy(r.rowPtr, Pointer.to(rowPtr), getIntSizeOf(rows + 1), cudaMemcpyHostToDevice);
-			cudaMemcpy(r.colInd, Pointer.to(colInd), getIntSizeOf(nnz), cudaMemcpyHostToDevice);
-			cudaMemcpy(r.val, Pointer.to(values), getDoubleSizeOf(nnz), cudaMemcpyHostToDevice);
-			if (DMLScript.STATISTICS) GPUStatistics.cudaToDevTime.addAndGet(System.nanoTime()-t0);
-			if (DMLScript.STATISTICS) GPUStatistics.cudaToDevCount.addAndGet(3);
-		}
-		
-		/**
-		 * Static method to copy a CSR sparse matrix from Device to host
-		 * @param src	[input] source location (on GPU)
-		 * @param rows	[input] number of rows
-		 * @param nnz	[input] number of non-zeroes
-		 * @param rowPtr	[output] pre-allocated integer array of row pointers of size (rows+1)
-		 * @param colInd	[output] pre-allocated integer array of column indices of size nnz
-		 * @param values	[output] pre-allocated double array of values of size nnz
-		 */
-		public static void copyToHost(CSRPointer src, int rows, long nnz, int[] rowPtr, int[] colInd, double[] values){
-			CSRPointer r = src;
-			long t0=0;
-			if (DMLScript.STATISTICS) t0 = System.nanoTime();
-			cudaMemcpy(Pointer.to(rowPtr), r.rowPtr, getIntSizeOf(rows + 1), cudaMemcpyDeviceToHost);
-			cudaMemcpy(Pointer.to(colInd), r.colInd, getIntSizeOf(nnz), cudaMemcpyDeviceToHost);
-			cudaMemcpy(Pointer.to(values), r.val, getDoubleSizeOf(nnz), cudaMemcpyDeviceToHost);
-			if (DMLScript.STATISTICS) GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime()-t0);
-			if (DMLScript.STATISTICS) GPUStatistics.cudaFromDevCount.addAndGet(3);
-		}
-		
-		// ==============================================================================================
-
-		// The following methods estimate the memory needed for sparse matrices that are
-		// results of operations on other sparse matrices using the cuSparse Library.
-		// The operation is C = op(A) binaryOperation op(B), C is the output and A & B are the inputs
-		// op = whether to transpose or not
-		// binaryOperation = For cuSparse, +, - are *(matmul) are supported
-
-		// From CuSparse Manual,
-		// Since A and B have different sparsity patterns, cuSPARSE adopts a two-step approach
-		// to complete sparse matrix C. In the first step, the user allocates csrRowPtrC of m+1
-		// elements and uses function cusparseXcsrgeamNnz() to determine csrRowPtrC
-		// and the total number of nonzero elements. In the second step, the user gathers nnzC
-		//(number of nonzero elements of matrix C) from either (nnzC=*nnzTotalDevHostPtr)
-		// or (nnzC=csrRowPtrC(m)-csrRowPtrC(0)) and allocates csrValC, csrColIndC of
-		// nnzC elements respectively, then finally calls function cusparse[S|D|C|Z]csrgeam()
-		// to complete matrix C.
-
-		/**
-		 * Allocate row pointers of m+1 elements
-		 * 
-		 * @param handle	a valid {@link cusparseHandle}
-		 * @param C			Output matrix
-		 * @param rowsC			number of rows in C
-		 * @throws DMLRuntimeException ?
-		 */
-		private static void step1AllocateRowPointers(cusparseHandle handle, CSRPointer C, int rowsC) throws DMLRuntimeException {
-			cusparseSetPointerMode(handle, cusparsePointerMode.CUSPARSE_POINTER_MODE_HOST);
-            cudaDeviceSynchronize();
-			// Do not increment the cudaCount of allocations on GPU
-			C.rowPtr = allocate(null, getIntSizeOf((long)rowsC+1), 0);
-		}
-		
-		/**
-		 * Determine total number of nonzero element for the cusparseDgeam  operation.
-		 * This is done from either (nnzC=*nnzTotalDevHostPtr) or (nnzC=csrRowPtrC(m)-csrRowPtrC(0))
-		 * 
-		 * @param handle	a valid {@link cusparseHandle}
-		 * @param A			Sparse Matrix A on GPU
-		 * @param B			Sparse Matrix B on GPU
-		 * @param C			Output Sparse Matrix C on GPU
-		 * @param m			Rows in C
-		 * @param n			Columns in C
-		 * @throws DMLRuntimeException ?
-		 */
-		private static void step2GatherNNZGeam(cusparseHandle handle, CSRPointer A, CSRPointer B, CSRPointer C, int m, int n) throws DMLRuntimeException {
-			int[] CnnzArray = { -1 };
-			cusparseXcsrgeamNnz(handle, m, n,  
-					A.descr, toIntExact(A.nnz), A.rowPtr, A.colInd, 
-					B.descr, toIntExact(B.nnz), B.rowPtr, B.colInd, 
-					C.descr, C.rowPtr, Pointer.to(CnnzArray));
-            cudaDeviceSynchronize();
-			if (CnnzArray[0] != -1){
-				C.nnz = CnnzArray[0];
-			}
-			else {
-		        int baseArray[] = { 0 };
-		        cudaMemcpy(Pointer.to(CnnzArray), C.rowPtr.withByteOffset(getIntSizeOf(m)), getIntSizeOf(1), cudaMemcpyDeviceToHost);
-	            cudaMemcpy(Pointer.to(baseArray), C.rowPtr,								   getIntSizeOf(1), cudaMemcpyDeviceToHost);
-	            C.nnz = CnnzArray[0] - baseArray[0];
-			}
-		}
-
-		/**
-		 *	Determine total number of nonzero element for the cusparseDgemm operation.
-		 *
-		 * @param handle	a valid {@link cusparseHandle}
-		 * @param A			Sparse Matrix A on GPU
-		 * @param transA	op - whether A is transposed
-		 * @param B			Sparse Matrix B on GPU
-		 * @param transB	op - whether B is transposed
-		 * @param C			Output Sparse Matrix C on GPU
-		 * @param m			Number of rows of sparse matrix op ( A ) and C
-		 * @param n			Number of columns of sparse matrix op ( B ) and C
-		 * @param k			Number of columns/rows of sparse matrix op ( A ) / op ( B )
-		 * @throws DMLRuntimeException ?
-		 */
-		private static void step2GatherNNZGemm(cusparseHandle handle, CSRPointer A, int transA, CSRPointer B, int transB, CSRPointer C, int m, int n, int k) throws DMLRuntimeException {
-			int[] CnnzArray = { -1 };
-			if (A.nnz >= Integer.MAX_VALUE || B.nnz >= Integer.MAX_VALUE) { 
-				throw new DMLRuntimeException("Number of non zeroes is larger than supported by cuSparse"); 
-			}
-			cusparseXcsrgemmNnz(handle, transA, transB, m, n, k, 
-					A.descr, toIntExact(A.nnz), A.rowPtr, A.colInd, 
-					B.descr, toIntExact(B.nnz), B.rowPtr, B.colInd, 
-					C.descr, C.rowPtr, Pointer.to(CnnzArray));
-            cudaDeviceSynchronize();
-			if (CnnzArray[0] != -1){
-				C.nnz = CnnzArray[0];
-			}
-			else {
-		        int baseArray[] = { 0 };
-		        cudaMemcpy(Pointer.to(CnnzArray), C.rowPtr.withByteOffset(getIntSizeOf(m)), getIntSizeOf(1), cudaMemcpyDeviceToHost);
-	            cudaMemcpy(Pointer.to(baseArray), C.rowPtr,								   getIntSizeOf(1), cudaMemcpyDeviceToHost);
-	            C.nnz = CnnzArray[0] - baseArray[0];
-			}
-		}
-
-		/**
-		 * Allocate val and index pointers.
-		 * 
-		 * @param handle	a valid {@link cusparseHandle}
-		 * @param C			Output sparse matrix on GPU
-		 * @throws DMLRuntimeException ?
-		 */
-		private static void step3AllocateValNInd(cusparseHandle handle, CSRPointer C) throws DMLRuntimeException {
-			// Increment cudaCount by one when all three arrays of CSR sparse array are allocated
-			C.val = allocate(null, getDoubleSizeOf(C.nnz), 0);
-			C.colInd = allocate(null, getIntSizeOf(C.nnz), 1);
-		}
-
-		// ==============================================================================================
-
-
-		/**
-		 * Estimates the number of non zero elements from the results of a sparse cusparseDgeam operation
-		 * C = a op(A) + b op(B)
-		 * @param handle 	a valid {@link cusparseHandle}
-		 * @param A			Sparse Matrix A on GPU
-		 * @param B			Sparse Matrix B on GPU
-		 * @param m			Rows in A
-		 * @param n			Columns in Bs
-		 * @return CSR (compressed sparse row) pointer
-		 * @throws DMLRuntimeException if DMLRuntimeException occurs
-		 */
-		public static CSRPointer allocateForDgeam(cusparseHandle handle, CSRPointer A, CSRPointer B, int m, int n) 
-				throws DMLRuntimeException{
-			if (A.nnz >= Integer.MAX_VALUE || B.nnz >= Integer.MAX_VALUE) { 
-				throw new DMLRuntimeException("Number of non zeroes is larger than supported by cuSparse"); 
-			}
-			CSRPointer C = new CSRPointer();
-			step1AllocateRowPointers(handle, C, m);
-			step2GatherNNZGeam(handle, A, B, C, m, n);
-			step3AllocateValNInd(handle, C);
-			return C;
-		}
-		
-		/**
-		 * Estimates the number of non-zero elements from the result of a sparse matrix multiplication C = A * B
-		 * and returns the {@link CSRPointer} to C with the appropriate GPU memory.
-		 * @param handle	a valid {@link cusparseHandle}
-		 * @param A			Sparse Matrix A on GPU
-		 * @param transA	'T' if A is to be transposed, 'N' otherwise
-		 * @param B			Sparse Matrix B on GPU
-		 * @param transB	'T' if B is to be transposed, 'N' otherwise
-		 * @param m			Rows in A
-		 * @param n			Columns in B
-		 * @param k			Columns in A / Rows in B
-		 * @return a {@link CSRPointer} instance that encapsulates the CSR matrix on GPU
-		 * @throws DMLRuntimeException if DMLRuntimeException occurs
-		 */
-		public static CSRPointer allocateForMatrixMultiply(cusparseHandle handle, CSRPointer A, int transA, CSRPointer B, int transB, int m, int n, int k) 
-				throws DMLRuntimeException{
-			// Following the code example at http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csrgemm and at
-			// https://github.com/jcuda/jcuda-matrix-utils/blob/master/JCudaMatrixUtils/src/test/java/org/jcuda/matrix/samples/JCusparseSampleDgemm.java
-			CSRPointer C = new CSRPointer();
-			step1AllocateRowPointers(handle, C, m);
-			step2GatherNNZGemm(handle, A, transA, B, transB, C, m, n, k);
-			step3AllocateValNInd(handle, C);
-			return C;
-		}
-		
-		/**
-		 * Copies this CSR matrix on the GPU to a dense column-major matrix
-		 * on the GPU. This is a temporary matrix for operations such as 
-		 * cusparseDcsrmv.
-		 * Since the allocated matrix is temporary, bookkeeping is not updated.
-		 * The caller is responsible for calling "free" on the returned Pointer object
-		 * @param cusparseHandle	a valid {@link cusparseHandle}
-		 * @param cublasHandle 		a valid {@link cublasHandle}
-		 * @param rows		number of rows in this CSR matrix
-		 * @param cols		number of columns in this CSR matrix
-		 * @return			A {@link Pointer} to the allocated dense matrix (in column-major format)
-		 * @throws DMLRuntimeException if DMLRuntimeException occurs
-		 */
-		public Pointer toColumnMajorDenseMatrix(cusparseHandle cusparseHandle, cublasHandle cublasHandle, int rows, int cols) throws DMLRuntimeException {
-			long size = ((long)rows) * getDoubleSizeOf((long)cols);
-			Pointer A = JCudaObject.allocate(size);
-			// If this sparse block is empty, the allocated dense matrix, initialized to zeroes, will be returned.
-			if (val != null && rowPtr != null && colInd != null && nnz > 0) {
-				// Note: cusparseDcsr2dense method cannot handle empty blocks
-				cusparseDcsr2dense(cusparseHandle, rows, cols, descr, val, rowPtr, colInd, A, rows);
-                cudaDeviceSynchronize();
-			} else {
-				LOG.warn("in CSRPointer, the values array, row pointers array or column indices array was null");
-			}
-			return A;
-		}
-		
-		/**
-		 * Calls cudaFree lazily on the allocated {@link Pointer} instances
-		 */
-		public void deallocate() {
-			deallocate(false);
-		}
-
-		/**
-		 * Calls cudaFree lazily or eagerly on the allocated {@link Pointer} instances
-		 * @param eager whether to do eager or lazy cudaFrees
-		 */
-		public void deallocate(boolean eager){
-			if (nnz > 0) {
-				cudaFreeHelper(val, eager);
-				cudaFreeHelper(rowPtr, eager);
-				cudaFreeHelper(colInd, eager);
-			}
-		}
-	};
-	
-	private static long getDoubleSizeOf(long numElems) {
-		return numElems * ((long)jcuda.Sizeof.DOUBLE);
-	}
-	
-	private static long getIntSizeOf(long numElems) {
-		return numElems * ((long)jcuda.Sizeof.INT);
-	}
-	
-	public synchronized boolean isAllocated() {
-		return (jcudaDenseMatrixPtr != null || jcudaSparseMatrixPtr != null);
-	}
-	
-	/** Pointer to dense matrix */
-	public Pointer jcudaDenseMatrixPtr = null;
-	/** Pointer to sparse matrix */
-	public CSRPointer jcudaSparseMatrixPtr = null;
-
-	public long numBytes;							/** Number of bytes occupied by this block on GPU */
-
-	/**
-	 * Initializes this JCudaObject with a {@link MatrixObject} instance which will contain metadata about the enclosing matrix block
-	 * @param m
-	 */
-	JCudaObject(MatrixObject m) {
-		super(m);
-	}
-
-	/**
-	 * Allocates a sparse and empty {@link JCudaObject}
-	 * This is the result of operations that are both non zero matrices.
-	 * 
-	 * @throws DMLRuntimeException if DMLRuntimeException occurs
-	 */
-	public void allocateSparseAndEmpty() throws DMLRuntimeException{
-		setSparseMatrixCudaPointer(CSRPointer.allocateEmpty(0, mat.getNumRows()));
-		setDeviceModify(0);
-	}
-
-
-	/**
-	 * Allocates a dense matrix of size obtained from the attached matrix metadata
-	 * and fills it up with a single value
-	 * 
-	 * @param v value to fill up the dense matrix
-	 * @throws DMLRuntimeException if DMLRuntimeException occurs
-	 */
-	public void allocateAndFillDense(double v) throws DMLRuntimeException {
-		long rows = mat.getNumRows();
-		long cols = mat.getNumColumns();
-		int numElems = toIntExact(rows * cols);
-		long size = getDoubleSizeOf(numElems);
-		setDenseMatrixCudaPointer(allocate(size));
-		setDeviceModify(size);
-		// The "fill" kernel is called which treats the matrix "jcudaDensePtr" like a vector and fills it with value "v"
-		LibMatrixCUDA.kernels.launchKernel("fill", ExecutionConfig.getConfigForSimpleVectorOperations(numElems), jcudaDenseMatrixPtr, v, numElems);
-	}
-
-	/**
-	 * If this {@link JCudaObject} is sparse and empty
-	 * Being allocated is a prerequisite to being sparse and empty.
-	 * 
-	 * @return true if sparse and empty
-	 */
-	public boolean isSparseAndEmpty() {
-		boolean isSparseAndAllocated = isAllocated()&& LibMatrixCUDA.isInSparseFormat(mat);
-		boolean isEmptyAndSparseAndAllocated = isSparseAndAllocated && jcudaSparseMatrixPtr.nnz == 0;
-		return isEmptyAndSparseAndAllocated;
-	}
-
-	@Override
-	public synchronized boolean acquireDeviceRead() throws DMLRuntimeException {
-		boolean transferred = false;
-		if(!isAllocated()) {
-			copyFromHostToDevice();
-			transferred = true;
-		} else {
-			numLocks.addAndGet(1);
-		}
-		if(!isAllocated())
-			throw new DMLRuntimeException("Expected device data to be allocated");
-		return transferred;
-	}
-	
-	@Override
-	public synchronized boolean acquireDeviceModifyDense() throws DMLRuntimeException {
-		boolean allocated = false;
-		if(!isAllocated()) {
-			mat.setDirty(true);
-			// Dense block, size = numRows * numCols
-			allocateDenseMatrixOnDevice();
-			allocated = true;
-			synchronized(evictionLock) {
-				JCudaContext.allocatedPointers.add(this);
-			}
-		}
-		isDeviceCopyModified = true;
-		if(!isAllocated()) 
-			throw new DMLRuntimeException("Expected device data to be allocated");
-		return allocated;
-	}
-	
-	@Override
-	public synchronized boolean acquireDeviceModifySparse() throws DMLRuntimeException {
-		boolean allocated = false;
-		isInSparseFormat = true;
-		if(!isAllocated()) {
-			mat.setDirty(true);
-			allocateSparseMatrixOnDevice();
-			allocated = true;
-			synchronized(evictionLock) {
-				JCudaContext.allocatedPointers.add(this);
-			}
-		}
-		isDeviceCopyModified = true;
-		if(!isAllocated()) 
-			throw new DMLRuntimeException("Expected device data to be allocated");
-		return allocated;
-	}
-	
-	@Override
-	public synchronized boolean acquireHostRead() throws CacheException {
-		boolean copied = false;
-		if(isAllocated()) {
-			try {
-				if(isDeviceCopyModified) {
-					copyFromDeviceToHost();
-					copied = true;
-				}
-			} catch (DMLRuntimeException e) {
-				throw new CacheException(e);
-			}
-		}
-		else {
-			throw new CacheException("Cannot perform acquireHostRead as the GPU data is not allocated:" + mat.getVarName());
-		}
-		return copied;
-	}
-	
-	/**
-	 * updates the locks depending on the eviction policy selected
-	 * @throws CacheException if there is no locked GPU Object
-	 */
-	private void updateReleaseLocks() throws CacheException {
-		if(numLocks.addAndGet(-1) < 0) {
-            throw new CacheException("Redundant release of GPU object");
-		}
-		if(evictionPolicy == EvictionPolicy.LRU) {
-            timestamp.set(System.nanoTime());
-		}
-		else if(evictionPolicy == EvictionPolicy.LFU) {
-            timestamp.addAndGet(1);
-		}
-		else if(evictionPolicy == EvictionPolicy.MIN_EVICT) {
-            // Do Nothing
-		}
-		else {
-            throw new CacheException("The eviction policy is not supported:" + evictionPolicy.name());
-		}
-	}
-	
-	/**
-	 * releases input allocated on GPU
-	 * @throws CacheException if data is not allocated
-	 */
-	public synchronized void releaseInput() throws CacheException {
-		updateReleaseLocks();
-		if(!isAllocated())
-			throw new CacheException("Attempting to release an input before allocating it");
-	}
-
-	/**
-	@Override
-	void allocateMemoryOnDevice(long numElemToAllocate) throws DMLRuntimeException {
-		if(!isAllocated()) {
-			long start = System.nanoTime();
-			if(numElemToAllocate == -1 && LibMatrixCUDA.isInSparseFormat(mat)) {
-				setSparseMatrixCudaPointer(CSRPointer.allocateEmpty(mat.getNnz(), mat.getNumRows()));
-				numBytes = CSRPointer.estimateSize(mat.getNnz(), mat.getNumRows());
-				JCudaContext.deviceMemBytes.addAndGet(-numBytes);
-				isInSparseFormat = true;
-				//throw new DMLRuntimeException("Sparse format not implemented");
-			} else if(numElemToAllocate == -1) {
-				// Called for dense input
-				setDenseMatrixCudaPointer(new Pointer());
-				numBytes = mat.getNumRows()*getDoubleSizeOf(mat.getNumColumns());
-				cudaMalloc(jcudaDenseMatrixPtr, numBytes);
-				JCudaContext.deviceMemBytes.addAndGet(-numBytes);
-			}
-			else {
-				// Called for dense output
-				setDenseMatrixCudaPointer(new Pointer());
-				numBytes = getDoubleSizeOf(numElemToAllocate);
-				if(numElemToAllocate <= 0 || numBytes <= 0)
-					throw new DMLRuntimeException("Cannot allocate dense matrix object with " + numElemToAllocate + " elements and size " + numBytes);
-				cudaMalloc(jcudaDenseMatrixPtr,  numBytes);
-				JCudaContext.deviceMemBytes.addAndGet(-numBytes);
-			}
-
-			GPUStatistics.cudaAllocTime.addAndGet(System.nanoTime()-start);
-			GPUStatistics.cudaAllocCount.addAndGet(1);
-
-		}
-	}
-	 */
-
-	@Override
-	void allocateDenseMatrixOnDevice() throws DMLRuntimeException {
-		assert !isAllocated() : "Internal error - trying to allocated dense matrix to a JCudaObject that is already allocated";
-		long rows = mat.getNumRows();
-		long cols = mat.getNumColumns();
-		assert rows > 0 : "Internal error - invalid number of rows when allocating dense matrix";
-		assert cols > 0 : "Internal error - invalid number of columns when allocating dense matrix;";
-        long size = getDoubleSizeOf(rows * cols);
-		Pointer tmp = allocate(size);
-		setDenseMatrixCudaPointer(tmp);
-		setDeviceModify(size);
-	}
-
-	@Override
-	void allocateSparseMatrixOnDevice() throws DMLRuntimeException {
-		assert !isAllocated() : "Internal error = trying to allocated sparse matrix to a JCudaObject that is already allocated";
-		long rows = mat.getNumRows();
-		long nnz = mat.getNnz();
-		assert rows > 0 : "Internal error - invalid number of rows when allocating a sparse matrix";
-		assert nnz > 0 : "Internal error - invalid number of non zeroes when allocating a sparse matrix";
-		CSRPointer tmp = CSRPointer.allocateEmpty(nnz, rows);
-		setSparseMatrixCudaPointer(tmp);
-		long size = CSRPointer.estimateSize(nnz, rows);
-		setDeviceModify(size);
-	}
-
-	/**
-	 * releases output allocated on GPU
-	 * @throws CacheException if data is not allocated
-	 */
-    @Override
-	public synchronized void releaseOutput() throws CacheException {
-		updateReleaseLocks();
-		isDeviceCopyModified = true;
-		if(!isAllocated())
-			throw new CacheException("Attempting to release an output before allocating it");
-	}
-
-	@Override
-	public void setDeviceModify(long numBytes) {
-		this.numLocks.addAndGet(1);
-		this.numBytes = numBytes;
-		((JCudaContext)GPUContext.currContext).getAndAddAvailableMemory(-numBytes);
-	}
-
-	@Override
-	void deallocateMemoryOnDevice(boolean eager) {
-		if(jcudaDenseMatrixPtr != null) {
-			cudaFreeHelper(null, jcudaDenseMatrixPtr, eager);
-			((JCudaContext)GPUContext.currContext).getAndAddAvailableMemory(numBytes);
-		}
-		if (jcudaSparseMatrixPtr != null) {
-			jcudaSparseMatrixPtr.deallocate(eager);
-			((JCudaContext)GPUContext.currContext).getAndAddAvailableMemory(numBytes);
-		}
-		jcudaDenseMatrixPtr = null;
-		jcudaSparseMatrixPtr = null;
-		if(tensorDescriptor != null) {
-			cudnnDestroyTensorDescriptor(tensorDescriptor);
-			tensorDescriptor = null;
-		}
-		numLocks.set(0);
-	}
-	
-	/** 
-	 * Thin wrapper over {@link #evict(long)}
-	 * @param size size to check
-	 * @throws DMLRuntimeException if DMLRuntimeException occurs
-	 */
-	static void ensureFreeSpace(long size) throws DMLRuntimeException {
-		ensureFreeSpace(null, size);
-	}
-
-	/**
-	 * Thin wrapper over {@link #evict(long)}
-	 * @param instructionName instructionName name of the instruction for which performance measurements are made
-	 * @param size size to check
-	 * @throws DMLRuntimeException if DMLRuntimeException occurs
-	 */
-	static void ensureFreeSpace(String instructionName, long size) throws DMLRuntimeException {
-		if(size >= getAvailableMemory()) {
-			evict(instructionName, size);
-		}
-	}
-	
-	@Override
-	void copyFromHostToDevice() 
-		throws DMLRuntimeException 
-	{
-		printCaller();
-		long start=0;
-		if (DMLScript.STATISTICS) start = System.nanoTime();
-		
-		MatrixBlock tmp = mat.acquireRead();
-		if(tmp.isInSparseFormat()) {
-			
-			int rowPtr[] = null;
-			int colInd[] = null;
-			double[] values = null;
-			
-			tmp.recomputeNonZeros();
-			long nnz = tmp.getNonZeros();
-			mat.getMatrixCharacteristics().setNonZeros(nnz);
-			
-			SparseBlock block = tmp.getSparseBlock();
-			boolean copyToDevice = true;
-			if(block == null && tmp.getNonZeros() == 0) {
-//				// Allocate empty block --> not necessary
-//				// To reproduce this, see org.apache.sysml.test.integration.applications.dml.ID3DMLTest
-//				rowPtr = new int[0];
-//				colInd = new int[0];
-//				values = new double[0];
-				copyToDevice = false;
-			}
-			else if(block == null && tmp.getNonZeros() != 0) {
-				throw new DMLRuntimeException("Expected CP sparse block to be not null.");
-			}
-			else {
-				// CSR is the preferred format for cuSparse GEMM
-				// Converts MCSR and COO to CSR
-				SparseBlockCSR csrBlock = null;
-				long t0=0;
-				if (block instanceof SparseBlockCSR){ 
-					csrBlock = (SparseBlockCSR)block;
-				} else if (block instanceof SparseBlockCOO) {
-					// TODO - should we do this on the GPU using cusparse<t>coo2csr() ?
-					if (DMLScript.STATISTICS) t0 = System.nanoTime();
-					SparseBlockCOO cooBlock = (SparseBlockCOO)block;
-					csrBlock = new SparseBlockCSR(toIntExact(mat.getNumRows()), cooBlock.rowIndexes(), cooBlock.indexes(), cooBlock.values());
-					if (DMLScript.STATISTICS) GPUStatistics.cudaSparseConversionTime.addAndGet(System.nanoTime() - t0);
-					if (DMLScript.STATISTICS) GPUStatistics.cudaSparseConversionCount.incrementAndGet();
-				} else if (block instanceof SparseBlockMCSR) {
-					if (DMLScript.STATISTICS) t0 = System.nanoTime();
-					SparseBlockMCSR mcsrBlock = (SparseBlockMCSR)block;
-					csrBlock = new SparseBlockCSR(mcsrBlock.getRows(), toIntExact(mcsrBlock.size()));
-					if (DMLScript.STATISTICS) GPUStatistics.cudaSparseConversionTime.addAndGet(System.nanoTime() - t0);
-					if (DMLScript.STATISTICS) GPUStatistics.cudaSparseConversionCount.incrementAndGet();
-				} else {
-					throw new DMLRuntimeException("Unsupported sparse matrix format for CUDA operations");
-				}
-				rowPtr = csrBlock.rowPointers();
-				colInd = csrBlock.indexes();
-				values = csrBlock.values();	
-			}
-			allocateSparseMatrixOnDevice();
-			synchronized(evictionLock) {
-				JCudaContext.allocatedPointers.add(this);
-			}
-			if(copyToDevice) {
-				CSRPointer.copyToDevice(jcudaSparseMatrixPtr, tmp.getNumRows(), tmp.getNonZeros(), rowPtr, colInd, values);
-			}
-		}
-		else {
-			double[] data = tmp.getDenseBlock();
-			
-			if( data == null && tmp.getSparseBlock() != null )
-				throw new DMLRuntimeException("Incorrect sparsity calculation");
-			else if( data==null && tmp.getNonZeros() != 0 )
-				throw new DMLRuntimeException("MatrixBlock is not allocated");
-			else if( tmp.getNonZeros() == 0 )
-				data = new double[tmp.getNumRows()*tmp.getNumColumns()];
-			
-			// Copy dense block
-			allocateDenseMatrixOnDevice();
-			synchronized(evictionLock) {
-				JCudaContext.allocatedPointers.add(this);
-			}
-			cudaMemcpy(jcudaDenseMatrixPtr, Pointer.to(data), getDoubleSizeOf(mat.getNumRows()*mat.getNumColumns()), cudaMemcpyHostToDevice);
-		}
-		
-		mat.release();
-
-		if (DMLScript.STATISTICS) GPUStatistics.cudaToDevTime.addAndGet(System.nanoTime()-start);
-		if (DMLScript.STATISTICS) GPUStatistics.cudaToDevCount.addAndGet(1);
-	}
-	
-	public static int toIntExact(long l) throws DMLRuntimeException {
-	    if (l < Integer.MIN_VALUE || l > Integer.MAX_VALUE) {
-	        throw new DMLRuntimeException("Cannot be cast to int:" + l);
-	    }
-	    return (int) l;
-	}
-
-	@Override
-	protected void copyFromDeviceToHost() throws DMLRuntimeException {
-		if (jcudaDenseMatrixPtr != null && jcudaSparseMatrixPtr != null){
-			throw new DMLRuntimeException("Invalid state : JCuda dense/sparse pointer are both allocated");
-		}
-
-		if(jcudaDenseMatrixPtr != null) {
-			printCaller();
-			long start=0;
-			if (DMLScript.STATISTICS) start = System.nanoTime();
-			MatrixBlock tmp = new MatrixBlock(toIntExact(mat.getNumRows()), toIntExact(mat.getNumColumns()), false);
-			tmp.allocateDenseBlock();
-			double [] data = tmp.getDenseBlock();
-			
-			cudaMemcpy(Pointer.to(data), jcudaDenseMatrixPtr, getDoubleSizeOf(data.length), cudaMemcpyDeviceToHost);
-			
-			tmp.recomputeNonZeros();
-			mat.acquireModify(tmp);
-			mat.release();
-
-			if (DMLScript.STATISTICS) GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime()-start);
-			if (DMLScript.STATISTICS) GPUStatistics.cudaFromDevCount.addAndGet(1);
-		}
-		else if (jcudaSparseMatrixPtr != null){
-			printCaller();
-			if(!LibMatrixCUDA.isInSparseFormat(mat))
-				throw new DMLRuntimeException("Block not in sparse format on host yet the device sparse matrix pointer is not null");
-
-			if(this.isSparseAndEmpty()){
-				MatrixBlock tmp = new MatrixBlock();	// Empty Block
-				mat.acquireModify(tmp);
-				mat.release();
-			} else {
-				long start=0;
-				if (DMLScript.STATISTICS) start = System.nanoTime();
-
-				int rows = toIntExact(mat.getNumRows());
-				int cols = toIntExact(mat.getNumColumns());
-				int nnz = toIntExact(jcudaSparseMatrixPtr.nnz);
-				int[] rowPtr = new int[rows + 1];
-				int[] colInd = new int[nnz];
-				double[] values = new double[nnz];
-				CSRPointer.copyToHost(jcudaSparseMatrixPtr, rows, nnz, rowPtr, colInd, values);
-
-				SparseBlockCSR sparseBlock = new SparseBlockCSR(rowPtr, colInd, values, nnz);
-				MatrixBlock tmp = new MatrixBlock(rows, cols, nnz, sparseBlock);
-				mat.acquireModify(tmp);
-				mat.release();
-				if (DMLScript.STATISTICS) GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime() - start);
-				if (DMLScript.STATISTICS) GPUStatistics.cudaFromDevCount.addAndGet(1);
-			}
-		}
-		else {
-			throw new DMLRuntimeException("Cannot copy from device to host as JCuda dense/sparse pointer is not allocated");
-		}
-		isDeviceCopyModified = false;
-	}
-
-	@Override
-	protected long getSizeOnDevice() throws DMLRuntimeException {
-		long GPUSize = 0;
-		long rlen = mat.getNumRows();
-		long clen = mat.getNumColumns();
-		long nnz = mat.getNnz();
-
-		if(LibMatrixCUDA.isInSparseFormat(mat)) {
-			GPUSize = CSRPointer.estimateSize(nnz, rlen);
-		}
-		else {
-			GPUSize = getDoubleSizeOf(rlen * clen);
-		}
-		return GPUSize;
-	}
-	
-	private String getClassAndMethod(StackTraceElement st) {
-		String [] str = st.getClassName().split("\\.");
-		return str[str.length - 1] + "." + st.getMethodName();
-	}
-	
-	/**
-	 * Convenience debugging method.
-	 * Checks {@link JCudaContext#DEBUG} flag before printing to System.out
-	 */
-	private void printCaller() {
-		if(JCudaContext.DEBUG) {
-			StackTraceElement[] st = Thread.currentThread().getStackTrace();
-			String ret = getClassAndMethod(st[1]);
-			for (int i = 2; i < st.length && i < 7; i++) {
-				ret += "->" + getClassAndMethod(st[i]);
-			}
-			System.out.println("CALL_STACK:" + ret);
-		}
-	}
-	
-	/**
-	 * Convenience method to directly examine the Sparse matrix on GPU
-	 * 
-	 * @return CSR (compressed sparse row) pointer
-	 */
-	public CSRPointer getSparseMatrixCudaPointer() {
-		return jcudaSparseMatrixPtr;
-	}
-	
-	/**
-	 * Convenience method to directly set the sparse matrix on GPU
-	 * Make sure to call {@link #setDeviceModify(long)} after this to set appropriate state, if you are not sure what you are doing.
-	 * Needed for operations like {@link JCusparse#cusparseDcsrgemm(cusparseHandle, int, int, int, int, int, cusparseMatDescr, int, Pointer, Pointer, Pointer, cusparseMatDescr, int, Pointer, Pointer, Pointer, cusparseMatDescr, Pointer, Pointer, Pointer)}
-	 * @param sparseMatrixPtr CSR (compressed sparse row) pointer
-	 */
-	public synchronized void setSparseMatrixCudaPointer(CSRPointer sparseMatrixPtr) {
-		this.jcudaSparseMatrixPtr = sparseMatrixPtr;
-		this.isInSparseFormat = true;
-		if(jcudaDenseMatrixPtr != null) {
-			cudaFreeHelper(jcudaDenseMatrixPtr);
-			jcudaDenseMatrixPtr = null;
-		}
-	}
-
-	/**
-	 * Convenience method to directly set the dense matrix pointer on GPU
-	 * Make sure to call {@link #setDeviceModify(long)} after this to set appropriate state, if you are not sure what you are doing.
-	 * 
-	 * @param densePtr dense pointer
-	 */
-	public synchronized void setDenseMatrixCudaPointer(Pointer densePtr){
-		this.jcudaDenseMatrixPtr = densePtr;
-		this.isInSparseFormat = false;
-		if(jcudaSparseMatrixPtr != null) {
-			jcudaSparseMatrixPtr.deallocate();
-			jcudaSparseMatrixPtr = null;
-		}
-	}
-	
-	/**
-	 * Converts this JCudaObject from dense to sparse format.
-	 * 
-	 * @throws DMLRuntimeException if DMLRuntimeException occurs
-	 */
-	public void denseToSparse() throws DMLRuntimeException {
-		long t0=0;
-		if (DMLScript.STATISTICS) t0 = System.nanoTime();
-		cusparseHandle cusparseHandle = LibMatrixCUDA.cusparseHandle;
-		if(cusparseHandle == null)
-			throw new DMLRuntimeException("Expected cusparse to be initialized");
-		int rows = toIntExact(mat.getNumRows());
-		int cols = toIntExact(mat.getNumColumns());
-		
-		if(jcudaDenseMatrixPtr == null || !isAllocated())
-			throw new DMLRuntimeException("Expected allocated dense matrix before denseToSparse() call");
-
-		convertDensePtrFromRowMajorToColumnMajor();
-		setSparseMatrixCudaPointer(columnMajorDenseToRowMajorSparse(cusparseHandle, rows, cols, jcudaDenseMatrixPtr));
-		// TODO: What if mat.getNnz() is -1 ?
-		numBytes = CSRPointer.estimateSize(mat.getNnz(), rows);
-		if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseTime.addAndGet(System.nanoTime() - t0);
-		if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseCount.addAndGet(1);
-	}
-
-	/**
-	 * Transposes a dense matrix on the GPU by calling the cublasDgeam operation
-	 * @param densePtr	Pointer to dense matrix on the GPU
-	 * @param m			rows in ouput matrix
-	 * @param n			columns in output matrix
-	 * @param lda		rows in input matrix
-	 * @param ldc		columns in output matrix
-	 * @return			transposed matrix
-	 * @throws DMLRuntimeException if operation failed
-	 */
-	public static Pointer transpose(Pointer densePtr, int m, int n, int lda, int ldc) throws DMLRuntimeException {
-		Pointer alpha = LibMatrixCUDA.pointerTo(1.0);
-		Pointer beta = LibMatrixCUDA.pointerTo(0.0);
-		Pointer A = densePtr;
-		Pointer C = JCudaObject.allocate(((long)m)*getDoubleSizeOf(n));
-
-		// Transpose the matrix to get a dense matrix
-		JCublas2.cublasDgeam(LibMatrixCUDA.cublasHandle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, alpha, A, lda, beta, new Pointer(), lda, C, ldc);
-		return C;
-	}
-
-	/**
-	 * Convenience method. Converts Row Major Dense Matrix --> Column Major Dense Matrix
-	 * @throws DMLRuntimeException if DMLRuntimeException occurs
-	 */
-	private void convertDensePtrFromRowMajorToColumnMajor() throws DMLRuntimeException {
-		int m = toIntExact(mat.getNumRows());
-		int n = toIntExact(mat.getNumColumns());
-		int lda = n;
-		int ldc = m;
-		if(!isAllocated()) {
-			throw new DMLRuntimeException("Error in converting row major to column major : data is not allocated");
-		}
-
-		Pointer tmp = transpose(jcudaDenseMatrixPtr, m, n, lda, ldc);
-		cudaFreeHelper(jcudaDenseMatrixPtr);
-		setDenseMatrixCudaPointer(tmp);
-	}
-
-	private void convertDensePtrFromColMajorToRowMajor() throws DMLRuntimeException {
-		int n = toIntExact(mat.getNumRows());
-		int m = toIntExact(mat.getNumColumns());
-		int lda = n;
-	    int ldc = m;
-		if(!isAllocated()) {
-			throw new DMLRuntimeException("Error in converting column major to row major : data is not allocated");
-		}
-
-		Pointer tmp = transpose(jcudaDenseMatrixPtr, m, n, lda, ldc);
-		cudaFreeHelper(jcudaDenseMatrixPtr);
-		setDenseMatrixCudaPointer(tmp);
-	}
-	
-	/**
-	 * Convert sparse to dense (Performs transpose, use sparseToColumnMajorDense if the kernel can deal with column major format)
-	 * 
-	 * @throws DMLRuntimeException if DMLRuntimeException occurs
-	 */
-	public void sparseToDense() throws DMLRuntimeException {
-		sparseToDense(null);
-	}
-
-	/**
-	 * Convert sparse to dense (Performs transpose, use sparseToColumnMajorDense if the kernel can deal with column major format)
-	 * Also records per instruction invokation of sparseToDense.
-	 * @param instructionName	Name of the instruction for which statistics are recorded in {@link GPUStatistics}
-	 * @throws DMLRuntimeException ?
-	 */
-	public void sparseToDense(String instructionName) throws DMLRuntimeException {
-		long start=0, end=0;
-		if (DMLScript.STATISTICS) start = System.nanoTime();
-		if(jcudaSparseMatrixPtr == null || !isAllocated())
-			throw new DMLRuntimeException("Expected allocated sparse matrix before sparseToDense() call");
-
-		sparseToColumnMajorDense();
-		convertDensePtrFromColMajorToRowMajor();
-		if (DMLScript.STATISTICS) end = System.nanoTime();
-		if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, end - start);
-		if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseTime.addAndGet(end - start);
-		if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseCount.addAndGet(1);
-	}
-	
-	/**
-	 * More efficient method to convert sparse to dense but returns dense in column major format
-	 * 
-	 * @throws DMLRuntimeException if DMLRuntimeException occurs
-	 */
-	public void sparseToColumnMajorDense() throws DMLRuntimeException {
-		if(jcudaSparseMatrixPtr == null || !isAllocated())
-			throw new DMLRuntimeException("Expected allocated sparse matrix before sparseToDense() call");
-		
-		cusparseHandle cusparseHandle = LibMatrixCUDA.cusparseHandle;
-		if(cusparseHandle == null)
-			throw new DMLRuntimeException("Expected cusparse to be initialized");
-		int rows = toIntExact(mat.getNumRows());
-		int cols = toIntExact(mat.getNumColumns());
-		setDenseMatrixCudaPointer(jcudaSparseMatrixPtr.toColumnMajorDenseMatrix(cusparseHandle, null, rows, cols));
-		numBytes = ((long)rows)*getDoubleSizeOf(cols);
-	}
-	
-	/**
-	 * Convenience method to convert a CSR matrix to a dense matrix on the GPU
-	 * Since the allocated matrix is temporary, bookkeeping is not updated.
-	 * Also note that the input dense matrix is expected to be in COLUMN MAJOR FORMAT
-	 * Caller is responsible for deallocating memory on GPU.
-	 * 
-	 * @param cusparseHandle handle to cusparse library
-	 * @param rows number of rows
-	 * @param cols number of columns
-	 * @param densePtr [in] dense matrix pointer on the GPU in row major
-	 * @return CSR (compressed sparse row) pointer
-	 * @throws DMLRuntimeException if DMLRuntimeException occurs
-	 */
-	public static CSRPointer columnMajorDenseToRowMajorSparse(cusparseHandle cusparseHandle, int rows, int cols, Pointer densePtr) throws DMLRuntimeException {
-		cusparseMatDescr matDescr = CSRPointer.getDefaultCuSparseMatrixDescriptor();
-		Pointer nnzPerRowPtr = null;
-		Pointer nnzTotalDevHostPtr = null;
-		
-		ensureFreeSpace(getIntSizeOf(rows + 1));
-		nnzPerRowPtr = allocate(getIntSizeOf(rows));
-		nnzTotalDevHostPtr = allocate(getIntSizeOf(1));
-		
-		// Output is in dense vector format, convert it to CSR
-		cusparseDnnz(cusparseHandle, cusparseDirection.CUSPARSE_DIRECTION_ROW, rows, cols, matDescr, densePtr, rows, nnzPerRowPtr, nnzTotalDevHostPtr);
-		cudaDeviceSynchronize();
-		int[] nnzC = {-1};
-
-		long t2=0;
-		if (DMLScript.STATISTICS) t2 = System.nanoTime();
-		cudaMemcpy(Pointer.to(nnzC), nnzTotalDevHostPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost);
-		if (DMLScript.STATISTICS) GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime() - t2);
-		if (DMLScript.STATISTICS) GPUStatistics.cudaFromDevCount.addAndGet(1);
-		
-		if (nnzC[0] == -1){
-			throw new DMLRuntimeException("cusparseDnnz did not calculate the correct number of nnz from the sparse-matrix vector mulitply on the GPU");
-		}
-		
-		CSRPointer C = CSRPointer.allocateEmpty(nnzC[0], rows);		
-		cusparseDdense2csr(cusparseHandle, rows, cols, matDescr, densePtr, rows, nnzPerRowPtr, C.val, C.rowPtr, C.colInd);
-		cudaDeviceSynchronize();
-
-		cudaFreeHelper(nnzPerRowPtr);
-		cudaFreeHelper(nnzTotalDevHostPtr);
-		
-		return C;
-	}
-
-
-	/** Map of free blocks allocate on GPU. maps size_of_block -> pointer on GPU */
-	static LRUCacheMap<Long, LinkedList<Pointer>> freeCUDASpaceMap = new LRUCacheMap<Long, LinkedList<Pointer>>();
-	/** To record size of allocated blocks */
-	static HashMap<Pointer, Long> cudaBlockSizeMap = new HashMap<Pointer, Long>();
-
-
-	/**
-	 * Convenience method for {@link #allocate(String, long, int)}, defaults statsCount to 1.
-	 * @param size size of data (in bytes) to allocate
-	 * @return jcuda pointer
-	 * @throws DMLRuntimeException if DMLRuntimeException occurs
-	 */
-	public static Pointer allocate(long size) throws DMLRuntimeException {
-		return allocate(null, size, 1);
-	}
-
-	/**
-	 * Convenience method for {@link #allocate(String, long, int)}, defaults statsCount to 1.
-	 * @param instructionName name of instruction for which to record per instruction performance statistics, null if don't want to record
-	 * @param size size of data (in bytes) to allocate
-	 * @return jcuda pointer
-	 * @throws DMLRuntimeException if DMLRuntimeException occurs
-	 */
-	public static Pointer allocate(String instructionName, long size) throws DMLRuntimeException {
-		return allocate(instructionName, size, 1);
-	}
-
-	/**
-	 * Allocates temporary space on the device.
-	 * Does not update bookkeeping.
-	 * The caller is responsible for freeing up after usage.
-	 * @param instructionName name of instruction for which to record per instruction performance statistics, null if don't want to record
-	 * @param size   			Size of data (in bytes) to allocate
-	 * @param statsCount	amount to increment the cudaAllocCount by
-	 * @return jcuda Pointer
-	 * @throws DMLRuntimeException if DMLRuntimeException occurs
-	 */
-	public static Pointer allocate(String instructionName, long size, int statsCount) throws DMLRuntimeException{
-		long t0=0, t1=0, end=0;
-		synchronized (JCudaContext.syncObj) {
-			Pointer A;
-			if (freeCUDASpaceMap.containsKey(size)) {
-				if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
-				LinkedList<Pointer> freeList = freeCUDASpaceMap.get(size);
-				A = freeList.pop();
-				if (freeList.isEmpty())
-					freeCUDASpaceMap.remove(size);
-				if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_REUSE, System.nanoTime() - t0);
-			} else {
-				if (DMLScript.STATISTICS) t0 = System.nanoTime();
-				ensureFreeSpace(instructionName, size);
-				A = new Pointer();
-				cudaMalloc(A, size);
-				((JCudaContext)(JCudaContext.currContext)).deviceMemBytes.addAndGet(size);
-				if (DMLScript.STATISTICS) GPUStatistics.cudaAllocTime.getAndAdd(System.nanoTime() - t0);
-				if (DMLScript.STATISTICS) GPUStatistics.cudaAllocCount.getAndAdd(statsCount);
-				if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_ALLOCATE, System.nanoTime() - t0);
-			}
-			// Set all elements to 0 since newly allocated space will contain garbage
-			if (DMLScript.STATISTICS) t1 = System.nanoTime();
-			cudaMemset(A, 0, size);
-			if (DMLScript.STATISTICS) end = System.nanoTime();
-			if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_SET_ZERO, end - t1);
-			if (DMLScript.STATISTICS) GPUStatistics.cudaMemSet0Time.getAndAdd(end - t1);
-			if (DMLScript.STATISTICS) GPUStatistics.cudaMemSet0Count.getAndAdd(1);
-			cudaBlockSizeMap.put(A, size);
-			return A;
-		}
-	}
-
-	/**
-	 * Does lazy cudaFree calls
-	 * @param toFree {@link Pointer} instance to be freed
-	 */
-	public static void cudaFreeHelper(final Pointer toFree) {
-		cudaFreeHelper(null, toFree, false);
-	}
-
-	/**
-	 * does lazy/eager cudaFree calls
-	 * @param toFree {@link Pointer} instance to be freed
-	 * @param eager true if to be done eagerly
-	 */
-	public static void cudaFreeHelper(final Pointer toFree, boolean eager) {
-		cudaFreeHelper(null, toFree, eager);
-	}
-
-	/**
-	 * Does lazy cudaFree calls
-	 * @param instructionName name of the instruction for which to record per instruction free time, null if do not want to record
-	 * @param toFree {@link Pointer} instance to be freed
-	 */
-	public static void cudaFreeHelper(String instructionName, final Pointer toFree) {
-		cudaFreeHelper(instructionName, toFree, false);
-	}
-
-	/**
-	 * Does cudaFree calls, lazily
-	 * @param instructionName name of the instruction for which to record per instruction free time, null if do not want to record
-	 * @param toFree {@link Pointer} instance to be freed
-	 * @param eager true if to be done eagerly
-	 */
-	public static void cudaFreeHelper(String instructionName, final Pointer toFree, boolean eager){
-		long t0 = 0;
-		assert cudaBlockSizeMap.containsKey(toFree) : "ERROR : Internal state corrupted, cache block size map is not aware of a block it trying to free up";
-		long size = cudaBlockSizeMap.get(toFree);
-		if (eager) {
-			if (DMLScript.STATISTICS) t0 = System.nanoTime();
-			((JCudaContext)(JCudaContext.currContext)).deviceMemBytes.addAndGet(-size);
-			cudaFree(toFree);
-			cudaBlockSizeMap.remove(toFree);
-			if (DMLScript.STATISTICS) GPUStatistics.cudaDeAllocTime.addAndGet(System.nanoTime() - t0);
-			if (DMLScript.STATISTICS) GPUStatistics.cudaDeAllocCount.addAndGet(1);
-			if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t0);
-		} else {
-			LinkedList<Pointer> freeList = freeCUDASpaceMap.get(size);
-			if (freeList == null) {
-				freeList = new LinkedList<Pointer>();
-				freeCUDASpaceMap.put(size, freeList);
-			}
-			freeList.add(toFree);
-		}
-	}
-
-
-	/**
-	 * Gets the double array from GPU memory onto host memory and returns string.
-	 * @param A Pointer to memory on device (GPU), assumed to point to a double array
-	 * @param rows rows in matrix A
-     * @param cols columns in matrix A
-	 * @return the debug string
-	 * @throws DMLRuntimeException  if DMLRuntimeException occurs
-	 */
-	public static String debugString(Pointer A, long rows, long cols) throws DMLRuntimeException {
-		StringBuffer sb = new StringBuffer();
-        int len = toIntExact(rows * cols);
-		double[] tmp = new double[len];
-		cudaMemcpy(Pointer.to(tmp), A, getDoubleSizeOf(len), cudaMemcpyDeviceToHost);
-        int k = 0;
-		for (int i=0; i<rows; i++){
-            for (int j=0; j<cols; j++){
-			   sb.append(tmp[k]).append(' ');
-               k++;
-            }
-            sb.append('\n');
-		}
-		return sb.toString();
-	}
-}