You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by na...@apache.org on 2017/04/21 23:23:17 UTC

[2/5] incubator-systemml git commit: Refactored GPU{Contex, Object} to make it friendlier for parfor

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index c363ab1..3c32137 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -19,23 +19,49 @@
 
 package org.apache.sysml.runtime.matrix.data;
 
-import jcuda.Pointer;
-import jcuda.Sizeof;
-import jcuda.jcublas.JCublas2;
-import jcuda.jcublas.cublasFillMode;
-import jcuda.jcublas.cublasHandle;
-import jcuda.jcublas.cublasOperation;
-import jcuda.jcudnn.cudnnActivationDescriptor;
-import jcuda.jcudnn.cudnnBatchNormMode;
-import jcuda.jcudnn.cudnnConvolutionDescriptor;
-import jcuda.jcudnn.cudnnConvolutionFwdPreference;
-import jcuda.jcudnn.cudnnFilterDescriptor;
-import jcuda.jcudnn.cudnnHandle;
-import jcuda.jcudnn.cudnnPoolingDescriptor;
-import jcuda.jcudnn.cudnnStatus;
-import jcuda.jcudnn.cudnnTensorDescriptor;
-import jcuda.jcusparse.JCusparse;
-import jcuda.jcusparse.cusparseHandle;
+import static jcuda.jcublas.cublasOperation.CUBLAS_OP_N;
+import static jcuda.jcublas.cublasOperation.CUBLAS_OP_T;
+import static jcuda.jcudnn.JCudnn.cudnnActivationForward;
+import static jcuda.jcudnn.JCudnn.cudnnBatchNormalizationBackward;
+import static jcuda.jcudnn.JCudnn.cudnnBatchNormalizationForwardInference;
+import static jcuda.jcudnn.JCudnn.cudnnBatchNormalizationForwardTraining;
+import static jcuda.jcudnn.JCudnn.cudnnConvolutionBackwardData;
+import static jcuda.jcudnn.JCudnn.cudnnConvolutionBackwardFilter;
+import static jcuda.jcudnn.JCudnn.cudnnConvolutionForward;
+import static jcuda.jcudnn.JCudnn.cudnnCreateActivationDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnCreateConvolutionDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnCreateFilterDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnCreatePoolingDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnCreateTensorDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnDestroyConvolutionDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnDestroyFilterDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnDestroyPoolingDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataWorkspaceSize;
+import static jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardFilterWorkspaceSize;
+import static jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardWorkspaceSize;
+import static jcuda.jcudnn.JCudnn.cudnnPoolingBackward;
+import static jcuda.jcudnn.JCudnn.cudnnPoolingForward;
+import static jcuda.jcudnn.JCudnn.cudnnSetActivationDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnSetConvolution2dDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnSetFilter4dDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnSetPooling2dDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnSetTensor4dDescriptor;
+import static jcuda.jcudnn.cudnnActivationMode.CUDNN_ACTIVATION_RELU;
+import static jcuda.jcudnn.cudnnConvolutionMode.CUDNN_CROSS_CORRELATION;
+import static jcuda.jcudnn.cudnnDataType.CUDNN_DATA_DOUBLE;
+import static jcuda.jcudnn.cudnnNanPropagation.CUDNN_PROPAGATE_NAN;
+import static jcuda.jcudnn.cudnnPoolingMode.CUDNN_POOLING_MAX;
+import static jcuda.jcudnn.cudnnTensorFormat.CUDNN_TENSOR_NCHW;
+import static jcuda.jcusparse.JCusparse.cusparseDcsrgemm;
+import static jcuda.jcusparse.JCusparse.cusparseDcsrmv;
+import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_NON_TRANSPOSE;
+import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_TRANSPOSE;
+import static jcuda.runtime.JCuda.cudaDeviceSynchronize;
+import static jcuda.runtime.JCuda.cudaMemcpy;
+import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice;
+import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
+import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
+
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysml.api.DMLScript;
@@ -72,10 +98,9 @@ import org.apache.sysml.runtime.instructions.cp.DoubleObject;
 import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
 import org.apache.sysml.runtime.instructions.gpu.context.ExecutionConfig;
 import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
-import org.apache.sysml.runtime.instructions.gpu.context.JCudaContext;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUObject;
+import org.apache.sysml.runtime.instructions.gpu.context.CSRPointer;
 import org.apache.sysml.runtime.instructions.gpu.context.JCudaKernels;
-import org.apache.sysml.runtime.instructions.gpu.context.JCudaObject;
-import org.apache.sysml.runtime.instructions.gpu.context.JCudaObject.CSRPointer;
 import org.apache.sysml.runtime.matrix.operators.AggregateOperator;
 import org.apache.sysml.runtime.matrix.operators.AggregateUnaryOperator;
 import org.apache.sysml.runtime.matrix.operators.BinaryOperator;
@@ -86,127 +111,123 @@ import org.apache.sysml.runtime.matrix.operators.ScalarOperator;
 import org.apache.sysml.utils.GPUStatistics;
 import org.apache.sysml.utils.Statistics;
 
-import static jcuda.jcublas.cublasOperation.CUBLAS_OP_N;
-import static jcuda.jcublas.cublasOperation.CUBLAS_OP_T;
-import static jcuda.jcudnn.JCudnn.cudnnActivationForward;
-import static jcuda.jcudnn.JCudnn.cudnnBatchNormalizationBackward;
-import static jcuda.jcudnn.JCudnn.cudnnBatchNormalizationForwardInference;
-import static jcuda.jcudnn.JCudnn.cudnnBatchNormalizationForwardTraining;
-import static jcuda.jcudnn.JCudnn.cudnnConvolutionBackwardData;
-import static jcuda.jcudnn.JCudnn.cudnnConvolutionBackwardFilter;
-import static jcuda.jcudnn.JCudnn.cudnnConvolutionForward;
-import static jcuda.jcudnn.JCudnn.cudnnCreateActivationDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnCreateConvolutionDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnCreateFilterDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnCreatePoolingDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnCreateTensorDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnDestroyConvolutionDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnDestroyFilterDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnDestroyPoolingDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataWorkspaceSize;
-import static jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardFilterWorkspaceSize;
-import static jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardWorkspaceSize;
-import static jcuda.jcudnn.JCudnn.cudnnPoolingBackward;
-import static jcuda.jcudnn.JCudnn.cudnnPoolingForward;
-import static jcuda.jcudnn.JCudnn.cudnnSetActivationDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnSetConvolution2dDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnSetFilter4dDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnSetPooling2dDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnSetTensor4dDescriptor;
-import static jcuda.jcudnn.cudnnActivationMode.CUDNN_ACTIVATION_RELU;
-import static jcuda.jcudnn.cudnnConvolutionMode.CUDNN_CROSS_CORRELATION;
-import static jcuda.jcudnn.cudnnDataType.CUDNN_DATA_DOUBLE;
-import static jcuda.jcudnn.cudnnNanPropagation.CUDNN_PROPAGATE_NAN;
-import static jcuda.jcudnn.cudnnPoolingMode.CUDNN_POOLING_MAX;
-import static jcuda.jcudnn.cudnnTensorFormat.CUDNN_TENSOR_NCHW;
-import static jcuda.jcusparse.JCusparse.cusparseDcsrgemm;
-import static jcuda.jcusparse.JCusparse.cusparseDcsrmv;
-import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_NON_TRANSPOSE;
-import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_TRANSPOSE;
-import static jcuda.runtime.JCuda.cudaDeviceSynchronize;
-import static jcuda.runtime.JCuda.cudaMemcpy;
-import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice;
-import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
-import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
-import static org.apache.sysml.runtime.instructions.gpu.context.JCudaObject.allocate;
-import static org.apache.sysml.runtime.instructions.gpu.context.JCudaObject.cudaFreeHelper;
+import jcuda.CudaException;
+import jcuda.Pointer;
+import jcuda.Sizeof;
+import jcuda.jcublas.JCublas2;
+import jcuda.jcublas.cublasFillMode;
+import jcuda.jcublas.cublasHandle;
+import jcuda.jcublas.cublasOperation;
+import jcuda.jcudnn.cudnnActivationDescriptor;
+import jcuda.jcudnn.cudnnBatchNormMode;
+import jcuda.jcudnn.cudnnConvolutionDescriptor;
+import jcuda.jcudnn.cudnnConvolutionFwdPreference;
+import jcuda.jcudnn.cudnnFilterDescriptor;
+import jcuda.jcudnn.cudnnHandle;
+import jcuda.jcudnn.cudnnPoolingDescriptor;
+import jcuda.jcudnn.cudnnStatus;
+import jcuda.jcudnn.cudnnTensorDescriptor;
+import jcuda.jcusparse.JCusparse;
+import jcuda.jcusparse.cusparseHandle;
 
-//FIXME move could to respective instructions, this is not a block library
+/**
+ * All CUDA kernels and library calls are redirected through this class
+ * @see GPUContext
+ * @see GPUObject
+ */
 public class LibMatrixCUDA {
 
-	// Assume Compute Capability 3.0
+	private static final Log LOG = LogFactory.getLog(LibMatrixCUDA.class.getName());
+
+    // Assume Compute Capability 3.0
 	// MAX BLOCKS is 2^31 - 1 For compute capability > 3.0
 	// MAX_THREADS is 1024 For compute capability > 3.0
 	private static int _MAX_THREADS = -1;
 	private static int _MAX_BLOCKS  = -1;
 	private static int _WARP_SIZE 	= -1;
 
+	//********************************************************************/
+	//***************************** UTILS ********************************/
+	//********************************************************************/
+
+	/*
+	static GPUContext gCtx throws DMLRuntimeException {
+			return GPUContext.gCtx;
+	}
+	*/
+
 	/**
 	 * Utility function to get maximum number of threads supported by the active CUDA device.
-	 * This is put into a singleton style method because the GPUContext is not fully initialized when
-	 * the LibMatrixCUDA class is loaded. If the {@link GPUContext#getGPUContext()} is invoked in a
-	 * static block in this class, it will access the {@link JCudaContext} instance when it is not
-	 * properly initialized. Due to the proper checks in place, a deadlock occurs.
+	 * @param gCtx a valid {@link GPUContext}
 	 * @return max threads
 	 * @throws DMLRuntimeException if exception occurs
 	 */
-	static int getMaxThreads() throws DMLRuntimeException{
+	static int getMaxThreads(GPUContext gCtx) throws DMLRuntimeException{
 		if (_MAX_THREADS == -1){
-			_MAX_THREADS = JCudaContext.getMaxThreadsPerBlock();
+			_MAX_THREADS = gCtx.getMaxThreadsPerBlock();
 		}
 		return _MAX_THREADS;
 	}
 
 	/**
 	 * Utility function to get maximum number of blocks supported by the active CUDA device.
-	 * This is put into a singleton style method because the GPUContext is not fully initialized when
-	 * the LibMatrixCUDA class is loaded. If the {@link GPUContext#getGPUContext()} is invoked in a
-	 * static block in this class, it will access the {@link JCudaContext} instance when it is not
-	 * properly initialized. Due to the proper checks in place, a deadlock occurs.
+	 * @param gCtx a valid {@link GPUContext}
 	 * @return max blocks
 	 * @throws DMLRuntimeException if exception occurs
 	 */
-	static int getMaxBlocks() throws DMLRuntimeException{
+	static int getMaxBlocks(GPUContext gCtx) throws DMLRuntimeException{
 		if (_MAX_BLOCKS == -1){
-			_MAX_BLOCKS = JCudaContext.getMaxBlocks();
+			_MAX_BLOCKS = gCtx.getMaxBlocks();
 		}
 		return _MAX_BLOCKS;
 	}
 
 	/**
 	 * Utility function to get the warp size supported by the active CUDA device.
-	 * This is put into a singleton style method because the GPUContext is not fully initialized when
-	 * the LibMatrixCUDA class is loaded. If the {@link GPUContext#getGPUContext()} is invoked in a
-	 * static block in this class, it will access the {@link JCudaContext} instance when it is not
-	 * properly initialized. Due to the proper checks in place, a deadlock occurs.
+	 * @param gCtx a valid {@link GPUContext}
 	 * @return warp size
 	 * @throws DMLRuntimeException if exception occurs
 	 */
-	static int getWarpSize() throws DMLRuntimeException {
+	static int getWarpSize(GPUContext gCtx) throws DMLRuntimeException {
 		if (_WARP_SIZE == -1) {
-			_WARP_SIZE = JCudaContext.getWarpSize();
+			_WARP_SIZE = gCtx.getWarpSize();
 		}
 		return _WARP_SIZE;
 	}
 
-	public static boolean isInSparseFormat(MatrixObject mo) {
-		if(mo.getGPUObject() != null && mo.getGPUObject().isAllocated())
-			return mo.getGPUObject().isInSparseFormat();
+	public static boolean isInSparseFormat(GPUContext gCtx, MatrixObject mo) {
+		if(mo.getGPUObject(gCtx) != null && mo.getGPUObject(gCtx).isAllocated())
+			return mo.getGPUObject(gCtx).isSparse();
 		return MatrixBlock.evalSparseFormatInMemory(mo.getNumRows(), mo.getNumColumns(), mo.getNnz());
 	}
 
 
+	private static cusparseHandle getCusparseHandle(GPUContext gCtx) throws DMLRuntimeException{
+		return gCtx.getCusparseHandle();
+	}
+
+	private static cublasHandle getCublasHandle(GPUContext gCtx) throws DMLRuntimeException {
+		return gCtx.getCublasHandle();
+	}
+
+	private static cudnnHandle getCudnnHandle(GPUContext gCtx) throws DMLRuntimeException {
+		return gCtx.getCudnnHandle();
+	}
+
+	private static JCudaKernels getCudaKernels(GPUContext gCtx) throws DMLRuntimeException {
+		return gCtx.getKernels();
+	}
+
+
 	//********************************************************************/
-	//***************** DEEP LEARNING Operators **************************/
+	//************************ End of UTILS ******************************/
 	//********************************************************************/
 
+	//********************************************************************/
+	//***************** DEEP LEARNING Operators **************************/
+	//********************************************************************/
 
-	public static cudnnHandle cudnnHandle;
-	public static cublasHandle cublasHandle;
-	public static cusparseHandle cusparseHandle;
-	public static JCudaKernels kernels; // Used to launch custom kernels
 
-	private static final Log LOG = LogFactory.getLog(LibMatrixCUDA.class.getName());
 
 	private static int CONVOLUTION_PREFERENCE = cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
 	
@@ -234,7 +255,8 @@ public class LibMatrixCUDA {
 	}
 	
 	/**
-	 * Convenience method to get tensor descriptor from underlying JCudaObject
+	 * Convenience method to get tensor descriptor from underlying GPUObject
+	 * @param gCtx   a valid {@link GPUContext}
 	 * @param mat matrix object
 	 * @param N number of images
 	 * @param C number of channels
@@ -243,39 +265,57 @@ public class LibMatrixCUDA {
 	 * @return cudnn tensor descriptor
 	 * @throws DMLRuntimeException if the input descriptor and matrix dimensions don't match
 	 */
-	private static cudnnTensorDescriptor allocateTensorDescriptor(MatrixObject mat, int N, int C, int H, int W) throws DMLRuntimeException {
+	private static cudnnTensorDescriptor allocateTensorDescriptor(GPUContext gCtx, MatrixObject mat, int N, int C, int H, int W) throws DMLRuntimeException {
 		if(mat.getNumRows() != N || mat.getNumColumns() != C*H*W) {
 			throw new DMLRuntimeException("Mismatch descriptor-matrix dimensions:" + mat.getNumRows() + " != " + N 
 					+ " || " + mat.getNumColumns() + " != " + (C*H*W));
 		}
-		return ((JCudaObject)mat.getGPUObject()).allocateTensorDescriptor(N, C, H, W);
+		return mat.getGPUObject(gCtx).allocateTensorDescriptor(N, C, H, W);
+	}
+
+	/**
+	 * Convenience method to get tensor descriptor
+	 * @param N number of images
+	 * @param C number of channels
+	 * @param H height
+	 * @param W width
+	 * @return cudnn tensor descriptor
+	 * @throws DMLRuntimeException if the input descriptor and matrix dimensions don't match
+	 */
+	private static cudnnTensorDescriptor allocateTensorDescriptor(int N, int C, int H, int W) throws DMLRuntimeException {
+		cudnnTensorDescriptor tensorDescriptor = new cudnnTensorDescriptor();
+		cudnnCreateTensorDescriptor(tensorDescriptor);
+		cudnnSetTensor4dDescriptor(tensorDescriptor, CUDNN_TENSOR_NCHW, CUDNN_DATA_DOUBLE, N, C, H, W);
+		return tensorDescriptor;
 	}
 	
 	/**
 	 * Convenience method to get jcudaDenseMatrixPtr. This method explicitly converts sparse to dense format, so use it judiciously.
+	 * @param gCtx a valid {@link GPUContext}
 	 * @param image input matrix object
 	 * @param isForCuDNN true if the dense pointer is to be used by a CuDNN kernel
 	 * @return jcuda pointer
 	 * @throws DMLRuntimeException if error occurs while sparse to dense conversion
 	 */
-	private static Pointer getDensePointer(MatrixObject image, boolean isForCuDNN, String instName) throws DMLRuntimeException {
+	private static Pointer getDensePointer(GPUContext gCtx, MatrixObject image, boolean isForCuDNN, String instName) throws DMLRuntimeException {
 		if(isForCuDNN && image.getNumRows()*image.getNumColumns() > numDoublesIn2GB) {
 			throw new DMLRuntimeException("CuDNN restriction: the size of input tensor cannot be greater than 2GB. Hint: try reducing the mini-batch size.");
 		}
-		return getDensePointer(image, instName);
+		return getDensePointer(gCtx, image, instName);
 	}
 	
 	/**
 	 * Convenience method to get jcudaDenseMatrixPtr. This method explicitly converts sparse to dense format, so use it judiciously.
+	 * @param gCtx a valid {@link GPUContext}
 	 * @param image input matrix object
 	 * @return jcuda pointer
 	 * @throws DMLRuntimeException if error occurs while sparse to dense conversion
 	 */
-	private static Pointer getDensePointer(MatrixObject image, String instName) throws DMLRuntimeException {
-		if(isInSparseFormat(image)) {
-			((JCudaObject)image.getGPUObject()).sparseToDense(instName);
+	private static Pointer getDensePointer(GPUContext gCtx, MatrixObject image, String instName) throws DMLRuntimeException {
+		if(isInSparseFormat(gCtx, image)) {
+			image.getGPUObject(gCtx).sparseToDense(instName);
 		}
-		return ((JCudaObject)image.getGPUObject()).jcudaDenseMatrixPtr;
+		return image.getGPUObject(gCtx).getJcudaDenseMatrixPtr();
 	}
 	
 	/**
@@ -289,34 +329,67 @@ public class LibMatrixCUDA {
 			throw new DMLRuntimeException("Error status returned by CuDNN:" + jcuda.jcudnn.cudnnStatus.stringFor(status));
 	}
 
-	public static void conv2dBiasAdd(String instName, MatrixObject image, MatrixObject bias, MatrixObject filter, MatrixObject outputBlock, int N, int C, int H, int W,
+	public static void conv2dBiasAdd(GPUContext gCtx, String instName, MatrixObject image, MatrixObject bias, MatrixObject filter, MatrixObject outputBlock, int N, int C, int H, int W,
 			int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q)
 					throws DMLRuntimeException {
-		conv2d(instName, image, filter, outputBlock, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
-		biasAdd(instName, outputBlock, bias, outputBlock);
+		/*
+		int rows = (int) outputBlock.getNumRows();
+		int cols = (int) outputBlock.getNumColumns();
+		long size  = rows * cols * Sizeof.DOUBLE;
+
+		Pointer imagePointer = getDensePointer(image, instName);
+		Pointer biasPointer = getDensePointer(bias, instName);
+		Pointer outputPointer = getDensePointer(outputBlock, instName);
+		Pointer filterPointer = getDensePointer(filter, instName);
+
+		Pointer tmp = allocate(size);
+
+		conv2d(instName, imagePointer, filterPointer, tmp, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+		cudaDeviceSynchronize();
+
+		long k1 = bias.getNumColumns();
+		if(k1 != bias.getNumColumns() || bias.getNumColumns() != 1 || cols % k1 != 0) {
+			throw new DMLRuntimeException("Incorrect inputs for bias_add: input[" + rows + " X " + cols + "] and bias[" + K + " X " + bias.getNumColumns() + "]");
+		}
+		// biasAdd(instName, outputBlock, bias, outputBlock);
+		biasAdd(instName, tmp, biasPointer, outputPointer, rows, cols, (int)k1);
+
+		cudaFreeHelper(tmp);
+		*/
+		LOG.trace("GPU : conv2dBiasAdd" + ", GPUContext=" + gCtx);
+		conv2d(gCtx, instName, image, filter, outputBlock, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+		//cudaDeviceSynchronize;
+		biasAdd(gCtx, instName, outputBlock, bias, outputBlock);
 	}
 	
-	public static void conv2d(String instName, MatrixObject image, MatrixObject filter, MatrixObject outputBlock, int N, int C, int H, int W,
+	public static void conv2d(GPUContext gCtx, String instName, MatrixObject image, MatrixObject filter, MatrixObject outputBlock, int N, int C, int H, int W,
 														int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q)
 					throws DMLRuntimeException {
-		cudnnFilterDescriptor filterDesc = null;
+		Pointer imagePointer = getDensePointer(gCtx, image, true, instName);
+		Pointer filterPointer = getDensePointer(gCtx, filter, true, instName);
+		Pointer dstPointer = getDensePointer(gCtx, outputBlock, true, instName);
+
+		conv2d(gCtx, instName, imagePointer, filterPointer, dstPointer, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+	}
+
+	public static void conv2d(GPUContext gCtx, String instName, Pointer image, Pointer filter, Pointer output, int N,
+														 int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q)
+					throws DMLRuntimeException {
+        LOG.trace("GPU : conv2d" + ", GPUContext=" + gCtx);
+        cudnnFilterDescriptor filterDesc = null;
 		cudnnConvolutionDescriptor convDesc = null;
 		Pointer workSpace = null;
 		long sizeInBytes = 0;
 		try {
-			long t1=0, t2=0;
+			long t1 = 0, t2 = 0;
 			// Allocate descriptors
 			if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
-			cudnnTensorDescriptor srcTensorDesc = allocateTensorDescriptor(image, N, C, H, W);
-			cudnnTensorDescriptor dstTensorDesc = allocateTensorDescriptor(outputBlock, N, K, P, Q);
+			cudnnTensorDescriptor srcTensorDesc = allocateTensorDescriptor(N, C, H, W);
+			cudnnTensorDescriptor dstTensorDesc = allocateTensorDescriptor(N, K, P, Q);
 			filterDesc = allocateFilterDescriptor(K, C, R, S);
 
-			Pointer imagePointer = getDensePointer(image, true, instName);
-			Pointer filterPointer = getDensePointer(filter, true, instName);
-			Pointer dstPointer = getDensePointer(outputBlock, true, instName);
-
-			int padding [] = { pad_h, pad_w };
-			int strides [] = { stride_h, stride_w };
+			int padding[] = {pad_h, pad_w};
+			int strides[] = {stride_h, stride_w};
 			convDesc = allocateConvolutionDescriptor(padding, strides);
 
 			// Select the best algorithm depending on the data and supported CUDA
@@ -324,53 +397,54 @@ public class LibMatrixCUDA {
 			int algo = -1;
 			workSpace = new Pointer();
 
-			if(CONVOLUTION_PREFERENCE == cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_NO_WORKSPACE) {
+			if (CONVOLUTION_PREFERENCE == cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_NO_WORKSPACE) {
 				algo = jcuda.jcudnn.cudnnConvolutionFwdAlgo.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-			}
-			else if(CONVOLUTION_PREFERENCE == cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_PREFER_FASTEST) {
-				int [] algos = {
+			} else if (CONVOLUTION_PREFERENCE == cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_PREFER_FASTEST) {
+				int[] algos = {
 								jcuda.jcudnn.cudnnConvolutionFwdAlgo.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM,
 								jcuda.jcudnn.cudnnConvolutionFwdAlgo.CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
 								jcuda.jcudnn.cudnnConvolutionFwdAlgo.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
 				};
 				// TODO: Look into FFt, Winograd, etc
 				// Also ensure that GPU has enough memory to allocate memory
-				long sizeInBytesArray[] = { 0 };
-				algo = jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardAlgorithm(cudnnHandle, srcTensorDesc, filterDesc, convDesc, dstTensorDesc,
+				long sizeInBytesArray[] = {0};
+				algo = jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardAlgorithm(getCudnnHandle(gCtx), srcTensorDesc, filterDesc, convDesc, dstTensorDesc,
 								CONVOLUTION_PREFERENCE, sizeInBytesArray[0], algos);
-				cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle, srcTensorDesc, filterDesc, convDesc, dstTensorDesc, algo, sizeInBytesArray);
-				if(sizeInBytesArray[0] != 0)
-					workSpace = allocate(sizeInBytesArray[0]);
+				cudnnGetConvolutionForwardWorkspaceSize(getCudnnHandle(gCtx), srcTensorDesc, filterDesc, convDesc, dstTensorDesc, algo, sizeInBytesArray);
+				if (sizeInBytesArray[0] != 0)
+					workSpace = gCtx.allocate(sizeInBytesArray[0]);
 				sizeInBytes = sizeInBytesArray[0];
-			}
-			else if(CONVOLUTION_PREFERENCE == cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT) {
+			} else if (CONVOLUTION_PREFERENCE == cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT) {
 				throw new DMLRuntimeException("CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT is not implemented");
-			}
-			else {
+			} else {
 				throw new DMLRuntimeException("Unsupported preference criteria for convolution");
 			}
-			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
+			if (GPUStatistics.DISPLAY_STATISTICS)
+				GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
 			if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
-			int status = cudnnConvolutionForward(cudnnHandle, one(),
-							srcTensorDesc, imagePointer,
-							filterDesc, filterPointer,
+			int status = cudnnConvolutionForward(getCudnnHandle(gCtx), one(),
+							srcTensorDesc, image,
+							filterDesc, filter,
 							convDesc, algo, workSpace, sizeInBytes, zero(),
-							dstTensorDesc, dstPointer);
-			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CONVOLUTION_FORWARD_LIB, System.nanoTime() - t2);
-			if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
-				throw new DMLRuntimeException("Could not executed cudnnConvolutionForward: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
+							dstTensorDesc, output);
+			if (GPUStatistics.DISPLAY_STATISTICS)
+				GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CONVOLUTION_FORWARD_LIB, System.nanoTime() - t2);
+			if (status != cudnnStatus.CUDNN_STATUS_SUCCESS) {
+				throw new DMLRuntimeException("Could not executed cudnnConvolutionForward: " + cudnnStatus.stringFor(status));
 			}
-		}
-		finally {
-			long t3=0;
+		} catch (CudaException e) {
+			throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e);
+		} finally {
+			long t3 = 0;
 			if (GPUStatistics.DISPLAY_STATISTICS) t3 = System.nanoTime();
-			if(filterDesc != null)
+			if (filterDesc != null)
 				cudnnDestroyFilterDescriptor(filterDesc);
-			if(convDesc != null)
+			if (convDesc != null)
 				cudnnDestroyConvolutionDescriptor(convDesc);
-			if(workSpace != null && sizeInBytes != 0)
-				cudaFreeHelper(instName, workSpace);
-			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3);
+			if (workSpace != null && sizeInBytes != 0)
+				gCtx.cudaFreeHelper(instName, workSpace);
+			if (GPUStatistics.DISPLAY_STATISTICS)
+				GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3);
 		}
 	}
 
@@ -381,7 +455,7 @@ public class LibMatrixCUDA {
 		return convDesc;
 	}
 
-	public static  Pointer pointerTo(double value) {
+	private static Pointer pointerTo(double value) {
 		return Pointer.to(new double[] { value });
 	}
 
@@ -411,22 +485,24 @@ public class LibMatrixCUDA {
 
 	/**
 	 * This method computes the backpropagation errors for previous layer of relu operation
+	 * @param gCtx a valid {@link GPUContext}
 	 * @param instName the invoking instruction's name for record {@link Statistics}.
 	 * @param input input image
 	 * @param dout  next layer error propogation
 	 * @param outputBlock output
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
-	public static void reluBackward(String instName, MatrixObject input, MatrixObject dout, MatrixObject outputBlock) throws DMLRuntimeException {
-		long rows = input.getNumRows();
+	public static void reluBackward(GPUContext gCtx, String instName, MatrixObject input, MatrixObject dout, MatrixObject outputBlock) throws DMLRuntimeException {
+        LOG.trace("GPU : reluBackward" + ", GPUContext=" + gCtx);
+        long rows = input.getNumRows();
 		long cols = input.getNumColumns();
-		Pointer imagePointer = getDensePointer(input, instName);
-		Pointer doutPointer = getDensePointer(dout, instName);
-		Pointer outputPointer = getDensePointer(outputBlock, instName);
+		Pointer imagePointer = getDensePointer(gCtx, input, instName);
+		Pointer doutPointer = getDensePointer(gCtx, dout, instName);
+		Pointer outputPointer = getDensePointer(gCtx, outputBlock, instName);
 
 		long t1=0;
 		if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
-		kernels.launchKernel("relu_backward",
+		getCudaKernels(gCtx).launchKernel("relu_backward",
 						ExecutionConfig.getConfigForSimpleMatrixOperations((int)rows, (int)cols),
 						imagePointer, doutPointer, outputPointer, (int)rows, (int)cols);
 		if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_BIAS_ADD_LIB, System.nanoTime() - t1);
@@ -438,18 +514,20 @@ public class LibMatrixCUDA {
 	 * ones = matrix(1, rows=1, cols=Hout*Wout)
 	 * output = input * matrix(bias %*% ones, rows=1, cols=F*Hout*Wout)
 	 * This operation is often followed by conv2d and hence we have introduced bias_add(input, bias) built-in function
+	 * @param gCtx   a valid {@link GPUContext}
 	 * @param instName the invoking instruction's name for record {@link Statistics}.
 	 * @param input input image
 	 * @param bias bias
 	 * @param outputBlock output
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
-	public static void biasMultiply(String instName, MatrixObject input, MatrixObject bias, MatrixObject outputBlock) throws DMLRuntimeException {
-		if(isInSparseFormat(input)) {
-			((JCudaObject)input.getGPUObject()).sparseToDense(instName);
+	public static void biasMultiply(GPUContext gCtx, String instName, MatrixObject input, MatrixObject bias, MatrixObject outputBlock) throws DMLRuntimeException {
+        LOG.trace("GPU : biasMultiply" + ", GPUContext=" + gCtx);
+        if(isInSparseFormat(gCtx, input)) {
+			input.getGPUObject(gCtx).sparseToDense(instName);
 		}
-		if(isInSparseFormat(bias)) {
-			((JCudaObject)bias.getGPUObject()).sparseToDense(instName);
+		if(isInSparseFormat(gCtx, bias)) {
+			bias.getGPUObject(gCtx).sparseToDense(instName);
 		}
 		long rows = input.getNumRows();
 		long cols = input.getNumColumns();
@@ -458,12 +536,12 @@ public class LibMatrixCUDA {
 		if(bias.getNumColumns() != 1 || cols % K != 0) {
 			throw new DMLRuntimeException("Incorrect inputs for bias_multiply: input[" + rows + " X " + cols + "] and bias[" + K + " X " + bias.getNumColumns() + "]");
 		}
-		Pointer imagePointer = ((JCudaObject)input.getGPUObject()).jcudaDenseMatrixPtr;
-		Pointer biasPointer = ((JCudaObject)bias.getGPUObject()).jcudaDenseMatrixPtr;
-		Pointer outputPointer = ((JCudaObject)outputBlock.getGPUObject()).jcudaDenseMatrixPtr;
+		Pointer imagePointer = input.getGPUObject(gCtx).getJcudaDenseMatrixPtr();
+		Pointer biasPointer = bias.getGPUObject(gCtx).getJcudaDenseMatrixPtr();
+		Pointer outputPointer = outputBlock.getGPUObject(gCtx).getJcudaDenseMatrixPtr();
 		long t1 = 0;
 		if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
-		kernels.launchKernel("bias_multiply",
+		getCudaKernels(gCtx).launchKernel("bias_multiply",
 						ExecutionConfig.getConfigForSimpleMatrixOperations((int)rows, (int)cols),
 						imagePointer, biasPointer, outputPointer, (int)rows, (int)cols, (int) PQ);
 		if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RELU_BACKWARD_KERNEL, System.nanoTime() - t1);
@@ -475,32 +553,52 @@ public class LibMatrixCUDA {
 	 * ones = matrix(1, rows=1, cols=Hout*Wout)
 	 * output = input + matrix(bias %*% ones, rows=1, cols=F*Hout*Wout)
 	 * This operation is often followed by conv2d and hence we have introduced bias_add(input, bias) built-in function
+	 * @param gCtx   a valid {@link GPUContext}
 	 * @param instName the invoking instruction's name for record {@link Statistics}.
 	 * @param input input image
 	 * @param bias bias
 	 * @param outputBlock output
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
-	public static void biasAdd(String instName, MatrixObject input, MatrixObject bias, MatrixObject outputBlock) throws DMLRuntimeException {
-		long rows = input.getNumRows();
-		long cols = input.getNumColumns();
-		long K = bias.getNumRows();
-		long PQ = cols / K;
+	public static void biasAdd(GPUContext gCtx, String instName, MatrixObject input, MatrixObject bias, MatrixObject outputBlock) throws DMLRuntimeException {
+		Pointer imagePointer = getDensePointer(gCtx, input, instName);
+		Pointer biasPointer = getDensePointer(gCtx, bias, instName);
+		Pointer outputPointer = getDensePointer(gCtx, outputBlock, instName);
+		int rows = (int) input.getNumRows();
+		int cols = (int) input.getNumColumns();
+		int K = (int) bias.getNumRows();
 		if(bias.getNumColumns() != 1 || cols % K != 0) {
 			throw new DMLRuntimeException("Incorrect inputs for bias_add: input[" + rows + " X " + cols + "] and bias[" + K + " X " + bias.getNumColumns() + "]");
 		}
-		Pointer imagePointer = getDensePointer(input, instName);
-		Pointer biasPointer = getDensePointer(bias, instName);
-		Pointer outputPointer = getDensePointer(outputBlock, instName);
+		biasAdd(gCtx, instName, imagePointer, biasPointer, outputPointer, rows, cols, K);
+	}
+
+	/**
+	 * Performs the operation corresponding to the DML script:
+	 * ones = matrix(1, rows=1, cols=Hout*Wout)
+	 * output = input + matrix(bias %*% ones, rows=1, cols=F*Hout*Wout)
+	 * This operation is often followed by conv2d and hence we have introduced bias_add(input, bias) built-in function
+	 * @param gCtx   a valid {@link GPUContext}
+	 * @param instName the invoking instruction's name for record {@link Statistics}.
+	 * @param image input image
+	 * @param bias bias
+	 * @param output output
+	 * @param rows rows in input image
+	 * @param cols cols in input image
+	 * @param k rows in bias
+	 * @throws DMLRuntimeException
+	 */
+	private static void biasAdd(GPUContext gCtx, String instName, Pointer image, Pointer bias, Pointer output, int rows, int cols, int k) throws DMLRuntimeException {
+        LOG.trace("GPU : biasAdd" + ", GPUContext=" + gCtx);
+        int PQ = cols / k;
 		long t1 = 0;
 		if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
-		kernels.launchKernel("bias_add",
-						ExecutionConfig.getConfigForSimpleMatrixOperations((int)rows, (int)cols),
-						imagePointer, biasPointer, outputPointer, (int)rows, (int)cols, (int) PQ);
+		getCudaKernels(gCtx).launchKernel("bias_add",
+						ExecutionConfig.getConfigForSimpleMatrixOperations(rows, cols),
+						image, bias, output, rows, cols, PQ);
 		if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RELU_BACKWARD_KERNEL, System.nanoTime() - t1);
-
 	}
-	
+
 	private static void validateBatchNormalizationDimensions(MatrixObject scale, MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar, int C) throws DMLRuntimeException {
 		if(scale.getNumRows() != 1 || scale.getNumColumns() != C) {
 			throw new DMLRuntimeException("Incorrect dimensions for scale");
@@ -518,7 +616,7 @@ public class LibMatrixCUDA {
 	
 	/**
 	 * Performs the forward BatchNormalization layer computation for inference
-	 * 
+	 * @param gCtx   a valid {@link GPUContext}
 	 * @param instName name of the instruction
 	 * @param image input image
 	 * @param scale scale (as per CuDNN) and gamma as per original paper: shape [1, C, 1, 1]
@@ -529,10 +627,11 @@ public class LibMatrixCUDA {
 	 * @param epsilon epsilon value used in the batch normalization formula
 	 * @throws DMLRuntimeException if error occurs
 	 */
-	public static void batchNormalizationForwardInference(String instName, MatrixObject image, 
+	public static void batchNormalizationForwardInference(GPUContext gCtx, String instName, MatrixObject image,
 			MatrixObject scale, MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar, 
 			MatrixObject ret, double epsilon) throws DMLRuntimeException {
-		int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
+        LOG.trace("GPU : batchNormalizationForwardInference" + ", GPUContext=" + gCtx);
+        int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
 		
 		int N = (int) image.getNumRows();
 		int C = (int) scale.getNumColumns();
@@ -540,19 +639,19 @@ public class LibMatrixCUDA {
 		validateBatchNormalizationDimensions(scale, bias, runningMean, runningVar, C);
 		
 		// Allocate descriptors
-		cudnnTensorDescriptor nCHWDescriptor = allocateNCHWDescriptors(N, C, CHW, 
+		cudnnTensorDescriptor nCHWDescriptor = allocateNCHWDescriptors(gCtx, N, C, CHW,
 				new MatrixObject[] {image},  new MatrixObject[] {ret});
-		cudnnTensorDescriptor scaleTensorDesc = allocateTensorDescriptor(scale, 1, C, 1, 1);
+		cudnnTensorDescriptor scaleTensorDesc = allocateTensorDescriptor(gCtx, scale, 1, C, 1, 1);
 		
 		// Get underlying dense pointer
-		Pointer imagePtr = getDensePointer(image, true, instName);
-		Pointer retPtr = getDensePointer(ret, true, instName);
-		Pointer biasPtr = getDensePointer(bias, true, instName);
-		Pointer scalePtr = getDensePointer(scale, true, instName);
-		Pointer runningMeanPtr = getDensePointer(runningMean, true, instName);
-		Pointer runningVarPtr = getDensePointer(runningVar, true, instName);
+		Pointer imagePtr = getDensePointer(gCtx, image, true, instName);
+		Pointer retPtr = getDensePointer(gCtx, ret, true, instName);
+		Pointer biasPtr = getDensePointer(gCtx, bias, true, instName);
+		Pointer scalePtr = getDensePointer(gCtx, scale, true, instName);
+		Pointer runningMeanPtr = getDensePointer(gCtx, runningMean, true, instName);
+		Pointer runningVarPtr = getDensePointer(gCtx, runningVar, true, instName);
 		
-		checkStatus(cudnnBatchNormalizationForwardInference(cudnnHandle, mode, one(), zero(),
+		checkStatus(cudnnBatchNormalizationForwardInference(getCudnnHandle(gCtx), mode, one(), zero(),
 				nCHWDescriptor, imagePtr, nCHWDescriptor, retPtr,
 			scaleTensorDesc, scalePtr, biasPtr,
 			runningMeanPtr, runningVarPtr, epsilon));
@@ -560,7 +659,7 @@ public class LibMatrixCUDA {
 	
 	/**
 	 * Performs the forward BatchNormalization layer computation for training
-	 * 
+	 * @param gCtx   a valid {@link GPUContext}
 	 * @param instName name of the instruction
 	 * @param image input image
 	 * @param scale scale (as per CuDNN) and gamma as per original paper: shape [1, C, 1, 1]
@@ -574,10 +673,11 @@ public class LibMatrixCUDA {
 	 * @param exponentialAverageFactor factor used in the moving average computation
 	 * @throws DMLRuntimeException if error occurs
 	 */
-	public static void batchNormalizationForwardTraining(String instName, MatrixObject image, 
+	public static void batchNormalizationForwardTraining(GPUContext gCtx, String instName, MatrixObject image,
 			MatrixObject scale,  MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar, 
 			MatrixObject ret, MatrixObject retRunningMean, MatrixObject retRunningVar, double epsilon, double exponentialAverageFactor) throws DMLRuntimeException {
-		int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
+        LOG.trace("GPU : batchNormalizationForwardTraining" + ", GPUContext=" + gCtx);
+        int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
 		
 		int N = (int) image.getNumRows();
 		int C = (int) scale.getNumColumns();
@@ -585,26 +685,26 @@ public class LibMatrixCUDA {
 		validateBatchNormalizationDimensions(scale, bias, runningMean, runningVar, C);
 		
 		// Allocate descriptors
-		cudnnTensorDescriptor nCHWDescriptor = allocateNCHWDescriptors(N, C, CHW, 
+		cudnnTensorDescriptor nCHWDescriptor = allocateNCHWDescriptors(gCtx, N, C, CHW,
 				new MatrixObject[] {image},  new MatrixObject[] {ret});
-		cudnnTensorDescriptor scaleTensorDesc = allocateTensorDescriptor(scale, 1, C, 1, 1);
+		cudnnTensorDescriptor scaleTensorDesc = allocateTensorDescriptor(gCtx, scale, 1, C, 1, 1);
 		
 		// Get underlying dense pointer
-		Pointer imagePtr = getDensePointer(image, true, instName);
-		Pointer retPtr = getDensePointer(ret, true, instName);
-		Pointer biasPtr = getDensePointer(bias, true, instName);
-		Pointer scalePtr = getDensePointer(scale, true, instName);
-		Pointer runningMeanPtr = getDensePointer(runningMean, true, instName);
-		Pointer runningVarPtr = getDensePointer(runningVar, true, instName);
+		Pointer imagePtr = getDensePointer(gCtx, image, true, instName);
+		Pointer retPtr = getDensePointer(gCtx, ret, true, instName);
+		Pointer biasPtr = getDensePointer(gCtx, bias, true, instName);
+		Pointer scalePtr = getDensePointer(gCtx, scale, true, instName);
+		Pointer runningMeanPtr = getDensePointer(gCtx, runningMean, true, instName);
+		Pointer runningVarPtr = getDensePointer(gCtx, runningVar, true, instName);
 		
 		// To allow for copy-on-write
-		Pointer retRunningMeanPtr = getDensePointer(retRunningMean, true, instName);
-		Pointer retRunningVarPtr = getDensePointer(retRunningVar, true, instName);
+		Pointer retRunningMeanPtr = getDensePointer(gCtx, retRunningMean, true, instName);
+		Pointer retRunningVarPtr = getDensePointer(gCtx, retRunningVar, true, instName);
 		cudaMemcpy(retRunningMeanPtr, runningMeanPtr, C * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
 		cudaMemcpy(retRunningVarPtr, runningVarPtr, C * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
 		
 		// ignoring resultSaveMean and resultSaveVariance as it requires state management
-		checkStatus(cudnnBatchNormalizationForwardTraining(cudnnHandle, mode, one(), zero(),
+		checkStatus(cudnnBatchNormalizationForwardTraining(getCudnnHandle(gCtx), mode, one(), zero(),
 				nCHWDescriptor, imagePtr, nCHWDescriptor, retPtr,
 			scaleTensorDesc, scalePtr, biasPtr, exponentialAverageFactor,
 			retRunningMeanPtr, retRunningVarPtr, epsilon, new Pointer(), new Pointer()));
@@ -612,7 +712,7 @@ public class LibMatrixCUDA {
 	
 	/**
 	 * Convenient utility for batch normalization that returns a NCHW descriptor
-	 * 
+	 * @param gCtx a valid {@link GPUContext}
 	 * @param N number of images
 	 * @param C number of channels
 	 * @param CHW channels*height*width
@@ -621,7 +721,7 @@ public class LibMatrixCUDA {
 	 * @return one of the NCHW descriptor
 	 * @throws DMLRuntimeException if error occurs
 	 */
-	private static cudnnTensorDescriptor allocateNCHWDescriptors(int N, int C, long CHW, MatrixObject [] input, MatrixObject [] output) throws DMLRuntimeException {
+	private static cudnnTensorDescriptor allocateNCHWDescriptors(GPUContext gCtx, int N, int C, long CHW, MatrixObject [] input, MatrixObject [] output) throws DMLRuntimeException {
 		cudnnTensorDescriptor ret  = null; // Return any one
 		if(CHW > ((long)Integer.MAX_VALUE)*C) {
 			throw new DMLRuntimeException("image size (height*width) should be less than " + Integer.MAX_VALUE);
@@ -629,9 +729,9 @@ public class LibMatrixCUDA {
 		cudnnTensorDescriptor knownNCHWdescriptor = null;
 		int H = -1; int W = -1;
 		for(int i = 0; i < input.length; i++) {
-			knownNCHWdescriptor = ((JCudaObject)input[i].getGPUObject()).getTensorDescriptor();
+			knownNCHWdescriptor = input[i].getGPUObject(gCtx).getTensorDescriptor();
 			if(knownNCHWdescriptor != null) {
-				int [] shape = ((JCudaObject)input[i].getGPUObject()).getTensorShape();
+				int [] shape = input[i].getGPUObject(gCtx).getTensorShape();
 				if(shape[0] != N || shape[1] != C) {
 					throw new DMLRuntimeException("Incorrect N and C:" + shape[0]  + " != " + N + " || " + shape[1]  + " != " +  C);
 				}
@@ -643,10 +743,10 @@ public class LibMatrixCUDA {
 		if(knownNCHWdescriptor != null) {
 			// We precisely know N, C, H, W
 			for(int i = 0; i < input.length; i++) {
-				ret = allocateTensorDescriptor(input[i], N, C, H, W);
+				ret = allocateTensorDescriptor(gCtx, input[i], N, C, H, W);
 			}
 			for(int i = 0; i < output.length; i++) {
-				ret = allocateTensorDescriptor(output[i], N, C, H, W);
+				ret = allocateTensorDescriptor(gCtx, output[i], N, C, H, W);
 			}
 		}
 		else {
@@ -667,7 +767,7 @@ public class LibMatrixCUDA {
 	
 	/**
 	 * This method computes the backpropagation errors for image, scale and bias of batch normalization layer
-	 * 
+	 * @param gCtx   a valid {@link GPUContext}
 	 * @param instName name of the instruction
 	 * @param image input image
 	 * @param dout input errors of shape C, H, W
@@ -678,30 +778,31 @@ public class LibMatrixCUDA {
 	 * @param epsilon epsilon value used in the batch normalization formula
 	 * @throws DMLRuntimeException if error occurs
 	 */
-	public static void batchNormalizationBackward(String instName, MatrixObject image, MatrixObject dout, 
+	public static void batchNormalizationBackward(GPUContext gCtx, String instName, MatrixObject image, MatrixObject dout,
 			MatrixObject scale, MatrixObject ret, MatrixObject retScale, MatrixObject retBias,
 			double epsilon) throws DMLRuntimeException {
-		int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
+        LOG.trace("GPU : batchNormalizationBackward" + ", GPUContext=" + gCtx);
+        int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
 		
 		int N = (int) image.getNumRows();
 		int C = (int) scale.getNumColumns();
 		long CHW = image.getNumColumns();
 		
 		// Allocate descriptors
-		cudnnTensorDescriptor nCHWDescriptor = allocateNCHWDescriptors(N, C, CHW, 
+		cudnnTensorDescriptor nCHWDescriptor = allocateNCHWDescriptors(gCtx, N, C, CHW,
 				new MatrixObject[] {image, dout},  new MatrixObject[] {ret});
-		cudnnTensorDescriptor scaleTensorDesc = allocateTensorDescriptor(scale, 1, C, 1, 1);
+		cudnnTensorDescriptor scaleTensorDesc = allocateTensorDescriptor(gCtx, scale, 1, C, 1, 1);
 		
 		// Get underlying dense pointer
-		Pointer imagePtr = getDensePointer(image, true, instName);
-		Pointer doutPtr = getDensePointer(dout, true, instName);
-		Pointer scalePtr = getDensePointer(scale, true, instName);
-		Pointer retPtr = getDensePointer(ret, true, instName);
-		Pointer retScalePtr = getDensePointer(retScale, true, instName);
-		Pointer retBiasPtr = getDensePointer(retBias, true, instName);
+		Pointer imagePtr = getDensePointer(gCtx, image, true, instName);
+		Pointer doutPtr = getDensePointer(gCtx, dout, true, instName);
+		Pointer scalePtr = getDensePointer(gCtx, scale, true, instName);
+		Pointer retPtr = getDensePointer(gCtx, ret, true, instName);
+		Pointer retScalePtr = getDensePointer(gCtx, retScale, true, instName);
+		Pointer retBiasPtr = getDensePointer(gCtx, retBias, true, instName);
 		
 		// ignoring resultSaveMean and resultSaveVariance as it requires state management
-		checkStatus(cudnnBatchNormalizationBackward(cudnnHandle, mode,  one(), zero(), one(), zero(),
+		checkStatus(cudnnBatchNormalizationBackward(getCudnnHandle(gCtx), mode,  one(), zero(), one(), zero(),
 				nCHWDescriptor,  imagePtr, nCHWDescriptor, doutPtr, nCHWDescriptor, retPtr,
 				scaleTensorDesc, scalePtr, retScalePtr, retBiasPtr, epsilon, new Pointer(), new Pointer()));
 	}
@@ -709,7 +810,7 @@ public class LibMatrixCUDA {
 
 	/**
 	 * This method computes the backpropogation errors for filter of convolution operation
-	 * 
+	 * @param gCtx   a valid {@link GPUContext}
 	 * @param instName the invoking instruction's name for record {@link Statistics}.
 	 * @param image input image
 	 * @param dout errors from next layer
@@ -729,56 +830,60 @@ public class LibMatrixCUDA {
 	 * @param Q output activation width
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
-	public static void conv2dBackwardFilter(String instName, MatrixObject image, MatrixObject dout,
+	public static void conv2dBackwardFilter(GPUContext gCtx, String instName, MatrixObject image, MatrixObject dout,
 																					MatrixObject outputBlock, int N, int C, int H, int W, int K, int R,
 																					int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
 																					int Q) throws DMLRuntimeException {
-		cudnnFilterDescriptor dwDesc = null;
+        LOG.trace("GPU : conv2dBackwardFilter" + ", GPUContext=" + gCtx);
+        cudnnFilterDescriptor dwDesc = null;
 		cudnnConvolutionDescriptor convDesc = null;
 
 		Pointer workSpace = null;
 		long sizeInBytes = 0;
 		try {
 
-			long t1=0, t2=0;
+			long t1 = 0, t2 = 0;
 			if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
 			// Allocate descriptors
-			cudnnTensorDescriptor xTensorDesc = allocateTensorDescriptor(image, N, C, H, W);
-			cudnnTensorDescriptor doutTensorDesc = allocateTensorDescriptor(dout, N, K, P, Q);
+			cudnnTensorDescriptor xTensorDesc = allocateTensorDescriptor(gCtx, image, N, C, H, W);
+			cudnnTensorDescriptor doutTensorDesc = allocateTensorDescriptor(gCtx, dout, N, K, P, Q);
 			dwDesc = allocateFilterDescriptor(K, C, R, S);
 
 			// Allocate data
-			Pointer imagePointer = getDensePointer(image, true, instName);
-			Pointer doutPointer = getDensePointer(dout, true, instName);
-			Pointer dwPointer = getDensePointer(outputBlock, true, instName);
-			int padding [] = { pad_h, pad_w };
-			int strides [] = { stride_h, stride_w };
+			Pointer imagePointer = getDensePointer(gCtx, image, true, instName);
+			Pointer doutPointer = getDensePointer(gCtx, dout, true, instName);
+			Pointer dwPointer = getDensePointer(gCtx, outputBlock, true, instName);
+			int padding[] = {pad_h, pad_w};
+			int strides[] = {stride_h, stride_w};
 			convDesc = allocateConvolutionDescriptor(padding, strides);
-			long sizeInBytesArray[] = { 0 };
+			long sizeInBytesArray[] = {0};
 
 			// TODO: Select the best algorithm depending on the data and supported CUDA
 			int algo = jcuda.jcudnn.cudnnConvolutionBwdFilterAlgo.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
 
 			workSpace = new Pointer();
-			cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnnHandle,
+			cudnnGetConvolutionBackwardFilterWorkspaceSize(getCudnnHandle(gCtx),
 							xTensorDesc, doutTensorDesc, convDesc, dwDesc, algo, sizeInBytesArray);
-			if (GPUStatistics.DISPLAY_STATISTICS)GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
+			if (GPUStatistics.DISPLAY_STATISTICS)
+				GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
 
 			if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
-			int status = cudnnConvolutionBackwardFilter(cudnnHandle, one(), xTensorDesc, imagePointer,
+			int status = cudnnConvolutionBackwardFilter(getCudnnHandle(gCtx), one(), xTensorDesc, imagePointer,
 							doutTensorDesc, doutPointer, convDesc, algo, workSpace, sizeInBytes, zero(), dwDesc, dwPointer);
-			if (GPUStatistics.DISPLAY_STATISTICS)GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CONVOLUTION_BACKWARD_FILTER_LIB, System.nanoTime() - t2);
+			if (GPUStatistics.DISPLAY_STATISTICS)
+				GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CONVOLUTION_BACKWARD_FILTER_LIB, System.nanoTime() - t2);
 
-			if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
+			if (status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
 				throw new DMLRuntimeException("Could not executed cudnnConvolutionBackwardFilter: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
 			}
-		}
-		finally {
+		} catch (CudaException e) {
+				throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e);
+		} finally {
 			long t3=0;
 			if (GPUStatistics.DISPLAY_STATISTICS) t3 = System.nanoTime();
 
 			if(workSpace != null && sizeInBytes != 0)
-				cudaFreeHelper(instName, workSpace);
+				gCtx.cudaFreeHelper(instName, workSpace);
 			if(dwDesc != null)
 				cudnnDestroyFilterDescriptor(dwDesc);
 
@@ -792,6 +897,7 @@ public class LibMatrixCUDA {
 
 	/**
 	 * This method computes the backpropogation errors for previous layer of convolution operation
+	 * @param gCtx   a valid {@link GPUContext}
 	 * @param instName the invoking instruction's name for record {@link Statistics}.
 	 * @param filter filter used in conv2d
 	 * @param dout errors from next layer
@@ -811,11 +917,12 @@ public class LibMatrixCUDA {
 	 * @param Q output activation width
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
-	public static void conv2dBackwardData(String instName, MatrixObject filter, MatrixObject dout,
+	public static void conv2dBackwardData(GPUContext gCtx, String instName, MatrixObject filter, MatrixObject dout,
 																				MatrixObject output, int N, int C, int H, int W, int K, int R,
 																				int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
 																				int Q) throws DMLRuntimeException {
-		cudnnFilterDescriptor wDesc = null;
+        LOG.trace("GPU : conv2dBackwardData" + ", GPUContext=" + gCtx);
+        cudnnFilterDescriptor wDesc = null;
 		cudnnConvolutionDescriptor convDesc = null;
 
 		Pointer workSpace = null;
@@ -825,13 +932,13 @@ public class LibMatrixCUDA {
 			if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
 			// Allocate descriptors
 			wDesc = allocateFilterDescriptor(K, C, R, S);
-			cudnnTensorDescriptor dyDesc = allocateTensorDescriptor(dout, N, K, P, Q);
-			cudnnTensorDescriptor dxDesc = allocateTensorDescriptor(output, N, C, H, W);
+			cudnnTensorDescriptor dyDesc = allocateTensorDescriptor(gCtx, dout, N, K, P, Q);
+			cudnnTensorDescriptor dxDesc = allocateTensorDescriptor(gCtx, output, N, C, H, W);
 
 			// Allocate data
-			Pointer w = getDensePointer(filter, true, instName);
-			Pointer dy = getDensePointer(dout, true, instName);
-			Pointer dx = getDensePointer(output, true, instName);
+			Pointer w = getDensePointer(gCtx, filter, true, instName);
+			Pointer dy = getDensePointer(gCtx, dout, true, instName);
+			Pointer dx = getDensePointer(gCtx, output, true, instName);
 			
 			int padding [] = { pad_h, pad_w };
 			int strides [] = { stride_h, stride_w };
@@ -841,25 +948,27 @@ public class LibMatrixCUDA {
 			// TODO: Select the best algorithm depending on the data and supported CUDA
 			int algo = jcuda.jcudnn.cudnnConvolutionBwdDataAlgo.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
 			workSpace = new Pointer();
-			cudnnGetConvolutionBackwardDataWorkspaceSize(cudnnHandle,
+			cudnnGetConvolutionBackwardDataWorkspaceSize(getCudnnHandle(gCtx),
 							wDesc, dyDesc, convDesc, dxDesc, algo, sizeInBytesArray);
 			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
 
 			if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
-			int status = cudnnConvolutionBackwardData(cudnnHandle, one(), wDesc, w,
+			int status = cudnnConvolutionBackwardData(getCudnnHandle(gCtx), one(), wDesc, w,
 							dyDesc, dy, convDesc, algo, workSpace, sizeInBytes, zero(), dxDesc, dx);
 			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CONVOLUTION_BACKWARD_DATA_LIB, System.nanoTime() - t2);
 
 			if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
 				throw new DMLRuntimeException("Could not executed cudnnConvolutionBackwardData: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
 			}
+		} catch (CudaException e) {
+			throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e);
 		}
 		finally {
 			long t3=0;
 			if (GPUStatistics.DISPLAY_STATISTICS) t3 = System.nanoTime();
 
 			if(workSpace != null && sizeInBytes != 0)
-				cudaFreeHelper(instName, workSpace);
+				gCtx.cudaFreeHelper(instName, workSpace);
 			if(wDesc != null)
 				cudnnDestroyFilterDescriptor(wDesc);
 			if(convDesc != null)
@@ -871,6 +980,7 @@ public class LibMatrixCUDA {
 	
 	/**
 	 * performs maxpooling on GPU by exploiting cudnnPoolingForward(...)
+	 * @param gCtx   a valid {@link GPUContext}
 	 * @param instName the invoking instruction's name for record {@link Statistics}.
 	 * @param image image as matrix object
 	 * @param outputBlock output matrix
@@ -889,37 +999,40 @@ public class LibMatrixCUDA {
 	 * @param Q				(W - S + 1 + 2*pad_w)/stride_w
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
-	public static void maxpooling(String instName, MatrixObject image,
+	public static void maxpooling(GPUContext gCtx, String instName, MatrixObject image,
 			MatrixObject outputBlock, int N, int C, int H, int W, int K, int R,
 			int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
 			int Q) throws DMLRuntimeException {
-		Pointer x = getDensePointer(image, true, instName);
-		cudnnTensorDescriptor xDesc = allocateTensorDescriptor(image, N, C, H, W);
-		performMaxpooling(instName, x, xDesc, outputBlock, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+        Pointer x = getDensePointer(gCtx, image, true, instName);
+		cudnnTensorDescriptor xDesc = allocateTensorDescriptor(gCtx, image, N, C, H, W);
+		performMaxpooling(gCtx, instName, x, xDesc, outputBlock, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
 	}
 	
-	public static void performMaxpooling(String instName, Pointer x, cudnnTensorDescriptor xDesc,
+	public static void performMaxpooling(GPUContext gCtx, String instName, Pointer x, cudnnTensorDescriptor xDesc,
 			MatrixObject outputBlock, int N, int C, int H, int W, int K, int R,
 			int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
 			int Q) throws DMLRuntimeException {
-		Pointer y = getDensePointer(outputBlock, true, instName);
+        LOG.trace("GPU : performMaxpooling" + ", GPUContext=" + gCtx);
+        Pointer y = getDensePointer(gCtx, outputBlock, true, instName);
 		cudnnPoolingDescriptor poolingDesc = null;
 
 		try {
 			long t1=0,t2=0;
 			if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
 			// Allocate descriptors
-			cudnnTensorDescriptor yDesc = allocateTensorDescriptor(outputBlock, N, C, P, Q);
+			cudnnTensorDescriptor yDesc = allocateTensorDescriptor(gCtx, outputBlock, N, C, P, Q);
 			poolingDesc = allocatePoolingDescriptor(R, S, pad_h, pad_w, stride_h, stride_w);
 			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
 
 			if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
-			int status = cudnnPoolingForward(cudnnHandle, poolingDesc, one(), xDesc, x, zero(), yDesc, y);
+			int status = cudnnPoolingForward(getCudnnHandle(gCtx), poolingDesc, one(), xDesc, x, zero(), yDesc, y);
 			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MAXPOOLING_FORWARD_LIB, System.nanoTime() - t2);
 
 			if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
 				throw new DMLRuntimeException("Could not executed cudnnPoolingForward: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
 			}
+		} catch (CudaException e) {
+			throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e);
 		}
 		finally {
 			long t3=0;
@@ -932,6 +1045,7 @@ public class LibMatrixCUDA {
 	
 	/**
 	 * performs relu followed by maxpooling on GPU by exploiting cudnnPoolingForward(...)
+	 * @param gCtx   a valid {@link GPUContext}
 	 * @param instName the invoking instruction's name for record {@link Statistics}.
 	 * @param image image as matrix object
 	 * @param outputBlock output matrix
@@ -950,22 +1064,24 @@ public class LibMatrixCUDA {
 	 * @param Q				(W - S + 1 + 2*pad_w)/stride_w
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
-	public static void reluMaxpooling(String instName, MatrixObject image,
+	public static void reluMaxpooling(GPUContext gCtx, String instName, MatrixObject image,
 			MatrixObject outputBlock, int N, int C, int H, int W, int K, int R,
 			int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
 			int Q) throws DMLRuntimeException {
-		cudnnTensorDescriptor srcTensorDesc = allocateTensorDescriptor(image, N, C, H, W);
+        LOG.trace("GPU : reluMaxpooling" + ", GPUContext=" + gCtx);
+        cudnnTensorDescriptor srcTensorDesc = allocateTensorDescriptor(gCtx, image, N, C, H, W);
 		long size  = image.getNumRows() * image.getNumColumns() * Sizeof.DOUBLE;
-		Pointer tmp = allocate(size);
-		performCuDNNReLU(instName, image, tmp, srcTensorDesc);
-		cudaDeviceSynchronize(); // It seemed like the cudnn operation in performCuDNNReLU was being done aysnchronously, this adds the neccesary sync
-		performMaxpooling(instName, tmp, srcTensorDesc, outputBlock, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
-		cudaFreeHelper(tmp);
+		Pointer tmp = gCtx.allocate(size);
+		performCuDNNReLU(gCtx, instName, image, tmp, srcTensorDesc);
+		//cudaDeviceSynchronize; // It seemed like the cudnn operation in performCuDNNReLU was being done aysnchronously, this adds the neccesary sync
+		performMaxpooling(gCtx, instName, tmp, srcTensorDesc, outputBlock, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+		gCtx.cudaFreeHelper(tmp);
 	}
 
 	/**
 	 * Performs maxpoolingBackward on GPU by exploiting cudnnPoolingBackward(...)
 	 * This method computes the backpropogation errors for previous layer of maxpooling operation
+	 * @param gCtx   a valid {@link GPUContext}
 	 * @param instName the invoking instruction's name for record {@link Statistics}.
 	 * @param image image as matrix object
 	 * @param dout			delta matrix, output of previous layer
@@ -985,38 +1101,39 @@ public class LibMatrixCUDA {
 	 * @param Q				(W - S + 1 + 2*pad_w)/stride_w
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
-	public static void maxpoolingBackward(String instName, MatrixObject image, MatrixObject dout,
+	public static void maxpoolingBackward(GPUContext gCtx, String instName, MatrixObject image, MatrixObject dout,
 																				MatrixObject outputBlock, int N, int C, int H, int W, int K, int R,
 																				int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
 																				int Q) throws DMLRuntimeException {
-		Pointer y = null;
+        LOG.trace("GPU : maxpoolingBackward" + ", GPUContext=" + gCtx);
+        Pointer y = null;
 		cudnnPoolingDescriptor poolingDesc = null;
 
 		try {
 			long t1=0, t2=0, t3=0;
 			if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
 			// Allocate descriptors
-			cudnnTensorDescriptor xDesc = allocateTensorDescriptor(image, N, C, H, W);
-			cudnnTensorDescriptor yDesc = allocateTensorDescriptor(dout, N, C, P, Q);
-			cudnnTensorDescriptor dxDesc = allocateTensorDescriptor(outputBlock, N, C, H, W);
-			cudnnTensorDescriptor dyDesc = allocateTensorDescriptor(dout, N, C, P, Q);
+			cudnnTensorDescriptor xDesc = allocateTensorDescriptor(gCtx, image, N, C, H, W);
+			cudnnTensorDescriptor yDesc = allocateTensorDescriptor(gCtx, dout, N, C, P, Q);
+			cudnnTensorDescriptor dxDesc = allocateTensorDescriptor(gCtx, outputBlock, N, C, H, W);
+			cudnnTensorDescriptor dyDesc = allocateTensorDescriptor(gCtx, dout, N, C, P, Q);
 
 			poolingDesc = allocatePoolingDescriptor(R, S, pad_h, pad_w, stride_h, stride_w);
 
 			// Calling PoolForward first, y is one of the inputs for poolBackward
 			// TODO: Remove calling poolForward after necessary changes at language level for poolBackward
 			long numBytes = N*C*P*Q*Sizeof.DOUBLE;
-			y = allocate(numBytes);
+			y = gCtx.allocate(numBytes);
 
 			// Allocate data
-			Pointer x = getDensePointer(image, true, instName);
-			Pointer dx = getDensePointer(outputBlock, true, instName);
-			Pointer dy = getDensePointer(dout, true, instName);
+			Pointer x = getDensePointer(gCtx, image, true, instName);
+			Pointer dx = getDensePointer(gCtx, outputBlock, true, instName);
+			Pointer dy = getDensePointer(gCtx, dout, true, instName);
 
 			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
 
 			if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
-			int status = cudnnPoolingForward(cudnnHandle, poolingDesc, one(), xDesc, x, zero(), yDesc, y);
+			int status = cudnnPoolingForward(getCudnnHandle(gCtx), poolingDesc, one(), xDesc, x, zero(), yDesc, y);
 			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MAXPOOLING_FORWARD_LIB, System.nanoTime() - t2);
 
 			if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
@@ -1024,19 +1141,21 @@ public class LibMatrixCUDA {
 			}
 
 			if (GPUStatistics.DISPLAY_STATISTICS) t3 = System.nanoTime();
-			status = cudnnPoolingBackward(cudnnHandle, poolingDesc, one(), yDesc, y, dyDesc, dy, xDesc, x, zero(), dxDesc, dx);
+			status = cudnnPoolingBackward(getCudnnHandle(gCtx), poolingDesc, one(), yDesc, y, dyDesc, dy, xDesc, x, zero(), dxDesc, dx);
 			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MAXPOOLING_BACKWARD_LIB, System.nanoTime() - t3);
 
 			if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
 				throw new DMLRuntimeException("Could not executed cudnnPoolingBackward: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
 			}
+		} catch (CudaException e) {
+			throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e);
 		}
 		finally {
 			long t4=0;
 			if (GPUStatistics.DISPLAY_STATISTICS) t4 = System.nanoTime();
 
 			if(y != null)
-				cudaFreeHelper(instName, y);
+				gCtx.cudaFreeHelper(instName, y);
 			if(poolingDesc != null)
 				cudnnDestroyPoolingDescriptor(poolingDesc);
 
@@ -1044,21 +1163,24 @@ public class LibMatrixCUDA {
 		}
 	}
 	
-	private static void performCuDNNReLU(String instName, MatrixObject in, Pointer dstData, cudnnTensorDescriptor srcTensorDesc) throws DMLRuntimeException {
+	private static void performCuDNNReLU(GPUContext gCtx, String instName, MatrixObject in, Pointer dstData, cudnnTensorDescriptor srcTensorDesc) throws DMLRuntimeException {
 		long t0=0;
 		try {
-			cudnnTensorDescriptor dstTensorDesc = srcTensorDesc;
+            LOG.trace("GPU : performCuDNNReLU" + ", GPUContext=" + gCtx);
+            cudnnTensorDescriptor dstTensorDesc = srcTensorDesc;
 			
-			Pointer srcData = getDensePointer(in, true, instName);
+			Pointer srcData = getDensePointer(gCtx, in, true, instName);
 			cudnnActivationDescriptor activationDescriptor = new cudnnActivationDescriptor();
 			cudnnCreateActivationDescriptor(activationDescriptor);
 			double dummy = -1;
 			cudnnSetActivationDescriptor(activationDescriptor, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, dummy);
 			if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
-			cudnnActivationForward(cudnnHandle, activationDescriptor,
+			cudnnActivationForward(getCudnnHandle(gCtx), activationDescriptor,
 							one(), srcTensorDesc, srcData,
 							zero(), dstTensorDesc, dstData);
 			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ACTIVATION_FORWARD_LIB, System.nanoTime() - t0);
+		} catch (CudaException e) {
+			throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e);
 		}
 		finally {
 			long t1=0;
@@ -1071,30 +1193,34 @@ public class LibMatrixCUDA {
 	/**
 	 * Performs the relu operation on the GPU.
 	 * @param ec currently active {@link ExecutionContext}
+	 * @param gCtx   a valid {@link GPUContext}
 	 * @param instName the invoking instruction's name for record {@link Statistics}.
 	 * @param in input matrix
 	 * @param outputName	name of the output matrix
 	 * @throws DMLRuntimeException	if an error occurs
 	 */
-	public static void relu(ExecutionContext ec, String instName, MatrixObject in, String outputName) throws DMLRuntimeException {
+	public static void relu(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in, String outputName) throws DMLRuntimeException {
+		if (ec.getGPUContext() != gCtx)
+			throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
 		long N = in.getNumRows();
 		long CHW = in.getNumColumns();
 		MatrixObject output = ec.getMatrixObject(outputName);
 		getDenseMatrixOutputForGPUInstruction(ec, instName, outputName);	// Allocated the dense output matrix
 		long t0=0;
-		cudnnTensorDescriptor srcTensorDesc = ((JCudaObject)in.getGPUObject()).getTensorDescriptor();
+		cudnnTensorDescriptor srcTensorDesc = in.getGPUObject(gCtx).getTensorDescriptor();
 		if(N*CHW >= numDoublesIn2GB ||  srcTensorDesc == null) {
-			// Invokes relu(double* A,  double* ret, int rlen, int clen)
+            LOG.trace("GPU : relu custom kernel" + ", GPUContext=" + gCtx);
+            // Invokes relu(double* A,  double* ret, int rlen, int clen)
 			if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
-			Pointer dstData = getDensePointer(output, instName);
-			Pointer srcData = getDensePointer(in, instName); // TODO: FIXME: Add sparse kernel support for relu
-			kernels.launchKernel("relu",
+			Pointer dstData = getDensePointer(gCtx, output, instName);
+			Pointer srcData = getDensePointer(gCtx, in, instName); // TODO: FIXME: Add sparse kernel support for relu
+			getCudaKernels(gCtx).launchKernel("relu",
 							ExecutionConfig.getConfigForSimpleMatrixOperations((int)N, (int)CHW),
 							srcData, dstData, (int)N, (int) CHW);
 			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RELU_KERNEL, System.nanoTime() - t0);
 		}
 		else {
-			performCuDNNReLU(instName, in, getDensePointer(output, true, instName), srcTensorDesc);
+			performCuDNNReLU(gCtx, instName, in, getDensePointer(gCtx, output, true, instName), srcTensorDesc);
 		}
 	}
 
@@ -1114,17 +1240,21 @@ public class LibMatrixCUDA {
 	 * Performs tsmm, A %*% A' or A' %*% A, on GPU by exploiting cublasDsyrk(...)
 	 *
 	 * @param ec execution context
+	 * @param gCtx   a valid {@link GPUContext}
 	 * @param instName the invoking instruction's name for record {@link Statistics}.
 	 * @param left input matrix, as in a tsmm expression like A %*% A' or A' %*% A, we just need to check whether the left one is transposed or not, I named it 'left'
 	 * @param outputName output matrix name
 	 * @param isLeftTransposed if true, left transposed
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
-	public static void matmultTSMM(ExecutionContext ec, String instName, MatrixObject left, String outputName,
+	public static void matmultTSMM(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject left, String outputName,
 																 boolean isLeftTransposed) throws DMLRuntimeException {
-		if(isInSparseFormat(left)) {
+		LOG.trace("GPU : matmultTSMM" + ", GPUContext=" + gCtx);
+		if (ec.getGPUContext() != gCtx)
+			throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
+		if(isInSparseFormat(gCtx, left)) {
 			// For sparse TSMM, invoke matmult (TODO: possible performance improvement)
-			matmult(ec, instName, left, left, outputName, isLeftTransposed, !isLeftTransposed);
+			matmult(ec, gCtx, instName, left, left, outputName, isLeftTransposed, !isLeftTransposed);
 			return;
 		}
 
@@ -1145,22 +1275,22 @@ public class LibMatrixCUDA {
 		int lda = (int) (isLeftTransposed ? m : k);
 		int ldc = m;
 
-		if(!left.getGPUObject().isAllocated())
-			throw new DMLRuntimeException("Input is not allocated:" + left.getGPUObject().isAllocated());
-		if(!output.getGPUObject().isAllocated())
-			throw new DMLRuntimeException("Output is not allocated:" + output.getGPUObject().isAllocated());
+		if(!left.getGPUObject(gCtx).isAllocated())
+			throw new DMLRuntimeException("Input is not allocated:" + left.getGPUObject(gCtx).isAllocated());
+		if(!output.getGPUObject(gCtx).isAllocated())
+			throw new DMLRuntimeException("Output is not allocated:" + output.getGPUObject(gCtx).isAllocated());
 
-		Pointer A = getDensePointer(left, instName);
-		Pointer C = getDensePointer(output, instName);
+		Pointer A = getDensePointer(gCtx, left, instName);
+		Pointer C = getDensePointer(gCtx, output, instName);
 
 		long t0=0, t1=0;
 
 		if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
-		JCublas2.cublasDsyrk(cublasHandle, cublasFillMode.CUBLAS_FILL_MODE_LOWER,transa, m, k, one(), A, lda, zero(), C, ldc);
+		JCublas2.cublasDsyrk(getCublasHandle(gCtx), cublasFillMode.CUBLAS_FILL_MODE_LOWER,transa, m, k, one(), A, lda, zero(), C, ldc);
 		if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SYRK_LIB, System.nanoTime() - t0);
 
 		if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
-		copyUpperToLowerTriangle(instName, output);
+		copyUpperToLowerTriangle(gCtx, instName, output);
 		if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_UPPER_TO_LOWER_TRIANGLE_KERNEL, System.nanoTime() - t1);
 	}
 
@@ -1168,22 +1298,23 @@ public class LibMatrixCUDA {
 	 * Used for all version of TSMM where the result is known to be symmetric.
 	 * Hence, we compute only the upper triangular matrix and copy this partial
 	 * result down to lower triangular matrix once.
-	 *
+	 * @param gCtx   a valid {@link GPUContext}
 	 * @param instName instruction name
 	 * @param ret upper triangular matrix
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
-	private static void copyUpperToLowerTriangle(String instName, MatrixObject ret) throws DMLRuntimeException {
-		if(isInSparseFormat(ret)) {
+	private static void copyUpperToLowerTriangle(GPUContext gCtx, String instName, MatrixObject ret) throws DMLRuntimeException {
+        LOG.trace("GPU : copyUpperToLowerTriangle" + ", GPUContext=" + gCtx);
+        if(isInSparseFormat(gCtx, ret)) {
 			throw new DMLRuntimeException("Sparse GPU copyUpperToLowerTriangle is not implemented");
 		}
 		if(ret.getNumRows() != ret.getNumColumns()) {
 			throw new DMLRuntimeException("Only square matrix kernel is implemented for copyUpperToLowerTriangle");
 		}
 		int dim = (int) ret.getNumRows();
-		kernels.launchKernel("copy_u2l_dense",
+		getCudaKernels(gCtx).launchKernel("copy_u2l_dense",
 						ExecutionConfig.getConfigForSimpleMatrixOperations(dim, dim),
-						getDensePointer(ret, instName), dim, dim*dim);
+						getDensePointer(gCtx, ret, instName), dim, dim*dim);
 	}
 
 
@@ -1201,40 +1332,43 @@ public class LibMatrixCUDA {
 	 * Examines sparsity and shapes and routes call to appropriate method
 	 * from cuBLAS or cuSparse
 	 * C = op(A) x op(B)
-	 * @param ec									Current {@link ExecutionContext} instance
-	 * @param instName name of the invoking instruction to record{@link Statistics}.
-	 * @param left1								Matrix A
-	 * @param right1							Matrix B
-	 * @param outputName					Name of the output matrix C (in code generated after LOP layer)
-	 * @param isLeftTransposed1		op for A, transposed or not
-	 * @param isRightTransposed1	op for B, tranposed or not
+	 * @param ec                    Current {@link ExecutionContext} instance
+	 * @param gCtx                  a valid {@link GPUContext}
+	 * @param instName              name of the invoking instruction to record{@link Statistics}.
+	 * @param left                  Matrix A
+	 * @param right                 Matrix B
+	 * @param outputName            Name of the output matrix C (in code generated after LOP layer)
+	 * @param isLeftTransposed      op for A, transposed or not
+	 * @param isRightTransposed     op for B, tranposed or not
 	 * @return	output of matrix multiply
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
-	public static MatrixObject matmult(ExecutionContext ec, String instName, MatrixObject left1, MatrixObject right1, String outputName,
-																		 boolean isLeftTransposed1, boolean isRightTransposed1) throws DMLRuntimeException {
-
-		if(!left1.getGPUObject().isAllocated() || !right1.getGPUObject().isAllocated())
-			throw new DMLRuntimeException("One of input is not allocated:" + left1.getGPUObject().isAllocated() + " " + right1.getGPUObject().isAllocated());
+	public static MatrixObject matmult(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject left, MatrixObject right, String outputName,
+																		 boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException {
+		if (ec.getGPUContext() != gCtx)
+			throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
+		LOG.trace("GPU : matmult" + ", GPUContext=" + gCtx);
+		if(!left.getGPUObject(gCtx).isAllocated() || !right.getGPUObject(gCtx).isAllocated())
+			throw new DMLRuntimeException("One of input is not allocated:" + left.getGPUObject(gCtx).isAllocated() + " " + right.getGPUObject(gCtx).isAllocated());
 
-		boolean bothDense = !left1.getGPUObject().isInSparseFormat() && !right1.getGPUObject().isInSparseFormat();
-		boolean bothSparse = left1.getGPUObject().isInSparseFormat() && right1.getGPUObject().isInSparseFormat();
+		boolean bothDense = !left.getGPUObject(gCtx).isSparse() && !right.getGPUObject(gCtx).isSparse();
+		boolean bothSparse = left.getGPUObject(gCtx).isSparse() && right.getGPUObject(gCtx).isSparse();
 
 		MatrixObject output = ec.getMatrixObject(outputName);
 
 		if (bothDense) {		// Dense C = Dense A * Dense B
 			// For both dense, do cuBLAS
 			getDenseMatrixOutputForGPUInstruction(ec, instName, outputName);	// Allocated the dense output matrix
-			denseDenseMatmult(instName, output, left1, right1, isLeftTransposed1, isRightTransposed1);
+			denseDenseMatmult(gCtx, instName, output, left, right, isLeftTransposed, isRightTransposed);
 		}
 		else if (bothSparse){	// Sparse C = Sparse A * Sparse B
 			ec.allocateGPUMatrixObject(outputName);
-			bothSparseMatmult(instName, output, left1, right1, isLeftTransposed1, isRightTransposed1);
+			bothSparseMatmult(gCtx, instName, output, left, right, isLeftTransposed, isRightTransposed);
 		}
 		else {	// Either of A or B is sparse, Sparse C = Sparse/Dense A * Dense/Sparse B
 			// Convert the dense to sparse and use the cusparseDcsrgemm routine
 			ec.allocateGPUMatrixObject(outputName);
-			eitherSparseMatmult(instName, output, left1, right1, isLeftTransposed1, isRightTransposed1);
+			eitherSparseMatmult(gCtx, instName, output, left, right, isLeftTransposed, isRightTransposed);
 		}
 
 		return output;
@@ -1243,20 +1377,18 @@ public class LibMatrixCUDA {
 	/**
 	 * One of the matrices is sparse, the other dense
 	 * C = op(A) x op(B)
-	 * @param instName the invoking instruction's name for record {@link Statistics}.
-	 * @param output				allocated output object for C on host to which GPU output will be attached
-	 * @param left					Matrix A on host
-	 * @param right					Matrix B on host
-	 * @param isLeftTransposed		op for A, tranposed or not
-	 * @param isRightTransposed		op for B, transposed or not
+	 * @param gCtx              a valid {@link GPUContext}
+	 * @param instName          the invoking instruction's name for record {@link Statistics}.
+	 * @param output            allocated output object for C on host to which GPU output will be attached
+	 * @param left              Matrix A on host
+	 * @param right             Matrix B on host
+	 * @param isLeftTransposed  op for A, tranposed or not
+	 * @param isRightTransposed op for B, transposed or not
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
-	protected static void eitherSparseMatmult(String instName, MatrixObject output, MatrixObject left, MatrixObject right,
+	private static void eitherSparseMatmult(GPUContext gCtx, String instName, MatrixObject output, MatrixObject left, MatrixObject right,
 																						boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException {
 
-		int transA = isLeftTransposed ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
-		int transB = isRightTransposed ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
-
 		int m = (int) (isLeftTransposed ? left.getNumColumns() : left.getNumRows()) ;
 		int n = (int) (isRightTransposed ? right.getNumRows() : right.getNumColumns());
 		int k = (int) (isLeftTransposed ? left.getNumRows() :  left.getNumColumns());
@@ -1268,12 +1400,12 @@ public class LibMatrixCUDA {
 			throw new DMLRuntimeException("Incorrect dimensions");
 
 
-		if (left.getGPUObject().isInSparseFormat()) {
+		if (left.getGPUObject(gCtx).isSparse()) {
 			// Left sparse, right dense
-			sparseDenseMatmult(instName, output, left, right, isLeftTransposed, isRightTransposed, transA, transB, m, n, k);
+			sparseDenseMatmult(gCtx, instName, output, left, right, isLeftTransposed, isRightTransposed, m, n, k);
 		} else {
 			// Left dense, right sparse
-			denseSparseMatmult(instName, output, right, left, isLeftTransposed, isRightTransposed, transA, transB, m, n, k);
+			denseSparseMatmult(gCtx, instName, left, right, output, isLeftTransposed, isRightTransposed, m, n, k);
 		}
 	}
 
@@ -1281,73 +1413,72 @@ public class LibMatrixCUDA {
 	 * C = op(A) * op(B) where A is dense and B is sparse
 	 * If B is ultrasparse, A is converted to a sparse matrix and {@code sparseSparseMatmult(MatrixObject, int, int, int, int, int, CSRPointer, CSRPointer)} is invoked
 	 * otherwise B is converted to a dense matrix and {@code denseDenseMatmult(Pointer, int, int, int, int, boolean, boolean, Pointer, Pointer)} is invoked.
+	 * @param gCtx   a valid {@link GPUContext}
 	 * @param instName the invoking instruction's name for record {@link Statistics}.
-	 * @param output ?
-	 * @param right ?
-	 * @param left ?
-	 * @param isLeftTransposed ?
-	 * @param isRightTransposed ?
-	 * @param transA ?
-	 * @param transB ?
+	 * @param left {@link MatrixObject} of A
+	 * @param right {@link MatrixObject} of B
+	 * @param output {@link MatrixObject} of the output matrix C
+	 * @param isLeftTransposed whether matrix A needs to be transposed
+	 * @param isRightTransposed whether matrix B needs to be transposed
 	 * @param m ?
 	 * @param n ?
 	 * @param k ?
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
-	protected static void denseSparseMatmult(String instName, MatrixObject output, MatrixObject right, MatrixObject left,
-																					 boolean isLeftTransposed, boolean isRightTransposed, int transA, int transB, int m, int n, int k)
+	private static void denseSparseMatmult(GPUContext gCtx, String instName, MatrixObject left, MatrixObject right, MatrixObject output,
+                                             boolean isLeftTransposed, boolean isRightTransposed, int m, int n, int k)
 					throws DMLRuntimeException {
 		// right sparse, left dense
-		CSRPointer B = ((JCudaObject)right.getGPUObject()).jcudaSparseMatrixPtr;
-		Pointer ADense = getDensePointer(left, instName);
+		CSRPointer B = right.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
+		Pointer ADense = getDensePointer(gCtx, left, instName);
 		if (B.isUltraSparse(k, n)){
-			LOG.debug(" GPU Dense-Sparse Matrix Multiplication (Converted to Sparse-Sparse)");
-			// Convert left to CSR and do cuSparse matmul
+            LOG.trace(" GPU : Convert d M %*% sp M --> sp M %*% sp M)" + ", GPUContext=" + gCtx);
+
+            // Convert left to CSR and do cuSparse matmul
 			int rowsA = (int)left.getNumRows();
 			int colsA = (int)left.getNumColumns();
 
-
 			long t0=0,t1=0, t2=0;
 			if (DMLScript.STATISTICS) t0 = System.nanoTime();
-			Pointer AT = JCudaObject.transpose(ADense, rowsA, colsA, colsA, rowsA);
+			Pointer AT = GPUObject.transpose(gCtx, ADense, rowsA, colsA, colsA, rowsA);
 			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_TRANSPOSE_LIB, System.nanoTime() - t0);
 
 			if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
-			CSRPointer A = JCudaObject.columnMajorDenseToRowMajorSparse(cusparseHandle, rowsA, colsA, AT);
+			CSRPointer A = GPUObject.columnMajorDenseToRowMajorSparse(gCtx, getCusparseHandle(gCtx), AT, rowsA, colsA);
 			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t1);
 
 			if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseTime.getAndAdd(System.nanoTime() - t0);
 			if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseCount.getAndAdd(1);
-			sparseSparseMatmult(instName, output, transA, transB, m, n, k, A, B);
+			sparseSparseMatmult(gCtx, instName, A, B, output, isLeftTransposed, isRightTransposed, m, n, k);
 
 			if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
 			A.deallocate();
-			cudaFreeHelper(AT);
+			gCtx.cudaFreeHelper(AT);
 			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t2, 2);
 
 		} else {
-			LOG.debug(" GPU Dense-Sparse Matrix Multiplication (Converted to Dense-Dense)");
+            LOG.trace(" GPU : Convert d M %*% sp M --> d M %*% d M" + ", GPUContext=" + gCtx);
 			// Convert right to dense and do a cuBlas matmul
 			// BDenseTransposed is a column major matrix
 			// Note the arguments to denseDenseMatmult to accommodate for this.
 			long t0=0, t1=0;
 			if (DMLScript.STATISTICS) t0 = System.nanoTime();
-			Pointer BDenseTransposed = B.toColumnMajorDenseMatrix(cusparseHandle, cublasHandle, (int)right.getNumRows(), (int)right.getNumColumns());
+			Pointer BDenseTransposed = B.toColumnMajorDenseMatrix(getCusparseHandle(gCtx), getCublasHandle(gCtx), (int)right.getNumRows(), (int)right.getNumColumns());
 			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0);
 			if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseTime.getAndAdd(System.nanoTime() - t0);
 			if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseCount.getAndAdd(System.nanoTime() - t0);
 
 			if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
-			boolean allocated = output.getGPUObject().acquireDeviceModifyDense();	// To allocate the dense matrix
+			boolean allocated = output.getGPUObject(gCtx).acquireDeviceModifyDense();	// To allocate the dense matrix
 			if (GPUStatistics.DISPLAY_STATISTICS && allocated) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ALLOCATE_DENSE_OUTPUT, System.nanoTime() - t1);
-			Pointer C = getDensePointer(output, instName);
-			denseDenseMatmult(instName, C,
+			Pointer C = getDensePointer(gCtx, output, instName);
+			denseDenseMatmult(gCtx, instName, C,
 							(int) left.getNumRows(), (int) left.getNumColumns(),
 							(int) right.getNumColumns(), (int) right.getNumRows(),
 							isLeftTransposed, !isRightTransposed,
 							ADense, BDenseTransposed);
 
-			cudaFreeHelper(instName, BDenseTransposed);
+			gCtx.cudaFreeHelper(instName, BDenseTransposed);
 		}
 	}
 
@@ -1355,81 +1486,79 @@ public class LibMatrixCUDA {
 	 * * C = op(A) * op(B) where A is sparse and B is dense
 	 * If A is ultrasparse, B is converted to a sparse matrix and {@code sparseSparseMatmult(MatrixObject, int, int, int, int, int, CSRPointer, CSRPointer)} is invoked
 	 * otherwise A is converted to a dense matrix and {@code denseDenseMatmult(Pointer, int, int, int, int, boolean, boolean, Pointer, Pointer)} is invoked.
+	 * @param gCtx   a valid {@link GPUContext}
 	 * @param instName the invoking instruction's name for record {@link Statistics}.
-	 * @param output ?
-	 * @param left ?
-	 * @param right ?
-	 * @param isLeftTransposed ?
-	 * @param isRightTransposed ?
-	 * @param transA ?
-	 * @param transB ?
+	 * @param output the output matrix object
+	 * @param left matrix A
+	 * @param right matrix B
+	 * @param isLeftTransposed if A needs to be transposed
+	 * @param isRightTransposed if B needs to be transposed
 	 * @param m ?
 	 * @param n ?
 	 * @param k ?
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
-	protected static void sparseDenseMatmult(String instName, MatrixObject output, MatrixObject left, MatrixObject right,
-																					 boolean isLeftTransposed, boolean isRightTransposed, int transA, int transB, int m, int n, int k)
+	private static void sparseDenseMatmult(GPUContext gCtx, String instName, MatrixObject output, MatrixObject left, MatrixObject right,
+																					 boolean isLeftTransposed, boole

<TRUNCATED>