You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by na...@apache.org on 2017/04/21 23:23:19 UTC

[4/5] incubator-systemml git commit: Refactored GPU{Contex, Object} to make it friendlier for parfor

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
index d2309b0..708f291 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
@@ -18,65 +18,584 @@
  */
 package org.apache.sysml.runtime.instructions.gpu.context;
 
+import static jcuda.jcublas.JCublas2.cublasCreate;
+import static jcuda.jcublas.JCublas2.cublasDestroy;
+import static jcuda.jcudnn.JCudnn.cudnnCreate;
+import static jcuda.jcudnn.JCudnn.cudnnDestroy;
+import static jcuda.jcusparse.JCusparse.cusparseCreate;
+import static jcuda.jcusparse.JCusparse.cusparseDestroy;
+import static jcuda.runtime.JCuda.cudaDeviceScheduleBlockingSync;
+import static jcuda.runtime.JCuda.cudaFree;
+import static jcuda.runtime.JCuda.cudaGetDeviceCount;
+import static jcuda.runtime.JCuda.cudaMalloc;
+import static jcuda.runtime.JCuda.cudaMemGetInfo;
+import static jcuda.runtime.JCuda.cudaMemset;
+import static jcuda.runtime.JCuda.cudaSetDevice;
+import static jcuda.runtime.JCuda.cudaSetDeviceFlags;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.Map;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.apache.sysml.api.DMLScript;
-import org.apache.sysml.hops.OptimizerUtils;
+import org.apache.sysml.conf.ConfigurationManager;
+import org.apache.sysml.conf.DMLConfig;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
+import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
+import org.apache.sysml.utils.GPUStatistics;
+import org.apache.sysml.utils.LRUCacheMap;
 
-//FIXME merge JCudaContext into GPUContext as this context is anyway CUDA specific
+import jcuda.Pointer;
+import jcuda.jcublas.cublasHandle;
+import jcuda.jcudnn.cudnnHandle;
+import jcuda.jcusparse.cusparseHandle;
+import jcuda.runtime.JCuda;
+import jcuda.runtime.cudaDeviceProp;
 
-public abstract class GPUContext {
+/**
+ * Represents a context per GPU accessible through the same JVM
+ * Each context holds cublas, cusparse, cudnn... handles which are separate for each GPU
+ */
+public class GPUContext {
 
-	protected static GPUContext currContext;
-	public static volatile Boolean isGPUContextCreated = false;
+	protected static final Log LOG = LogFactory.getLog(GPUContext.class.getName());
 
-	protected GPUContext() {}
+  /** Eviction policies for {@link GPUContext#evict(long)} */
+	public enum EvictionPolicy {
+		LRU, LFU, MIN_EVICT
+	}
 
-	/**
-	 * Gets device memory available for SystemML operations
-	 * 
-	 * @return available memory
-	 */
-	public abstract long getAvailableMemory();
+	/** currently employed eviction policy */
+	public final EvictionPolicy evictionPolicy = EvictionPolicy.LRU;
+
+	/** Map of free blocks allocate on GPU. maps size_of_block -> pointer on GPU */
+	private LRUCacheMap<Long, LinkedList<Pointer>> freeCUDASpaceMap = new LRUCacheMap<>();
+
+	/** To record size of allocated blocks */
+	private HashMap<Pointer, Long> cudaBlockSizeMap = new HashMap<>();
+
+  /** active device assigned to this GPUContext instance */
+  private final int deviceNum;
+
+  /** list of allocated {@link GPUObject} instances allocated on {@link GPUContext#deviceNum} GPU
+   * These are matrices allocated on the GPU on which rmvar hasn't been called yet.
+   * If a {@link GPUObject} has more than one lock on it, it cannot be freed
+   * If it has zero locks on it, it can be freed, but it is preferrable to keep it around
+   * so that an extraneous host to dev transfer can be avoided */
+  private ArrayList<GPUObject> allocatedGPUObjects = new ArrayList<>();
+
+  /** cudnnHandle specific to the active GPU for this GPUContext */
+  private cudnnHandle cudnnHandle;
+
+  /** cublasHandle specific to the active GPU for this GPUContext */
+  private cublasHandle cublasHandle;
+
+  /** cusparseHandle specific to the active GPU for this GPUContext */
+  private cusparseHandle cusparseHandle;
+
+  /** to launch custom CUDA kernel, specific to the active GPU for this GPUContext */
+  private JCudaKernels kernels;
 
 	/**
-	 * Ensures that all the CUDA cards on the current system are
-	 * of the minimum required compute capability.
-	 * (The minimum required compute capability is hard coded in {@link JCudaContext}.
-	 * 
-	 * @throws DMLRuntimeException if DMLRuntimeException occurs
-	 */
-	public abstract void ensureComputeCapability() throws DMLRuntimeException;
-	
-	/**
-	 * Singleton Factory method for creation of {@link GPUContext}
-	 * @return GPU context
-	 * @throws DMLRuntimeException if DMLRuntimeException occurs
+	 * The minimum CUDA Compute capability needed for SystemML.
+	 * After compute capability 3.0, 2^31 - 1 blocks and 1024 threads per block are supported.
+	 * If SystemML needs to run on an older card, this logic can be revisited.
 	 */
-	public static GPUContext getGPUContext() throws DMLRuntimeException {
-		if(currContext == null && DMLScript.USE_ACCELERATOR) {
-			synchronized(isGPUContextCreated) {
-				currContext = new JCudaContext();
-				currContext.ensureComputeCapability();
-				OptimizerUtils.GPU_MEMORY_BUDGET = currContext.getAvailableMemory();
-				isGPUContextCreated = true;
-			}
-		}
-		return currContext;
-	}
-	
-	public static GPUObject createGPUObject(MatrixObject mo) {
-		if(DMLScript.USE_ACCELERATOR) {
-			synchronized(isGPUContextCreated) {
-				if(currContext == null)
-					throw new RuntimeException("GPUContext is not created");
-				if(currContext instanceof JCudaContext)
-					return new JCudaObject(mo);
-			}
-		}
-		throw new RuntimeException("Cannot create createGPUObject when USE_ACCELERATOR is off");
-	}
-	public abstract void destroy() throws DMLRuntimeException;
-	
-	
+	final int MAJOR_REQUIRED = 3;
+	final int MINOR_REQUIRED = 0;
+
+  // Invoke cudaMemGetInfo to get available memory information. Useful if GPU is shared among multiple application.
+  public double GPU_MEMORY_UTILIZATION_FACTOR = ConfigurationManager.getDMLConfig().getDoubleValue(DMLConfig.GPU_MEMORY_UTILIZATION_FACTOR);
+
+  protected GPUContext(int deviceNum) throws DMLRuntimeException {
+    this.deviceNum = deviceNum;
+    cudaSetDevice(deviceNum);
+
+    cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+
+    long free[] = {0};
+    long total[] = {0};
+    cudaMemGetInfo(free, total);
+
+    long start = System.nanoTime();
+    cudnnHandle = new cudnnHandle();
+    cudnnCreate(cudnnHandle);
+    cublasHandle = new cublasHandle();
+    cublasCreate(cublasHandle);
+    // For cublas v2, cublasSetPointerMode tells Cublas whether to expect scalar arguments on device or on host
+    // This applies to arguments like "alpha" in Dgemm, and "y" in Ddot.
+    // cublasSetPointerMode(LibMatrixCUDA.cublasHandle, cublasPointerMode.CUBLAS_POINTER_MODE_DEVICE);
+    cusparseHandle = new cusparseHandle();
+    cusparseCreate(cusparseHandle);
+    kernels = new JCudaKernels(deviceNum);
+
+    GPUStatistics.cudaLibrariesInitTime = System.nanoTime() - start;
+    LOG.info(" GPU memory - Total: " + (total[0] * (1e-6)) + " MB, Available: " + (free[0] * (1e-6)) + " MB on " + this);
+
+  }
+
+  public int getDeviceNum() {
+    return deviceNum;
+  }
+
+  /**
+   * Sets the device for the calling thread.
+   * This method must be called after {@link GPUContextPool#getFromPool()}
+   * is called.
+   * If in a multi-threaded env like parfor, this method must be called when in the
+   * appropriate thread
+   */
+  public void initializeThread() {
+    cudaSetDevice(deviceNum);
+  }
+
+  @SuppressWarnings("unused")
+  public static int cudaGetDevice() {
+    int[] device = new int[1];
+    JCuda.cudaGetDevice(device);
+    return device[0];
+  }
+
+  /**
+   * Convenience method for {@link #allocate(String, long, int)}, defaults statsCount to 1.
+   *
+   * @param size size of data (in bytes) to allocate
+   * @return jcuda pointer
+   * @throws DMLRuntimeException if DMLRuntimeException occurs
+   */
+  public Pointer allocate(long size) throws DMLRuntimeException {
+    return allocate(null, size, 1);
+  }
+
+  /**
+   * Convenience method for {@link #allocate(String, long, int)}, defaults statsCount to 1.
+   *
+   * @param instructionName name of instruction for which to record per instruction performance statistics, null if don't want to record
+   * @param size            size of data (in bytes) to allocate
+   * @return jcuda pointer
+   * @throws DMLRuntimeException if DMLRuntimeException occurs
+   */
+  public Pointer allocate(String instructionName, long size) throws DMLRuntimeException {
+    return allocate(instructionName, size, 1);
+  }
+
+  /**
+   * Allocates temporary space on the device.
+   * Does not update bookkeeping.
+   * The caller is responsible for freeing up after usage.
+   *
+   * @param instructionName name of instruction for which to record per instruction performance statistics, null if don't want to record
+   * @param size            Size of data (in bytes) to allocate
+   * @param statsCount      amount to increment the cudaAllocCount by
+   * @return jcuda Pointer
+   * @throws DMLRuntimeException if DMLRuntimeException occurs
+   */
+  public Pointer allocate(String instructionName, long size, int statsCount) throws DMLRuntimeException {
+    long t0 = 0, t1 = 0, end = 0;
+    Pointer A;
+    if (freeCUDASpaceMap.containsKey(size)) {
+      LOG.trace("GPU : in allocate from instruction " + instructionName + ", found free block of size " + (size / 1024.0) + " Kbytes from previously allocated block on " + this);
+      if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
+      LinkedList<Pointer> freeList = freeCUDASpaceMap.get(size);
+      A = freeList.pop();
+      if (freeList.isEmpty())
+        freeCUDASpaceMap.remove(size);
+      if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS)
+        GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_REUSE, System.nanoTime() - t0);
+    } else {
+      LOG.trace("GPU : in allocate from instruction " + instructionName + ", allocating new block of size " + (size / 1024.0) + " Kbytes on " + this);
+      if (DMLScript.STATISTICS) t0 = System.nanoTime();
+      ensureFreeSpace(instructionName, size);
+      A = new Pointer();
+      cudaMalloc(A, size);
+      if (DMLScript.STATISTICS) GPUStatistics.cudaAllocTime.getAndAdd(System.nanoTime() - t0);
+      if (DMLScript.STATISTICS) GPUStatistics.cudaAllocCount.getAndAdd(statsCount);
+      if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS)
+        GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_ALLOCATE, System.nanoTime() - t0);
+    }
+    // Set all elements to 0 since newly allocated space will contain garbage
+    if (DMLScript.STATISTICS) t1 = System.nanoTime();
+    LOG.trace("GPU : in allocate from instruction " + instructionName + ", setting block of size " + (size / 1024.0) + " Kbytes to zero on " + this);
+    cudaMemset(A, 0, size);
+    if (DMLScript.STATISTICS) end = System.nanoTime();
+    if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS)
+      GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_SET_ZERO, end - t1);
+    if (DMLScript.STATISTICS) GPUStatistics.cudaMemSet0Time.getAndAdd(end - t1);
+    if (DMLScript.STATISTICS) GPUStatistics.cudaMemSet0Count.getAndAdd(1);
+    cudaBlockSizeMap.put(A, size);
+    return A;
+
+  }
+
+  /**
+   * Does lazy cudaFree calls
+   *
+   * @param toFree {@link Pointer} instance to be freed
+   */
+  public void cudaFreeHelper(final Pointer toFree) {
+    cudaFreeHelper(null, toFree, false);
+  }
+
+  /**
+   * does lazy/eager cudaFree calls
+   *
+   * @param toFree {@link Pointer} instance to be freed
+   * @param eager  true if to be done eagerly
+   */
+  public void cudaFreeHelper(final Pointer toFree, boolean eager) {
+    cudaFreeHelper(null, toFree, eager);
+  }
+
+  /**
+   * Does lazy cudaFree calls
+   *
+   * @param instructionName name of the instruction for which to record per instruction free time, null if do not want to record
+   * @param toFree          {@link Pointer} instance to be freed
+   */
+  public void cudaFreeHelper(String instructionName, final Pointer toFree) {
+    cudaFreeHelper(instructionName, toFree, false);
+  }
+
+  /**
+   * Does cudaFree calls, lazily
+   *
+   * @param instructionName name of the instruction for which to record per instruction free time, null if do not want to record
+   * @param toFree          {@link Pointer} instance to be freed
+   * @param eager           true if to be done eagerly
+   */
+  public void cudaFreeHelper(String instructionName, final Pointer toFree, boolean eager) {
+    long t0 = 0;
+    assert cudaBlockSizeMap.containsKey(toFree) : "ERROR : Internal state corrupted, cache block size map is not aware of a block it trying to free up";
+    long size = cudaBlockSizeMap.get(toFree);
+    if (eager) {
+      LOG.trace("GPU : eagerly freeing cuda memory [ " + toFree + " ] for instruction " + instructionName + " on " + this);
+      if (DMLScript.STATISTICS) t0 = System.nanoTime();
+      cudaFree(toFree);
+      cudaBlockSizeMap.remove(toFree);
+      if (DMLScript.STATISTICS) GPUStatistics.cudaDeAllocTime.addAndGet(System.nanoTime() - t0);
+      if (DMLScript.STATISTICS) GPUStatistics.cudaDeAllocCount.addAndGet(1);
+      if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS)
+        GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t0);
+    } else {
+      LOG.trace("GPU : lazily freeing cuda memory for instruction " + instructionName + " on " + this);
+      LinkedList<Pointer> freeList = freeCUDASpaceMap.get(size);
+      if (freeList == null) {
+        freeList = new LinkedList<Pointer>();
+        freeCUDASpaceMap.put(size, freeList);
+      }
+      freeList.add(toFree);
+    }
+  }
+
+  /**
+   * Thin wrapper over {@link GPUContext#evict(long)}
+   *
+   * @param size size to check
+   * @throws DMLRuntimeException if DMLRuntimeException occurs
+   */
+  void ensureFreeSpace(long size) throws DMLRuntimeException {
+    ensureFreeSpace(null, size);
+  }
+
+  /**
+   * Thin wrapper over {@link GPUContext#evict(long)}
+   *
+   * @param instructionName instructionName name of the instruction for which performance measurements are made
+   * @param size            size to check
+   * @throws DMLRuntimeException if DMLRuntimeException occurs
+   */
+  void ensureFreeSpace(String instructionName, long size) throws DMLRuntimeException {
+    if (size >= getAvailableMemory()) {
+      evict(instructionName, size);
+    }
+  }
+
+  /**
+   * Convenience wrapper over {@link GPUContext#evict(String, long)}
+   *
+   * @param GPUSize Desired size to be freed up on the GPU
+   * @throws DMLRuntimeException If no blocks to free up or if not enough blocks with zero locks on them.
+   */
+  protected void evict(final long GPUSize) throws DMLRuntimeException {
+    evict(null, GPUSize);
+  }
+
+  /**
+   * Memory on the GPU is tried to be freed up until either a chunk of needed size is freed up
+   * or it fails.
+   * First the set of reusable blocks is freed up. If that isn't enough, the set of allocated matrix
+   * blocks with zero locks on them is freed up.
+   * The process cycles through the sorted list of allocated {@link GPUObject} instances. Sorting is based on
+   * number of (read) locks that have been obtained on it (reverse order). It repeatedly frees up
+   * blocks on which there are zero locks until the required size has been freed up.
+   * // TODO: update it with hybrid policy
+   *
+   * @param instructionName name of the instruction for which performance measurements are made
+   * @param neededSize      desired size to be freed up on the GPU
+   * @throws DMLRuntimeException If no reusable memory blocks to free up or if not enough matrix blocks with zero locks on them.
+   */
+  protected void evict(String instructionName, final long neededSize) throws DMLRuntimeException {
+    LOG.trace("GPU : evict called from " + instructionName + " for size " + neededSize + " on " + this);
+    GPUStatistics.cudaEvictionCount.addAndGet(1);
+    // Release the set of free blocks maintained in a GPUObject.freeCUDASpaceMap
+    // to free up space
+    LRUCacheMap<Long, LinkedList<Pointer>> lruCacheMap = freeCUDASpaceMap;
+    while (lruCacheMap.size() > 0) {
+      if (neededSize <= getAvailableMemory())
+        break;
+      Map.Entry<Long, LinkedList<Pointer>> toFreeListPair = lruCacheMap.removeAndGetLRUEntry();
+      LinkedList<Pointer> toFreeList = toFreeListPair.getValue();
+      Long size = toFreeListPair.getKey();
+      Pointer toFree = toFreeList.pop();
+      if (toFreeList.isEmpty())
+        lruCacheMap.remove(size);
+      cudaFreeHelper(instructionName, toFree, true);
+    }
+
+    if (neededSize <= getAvailableMemory())
+      return;
+
+    if (allocatedGPUObjects.size() == 0) {
+      throw new DMLRuntimeException("There is not enough memory on device for this matrix!");
+    }
+
+    Collections.sort(allocatedGPUObjects, new Comparator<GPUObject>() {
+      @Override
+      public int compare(GPUObject p1, GPUObject p2) {
+        long p1Val = p1.readLocks.get();
+        long p2Val = p2.readLocks.get();
+
+        if (p1Val > 0 && p2Val > 0) {
+          // Both are locked, so don't sort
+          return 0;
+        } else if (p1Val > 0 || p2Val > 0) {
+          // Put the unlocked one to RHS
+          return Long.compare(p2Val, p1Val);
+        } else {
+          // Both are unlocked
+
+          if (evictionPolicy == EvictionPolicy.MIN_EVICT) {
+            long p1Size = 0;
+            long p2Size = 0;
+            try {
+              p1Size = p1.getSizeOnDevice() - neededSize;
+              p2Size = p2.getSizeOnDevice() - neededSize;
+            } catch (DMLRuntimeException e) {
+              throw new RuntimeException(e);
+            }
+
+            if (p1Size >= 0 && p2Size >= 0) {
+              return Long.compare(p2Size, p1Size);
+            } else {
+              return Long.compare(p1Size, p2Size);
+            }
+          } else if (evictionPolicy == EvictionPolicy.LRU || evictionPolicy == EvictionPolicy.LFU) {
+            return Long.compare(p2.timestamp.get(), p1.timestamp.get());
+          } else {
+            throw new RuntimeException("Unsupported eviction policy:" + evictionPolicy.name());
+          }
+        }
+      }
+    });
+
+    while (neededSize > getAvailableMemory() && allocatedGPUObjects.size() > 0) {
+      GPUObject toBeRemoved = allocatedGPUObjects.get(allocatedGPUObjects.size() - 1);
+      if (toBeRemoved.readLocks.get() > 0) {
+        throw new DMLRuntimeException("There is not enough memory on device for this matrix!");
+      }
+      if (toBeRemoved.dirty) {
+        toBeRemoved.copyFromDeviceToHost();
+      }
+
+      toBeRemoved.clearData(true);
+    }
+  }
+
+  /**
+   * Whether the GPU associated with this {@link GPUContext} has recorded the usage of a certain block
+   *
+   * @param o the block
+   * @return true if present, false otherwise
+   */
+  public boolean isBlockRecorded(GPUObject o) {
+    return allocatedGPUObjects.contains(o);
+  }
+
+  /**
+   * @param o {@link GPUObject} instance to record
+   * @see GPUContext#allocatedGPUObjects
+   * Records the usage of a matrix block
+   */
+  public void recordBlockUsage(GPUObject o) {
+    allocatedGPUObjects.add(o);
+  }
+
+  /**
+   * @param o {@link GPUObject} instance to remove from the list of allocated GPU objects
+   * @see GPUContext#allocatedGPUObjects
+   * Records that a block is not used anymore
+   */
+  public void removeRecordedUsage(GPUObject o) {
+    allocatedGPUObjects.remove(o);
+  }
+
+  /**
+   * Gets the available memory on GPU that SystemML can use
+   *
+   * @return the available memory in bytes
+   */
+  public long getAvailableMemory() {
+    long free[] = {0};
+    long total[] = {0};
+    cudaMemGetInfo(free, total);
+    return (long) (free[0] * GPU_MEMORY_UTILIZATION_FACTOR);
+  }
+
+  /**
+   * Makes sure that GPU that SystemML is trying to use has the minimum compute capability needed
+   *
+   * @throws DMLRuntimeException if the compute capability is less than what is required
+   */
+  public void ensureComputeCapability() throws DMLRuntimeException {
+    int[] devices = {-1};
+    cudaGetDeviceCount(devices);
+    if (devices[0] == -1) {
+      throw new DMLRuntimeException("Call to cudaGetDeviceCount returned 0 devices");
+    }
+    boolean isComputeCapable = true;
+    for (int i = 0; i < devices[0]; i++) {
+      cudaDeviceProp properties = GPUContextPool.getGPUProperties(i);
+      int major = properties.major;
+      int minor = properties.minor;
+      if (major < MAJOR_REQUIRED) {
+        isComputeCapable = false;
+      } else if (major == MAJOR_REQUIRED && minor < MINOR_REQUIRED) {
+        isComputeCapable = false;
+      }
+    }
+    if (!isComputeCapable) {
+      throw new DMLRuntimeException("One of the CUDA cards on the system has compute capability lower than " + MAJOR_REQUIRED + "." + MINOR_REQUIRED);
+    }
+  }
+
+  public GPUObject createGPUObject(MatrixObject mo) {
+    return new GPUObject(this, mo);
+  }
+
+  /**
+   * Gets the device properties for the active GPU (set with cudaSetDevice())
+   *
+   * @return the device properties
+   */
+  public cudaDeviceProp getGPUProperties() throws DMLRuntimeException {
+    return GPUContextPool.getGPUProperties(deviceNum);
+  }
+
+  /**
+   * Gets the maximum number of threads per block for "active" GPU
+   *
+   * @return the maximum number of threads per block
+   */
+  public int getMaxThreadsPerBlock() throws DMLRuntimeException {
+    cudaDeviceProp deviceProps = getGPUProperties();
+    return deviceProps.maxThreadsPerBlock;
+  }
+
+  /**
+   * Gets the maximum number of blocks supported by the active cuda device
+   *
+   * @return the maximum number of blocks supported
+   */
+  public int getMaxBlocks() throws DMLRuntimeException {
+    cudaDeviceProp deviceProp = getGPUProperties();
+    return deviceProp.maxGridSize[0];
+  }
+
+  /**
+   * Gets the shared memory per block supported by the active cuda device
+   *
+   * @return the shared memory per block
+   */
+  public long getMaxSharedMemory() throws DMLRuntimeException {
+    cudaDeviceProp deviceProp = getGPUProperties();
+    return deviceProp.sharedMemPerBlock;
+  }
+
+  /**
+   * Gets the warp size supported by the active cuda device
+   *
+   * @return the warp size
+   */
+  public int getWarpSize() throws DMLRuntimeException {
+    cudaDeviceProp deviceProp = getGPUProperties();
+    return deviceProp.warpSize;
+  }
+
+  public cudnnHandle getCudnnHandle() {
+    return cudnnHandle;
+  }
+
+  public cublasHandle getCublasHandle() {
+    return cublasHandle;
+  }
+
+  public cusparseHandle getCusparseHandle() {
+    return cusparseHandle;
+  }
+
+  public JCudaKernels getKernels() {
+    return kernels;
+  }
+
+  /**
+   * Destroys this GPUContext object
+   * This method MUST BE called so that the GPU is available to be used again
+   *
+   * @throws DMLRuntimeException if error
+   */
+  public void destroy() throws DMLRuntimeException {
+    LOG.trace("GPU : this context was destroyed, this = " + this.toString());
+    clearMemory();
+    cudnnDestroy(cudnnHandle);
+    cublasDestroy(cublasHandle);
+    cusparseDestroy(cusparseHandle);
+    cudnnHandle = null;
+    cublasHandle = null;
+    cusparseHandle = null;
+
+  }
+
+  /**
+   * Clears all memory used by this {@link GPUContext}
+   * Be careful to ensure that no memory is currently being used before invoking this
+   * @throws DMLRuntimeException
+   */
+  public void clearMemory() throws DMLRuntimeException {
+    while (allocatedGPUObjects.isEmpty()) {
+      GPUObject o = allocatedGPUObjects.get(0);
+      o.clearData();
+    }
+    for (LinkedList<Pointer> l : freeCUDASpaceMap.values()) {
+      for (Pointer p : l) {
+        cudaFreeHelper(p, true);
+      }
+    }
+    cudaBlockSizeMap.clear();
+    freeCUDASpaceMap.clear();
+    allocatedGPUObjects.clear();
+  }
+
+  @Override
+  public String toString() {
+    return "GPUContext{" +
+            "deviceNum=" + deviceNum +
+            '}';
+  }
+
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java
new file mode 100644
index 0000000..6452651
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java
@@ -0,0 +1,158 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.instructions.gpu.context;
+
+import static jcuda.driver.JCudaDriver.cuDeviceGetCount;
+import static jcuda.driver.JCudaDriver.cuInit;
+import static jcuda.runtime.JCuda.cudaGetDeviceProperties;
+
+import java.util.LinkedList;
+import java.util.Queue;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.utils.GPUStatistics;
+
+import jcuda.driver.JCudaDriver;
+import jcuda.jcublas.JCublas2;
+import jcuda.jcudnn.JCudnn;
+import jcuda.jcusparse.JCusparse;
+import jcuda.runtime.JCuda;
+import jcuda.runtime.cudaDeviceProp;
+
+public class GPUContextPool {
+
+  protected static final Log LOG = LogFactory.getLog(GPUContextPool.class.getName());
+
+  /** Maximum number of gpus to use, -1 for all */
+  public static int PER_PROCESS_MAX_GPUS = -1;
+
+  /** Whether cuda has been initialized */
+  static boolean initialized = false;
+
+  /** The total number of cuda devices on this machine */
+  static int deviceCount = -1;
+
+  /** Stores the cached deviceProperties */
+  static cudaDeviceProp[] deviceProperties;
+
+  /** Set of free GPUContexts */
+  static Queue<GPUContext> freePool = new LinkedList<>();
+
+  /**
+   * Static initialization of the number of devices
+   * Also sets behaviour for J{Cuda, Cudnn, Cublas, Cusparse} in case of error
+   * Initializes the CUDA driver
+   * All these need be done once, and not per GPU
+   */
+  public synchronized static void initializeGPU() throws DMLRuntimeException {
+    GPUContext.LOG.info("Initializing CUDA");
+    long start = System.nanoTime();
+    JCuda.setExceptionsEnabled(true);
+    JCudnn.setExceptionsEnabled(true);
+    JCublas2.setExceptionsEnabled(true);
+    JCusparse.setExceptionsEnabled(true);
+    JCudaDriver.setExceptionsEnabled(true);
+    cuInit(0); // Initialize the driver
+
+    int deviceCountArray[] = {0};
+    cuDeviceGetCount(deviceCountArray);        // Obtain the number of devices
+    deviceCount = deviceCountArray[0];
+    deviceProperties = new cudaDeviceProp[deviceCount];
+
+    if (PER_PROCESS_MAX_GPUS > 0)
+       deviceCount = Math.min(PER_PROCESS_MAX_GPUS, deviceCount);
+
+    // Initialize the list of devices
+    for (int i = 0; i < deviceCount; i++) {
+      cudaDeviceProp properties = new cudaDeviceProp();
+      cudaGetDeviceProperties(properties, i);
+      deviceProperties[i] = properties;
+    }
+
+    // Initialize the pool of GPUContexts
+    for (int i=0; i<deviceCount; i++){
+      GPUContext gCtx = new GPUContext(i);
+      freePool.add(gCtx);
+    }
+
+    GPUContext.LOG.info("Total number of GPUs on the machine: " + deviceCount);
+    //int[] device = {-1};
+    //cudaGetDevice(device);
+    //cudaDeviceProp prop = getGPUProperties(device[0]);
+    //int maxBlocks = prop.maxGridSize[0];
+    //int maxThreadsPerBlock = prop.maxThreadsPerBlock;
+    //long sharedMemPerBlock = prop.sharedMemPerBlock;
+    //LOG.debug("Active CUDA device number : " + device[0]);
+    //LOG.debug("Max Blocks/Threads/SharedMem on active device: " + maxBlocks + "/" + maxThreadsPerBlock + "/" + sharedMemPerBlock);
+    initialized = true;
+    GPUStatistics.cudaInitTime = System.nanoTime() - start;
+  }
+
+  /**
+   * Gets an initialized GPUContext from a pool of GPUContexts, each linked to a GPU
+   * @return null if not more GPUContexts in pool, a valid GPUContext otherwise
+   */
+  public static synchronized GPUContext getFromPool() throws DMLRuntimeException {
+    if (!initialized) initializeGPU();
+    GPUContext gCtx = freePool.poll();
+    LOG.trace("GPU : got GPUContext (" + gCtx + ") from freePool. New sizes - FreePool[" + freePool.size() + "]");
+    return gCtx;
+  }
+
+  /**
+   * Get the number of free GPUContexts
+   * @return number of free GPUContexts
+   */
+  public static synchronized int getAvailableCount() {
+    return freePool.size();
+  }
+
+  /**
+   * Gets the device properties
+   * @param device the device number (on a machine with more than 1 GPU)
+   * @return the device properties
+   * @throws DMLRuntimeException if there is problem initializing the GPUContexts
+   */
+  static cudaDeviceProp getGPUProperties(int device) throws DMLRuntimeException {
+    // do once - initialization of GPU
+    if (!initialized) initializeGPU();
+    return deviceProperties[device];
+  }
+
+  public static int getDeviceCount() throws DMLRuntimeException {
+    if (!initialized) initializeGPU();
+    return deviceCount;
+  }
+
+  /**
+   * Returns a {@link GPUContext} back to the pool of {@link GPUContext}s
+   * @param gCtx the GPUContext instance to return. If null, nothing happens
+   * @throws DMLRuntimeException if error
+   */
+  public static synchronized void returnToPool(GPUContext gCtx) throws DMLRuntimeException {
+    if (gCtx == null)
+      return;
+    freePool.add(gCtx);
+    LOG.trace("GPU : returned GPUContext (" + gCtx + ") to freePool. New sizes - FreePool[" + freePool.size() + "]");
+
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
index c116475..3a1fafa 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -18,206 +18,799 @@
  */
 package org.apache.sysml.runtime.instructions.gpu.context;
 
-import jcuda.Pointer;
+import static jcuda.jcublas.cublasOperation.CUBLAS_OP_T;
+import static jcuda.jcudnn.JCudnn.cudnnCreateTensorDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnDestroyTensorDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnSetTensor4dDescriptor;
+import static jcuda.jcudnn.cudnnDataType.CUDNN_DATA_DOUBLE;
+import static jcuda.jcudnn.cudnnTensorFormat.CUDNN_TENSOR_NCHW;
+import static jcuda.jcusparse.JCusparse.cusparseDdense2csr;
+import static jcuda.jcusparse.JCusparse.cusparseDnnz;
+import static jcuda.runtime.JCuda.cudaMemcpy;
+import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
+import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
+
+import java.util.Arrays;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.caching.CacheException;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
+import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
+import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+import org.apache.sysml.runtime.matrix.data.SparseBlock;
+import org.apache.sysml.runtime.matrix.data.SparseBlockCOO;
+import org.apache.sysml.runtime.matrix.data.SparseBlockCSR;
+import org.apache.sysml.runtime.matrix.data.SparseBlockMCSR;
 import org.apache.sysml.utils.GPUStatistics;
-import org.apache.sysml.utils.LRUCacheMap;
 
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.LinkedList;
-import java.util.Map;
-import java.util.concurrent.atomic.AtomicInteger;
-import java.util.concurrent.atomic.AtomicLong;
+import jcuda.Pointer;
+import jcuda.jcublas.JCublas2;
+import jcuda.jcudnn.cudnnTensorDescriptor;
+import jcuda.jcusparse.JCusparse;
+import jcuda.jcusparse.cusparseDirection;
+import jcuda.jcusparse.cusparseHandle;
+import jcuda.jcusparse.cusparseMatDescr;
+
+/**
+ * Handle to a matrix block on the GPU
+ */
+public class GPUObject {
+
+	private static final Log LOG = LogFactory.getLog(GPUObject.class.getName());
+
+	/** GPUContext that owns this GPUObject */
+	private final GPUContext gpuContext;
+
+	/** Pointer to the underlying dense matrix block on GPU */
+	private Pointer jcudaDenseMatrixPtr = null;
+
+    /** Pointer to the underlying sparse matrix block on GPU */
+	private CSRPointer jcudaSparseMatrixPtr = null;
 
-//FIXME merge JCudaObject into GPUObject to avoid unnecessary complexity
-public abstract class GPUObject 
-{
-	public enum EvictionPolicy {
-        LRU, LFU, MIN_EVICT
-    }
-	public static final EvictionPolicy evictionPolicy = EvictionPolicy.LRU;
-	protected boolean isDeviceCopyModified = false;
-	protected AtomicInteger numLocks = new AtomicInteger(0);
+	/** An optional tensor descriptor (and shape) that can be set by a tensor instruction such as convolution,
+	 * maxpooling and exploited by a subsequent non-tensor instruction such as relu
+	 */
+	private cudnnTensorDescriptor tensorDescriptor = null;
+
+	/** the shape of this tensor, if in fact this is a tensor */
+	private int [] tensorShape = null;
+
+	/** whether the block attached to this {@link GPUContext} is dirty on the device and needs to be copied back to host */
+	protected boolean dirty = false;
+
+	/** number of read locks on this object */
+	protected AtomicInteger readLocks = new AtomicInteger(0);
+
+	/** Timestamp, needed by {@link GPUContext#evict(long)} */
 	AtomicLong timestamp = new AtomicLong(0);
-	
-	protected boolean isInSparseFormat = false;
+
+	/** Whether this block is in sparse format */
+	protected boolean isSparse = false;
+
+	/** Enclosing {@link MatrixObject} instance */
 	protected MatrixObject mat = null;
-	
-	protected GPUObject(MatrixObject mat2)  {
-		this.mat = mat2;
+
+	private Pointer allocate(String instName, long size) throws DMLRuntimeException {
+		return getGPUContext().allocate(instName, size);
+	}
+
+	private Pointer allocate(long size) throws DMLRuntimeException {
+		return getGPUContext().allocate(size);
+	}
+
+	private void cudaFreeHelper(Pointer toFree) throws DMLRuntimeException {
+		getGPUContext().cudaFreeHelper(toFree);
+	}
+
+	private void cudaFreeHelper(Pointer toFree, boolean eager) throws DMLRuntimeException {
+		getGPUContext().cudaFreeHelper(toFree, eager);
+	}
+
+	private void cudaFreeHelper(String instName, Pointer toFree, boolean eager) throws DMLRuntimeException {
+		getGPUContext().cudaFreeHelper(instName, toFree, eager);
 	}
-	
-	public boolean isInSparseFormat() {
-		return isInSparseFormat;
+
+	private GPUContext getGPUContext() throws DMLRuntimeException {
+		return gpuContext;
 	}
-	
-	public abstract boolean isAllocated();
 
 	/**
-	 * Signal intent that a matrix block will be read (as input) on the GPU
-	 * @return	true if a host memory to device memory transfer happened
-	 * @throws DMLRuntimeException ?
+	 * Transposes a dense matrix on the GPU by calling the cublasDgeam operation
+	 * @param gCtx   a valid {@link GPUContext}
+	 * @param densePtr	Pointer to dense matrix on the GPU
+	 * @param m			rows in ouput matrix
+	 * @param n			columns in output matrix
+	 * @param lda		rows in input matrix
+	 * @param ldc		columns in output matrix
+	 * @return			transposed matrix
+	 * @throws DMLRuntimeException if operation failed
 	 */
-	public abstract boolean acquireDeviceRead() throws DMLRuntimeException;
+	public static Pointer transpose(GPUContext gCtx, Pointer densePtr, int m, int n, int lda, int ldc) throws DMLRuntimeException {
+		LOG.trace("GPU : transpose of block of size [" + m + "," + n + "]" + ", GPUContext=" + gCtx);
+		Pointer alpha = Pointer.to(new double[]{1.0});
+		Pointer beta = Pointer.to(new double[]{0.0});
+		Pointer A = densePtr;
+		Pointer C = gCtx.allocate(((long)m)*getDoubleSizeOf(n));
+
+		// Transpose the matrix to get a dense matrix
+		JCublas2.cublasDgeam(gCtx.getCublasHandle(), CUBLAS_OP_T, CUBLAS_OP_T, m, n, alpha, A, lda, beta, new Pointer(), lda, C, ldc);
+		return C;
+	}
+
 	/**
-	 * To signal intent that a matrix block will be written to on the GPU
-	 * @return	true if memory was allocated on the GPU as a result of this call
+	 * Convenience method to convert a CSR matrix to a dense matrix on the GPU
+	 * Since the allocated matrix is temporary, bookkeeping is not updated.
+	 * Also note that the input dense matrix is expected to be in COLUMN MAJOR FORMAT
+	 * Caller is responsible for deallocating memory on GPU.
+	 * @param gCtx   a valid {@link GPUContext}
+	 * @param cusparseHandle handle to cusparse library
+	 * @param densePtr [in] dense matrix pointer on the GPU in row major
+	 * @param rows number of rows
+	 * @param cols number of columns
+	 * @return CSR (compressed sparse row) pointer
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
-	public abstract boolean acquireDeviceModifyDense() throws DMLRuntimeException;
+	public static CSRPointer columnMajorDenseToRowMajorSparse(GPUContext gCtx, cusparseHandle cusparseHandle, Pointer densePtr, int rows, int cols) throws DMLRuntimeException {
+		cusparseMatDescr matDescr = CSRPointer.getDefaultCuSparseMatrixDescriptor();
+		Pointer nnzPerRowPtr = null;
+		Pointer nnzTotalDevHostPtr = null;
+
+		gCtx.ensureFreeSpace(getIntSizeOf(rows + 1));
+		nnzPerRowPtr = gCtx.allocate(getIntSizeOf(rows));
+		nnzTotalDevHostPtr = gCtx.allocate(getIntSizeOf(1));
+
+		// Output is in dense vector format, convert it to CSR
+		cusparseDnnz(cusparseHandle, cusparseDirection.CUSPARSE_DIRECTION_ROW, rows, cols, matDescr, densePtr, rows, nnzPerRowPtr, nnzTotalDevHostPtr);
+		//cudaDeviceSynchronize();
+		int[] nnzC = {-1};
+
+		long t2=0;
+		if (DMLScript.STATISTICS) t2 = System.nanoTime();
+		cudaMemcpy(Pointer.to(nnzC), nnzTotalDevHostPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost);
+		if (DMLScript.STATISTICS) GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime() - t2);
+		if (DMLScript.STATISTICS) GPUStatistics.cudaFromDevCount.addAndGet(1);
+
+		if (nnzC[0] == -1){
+			throw new DMLRuntimeException("cusparseDnnz did not calculate the correct number of nnz from the sparse-matrix vector mulitply on the GPU");
+		}
+
+		LOG.trace("GPU : col-major dense size[" + rows + "," + cols + "] to row-major sparse of with nnz = " + nnzC[0] + ", GPUContext=" + gCtx);
+
+		CSRPointer C = CSRPointer.allocateEmpty(gCtx, nnzC[0], rows);
+		cusparseDdense2csr(cusparseHandle, rows, cols, matDescr, densePtr, rows, nnzPerRowPtr, C.val, C.rowPtr, C.colInd);
+		//cudaDeviceSynchronize();
+
+		gCtx.cudaFreeHelper(nnzPerRowPtr);
+		gCtx.cudaFreeHelper(nnzTotalDevHostPtr);
+
+		return C;
+	}
+
+	/**
+	 * Gets the double array from GPU memory onto host memory and returns string.
+	 * @param A Pointer to memory on device (GPU), assumed to point to a double array
+	 * @param rows rows in matrix A
+	 * @param cols columns in matrix A
+	 * @return the debug string
+	 * @throws DMLRuntimeException  if DMLRuntimeException occurs
+	 */
+	@SuppressWarnings("unused")
+	public static String debugString(Pointer A, long rows, long cols) throws DMLRuntimeException {
+		StringBuffer sb = new StringBuffer();
+		int len = toIntExact(rows * cols);
+		double[] tmp = new double[len];
+		cudaMemcpy(Pointer.to(tmp), A, getDoubleSizeOf(len), cudaMemcpyDeviceToHost);
+		int k = 0;
+		for (int i=0; i<rows; i++){
+			for (int j=0; j<cols; j++){
+				sb.append(tmp[k]).append(' ');
+				k++;
+			}
+			sb.append('\n');
+		}
+		return sb.toString();
+	}
+
 	/**
-	 * To signal intent that a sparse matrix block will be written to on the GPU
-	 * @return	true if memory was allocated on the GPU as a result of this call
+	 * Convenience method to directly examine the Sparse matrix on GPU
+	 * @return CSR (compressed sparse row) pointer
+	 */
+	public CSRPointer getSparseMatrixCudaPointer() {
+		return getJcudaSparseMatrixPtr();
+	}
+
+	/**
+	 * Convenience method to directly set the sparse matrix on GPU
+	 * Make sure to call {@link #addReadLock()} after this to set appropriate state, if you are not sure what you are doing.
+	 * Needed for operations like {@link JCusparse#cusparseDcsrgemm(cusparseHandle, int, int, int, int, int, cusparseMatDescr, int, Pointer, Pointer, Pointer, cusparseMatDescr, int, Pointer, Pointer, Pointer, cusparseMatDescr, Pointer, Pointer, Pointer)}
+	 * @param sparseMatrixPtr CSR (compressed sparse row) pointer
+	 */
+	public void setSparseMatrixCudaPointer(CSRPointer sparseMatrixPtr) throws DMLRuntimeException {
+		this.jcudaSparseMatrixPtr = sparseMatrixPtr;
+		this.isSparse = true;
+		if(getJcudaDenseMatrixPtr() != null) {
+			cudaFreeHelper(getJcudaDenseMatrixPtr());
+			jcudaDenseMatrixPtr = null;
+		}
+	}
+
+	/**
+	 * Convenience method to directly set the dense matrix pointer on GPU
+	 * Make sure to call {@link #addReadLock()} after this to set appropriate state, if you are not sure what you are doing.
+	 *
+	 * @param densePtr dense pointer
+	 */
+	public void setDenseMatrixCudaPointer(Pointer densePtr) throws DMLRuntimeException{
+		this.jcudaDenseMatrixPtr = densePtr;
+		this.isSparse = false;
+		if(getJcudaSparseMatrixPtr() != null) {
+			getJcudaSparseMatrixPtr().deallocate();
+			jcudaSparseMatrixPtr = null;
+		}
+	}
+
+	/**
+	 * Converts this GPUObject from dense to sparse format.
+	 *
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
-	public abstract boolean acquireDeviceModifySparse() throws DMLRuntimeException;
-	
+	public void denseToSparse() throws DMLRuntimeException {
+		LOG.trace("GPU : dense -> sparse on " + this + ", GPUContext=" + getGPUContext());
+		long t0=0;
+		if (DMLScript.STATISTICS) t0 = System.nanoTime();
+		cusparseHandle cusparseHandle = getGPUContext().getCusparseHandle();
+		if(cusparseHandle == null)
+			throw new DMLRuntimeException("Expected cusparse to be initialized");
+		int rows = toIntExact(mat.getNumRows());
+		int cols = toIntExact(mat.getNumColumns());
+
+		if(getJcudaDenseMatrixPtr() == null || !isAllocated())
+			throw new DMLRuntimeException("Expected allocated dense matrix before denseToSparse() call");
+
+		convertDensePtrFromRowMajorToColumnMajor();
+		setSparseMatrixCudaPointer(columnMajorDenseToRowMajorSparse(getGPUContext(), cusparseHandle, getJcudaDenseMatrixPtr(), rows, cols));
+		// TODO: What if mat.getNnz() is -1 ?
+		if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseTime.addAndGet(System.nanoTime() - t0);
+		if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseCount.addAndGet(1);
+	}
+
 	/**
-	 * If memory on GPU has been allocated from elsewhere, this method 
-	 * updates the internal bookkeeping
-	 * @param numBytes number of bytes
+	 * Convenience method. Converts Row Major Dense Matrix --> Column Major Dense Matrix
+	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
-	public abstract void setDeviceModify(long numBytes);
+	private void convertDensePtrFromRowMajorToColumnMajor() throws DMLRuntimeException {
+		LOG.trace("GPU : dense Ptr row-major -> col-major on " + this + ", GPUContext=" + getGPUContext());
+		int m = toIntExact(mat.getNumRows());
+		int n = toIntExact(mat.getNumColumns());
+		int lda = n;
+		int ldc = m;
+		if(!isAllocated()) {
+			throw new DMLRuntimeException("Error in converting row major to column major : data is not allocated");
+		}
+
+		Pointer tmp = transpose(getGPUContext(), getJcudaDenseMatrixPtr(), m, n, lda, ldc);
+		cudaFreeHelper(getJcudaDenseMatrixPtr());
+		setDenseMatrixCudaPointer(tmp);
+	}
+
+	private void convertDensePtrFromColMajorToRowMajor() throws DMLRuntimeException {
+		LOG.trace("GPU : dense Ptr row-major -> col-major on " + this + ", GPUContext=" + getGPUContext());
+
+		int n = toIntExact(mat.getNumRows());
+		int m = toIntExact(mat.getNumColumns());
+		int lda = n;
+		int ldc = m;
+		if(!isAllocated()) {
+			throw new DMLRuntimeException("Error in converting column major to row major : data is not allocated");
+		}
+
+		Pointer tmp = transpose(getGPUContext(), getJcudaDenseMatrixPtr(), m, n, lda, ldc);
+		cudaFreeHelper(getJcudaDenseMatrixPtr());
+		setDenseMatrixCudaPointer(tmp);
+	}
 
 	/**
-	 * Signal intent that a block needs to be read on the host
-	 * @return true if copied from device to host
-	 * @throws CacheException ?
+	 * Convert sparse to dense (Performs transpose, use sparseToColumnMajorDense if the kernel can deal with column major format)
+	 *
+	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
-	public abstract boolean acquireHostRead() throws CacheException;
-
-	public abstract void releaseInput() throws CacheException;
-	public abstract void releaseOutput() throws CacheException;
-	
-	// package-level visibility as these methods are guarded by underlying GPUContext
-
-	abstract void allocateDenseMatrixOnDevice() throws DMLRuntimeException;
-	abstract void allocateSparseMatrixOnDevice() throws DMLRuntimeException;
-	abstract void deallocateMemoryOnDevice(boolean eager) throws DMLRuntimeException;
-	abstract long getSizeOnDevice() throws DMLRuntimeException;
-	
-	abstract void copyFromHostToDevice() throws DMLRuntimeException;
-	
+	public void sparseToDense() throws DMLRuntimeException {
+		sparseToDense(null);
+	}
+
 	/**
-	 * Copies a matrix block (dense or sparse) from GPU Memory to Host memory.
-	 * A {@link MatrixBlock} instance is allocated, data from the GPU is copied in,
-	 * the current one in Host memory is deallocated by calling MatrixObject's acquireHostModify(MatrixBlock) (??? does not exist)
-	 * and overwritten with the newly allocated instance.
-	 * TODO : re-examine this to avoid spurious allocations of memory for optimizations
+	 * Convert sparse to dense (Performs transpose, use sparseToColumnMajorDense if the kernel can deal with column major format)
+	 * Also records per instruction invokation of sparseToDense.
+	 * @param instructionName	Name of the instruction for which statistics are recorded in {@link GPUStatistics}
+	 * @throws DMLRuntimeException ?
+	 */
+	public void sparseToDense(String instructionName) throws DMLRuntimeException {
+		LOG.trace("GPU : sparse -> dense on " + this + ", GPUContext=" + getGPUContext());
+		long start=0, end=0;
+		if (DMLScript.STATISTICS) start = System.nanoTime();
+		if(getJcudaSparseMatrixPtr() == null || !isAllocated())
+			throw new DMLRuntimeException("Expected allocated sparse matrix before sparseToDense() call");
+
+		sparseToColumnMajorDense();
+		convertDensePtrFromColMajorToRowMajor();
+		if (DMLScript.STATISTICS) end = System.nanoTime();
+		if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, end - start);
+		if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseTime.addAndGet(end - start);
+		if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseCount.addAndGet(1);
+	}
+
+	/**
+	 * More efficient method to convert sparse to dense but returns dense in column major format
+	 *
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
-	abstract void copyFromDeviceToHost() throws DMLRuntimeException; // Called by export()
+	public void sparseToColumnMajorDense() throws DMLRuntimeException {
+		LOG.trace("GPU : sparse -> col-major dense on " + this + ", GPUContext=" + getGPUContext());
+		if(getJcudaSparseMatrixPtr() == null || !isAllocated())
+			throw new DMLRuntimeException("Expected allocated sparse matrix before sparseToDense() call");
+
+		cusparseHandle cusparseHandle = getGPUContext().getCusparseHandle();
+		if(cusparseHandle == null)
+			throw new DMLRuntimeException("Expected cusparse to be initialized");
+		int rows = toIntExact(mat.getNumRows());
+		int cols = toIntExact(mat.getNumColumns());
+		setDenseMatrixCudaPointer(getJcudaSparseMatrixPtr().toColumnMajorDenseMatrix(cusparseHandle, null, rows, cols));
+	}
+
+	/**
+	 * Initializes this GPUObject with a {@link MatrixObject} instance which will contain metadata about the enclosing matrix block
+	 * @param mat2 the matrix block that owns this {@link GPUObject}
+	 */
+	GPUObject(GPUContext gCtx, MatrixObject mat2)  {
+		gpuContext = gCtx;
+		this.mat = mat2;
+	}
 
+	public boolean isSparse() {
+		return isSparse;
+	}
 
 	/**
-	 * Convenience wrapper over {@link GPUObject#evict(String, long)}
-	 * @param GPUSize Desired size to be freed up on the GPU
-	 * @throws DMLRuntimeException If no blocks to free up or if not enough blocks with zero locks on them.
+	 * Returns a previously allocated tensor shape or null
+	 * @return int array of four elements or null
 	 */
-	protected static void evict(final long GPUSize) throws DMLRuntimeException {
-		evict(null, GPUSize);
+	public int [] getTensorShape() {
+		return tensorShape;
 	}
 
 	/**
-	 * Cycles through the sorted list of allocated {@link GPUObject} instances. Sorting is based on
-	 * number of (read) locks that have been obtained on it (reverse order). It repeatedly frees up 
-	 * blocks on which there are zero locks until the required size has been freed up.
-	 * // TODO: update it with hybrid policy
-	 * @param instructionName name of the instruction for which performance measurements are made
-	 * @param GPUSize Desired size to be freed up on the GPU
-	 * @throws DMLRuntimeException If no blocks to free up or if not enough blocks with zero locks on them.	 
+	 * Returns a previously allocated tensor descriptor or null
+	 * @return cudnn tensor descriptor
 	 */
-	protected static void evict(String instructionName, final long GPUSize) throws DMLRuntimeException {
-		synchronized (JCudaContext.syncObj) {
-
-			GPUStatistics.cudaEvictionCount.addAndGet(1);
-			// Release the set of free blocks maintained in a JCudaObject.freeCUDASpaceMap
-			// to free up space
-			LRUCacheMap<Long, LinkedList<Pointer>> lruCacheMap = JCudaObject.freeCUDASpaceMap;
-			while (lruCacheMap.size() > 0) {
-				if (GPUSize <= getAvailableMemory())
-					break;
-				Map.Entry<Long, LinkedList<Pointer>> toFreeListPair = lruCacheMap.removeAndGetLRUEntry();
-				LinkedList<Pointer> toFreeList = toFreeListPair.getValue();
-				Long size = toFreeListPair.getKey();
-				Pointer toFree = toFreeList.pop();
-				if (toFreeList.isEmpty())
-					lruCacheMap.remove(size);
-				JCudaObject.cudaFreeHelper(instructionName, toFree, true);
+	public cudnnTensorDescriptor getTensorDescriptor() {
+		return tensorDescriptor;
+	}
+
+	/**
+	 * Returns a previously allocated or allocates and returns a tensor descriptor
+	 * @param N number of images
+	 * @param C number of channels
+	 * @param H height
+	 * @param W width
+	 * @return cudnn tensor descriptor
+	 */
+	public cudnnTensorDescriptor allocateTensorDescriptor(int N, int C, int H, int W) {
+		LOG.trace("GPU : allocateTensorDescriptor with [N="+N+",C="+C+",H="+H+",W="+W+"] on " + this);
+		if(tensorDescriptor == null) {
+			tensorDescriptor = new cudnnTensorDescriptor();
+			cudnnCreateTensorDescriptor(tensorDescriptor);
+			cudnnSetTensor4dDescriptor(tensorDescriptor, CUDNN_TENSOR_NCHW, CUDNN_DATA_DOUBLE, N, C, H, W);
+			tensorShape = new int[4];
+			tensorShape[0] = N;
+			tensorShape[1] = C;
+			tensorShape[2] = H;
+			tensorShape[3] = W;
+		}
+		return tensorDescriptor;
+	}
+
+	private static long getDoubleSizeOf(long numElems) {
+		return numElems * ((long)jcuda.Sizeof.DOUBLE);
+	}
+
+	private static long getIntSizeOf(long numElems) {
+		return numElems * ((long)jcuda.Sizeof.INT);
+	}
+
+	public boolean isAllocated() {
+		boolean eitherAllocated = (getJcudaDenseMatrixPtr() != null || getJcudaSparseMatrixPtr() != null);
+		return eitherAllocated;
+	}
+
+	public boolean isInputAllocated() {
+		try {
+			boolean eitherAllocated = (getJcudaDenseMatrixPtr() != null || getJcudaSparseMatrixPtr() != null);
+			boolean isAllocatedOnThisGPUContext = getGPUContext().isBlockRecorded(this);
+			if (eitherAllocated && !isAllocatedOnThisGPUContext) {
+				LOG.warn("GPU : A block was allocated but was not on this GPUContext, GPUContext=" + getGPUContext());
 			}
+			return eitherAllocated && isAllocatedOnThisGPUContext;
+		} catch (DMLRuntimeException e){
+			LOG.info("GPU : System is in an inconsistent state");
+			throw new RuntimeException(e);
+		}
+	}
+
+	/**
+	 * Allocates a sparse and empty {@link GPUObject}
+	 * This is the result of operations that are both non zero matrices.
+	 *
+	 * @throws DMLRuntimeException if DMLRuntimeException occurs
+	 */
+	public void allocateSparseAndEmpty() throws DMLRuntimeException{
+		LOG.trace("GPU : allocate sparse and empty block on " + this + ", GPUContext=" + getGPUContext());
+		setSparseMatrixCudaPointer(CSRPointer.allocateEmpty(getGPUContext(), 0, mat.getNumRows()));
+		addReadLock();
+	}
+
+	/**
+	 * Allocates a dense matrix of size obtained from the attached matrix metadata
+	 * and fills it up with a single value
+	 *
+	 * @param v value to fill up the dense matrix
+	 * @throws DMLRuntimeException if DMLRuntimeException occurs
+	 */
+	public void allocateAndFillDense(double v) throws DMLRuntimeException {
+		LOG.trace("GPU : allocate and fill dense with value " + v + " on " + this + ", GPUContext=" + getGPUContext());
+		long rows = mat.getNumRows();
+		long cols = mat.getNumColumns();
+		int numElems = toIntExact(rows * cols);
+		long size = getDoubleSizeOf(numElems);
+		setDenseMatrixCudaPointer(allocate(size));
+		addReadLock();
+		// The "fill" kernel is called which treats the matrix "jcudaDensePtr" like a vector and fills it with value "v"
+		getGPUContext().getKernels().launchKernel("fill", ExecutionConfig.getConfigForSimpleVectorOperations(numElems), getJcudaDenseMatrixPtr(), v, numElems);
+	}
+
+	/**
+	 * If this {@link GPUObject} is sparse and empty
+	 * Being allocated is a prerequisite to being sparse and empty.
+	 *
+	 * @return true if sparse and empty
+	 * @throws DMLRuntimeException if error
+	 */
+	public boolean isSparseAndEmpty() throws DMLRuntimeException{
+		boolean isSparseAndAllocated = isAllocated()&& LibMatrixCUDA.isInSparseFormat(getGPUContext(), mat);
+		boolean isEmptyAndSparseAndAllocated = isSparseAndAllocated && getJcudaSparseMatrixPtr().nnz == 0;
+		return isEmptyAndSparseAndAllocated;
+	}
+
+	public boolean acquireDeviceRead() throws DMLRuntimeException {
+		LOG.trace("GPU : acquireDeviceRead on " + this);
+		boolean transferred = false;
+		if(!isAllocated()) {
+			LOG.trace("GPU : in acquireDeviceRead, data is not allocated, copying from host, on " + this + ", GPUContext=" + getGPUContext());
+			copyFromHostToDevice();
+			transferred = true;
+		} else {
+			addReadLock();
+		}
+		if(!isAllocated())
+			throw new DMLRuntimeException("Expected device data to be allocated");
+		return transferred;
+	}
+
+	public boolean acquireDeviceModifyDense() throws DMLRuntimeException {
+		LOG.trace("GPU : acquireDeviceModifyDense on " + this + ", GPUContext=" + getGPUContext());
+		boolean allocated = false;
+		if(!isAllocated()) {
+			mat.setDirty(true);
+			LOG.trace("GPU : data is not allocated, allocating a dense block, on " + this);
+			// Dense block, size = numRows * numCols
+			allocateDenseMatrixOnDevice();
+			allocated = true;
+			getGPUContext().recordBlockUsage(this);
+		}
+		dirty = true;
+		if(!isAllocated())
+			throw new DMLRuntimeException("Expected device data to be allocated");
+		return allocated;
+	}
 
-			if (GPUSize <= getAvailableMemory())
-				return;
+	public boolean acquireDeviceModifySparse() throws DMLRuntimeException {
+		LOG.trace("GPU : acquireDeviceModifySparse on " + this + ", GPUContext=" + getGPUContext());
+		boolean allocated = false;
+		isSparse = true;
+		if(!isAllocated()) {
+			LOG.trace("GPU : data is not allocated, allocating a sparse block, on " + this);
+			mat.setDirty(true);
+			allocateSparseMatrixOnDevice();
+			allocated = true;
+			getGPUContext().recordBlockUsage(this);
+
+		}
+		dirty = true;
+		if(!isAllocated())
+			throw new DMLRuntimeException("Expected device data to be allocated");
+		return allocated;
+	}
+
+	public void addReadLock() {
+		readLocks.addAndGet(1);
+	}
 
-			if (JCudaContext.allocatedPointers.size() == 0) {
-				throw new DMLRuntimeException("There is not enough memory on device for this matrix!");
+	/**
+	 * if the data is allocated on the GPU and is dirty, it is copied back to the host memory
+	 * @return true if a copy to host happened, false otherwise
+	 * @throws CacheException
+	 */
+	public boolean acquireHostRead() throws CacheException {
+		boolean copied = false;
+		try {
+			LOG.trace("GPU : acquireDeviceModifySparse on " + this + ", GPUContext=" + getGPUContext());
+			if (isAllocated() && dirty) {
+				LOG.trace("GPU : data is dirty on device, copying to host, on " + this + ", GPUContext=" + getGPUContext());
+				copyFromDeviceToHost();
+				copied = true;
 			}
+		} catch (DMLRuntimeException e) {
+			throw new CacheException(e);
+		}
+		return copied;
+	}
+
+	/**
+	 * Updates the locks depending on the eviction policy selected
+	 * @throws DMLRuntimeException if there is no locked GPU Object or if could not obtain a {@link GPUContext}
+	 */
+	private void updateReleaseLocks() throws DMLRuntimeException {
+		if (readLocks.addAndGet(-1) < 0) {
+			throw new CacheException("Redundant release of GPU object");
+		}
+		LOG.trace("GPU : updateReleaseLocks, new number of read locks is " + readLocks.get() + ", on " + this + ", GPUContext=" + getGPUContext());
+		GPUContext.EvictionPolicy evictionPolicy = getGPUContext().evictionPolicy;
+		switch (evictionPolicy){
+			case LRU : timestamp.set(System.nanoTime()); break;
+			case LFU : timestamp.addAndGet(1); break;
+			case MIN_EVICT : /* Do Nothing */ break;
+			default : throw new CacheException("The eviction policy is not supported:" + evictionPolicy.name());
+		}
+	}
+
+	/**
+	 * Releases input allocated on GPU
+	 * @throws DMLRuntimeException if data is not allocated or if there is no locked GPU Object or if could not obtain a {@link GPUContext}
+	 */
+	public void releaseInput() throws DMLRuntimeException {
+		updateReleaseLocks();
+		if(!isAllocated())
+			throw new CacheException("Attempting to release an input before allocating it");
+	}
+
+	/**
+	 * releases output allocated on GPU
+	 * @throws DMLRuntimeException if data is not allocated or if there is no locked GPU Object or if could not obtain a {@link GPUContext}
+	 */
+	public void releaseOutput() throws DMLRuntimeException {
+		updateReleaseLocks();
+		dirty = true;
+		if(!isAllocated())
+			throw new CacheException("Attempting to release an output before allocating it");
+	}
 
-			synchronized (evictionLock) {
-				Collections.sort(JCudaContext.allocatedPointers, new Comparator<GPUObject>() {
-
-					@Override
-					public int compare(GPUObject p1, GPUObject p2) {
-						long p1Val = p1.numLocks.get();
-						long p2Val = p2.numLocks.get();
-
-						if (p1Val > 0 && p2Val > 0) {
-							// Both are locked, so don't sort
-							return 0;
-						} else if (p1Val > 0 || p2Val > 0) {
-							// Put the unlocked one to RHS
-							return Long.compare(p2Val, p1Val);
-						} else {
-							// Both are unlocked
-
-							if (evictionPolicy == EvictionPolicy.MIN_EVICT) {
-								long p1Size = 0;
-								long p2Size = 0;
-								try {
-									p1Size = p1.getSizeOnDevice() - GPUSize;
-									p2Size = p2.getSizeOnDevice() - GPUSize;
-								} catch (DMLRuntimeException e) {
-									throw new RuntimeException(e);
-								}
-
-								if (p1Size >= 0 && p2Size >= 0) {
-									return Long.compare(p2Size, p1Size);
-								} else {
-									return Long.compare(p1Size, p2Size);
-								}
-							} else if (evictionPolicy == EvictionPolicy.LRU || evictionPolicy == EvictionPolicy.LFU) {
-								return Long.compare(p2.timestamp.get(), p1.timestamp.get());
-							} else {
-								throw new RuntimeException("Unsupported eviction policy:" + evictionPolicy.name());
-							}
-						}
-					}
-				});
-
-				while (GPUSize > getAvailableMemory() && JCudaContext.allocatedPointers.size() > 0) {
-					GPUObject toBeRemoved = JCudaContext.allocatedPointers.get(JCudaContext.allocatedPointers.size() - 1);
-					if (toBeRemoved.numLocks.get() > 0) {
-						throw new DMLRuntimeException("There is not enough memory on device for this matrix!");
-					}
-					if (toBeRemoved.isDeviceCopyModified) {
-						toBeRemoved.copyFromDeviceToHost();
-					}
-
-					toBeRemoved.clearData(true);
+	void allocateDenseMatrixOnDevice() throws DMLRuntimeException {
+		LOG.trace("GPU : allocateDenseMatrixOnDevice, on " + this + ", GPUContext=" + getGPUContext());
+		assert !isAllocated() : "Internal error - trying to allocated dense matrix to a GPUObject that is already allocated";
+		long rows = mat.getNumRows();
+		long cols = mat.getNumColumns();
+		assert rows > 0 : "Internal error - invalid number of rows when allocating dense matrix";
+		assert cols > 0 : "Internal error - invalid number of columns when allocating dense matrix;";
+		long size = getDoubleSizeOf(rows * cols);
+		Pointer tmp = allocate(size);
+		setDenseMatrixCudaPointer(tmp);
+		addReadLock();
+	}
+
+	void allocateSparseMatrixOnDevice() throws DMLRuntimeException {
+		LOG.trace("GPU : allocateSparseMatrixOnDevice, on " + this + ", GPUContext=" + getGPUContext());
+		assert !isAllocated() : "Internal error = trying to allocated sparse matrix to a GPUObject that is already allocated";
+		long rows = mat.getNumRows();
+		long nnz = mat.getNnz();
+		assert rows > 0 : "Internal error - invalid number of rows when allocating a sparse matrix";
+		assert nnz > 0 : "Internal error - invalid number of non zeroes when allocating a sparse matrix";
+		CSRPointer tmp = CSRPointer.allocateEmpty(getGPUContext(), nnz, rows);
+		setSparseMatrixCudaPointer(tmp);
+		addReadLock();
+	}
+
+	void deallocateMemoryOnDevice(boolean eager) throws DMLRuntimeException {
+		LOG.trace("GPU : deallocateMemoryOnDevice, on " + this + ", GPUContext=" + getGPUContext());
+		if(getJcudaDenseMatrixPtr() != null) {
+			cudaFreeHelper(null, getJcudaDenseMatrixPtr(), eager);
+		}
+		if (getJcudaSparseMatrixPtr() != null) {
+			getJcudaSparseMatrixPtr().deallocate(eager);
+		}
+		jcudaDenseMatrixPtr = null;
+		jcudaSparseMatrixPtr = null;
+		if(tensorDescriptor != null) {
+			cudnnDestroyTensorDescriptor(tensorDescriptor);
+			tensorDescriptor = null;
+		}
+		readLocks.set(0);
+	}
+
+	protected long getSizeOnDevice() throws DMLRuntimeException {
+		long GPUSize = 0;
+		long rlen = mat.getNumRows();
+		long clen = mat.getNumColumns();
+		long nnz = mat.getNnz();
+
+		if(LibMatrixCUDA.isInSparseFormat(getGPUContext(), mat)) {
+			GPUSize = CSRPointer.estimateSize(nnz, rlen);
+		}
+		else {
+			GPUSize = getDoubleSizeOf(rlen * clen);
+		}
+		return GPUSize;
+	}
+
+	void copyFromHostToDevice() throws DMLRuntimeException {
+		LOG.trace("GPU : copyFromHostToDevice, on " + this + ", GPUContext=" + getGPUContext());
+		long start=0;
+		if (DMLScript.STATISTICS) start = System.nanoTime();
+
+		MatrixBlock tmp = mat.acquireRead();
+		if(tmp.isInSparseFormat()) {
+
+			int rowPtr[] = null;
+			int colInd[] = null;
+			double[] values = null;
+
+			tmp.recomputeNonZeros();
+			long nnz = tmp.getNonZeros();
+			mat.getMatrixCharacteristics().setNonZeros(nnz);
+
+			SparseBlock block = tmp.getSparseBlock();
+			boolean copyToDevice = true;
+			if(block == null && tmp.getNonZeros() == 0) {
+//				// Allocate empty block --> not necessary
+//				// To reproduce this, see org.apache.sysml.test.integration.applications.dml.ID3DMLTest
+//				rowPtr = new int[0];
+//				colInd = new int[0];
+//				values = new double[0];
+				copyToDevice = false;
+			}
+			else if(block == null && tmp.getNonZeros() != 0) {
+				throw new DMLRuntimeException("Expected CP sparse block to be not null.");
+			}
+			else {
+				// CSR is the preferred format for cuSparse GEMM
+				// Converts MCSR and COO to CSR
+				SparseBlockCSR csrBlock = null;
+				long t0=0;
+				if (block instanceof SparseBlockCSR){
+					csrBlock = (SparseBlockCSR)block;
+				} else if (block instanceof SparseBlockCOO) {
+					// TODO - should we do this on the GPU using cusparse<t>coo2csr() ?
+					if (DMLScript.STATISTICS) t0 = System.nanoTime();
+					SparseBlockCOO cooBlock = (SparseBlockCOO)block;
+					csrBlock = new SparseBlockCSR(toIntExact(mat.getNumRows()), cooBlock.rowIndexes(), cooBlock.indexes(), cooBlock.values());
+					if (DMLScript.STATISTICS) GPUStatistics.cudaSparseConversionTime.addAndGet(System.nanoTime() - t0);
+					if (DMLScript.STATISTICS) GPUStatistics.cudaSparseConversionCount.incrementAndGet();
+				} else if (block instanceof SparseBlockMCSR) {
+					if (DMLScript.STATISTICS) t0 = System.nanoTime();
+					SparseBlockMCSR mcsrBlock = (SparseBlockMCSR)block;
+					csrBlock = new SparseBlockCSR(mcsrBlock.getRows(), toIntExact(mcsrBlock.size()));
+					if (DMLScript.STATISTICS) GPUStatistics.cudaSparseConversionTime.addAndGet(System.nanoTime() - t0);
+					if (DMLScript.STATISTICS) GPUStatistics.cudaSparseConversionCount.incrementAndGet();
+				} else {
+					throw new DMLRuntimeException("Unsupported sparse matrix format for CUDA operations");
 				}
+				rowPtr = csrBlock.rowPointers();
+				colInd = csrBlock.indexes();
+				values = csrBlock.values();
+			}
+			allocateSparseMatrixOnDevice();
+			getGPUContext().recordBlockUsage(this);
+
+			if(copyToDevice) {
+				CSRPointer.copyToDevice(getJcudaSparseMatrixPtr(), tmp.getNumRows(), tmp.getNonZeros(), rowPtr, colInd, values);
 			}
 		}
+		else {
+			double[] data = tmp.getDenseBlock();
+
+			if( data == null && tmp.getSparseBlock() != null )
+				throw new DMLRuntimeException("Incorrect sparsity calculation");
+			else if( data==null && tmp.getNonZeros() != 0 )
+				throw new DMLRuntimeException("MatrixBlock is not allocated");
+			else if( tmp.getNonZeros() == 0 )
+				data = new double[tmp.getNumRows()*tmp.getNumColumns()];
+
+			// Copy dense block
+			allocateDenseMatrixOnDevice();
+			getGPUContext().recordBlockUsage(this);
+
+			cudaMemcpy(getJcudaDenseMatrixPtr(), Pointer.to(data), getDoubleSizeOf(mat.getNumRows()*mat.getNumColumns()), cudaMemcpyHostToDevice);
+		}
+
+		mat.release();
+
+		if (DMLScript.STATISTICS) GPUStatistics.cudaToDevTime.addAndGet(System.nanoTime()-start);
+		if (DMLScript.STATISTICS) GPUStatistics.cudaToDevCount.addAndGet(1);
+	}
+
+	public static int toIntExact(long l) throws DMLRuntimeException {
+		if (l < Integer.MIN_VALUE || l > Integer.MAX_VALUE) {
+			throw new DMLRuntimeException("Cannot be cast to int:" + l);
+		}
+		return (int) l;
 	}
 
+	protected void copyFromDeviceToHost() throws DMLRuntimeException {
+		LOG.trace("GPU : copyFromDeviceToHost, on " + this + ", GPUContext=" + getGPUContext());
+		if (getJcudaDenseMatrixPtr() != null && getJcudaSparseMatrixPtr() != null){
+			throw new DMLRuntimeException("Invalid state : JCuda dense/sparse pointer are both allocated");
+		}
+
+		if(getJcudaDenseMatrixPtr() != null) {
+			long start=0;
+			if (DMLScript.STATISTICS) start = System.nanoTime();
+			MatrixBlock tmp = new MatrixBlock(toIntExact(mat.getNumRows()), toIntExact(mat.getNumColumns()), false);
+			tmp.allocateDenseBlock();
+			double [] data = tmp.getDenseBlock();
+
+			cudaMemcpy(Pointer.to(data), getJcudaDenseMatrixPtr(), getDoubleSizeOf(data.length), cudaMemcpyDeviceToHost);
+
+			tmp.recomputeNonZeros();
+			mat.acquireModify(tmp);
+			mat.release();
+
+			if (DMLScript.STATISTICS) GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime()-start);
+			if (DMLScript.STATISTICS) GPUStatistics.cudaFromDevCount.addAndGet(1);
+		}
+		else if (getJcudaSparseMatrixPtr() != null){
+			if(!LibMatrixCUDA.isInSparseFormat(getGPUContext(), mat))
+				throw new DMLRuntimeException("Block not in sparse format on host yet the device sparse matrix pointer is not null");
+
+			if(this.isSparseAndEmpty()){
+				MatrixBlock tmp = new MatrixBlock();	// Empty Block
+				mat.acquireModify(tmp);
+				mat.release();
+			} else {
+				long start=0;
+				if (DMLScript.STATISTICS) start = System.nanoTime();
+
+				int rows = toIntExact(mat.getNumRows());
+				int cols = toIntExact(mat.getNumColumns());
+				int nnz = toIntExact(getJcudaSparseMatrixPtr().nnz);
+				int[] rowPtr = new int[rows + 1];
+				int[] colInd = new int[nnz];
+				double[] values = new double[nnz];
+				CSRPointer.copyToHost(getJcudaSparseMatrixPtr(), rows, nnz, rowPtr, colInd, values);
+
+				SparseBlockCSR sparseBlock = new SparseBlockCSR(rowPtr, colInd, values, nnz);
+				MatrixBlock tmp = new MatrixBlock(rows, cols, nnz, sparseBlock);
+				mat.acquireModify(tmp);
+				mat.release();
+				if (DMLScript.STATISTICS) GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime() - start);
+				if (DMLScript.STATISTICS) GPUStatistics.cudaFromDevCount.addAndGet(1);
+			}
+		}
+		else {
+			throw new DMLRuntimeException("Cannot copy from device to host as JCuda dense/sparse pointer is not allocated");
+		}
+		dirty = false;
+	}
+
+
 	/**
 	 * lazily clears the data associated with this {@link GPUObject} instance
 	 * @throws CacheException ?
 	 */
-	public void clearData() throws CacheException {
+	public void clearData() throws DMLRuntimeException {
 		clearData(false);
 	}
 
@@ -226,36 +819,38 @@ public abstract class GPUObject
 	 * @param eager whether to be done synchronously or asynchronously
 	 * @throws CacheException ?
 	 */
-	public void clearData(boolean eager) throws CacheException {
-		synchronized(evictionLock) {
-			JCudaContext.allocatedPointers.remove(this);
-		}
-		try {
-			deallocateMemoryOnDevice(eager);
-		} catch (DMLRuntimeException e) {
-			throw new CacheException(e);
-		}
+	public void clearData(boolean eager) throws DMLRuntimeException {
+		getGPUContext().removeRecordedUsage(this);
+		deallocateMemoryOnDevice(eager);
+
+	}
+
+	/** Pointer to dense matrix */
+	public Pointer getJcudaDenseMatrixPtr() {
+		return jcudaDenseMatrixPtr;
+	}
+
+	/** Pointer to sparse matrix */
+	public CSRPointer getJcudaSparseMatrixPtr() {
+		return jcudaSparseMatrixPtr;
 	}
-	
-	static Boolean evictionLock = new Boolean(true);
-	
-	protected static long getAvailableMemory() {
-		return GPUContext.currContext.getAvailableMemory();
-	}
-	
-//	// Copying from device -> host occurs here
-//	// Called by MatrixObject's exportData
-//	public void exportData() throws CacheException {
-//		boolean isDeviceCopyModified = mat.getGPUObject() != null && mat.getGPUObject().isDeviceCopyModified;
-//		boolean isHostCopyUnavailable = mat.getMatrixBlock() == null || 
-//				(mat.getMatrixBlock().getDenseBlock() == null && mat.getMatrixBlock().getSparseBlock() == null);
-//		
-//		if(mat.getGPUObject() != null && (isDeviceCopyModified || isHostCopyUnavailable)) {
-//			try {
-//				mat.getGPUObject().copyFromDeviceToHost();
-//			} catch (DMLRuntimeException e) {
-//				throw new CacheException(e);
-//			}
-//		}
-//	}
+
+	/** Whether this block is dirty on the GPU */
+	public boolean isDirty() {
+		return dirty;
+	}
+
+	@Override
+	public String toString() {
+		final StringBuilder sb = new StringBuilder("GPUObject{");
+		sb.append(", tensorShape=").append(Arrays.toString(tensorShape));
+		sb.append(", dirty=").append(dirty);
+		sb.append(", readLocks=").append(readLocks);
+		sb.append(", sparse? ").append(isSparse);
+		sb.append(", dims=[").append(mat.getNumRows()).append(",").append(mat.getNumColumns()).append("]");
+		sb.append('}');
+		return sb.toString();
+	}
+
+
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java
deleted file mode 100644
index bb73f4b..0000000
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java
+++ /dev/null
@@ -1,286 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.sysml.runtime.instructions.gpu.context;
-
-import jcuda.driver.JCudaDriver;
-import jcuda.jcublas.JCublas2;
-import jcuda.jcublas.cublasHandle;
-import jcuda.jcudnn.JCudnn;
-import jcuda.jcudnn.cudnnHandle;
-import jcuda.jcusparse.JCusparse;
-import jcuda.jcusparse.cusparseHandle;
-import jcuda.runtime.JCuda;
-import jcuda.runtime.cudaDeviceProp;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.sysml.conf.ConfigurationManager;
-import org.apache.sysml.conf.DMLConfig;
-import org.apache.sysml.runtime.DMLRuntimeException;
-import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
-import org.apache.sysml.utils.GPUStatistics;
-
-import java.util.ArrayList;
-import java.util.concurrent.atomic.AtomicLong;
-
-import static jcuda.driver.JCudaDriver.cuDeviceGetCount;
-import static jcuda.driver.JCudaDriver.cuInit;
-import static jcuda.jcublas.JCublas2.cublasCreate;
-import static jcuda.jcublas.JCublas2.cublasDestroy;
-import static jcuda.jcudnn.JCudnn.cudnnCreate;
-import static jcuda.jcudnn.JCudnn.cudnnDestroy;
-import static jcuda.jcusparse.JCusparse.cusparseCreate;
-import static jcuda.jcusparse.JCusparse.cusparseDestroy;
-import static jcuda.runtime.JCuda.*;
-import static jcuda.runtime.cudaError.cudaSuccess;
-
-
-public class JCudaContext extends GPUContext {
-
-	/** Synchronization object to make sure no allocations happen when something is being evicted from memory */
-	public static final Object syncObj = new Object();
-	private static final Log LOG = LogFactory.getLog(JCudaContext.class.getName());
-
-	/** Global list of allocated {@link GPUObject} instances. This list must be accessed in a synchronized way */
-	public static ArrayList<GPUObject> allocatedPointers = new ArrayList<GPUObject>();
-
-	// The minimum CUDA Compute capability needed for SystemML.
-	// After compute capability 3.0, 2^31 - 1 blocks and 1024 threads per block are supported.
-	// If SystemML needs to run on an older card, this logic can be revisited.
-	final int MAJOR_REQUIRED = 3;
-	final int MINOR_REQUIRED = 0;
-
-	/** The total number of cuda devices on this machine */
-	public static int deviceCount = -1;
-
-	/** enable this to print debug information before code pertaining to the GPU is executed  */
-	public static boolean DEBUG = false;
-
-	/** total bytes available on currently active cude device, please be careful with its bookkeeping */
-	AtomicLong deviceMemBytes = new AtomicLong(0);
-
-	/** Stores the cached deviceProperties */
-	private static cudaDeviceProp[] deviceProperties;
-
-	// Invoke cudaMemGetInfo to get available memory information. Useful if GPU is shared among multiple application.
-	public double GPU_MEMORY_UTILIZATION_FACTOR = ConfigurationManager.getDMLConfig().getDoubleValue(DMLConfig.GPU_MEMORY_UTILIZATION_FACTOR);
-	// Whether to invoke cudaMemGetInfo for available memory or rely on internal bookkeeping for memory info.
-	public boolean REFRESH_AVAILABLE_MEMORY_EVERY_TIME = ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.REFRESH_AVAILABLE_MEMORY_EVERY_TIME);
-	static {
-		long start = System.nanoTime();
-		JCuda.setExceptionsEnabled(true);
-		JCudnn.setExceptionsEnabled(true);
-		JCublas2.setExceptionsEnabled(true);
-		JCusparse.setExceptionsEnabled(true);
-		JCudaDriver.setExceptionsEnabled(true);
-		cuInit(0); // Initialize the driver
-
-		int deviceCountArray[] = { 0 };
-		cuDeviceGetCount(deviceCountArray);				// Obtain the number of devices
-		deviceCount = deviceCountArray[0];
-		deviceProperties = new cudaDeviceProp[deviceCount];
-
-		LOG.info("Total number of GPUs on the machine: " + deviceCount);
-		int maxBlocks = getMaxBlocks();
-		int maxThreadsPerBlock = getMaxThreadsPerBlock();
-		long sharedMemPerBlock = getMaxSharedMemory();
-		int[] device = {-1};
-		cudaGetDevice(device);
-		LOG.info("Active CUDA device number : " + device[0]);
-		LOG.info("Max Blocks/Threads/SharedMem : " + maxBlocks + "/" + maxThreadsPerBlock + "/" + sharedMemPerBlock);
-
-		GPUStatistics.cudaInitTime = System.nanoTime() - start;
-	}
-
-	@Override
-	public long getAvailableMemory() {
-		if (REFRESH_AVAILABLE_MEMORY_EVERY_TIME) {
-			long free[] = {0};
-			long total[] = {0};
-			if (cudaMemGetInfo(free, total) == cudaSuccess) {
-				//long totalNumBytes = total[0];
-				deviceMemBytes.set(free[0]);
-			} else {
-				throw new RuntimeException("ERROR: Unable to get memory information of the GPU.");
-			}
-		}
-		return (long) (deviceMemBytes.get()*GPU_MEMORY_UTILIZATION_FACTOR);
-	}
-
-	@Override
-	public void ensureComputeCapability() throws DMLRuntimeException {
-		int[] devices =  {-1};
-		cudaGetDeviceCount(devices);
-		if (devices[0] == -1){
-			throw new DMLRuntimeException("Call to cudaGetDeviceCount returned 0 devices");
-		}
-		boolean isComputeCapable = true;
-		for (int i=0; i<devices[0]; i++) {
-			cudaDeviceProp properties = getGPUProperties(i);
-			int major = properties.major;
-			int minor = properties.minor;
-			if (major < MAJOR_REQUIRED) {
-				isComputeCapable = false;
-			} else if (major == MAJOR_REQUIRED && minor < MINOR_REQUIRED) {
-				isComputeCapable = false;
-			}
-		}
-		if (!isComputeCapable) {
-			throw new DMLRuntimeException("One of the CUDA cards on the system has compute capability lower than " + MAJOR_REQUIRED + "." + MINOR_REQUIRED);
-		}
-	}
-
-	/**
-	 * Gets the device properties for the active GPU (set with cudaSetDevice())
-	 * @return the device properties
-	 */
-	public static cudaDeviceProp getGPUProperties() {
-		int[] device = {-1};
-		cudaGetDevice(device);	// Get currently active device
-		return getGPUProperties(device[0]);
-	}
-
-	/**
-	 * Gets the device properties
-	 * @param device the device number (on a machine with more than 1 GPU)
-	 * @return the device properties
-	 */
-	public static cudaDeviceProp getGPUProperties(int device){
-		if (deviceProperties[device] == null) {
-			cudaDeviceProp properties = new cudaDeviceProp();
-			cudaGetDeviceProperties(properties, device);
-			deviceProperties[device] = properties;
-		}
-		return deviceProperties[device];
-	}
-
-
-	/**
-	 * Gets the maximum number of threads per block for "active" GPU
-	 * @return the maximum number of threads per block
-	 */
-	public static int getMaxThreadsPerBlock() {
-		cudaDeviceProp deviceProps = getGPUProperties();
-		return deviceProps.maxThreadsPerBlock;
-	}
-
-	/**
-	 * Gets the maximum number of blocks supported by the active cuda device
-	 * @return the maximum number of blocks supported
-	 */
-	public static int getMaxBlocks() {
-		cudaDeviceProp deviceProp = getGPUProperties();
-		return deviceProp.maxGridSize[0];
-	}
-
-	/**
-	 * Gets the shared memory per block supported by the active cuda device
-	 * @return the shared memory per block
-	 */
-	public static long getMaxSharedMemory() {
-		cudaDeviceProp deviceProp = getGPUProperties();
-		return deviceProp.sharedMemPerBlock;
-	}
-
-	/**
-	 * Gets the warp size supported by the active cuda device
-	 * @return the warp size
-	 */
-	public static int getWarpSize() {
-		cudaDeviceProp deviceProp = getGPUProperties();
-		return deviceProp.warpSize;
-	}
-
-	/**
-	 * Gets the available memory and then adds value to it
-	 * @param v the value to add
-	 * @return the current available memory before adding value to it
-	 */
-	public long getAndAddAvailableMemory(long v){
-		return deviceMemBytes.getAndAdd(v);
-	}
-
-	public JCudaContext() throws DMLRuntimeException {
-		if(isGPUContextCreated) {
-			// Wait until it is deleted. This case happens during multi-threaded testing.
-			// This also allows for multi-threaded execute calls
-			long startTime = System.currentTimeMillis();
-			do {
-				try {
-					Thread.sleep(100);
-				} catch (InterruptedException e) {}
-			} while(isGPUContextCreated && (System.currentTimeMillis() - startTime) < 60000);
-			synchronized(isGPUContextCreated) {
-				if(GPUContext.currContext != null) {
-					throw new RuntimeException("Cannot create multiple JCudaContext. Waited for 10 min to close previous GPUContext");
-				}
-			}
-		}
-		synchronized (isGPUContextCreated){
-			GPUContext.currContext = this;
-		}
-
-		long free [] = { 0 };
-		long total [] = { 0 };
-		long totalNumBytes = 0;
-		if(cudaMemGetInfo(free, total) == cudaSuccess) {
-			totalNumBytes = total[0];
-			deviceMemBytes.set(free[0]);
-		}
-		else {
-			throw new RuntimeException("ERROR: Unable to get memory information of the GPU.");
-		}
-		LOG.info("Total GPU memory: " + (totalNumBytes*(1e-6)) + " MB");
-		LOG.info("Available GPU memory: " + (deviceMemBytes.get()*(1e-6)) + " MB");
-
-		long start = System.nanoTime();
-		LibMatrixCUDA.cudnnHandle = new cudnnHandle();
-		cudnnCreate(LibMatrixCUDA.cudnnHandle);
-		LibMatrixCUDA.cublasHandle = new cublasHandle();
-		cublasCreate(LibMatrixCUDA.cublasHandle);
-		// For cublas v2, cublasSetPointerMode tells Cublas whether to expect scalar arguments on device or on host
-		// This applies to arguments like "alpha" in Dgemm, and "y" in Ddot.
-		// cublasSetPointerMode(LibMatrixCUDA.cublasHandle, cublasPointerMode.CUBLAS_POINTER_MODE_DEVICE);
-		LibMatrixCUDA.cusparseHandle = new cusparseHandle();
-		cusparseCreate(LibMatrixCUDA.cusparseHandle);
-		try {
-			LibMatrixCUDA.kernels = new JCudaKernels();
-		} catch (DMLRuntimeException e) {
-			System.err.println("ERROR - Unable to initialize JCudaKernels. System in an inconsistent state");
-			LibMatrixCUDA.kernels = null;
-		}
-		GPUStatistics.cudaLibrariesInitTime = System.nanoTime() - start;
-	}
-
-	@Override
-	public void destroy() throws DMLRuntimeException {
-		if(currContext != null) {
-			synchronized(isGPUContextCreated) {
-				cudnnDestroy(LibMatrixCUDA.cudnnHandle);
-				cublasDestroy(LibMatrixCUDA.cublasHandle);
-				cusparseDestroy(LibMatrixCUDA.cusparseHandle);
-				currContext = null;
-				isGPUContextCreated = false;
-			}
-		}
-		else if(LibMatrixCUDA.cudnnHandle != null || LibMatrixCUDA.cublasHandle != null) {
-			throw new DMLRuntimeException("Error while destroying the GPUContext");
-		}
-	}
-
-}