You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by na...@apache.org on 2017/04/21 23:23:19 UTC
[4/5] incubator-systemml git commit: Refactored GPU{Contex,
Object} to make it friendlier for parfor
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
index d2309b0..708f291 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
@@ -18,65 +18,584 @@
*/
package org.apache.sysml.runtime.instructions.gpu.context;
+import static jcuda.jcublas.JCublas2.cublasCreate;
+import static jcuda.jcublas.JCublas2.cublasDestroy;
+import static jcuda.jcudnn.JCudnn.cudnnCreate;
+import static jcuda.jcudnn.JCudnn.cudnnDestroy;
+import static jcuda.jcusparse.JCusparse.cusparseCreate;
+import static jcuda.jcusparse.JCusparse.cusparseDestroy;
+import static jcuda.runtime.JCuda.cudaDeviceScheduleBlockingSync;
+import static jcuda.runtime.JCuda.cudaFree;
+import static jcuda.runtime.JCuda.cudaGetDeviceCount;
+import static jcuda.runtime.JCuda.cudaMalloc;
+import static jcuda.runtime.JCuda.cudaMemGetInfo;
+import static jcuda.runtime.JCuda.cudaMemset;
+import static jcuda.runtime.JCuda.cudaSetDevice;
+import static jcuda.runtime.JCuda.cudaSetDeviceFlags;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.Map;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
import org.apache.sysml.api.DMLScript;
-import org.apache.sysml.hops.OptimizerUtils;
+import org.apache.sysml.conf.ConfigurationManager;
+import org.apache.sysml.conf.DMLConfig;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
+import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
+import org.apache.sysml.utils.GPUStatistics;
+import org.apache.sysml.utils.LRUCacheMap;
-//FIXME merge JCudaContext into GPUContext as this context is anyway CUDA specific
+import jcuda.Pointer;
+import jcuda.jcublas.cublasHandle;
+import jcuda.jcudnn.cudnnHandle;
+import jcuda.jcusparse.cusparseHandle;
+import jcuda.runtime.JCuda;
+import jcuda.runtime.cudaDeviceProp;
-public abstract class GPUContext {
+/**
+ * Represents a context per GPU accessible through the same JVM
+ * Each context holds cublas, cusparse, cudnn... handles which are separate for each GPU
+ */
+public class GPUContext {
- protected static GPUContext currContext;
- public static volatile Boolean isGPUContextCreated = false;
+ protected static final Log LOG = LogFactory.getLog(GPUContext.class.getName());
- protected GPUContext() {}
+ /** Eviction policies for {@link GPUContext#evict(long)} */
+ public enum EvictionPolicy {
+ LRU, LFU, MIN_EVICT
+ }
- /**
- * Gets device memory available for SystemML operations
- *
- * @return available memory
- */
- public abstract long getAvailableMemory();
+ /** currently employed eviction policy */
+ public final EvictionPolicy evictionPolicy = EvictionPolicy.LRU;
+
+ /** Map of free blocks allocate on GPU. maps size_of_block -> pointer on GPU */
+ private LRUCacheMap<Long, LinkedList<Pointer>> freeCUDASpaceMap = new LRUCacheMap<>();
+
+ /** To record size of allocated blocks */
+ private HashMap<Pointer, Long> cudaBlockSizeMap = new HashMap<>();
+
+ /** active device assigned to this GPUContext instance */
+ private final int deviceNum;
+
+ /** list of allocated {@link GPUObject} instances allocated on {@link GPUContext#deviceNum} GPU
+ * These are matrices allocated on the GPU on which rmvar hasn't been called yet.
+ * If a {@link GPUObject} has more than one lock on it, it cannot be freed
+ * If it has zero locks on it, it can be freed, but it is preferrable to keep it around
+ * so that an extraneous host to dev transfer can be avoided */
+ private ArrayList<GPUObject> allocatedGPUObjects = new ArrayList<>();
+
+ /** cudnnHandle specific to the active GPU for this GPUContext */
+ private cudnnHandle cudnnHandle;
+
+ /** cublasHandle specific to the active GPU for this GPUContext */
+ private cublasHandle cublasHandle;
+
+ /** cusparseHandle specific to the active GPU for this GPUContext */
+ private cusparseHandle cusparseHandle;
+
+ /** to launch custom CUDA kernel, specific to the active GPU for this GPUContext */
+ private JCudaKernels kernels;
/**
- * Ensures that all the CUDA cards on the current system are
- * of the minimum required compute capability.
- * (The minimum required compute capability is hard coded in {@link JCudaContext}.
- *
- * @throws DMLRuntimeException if DMLRuntimeException occurs
- */
- public abstract void ensureComputeCapability() throws DMLRuntimeException;
-
- /**
- * Singleton Factory method for creation of {@link GPUContext}
- * @return GPU context
- * @throws DMLRuntimeException if DMLRuntimeException occurs
+ * The minimum CUDA Compute capability needed for SystemML.
+ * After compute capability 3.0, 2^31 - 1 blocks and 1024 threads per block are supported.
+ * If SystemML needs to run on an older card, this logic can be revisited.
*/
- public static GPUContext getGPUContext() throws DMLRuntimeException {
- if(currContext == null && DMLScript.USE_ACCELERATOR) {
- synchronized(isGPUContextCreated) {
- currContext = new JCudaContext();
- currContext.ensureComputeCapability();
- OptimizerUtils.GPU_MEMORY_BUDGET = currContext.getAvailableMemory();
- isGPUContextCreated = true;
- }
- }
- return currContext;
- }
-
- public static GPUObject createGPUObject(MatrixObject mo) {
- if(DMLScript.USE_ACCELERATOR) {
- synchronized(isGPUContextCreated) {
- if(currContext == null)
- throw new RuntimeException("GPUContext is not created");
- if(currContext instanceof JCudaContext)
- return new JCudaObject(mo);
- }
- }
- throw new RuntimeException("Cannot create createGPUObject when USE_ACCELERATOR is off");
- }
- public abstract void destroy() throws DMLRuntimeException;
-
-
+ final int MAJOR_REQUIRED = 3;
+ final int MINOR_REQUIRED = 0;
+
+ // Invoke cudaMemGetInfo to get available memory information. Useful if GPU is shared among multiple application.
+ public double GPU_MEMORY_UTILIZATION_FACTOR = ConfigurationManager.getDMLConfig().getDoubleValue(DMLConfig.GPU_MEMORY_UTILIZATION_FACTOR);
+
+ protected GPUContext(int deviceNum) throws DMLRuntimeException {
+ this.deviceNum = deviceNum;
+ cudaSetDevice(deviceNum);
+
+ cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+
+ long free[] = {0};
+ long total[] = {0};
+ cudaMemGetInfo(free, total);
+
+ long start = System.nanoTime();
+ cudnnHandle = new cudnnHandle();
+ cudnnCreate(cudnnHandle);
+ cublasHandle = new cublasHandle();
+ cublasCreate(cublasHandle);
+ // For cublas v2, cublasSetPointerMode tells Cublas whether to expect scalar arguments on device or on host
+ // This applies to arguments like "alpha" in Dgemm, and "y" in Ddot.
+ // cublasSetPointerMode(LibMatrixCUDA.cublasHandle, cublasPointerMode.CUBLAS_POINTER_MODE_DEVICE);
+ cusparseHandle = new cusparseHandle();
+ cusparseCreate(cusparseHandle);
+ kernels = new JCudaKernels(deviceNum);
+
+ GPUStatistics.cudaLibrariesInitTime = System.nanoTime() - start;
+ LOG.info(" GPU memory - Total: " + (total[0] * (1e-6)) + " MB, Available: " + (free[0] * (1e-6)) + " MB on " + this);
+
+ }
+
+ public int getDeviceNum() {
+ return deviceNum;
+ }
+
+ /**
+ * Sets the device for the calling thread.
+ * This method must be called after {@link GPUContextPool#getFromPool()}
+ * is called.
+ * If in a multi-threaded env like parfor, this method must be called when in the
+ * appropriate thread
+ */
+ public void initializeThread() {
+ cudaSetDevice(deviceNum);
+ }
+
+ @SuppressWarnings("unused")
+ public static int cudaGetDevice() {
+ int[] device = new int[1];
+ JCuda.cudaGetDevice(device);
+ return device[0];
+ }
+
+ /**
+ * Convenience method for {@link #allocate(String, long, int)}, defaults statsCount to 1.
+ *
+ * @param size size of data (in bytes) to allocate
+ * @return jcuda pointer
+ * @throws DMLRuntimeException if DMLRuntimeException occurs
+ */
+ public Pointer allocate(long size) throws DMLRuntimeException {
+ return allocate(null, size, 1);
+ }
+
+ /**
+ * Convenience method for {@link #allocate(String, long, int)}, defaults statsCount to 1.
+ *
+ * @param instructionName name of instruction for which to record per instruction performance statistics, null if don't want to record
+ * @param size size of data (in bytes) to allocate
+ * @return jcuda pointer
+ * @throws DMLRuntimeException if DMLRuntimeException occurs
+ */
+ public Pointer allocate(String instructionName, long size) throws DMLRuntimeException {
+ return allocate(instructionName, size, 1);
+ }
+
+ /**
+ * Allocates temporary space on the device.
+ * Does not update bookkeeping.
+ * The caller is responsible for freeing up after usage.
+ *
+ * @param instructionName name of instruction for which to record per instruction performance statistics, null if don't want to record
+ * @param size Size of data (in bytes) to allocate
+ * @param statsCount amount to increment the cudaAllocCount by
+ * @return jcuda Pointer
+ * @throws DMLRuntimeException if DMLRuntimeException occurs
+ */
+ public Pointer allocate(String instructionName, long size, int statsCount) throws DMLRuntimeException {
+ long t0 = 0, t1 = 0, end = 0;
+ Pointer A;
+ if (freeCUDASpaceMap.containsKey(size)) {
+ LOG.trace("GPU : in allocate from instruction " + instructionName + ", found free block of size " + (size / 1024.0) + " Kbytes from previously allocated block on " + this);
+ if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
+ LinkedList<Pointer> freeList = freeCUDASpaceMap.get(size);
+ A = freeList.pop();
+ if (freeList.isEmpty())
+ freeCUDASpaceMap.remove(size);
+ if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS)
+ GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_REUSE, System.nanoTime() - t0);
+ } else {
+ LOG.trace("GPU : in allocate from instruction " + instructionName + ", allocating new block of size " + (size / 1024.0) + " Kbytes on " + this);
+ if (DMLScript.STATISTICS) t0 = System.nanoTime();
+ ensureFreeSpace(instructionName, size);
+ A = new Pointer();
+ cudaMalloc(A, size);
+ if (DMLScript.STATISTICS) GPUStatistics.cudaAllocTime.getAndAdd(System.nanoTime() - t0);
+ if (DMLScript.STATISTICS) GPUStatistics.cudaAllocCount.getAndAdd(statsCount);
+ if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS)
+ GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_ALLOCATE, System.nanoTime() - t0);
+ }
+ // Set all elements to 0 since newly allocated space will contain garbage
+ if (DMLScript.STATISTICS) t1 = System.nanoTime();
+ LOG.trace("GPU : in allocate from instruction " + instructionName + ", setting block of size " + (size / 1024.0) + " Kbytes to zero on " + this);
+ cudaMemset(A, 0, size);
+ if (DMLScript.STATISTICS) end = System.nanoTime();
+ if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS)
+ GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_SET_ZERO, end - t1);
+ if (DMLScript.STATISTICS) GPUStatistics.cudaMemSet0Time.getAndAdd(end - t1);
+ if (DMLScript.STATISTICS) GPUStatistics.cudaMemSet0Count.getAndAdd(1);
+ cudaBlockSizeMap.put(A, size);
+ return A;
+
+ }
+
+ /**
+ * Does lazy cudaFree calls
+ *
+ * @param toFree {@link Pointer} instance to be freed
+ */
+ public void cudaFreeHelper(final Pointer toFree) {
+ cudaFreeHelper(null, toFree, false);
+ }
+
+ /**
+ * does lazy/eager cudaFree calls
+ *
+ * @param toFree {@link Pointer} instance to be freed
+ * @param eager true if to be done eagerly
+ */
+ public void cudaFreeHelper(final Pointer toFree, boolean eager) {
+ cudaFreeHelper(null, toFree, eager);
+ }
+
+ /**
+ * Does lazy cudaFree calls
+ *
+ * @param instructionName name of the instruction for which to record per instruction free time, null if do not want to record
+ * @param toFree {@link Pointer} instance to be freed
+ */
+ public void cudaFreeHelper(String instructionName, final Pointer toFree) {
+ cudaFreeHelper(instructionName, toFree, false);
+ }
+
+ /**
+ * Does cudaFree calls, lazily
+ *
+ * @param instructionName name of the instruction for which to record per instruction free time, null if do not want to record
+ * @param toFree {@link Pointer} instance to be freed
+ * @param eager true if to be done eagerly
+ */
+ public void cudaFreeHelper(String instructionName, final Pointer toFree, boolean eager) {
+ long t0 = 0;
+ assert cudaBlockSizeMap.containsKey(toFree) : "ERROR : Internal state corrupted, cache block size map is not aware of a block it trying to free up";
+ long size = cudaBlockSizeMap.get(toFree);
+ if (eager) {
+ LOG.trace("GPU : eagerly freeing cuda memory [ " + toFree + " ] for instruction " + instructionName + " on " + this);
+ if (DMLScript.STATISTICS) t0 = System.nanoTime();
+ cudaFree(toFree);
+ cudaBlockSizeMap.remove(toFree);
+ if (DMLScript.STATISTICS) GPUStatistics.cudaDeAllocTime.addAndGet(System.nanoTime() - t0);
+ if (DMLScript.STATISTICS) GPUStatistics.cudaDeAllocCount.addAndGet(1);
+ if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS)
+ GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t0);
+ } else {
+ LOG.trace("GPU : lazily freeing cuda memory for instruction " + instructionName + " on " + this);
+ LinkedList<Pointer> freeList = freeCUDASpaceMap.get(size);
+ if (freeList == null) {
+ freeList = new LinkedList<Pointer>();
+ freeCUDASpaceMap.put(size, freeList);
+ }
+ freeList.add(toFree);
+ }
+ }
+
+ /**
+ * Thin wrapper over {@link GPUContext#evict(long)}
+ *
+ * @param size size to check
+ * @throws DMLRuntimeException if DMLRuntimeException occurs
+ */
+ void ensureFreeSpace(long size) throws DMLRuntimeException {
+ ensureFreeSpace(null, size);
+ }
+
+ /**
+ * Thin wrapper over {@link GPUContext#evict(long)}
+ *
+ * @param instructionName instructionName name of the instruction for which performance measurements are made
+ * @param size size to check
+ * @throws DMLRuntimeException if DMLRuntimeException occurs
+ */
+ void ensureFreeSpace(String instructionName, long size) throws DMLRuntimeException {
+ if (size >= getAvailableMemory()) {
+ evict(instructionName, size);
+ }
+ }
+
+ /**
+ * Convenience wrapper over {@link GPUContext#evict(String, long)}
+ *
+ * @param GPUSize Desired size to be freed up on the GPU
+ * @throws DMLRuntimeException If no blocks to free up or if not enough blocks with zero locks on them.
+ */
+ protected void evict(final long GPUSize) throws DMLRuntimeException {
+ evict(null, GPUSize);
+ }
+
+ /**
+ * Memory on the GPU is tried to be freed up until either a chunk of needed size is freed up
+ * or it fails.
+ * First the set of reusable blocks is freed up. If that isn't enough, the set of allocated matrix
+ * blocks with zero locks on them is freed up.
+ * The process cycles through the sorted list of allocated {@link GPUObject} instances. Sorting is based on
+ * number of (read) locks that have been obtained on it (reverse order). It repeatedly frees up
+ * blocks on which there are zero locks until the required size has been freed up.
+ * // TODO: update it with hybrid policy
+ *
+ * @param instructionName name of the instruction for which performance measurements are made
+ * @param neededSize desired size to be freed up on the GPU
+ * @throws DMLRuntimeException If no reusable memory blocks to free up or if not enough matrix blocks with zero locks on them.
+ */
+ protected void evict(String instructionName, final long neededSize) throws DMLRuntimeException {
+ LOG.trace("GPU : evict called from " + instructionName + " for size " + neededSize + " on " + this);
+ GPUStatistics.cudaEvictionCount.addAndGet(1);
+ // Release the set of free blocks maintained in a GPUObject.freeCUDASpaceMap
+ // to free up space
+ LRUCacheMap<Long, LinkedList<Pointer>> lruCacheMap = freeCUDASpaceMap;
+ while (lruCacheMap.size() > 0) {
+ if (neededSize <= getAvailableMemory())
+ break;
+ Map.Entry<Long, LinkedList<Pointer>> toFreeListPair = lruCacheMap.removeAndGetLRUEntry();
+ LinkedList<Pointer> toFreeList = toFreeListPair.getValue();
+ Long size = toFreeListPair.getKey();
+ Pointer toFree = toFreeList.pop();
+ if (toFreeList.isEmpty())
+ lruCacheMap.remove(size);
+ cudaFreeHelper(instructionName, toFree, true);
+ }
+
+ if (neededSize <= getAvailableMemory())
+ return;
+
+ if (allocatedGPUObjects.size() == 0) {
+ throw new DMLRuntimeException("There is not enough memory on device for this matrix!");
+ }
+
+ Collections.sort(allocatedGPUObjects, new Comparator<GPUObject>() {
+ @Override
+ public int compare(GPUObject p1, GPUObject p2) {
+ long p1Val = p1.readLocks.get();
+ long p2Val = p2.readLocks.get();
+
+ if (p1Val > 0 && p2Val > 0) {
+ // Both are locked, so don't sort
+ return 0;
+ } else if (p1Val > 0 || p2Val > 0) {
+ // Put the unlocked one to RHS
+ return Long.compare(p2Val, p1Val);
+ } else {
+ // Both are unlocked
+
+ if (evictionPolicy == EvictionPolicy.MIN_EVICT) {
+ long p1Size = 0;
+ long p2Size = 0;
+ try {
+ p1Size = p1.getSizeOnDevice() - neededSize;
+ p2Size = p2.getSizeOnDevice() - neededSize;
+ } catch (DMLRuntimeException e) {
+ throw new RuntimeException(e);
+ }
+
+ if (p1Size >= 0 && p2Size >= 0) {
+ return Long.compare(p2Size, p1Size);
+ } else {
+ return Long.compare(p1Size, p2Size);
+ }
+ } else if (evictionPolicy == EvictionPolicy.LRU || evictionPolicy == EvictionPolicy.LFU) {
+ return Long.compare(p2.timestamp.get(), p1.timestamp.get());
+ } else {
+ throw new RuntimeException("Unsupported eviction policy:" + evictionPolicy.name());
+ }
+ }
+ }
+ });
+
+ while (neededSize > getAvailableMemory() && allocatedGPUObjects.size() > 0) {
+ GPUObject toBeRemoved = allocatedGPUObjects.get(allocatedGPUObjects.size() - 1);
+ if (toBeRemoved.readLocks.get() > 0) {
+ throw new DMLRuntimeException("There is not enough memory on device for this matrix!");
+ }
+ if (toBeRemoved.dirty) {
+ toBeRemoved.copyFromDeviceToHost();
+ }
+
+ toBeRemoved.clearData(true);
+ }
+ }
+
+ /**
+ * Whether the GPU associated with this {@link GPUContext} has recorded the usage of a certain block
+ *
+ * @param o the block
+ * @return true if present, false otherwise
+ */
+ public boolean isBlockRecorded(GPUObject o) {
+ return allocatedGPUObjects.contains(o);
+ }
+
+ /**
+ * @param o {@link GPUObject} instance to record
+ * @see GPUContext#allocatedGPUObjects
+ * Records the usage of a matrix block
+ */
+ public void recordBlockUsage(GPUObject o) {
+ allocatedGPUObjects.add(o);
+ }
+
+ /**
+ * @param o {@link GPUObject} instance to remove from the list of allocated GPU objects
+ * @see GPUContext#allocatedGPUObjects
+ * Records that a block is not used anymore
+ */
+ public void removeRecordedUsage(GPUObject o) {
+ allocatedGPUObjects.remove(o);
+ }
+
+ /**
+ * Gets the available memory on GPU that SystemML can use
+ *
+ * @return the available memory in bytes
+ */
+ public long getAvailableMemory() {
+ long free[] = {0};
+ long total[] = {0};
+ cudaMemGetInfo(free, total);
+ return (long) (free[0] * GPU_MEMORY_UTILIZATION_FACTOR);
+ }
+
+ /**
+ * Makes sure that GPU that SystemML is trying to use has the minimum compute capability needed
+ *
+ * @throws DMLRuntimeException if the compute capability is less than what is required
+ */
+ public void ensureComputeCapability() throws DMLRuntimeException {
+ int[] devices = {-1};
+ cudaGetDeviceCount(devices);
+ if (devices[0] == -1) {
+ throw new DMLRuntimeException("Call to cudaGetDeviceCount returned 0 devices");
+ }
+ boolean isComputeCapable = true;
+ for (int i = 0; i < devices[0]; i++) {
+ cudaDeviceProp properties = GPUContextPool.getGPUProperties(i);
+ int major = properties.major;
+ int minor = properties.minor;
+ if (major < MAJOR_REQUIRED) {
+ isComputeCapable = false;
+ } else if (major == MAJOR_REQUIRED && minor < MINOR_REQUIRED) {
+ isComputeCapable = false;
+ }
+ }
+ if (!isComputeCapable) {
+ throw new DMLRuntimeException("One of the CUDA cards on the system has compute capability lower than " + MAJOR_REQUIRED + "." + MINOR_REQUIRED);
+ }
+ }
+
+ public GPUObject createGPUObject(MatrixObject mo) {
+ return new GPUObject(this, mo);
+ }
+
+ /**
+ * Gets the device properties for the active GPU (set with cudaSetDevice())
+ *
+ * @return the device properties
+ */
+ public cudaDeviceProp getGPUProperties() throws DMLRuntimeException {
+ return GPUContextPool.getGPUProperties(deviceNum);
+ }
+
+ /**
+ * Gets the maximum number of threads per block for "active" GPU
+ *
+ * @return the maximum number of threads per block
+ */
+ public int getMaxThreadsPerBlock() throws DMLRuntimeException {
+ cudaDeviceProp deviceProps = getGPUProperties();
+ return deviceProps.maxThreadsPerBlock;
+ }
+
+ /**
+ * Gets the maximum number of blocks supported by the active cuda device
+ *
+ * @return the maximum number of blocks supported
+ */
+ public int getMaxBlocks() throws DMLRuntimeException {
+ cudaDeviceProp deviceProp = getGPUProperties();
+ return deviceProp.maxGridSize[0];
+ }
+
+ /**
+ * Gets the shared memory per block supported by the active cuda device
+ *
+ * @return the shared memory per block
+ */
+ public long getMaxSharedMemory() throws DMLRuntimeException {
+ cudaDeviceProp deviceProp = getGPUProperties();
+ return deviceProp.sharedMemPerBlock;
+ }
+
+ /**
+ * Gets the warp size supported by the active cuda device
+ *
+ * @return the warp size
+ */
+ public int getWarpSize() throws DMLRuntimeException {
+ cudaDeviceProp deviceProp = getGPUProperties();
+ return deviceProp.warpSize;
+ }
+
+ public cudnnHandle getCudnnHandle() {
+ return cudnnHandle;
+ }
+
+ public cublasHandle getCublasHandle() {
+ return cublasHandle;
+ }
+
+ public cusparseHandle getCusparseHandle() {
+ return cusparseHandle;
+ }
+
+ public JCudaKernels getKernels() {
+ return kernels;
+ }
+
+ /**
+ * Destroys this GPUContext object
+ * This method MUST BE called so that the GPU is available to be used again
+ *
+ * @throws DMLRuntimeException if error
+ */
+ public void destroy() throws DMLRuntimeException {
+ LOG.trace("GPU : this context was destroyed, this = " + this.toString());
+ clearMemory();
+ cudnnDestroy(cudnnHandle);
+ cublasDestroy(cublasHandle);
+ cusparseDestroy(cusparseHandle);
+ cudnnHandle = null;
+ cublasHandle = null;
+ cusparseHandle = null;
+
+ }
+
+ /**
+ * Clears all memory used by this {@link GPUContext}
+ * Be careful to ensure that no memory is currently being used before invoking this
+ * @throws DMLRuntimeException
+ */
+ public void clearMemory() throws DMLRuntimeException {
+ while (allocatedGPUObjects.isEmpty()) {
+ GPUObject o = allocatedGPUObjects.get(0);
+ o.clearData();
+ }
+ for (LinkedList<Pointer> l : freeCUDASpaceMap.values()) {
+ for (Pointer p : l) {
+ cudaFreeHelper(p, true);
+ }
+ }
+ cudaBlockSizeMap.clear();
+ freeCUDASpaceMap.clear();
+ allocatedGPUObjects.clear();
+ }
+
+ @Override
+ public String toString() {
+ return "GPUContext{" +
+ "deviceNum=" + deviceNum +
+ '}';
+ }
+
}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java
new file mode 100644
index 0000000..6452651
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java
@@ -0,0 +1,158 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.instructions.gpu.context;
+
+import static jcuda.driver.JCudaDriver.cuDeviceGetCount;
+import static jcuda.driver.JCudaDriver.cuInit;
+import static jcuda.runtime.JCuda.cudaGetDeviceProperties;
+
+import java.util.LinkedList;
+import java.util.Queue;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.utils.GPUStatistics;
+
+import jcuda.driver.JCudaDriver;
+import jcuda.jcublas.JCublas2;
+import jcuda.jcudnn.JCudnn;
+import jcuda.jcusparse.JCusparse;
+import jcuda.runtime.JCuda;
+import jcuda.runtime.cudaDeviceProp;
+
+public class GPUContextPool {
+
+ protected static final Log LOG = LogFactory.getLog(GPUContextPool.class.getName());
+
+ /** Maximum number of gpus to use, -1 for all */
+ public static int PER_PROCESS_MAX_GPUS = -1;
+
+ /** Whether cuda has been initialized */
+ static boolean initialized = false;
+
+ /** The total number of cuda devices on this machine */
+ static int deviceCount = -1;
+
+ /** Stores the cached deviceProperties */
+ static cudaDeviceProp[] deviceProperties;
+
+ /** Set of free GPUContexts */
+ static Queue<GPUContext> freePool = new LinkedList<>();
+
+ /**
+ * Static initialization of the number of devices
+ * Also sets behaviour for J{Cuda, Cudnn, Cublas, Cusparse} in case of error
+ * Initializes the CUDA driver
+ * All these need be done once, and not per GPU
+ */
+ public synchronized static void initializeGPU() throws DMLRuntimeException {
+ GPUContext.LOG.info("Initializing CUDA");
+ long start = System.nanoTime();
+ JCuda.setExceptionsEnabled(true);
+ JCudnn.setExceptionsEnabled(true);
+ JCublas2.setExceptionsEnabled(true);
+ JCusparse.setExceptionsEnabled(true);
+ JCudaDriver.setExceptionsEnabled(true);
+ cuInit(0); // Initialize the driver
+
+ int deviceCountArray[] = {0};
+ cuDeviceGetCount(deviceCountArray); // Obtain the number of devices
+ deviceCount = deviceCountArray[0];
+ deviceProperties = new cudaDeviceProp[deviceCount];
+
+ if (PER_PROCESS_MAX_GPUS > 0)
+ deviceCount = Math.min(PER_PROCESS_MAX_GPUS, deviceCount);
+
+ // Initialize the list of devices
+ for (int i = 0; i < deviceCount; i++) {
+ cudaDeviceProp properties = new cudaDeviceProp();
+ cudaGetDeviceProperties(properties, i);
+ deviceProperties[i] = properties;
+ }
+
+ // Initialize the pool of GPUContexts
+ for (int i=0; i<deviceCount; i++){
+ GPUContext gCtx = new GPUContext(i);
+ freePool.add(gCtx);
+ }
+
+ GPUContext.LOG.info("Total number of GPUs on the machine: " + deviceCount);
+ //int[] device = {-1};
+ //cudaGetDevice(device);
+ //cudaDeviceProp prop = getGPUProperties(device[0]);
+ //int maxBlocks = prop.maxGridSize[0];
+ //int maxThreadsPerBlock = prop.maxThreadsPerBlock;
+ //long sharedMemPerBlock = prop.sharedMemPerBlock;
+ //LOG.debug("Active CUDA device number : " + device[0]);
+ //LOG.debug("Max Blocks/Threads/SharedMem on active device: " + maxBlocks + "/" + maxThreadsPerBlock + "/" + sharedMemPerBlock);
+ initialized = true;
+ GPUStatistics.cudaInitTime = System.nanoTime() - start;
+ }
+
+ /**
+ * Gets an initialized GPUContext from a pool of GPUContexts, each linked to a GPU
+ * @return null if not more GPUContexts in pool, a valid GPUContext otherwise
+ */
+ public static synchronized GPUContext getFromPool() throws DMLRuntimeException {
+ if (!initialized) initializeGPU();
+ GPUContext gCtx = freePool.poll();
+ LOG.trace("GPU : got GPUContext (" + gCtx + ") from freePool. New sizes - FreePool[" + freePool.size() + "]");
+ return gCtx;
+ }
+
+ /**
+ * Get the number of free GPUContexts
+ * @return number of free GPUContexts
+ */
+ public static synchronized int getAvailableCount() {
+ return freePool.size();
+ }
+
+ /**
+ * Gets the device properties
+ * @param device the device number (on a machine with more than 1 GPU)
+ * @return the device properties
+ * @throws DMLRuntimeException if there is problem initializing the GPUContexts
+ */
+ static cudaDeviceProp getGPUProperties(int device) throws DMLRuntimeException {
+ // do once - initialization of GPU
+ if (!initialized) initializeGPU();
+ return deviceProperties[device];
+ }
+
+ public static int getDeviceCount() throws DMLRuntimeException {
+ if (!initialized) initializeGPU();
+ return deviceCount;
+ }
+
+ /**
+ * Returns a {@link GPUContext} back to the pool of {@link GPUContext}s
+ * @param gCtx the GPUContext instance to return. If null, nothing happens
+ * @throws DMLRuntimeException if error
+ */
+ public static synchronized void returnToPool(GPUContext gCtx) throws DMLRuntimeException {
+ if (gCtx == null)
+ return;
+ freePool.add(gCtx);
+ LOG.trace("GPU : returned GPUContext (" + gCtx + ") to freePool. New sizes - FreePool[" + freePool.size() + "]");
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
index c116475..3a1fafa 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -18,206 +18,799 @@
*/
package org.apache.sysml.runtime.instructions.gpu.context;
-import jcuda.Pointer;
+import static jcuda.jcublas.cublasOperation.CUBLAS_OP_T;
+import static jcuda.jcudnn.JCudnn.cudnnCreateTensorDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnDestroyTensorDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnSetTensor4dDescriptor;
+import static jcuda.jcudnn.cudnnDataType.CUDNN_DATA_DOUBLE;
+import static jcuda.jcudnn.cudnnTensorFormat.CUDNN_TENSOR_NCHW;
+import static jcuda.jcusparse.JCusparse.cusparseDdense2csr;
+import static jcuda.jcusparse.JCusparse.cusparseDnnz;
+import static jcuda.runtime.JCuda.cudaMemcpy;
+import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
+import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
+
+import java.util.Arrays;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysml.api.DMLScript;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.controlprogram.caching.CacheException;
import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
+import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
+import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+import org.apache.sysml.runtime.matrix.data.SparseBlock;
+import org.apache.sysml.runtime.matrix.data.SparseBlockCOO;
+import org.apache.sysml.runtime.matrix.data.SparseBlockCSR;
+import org.apache.sysml.runtime.matrix.data.SparseBlockMCSR;
import org.apache.sysml.utils.GPUStatistics;
-import org.apache.sysml.utils.LRUCacheMap;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.LinkedList;
-import java.util.Map;
-import java.util.concurrent.atomic.AtomicInteger;
-import java.util.concurrent.atomic.AtomicLong;
+import jcuda.Pointer;
+import jcuda.jcublas.JCublas2;
+import jcuda.jcudnn.cudnnTensorDescriptor;
+import jcuda.jcusparse.JCusparse;
+import jcuda.jcusparse.cusparseDirection;
+import jcuda.jcusparse.cusparseHandle;
+import jcuda.jcusparse.cusparseMatDescr;
+
+/**
+ * Handle to a matrix block on the GPU
+ */
+public class GPUObject {
+
+ private static final Log LOG = LogFactory.getLog(GPUObject.class.getName());
+
+ /** GPUContext that owns this GPUObject */
+ private final GPUContext gpuContext;
+
+ /** Pointer to the underlying dense matrix block on GPU */
+ private Pointer jcudaDenseMatrixPtr = null;
+
+ /** Pointer to the underlying sparse matrix block on GPU */
+ private CSRPointer jcudaSparseMatrixPtr = null;
-//FIXME merge JCudaObject into GPUObject to avoid unnecessary complexity
-public abstract class GPUObject
-{
- public enum EvictionPolicy {
- LRU, LFU, MIN_EVICT
- }
- public static final EvictionPolicy evictionPolicy = EvictionPolicy.LRU;
- protected boolean isDeviceCopyModified = false;
- protected AtomicInteger numLocks = new AtomicInteger(0);
+ /** An optional tensor descriptor (and shape) that can be set by a tensor instruction such as convolution,
+ * maxpooling and exploited by a subsequent non-tensor instruction such as relu
+ */
+ private cudnnTensorDescriptor tensorDescriptor = null;
+
+ /** the shape of this tensor, if in fact this is a tensor */
+ private int [] tensorShape = null;
+
+ /** whether the block attached to this {@link GPUContext} is dirty on the device and needs to be copied back to host */
+ protected boolean dirty = false;
+
+ /** number of read locks on this object */
+ protected AtomicInteger readLocks = new AtomicInteger(0);
+
+ /** Timestamp, needed by {@link GPUContext#evict(long)} */
AtomicLong timestamp = new AtomicLong(0);
-
- protected boolean isInSparseFormat = false;
+
+ /** Whether this block is in sparse format */
+ protected boolean isSparse = false;
+
+ /** Enclosing {@link MatrixObject} instance */
protected MatrixObject mat = null;
-
- protected GPUObject(MatrixObject mat2) {
- this.mat = mat2;
+
+ private Pointer allocate(String instName, long size) throws DMLRuntimeException {
+ return getGPUContext().allocate(instName, size);
+ }
+
+ private Pointer allocate(long size) throws DMLRuntimeException {
+ return getGPUContext().allocate(size);
+ }
+
+ private void cudaFreeHelper(Pointer toFree) throws DMLRuntimeException {
+ getGPUContext().cudaFreeHelper(toFree);
+ }
+
+ private void cudaFreeHelper(Pointer toFree, boolean eager) throws DMLRuntimeException {
+ getGPUContext().cudaFreeHelper(toFree, eager);
+ }
+
+ private void cudaFreeHelper(String instName, Pointer toFree, boolean eager) throws DMLRuntimeException {
+ getGPUContext().cudaFreeHelper(instName, toFree, eager);
}
-
- public boolean isInSparseFormat() {
- return isInSparseFormat;
+
+ private GPUContext getGPUContext() throws DMLRuntimeException {
+ return gpuContext;
}
-
- public abstract boolean isAllocated();
/**
- * Signal intent that a matrix block will be read (as input) on the GPU
- * @return true if a host memory to device memory transfer happened
- * @throws DMLRuntimeException ?
+ * Transposes a dense matrix on the GPU by calling the cublasDgeam operation
+ * @param gCtx a valid {@link GPUContext}
+ * @param densePtr Pointer to dense matrix on the GPU
+ * @param m rows in ouput matrix
+ * @param n columns in output matrix
+ * @param lda rows in input matrix
+ * @param ldc columns in output matrix
+ * @return transposed matrix
+ * @throws DMLRuntimeException if operation failed
*/
- public abstract boolean acquireDeviceRead() throws DMLRuntimeException;
+ public static Pointer transpose(GPUContext gCtx, Pointer densePtr, int m, int n, int lda, int ldc) throws DMLRuntimeException {
+ LOG.trace("GPU : transpose of block of size [" + m + "," + n + "]" + ", GPUContext=" + gCtx);
+ Pointer alpha = Pointer.to(new double[]{1.0});
+ Pointer beta = Pointer.to(new double[]{0.0});
+ Pointer A = densePtr;
+ Pointer C = gCtx.allocate(((long)m)*getDoubleSizeOf(n));
+
+ // Transpose the matrix to get a dense matrix
+ JCublas2.cublasDgeam(gCtx.getCublasHandle(), CUBLAS_OP_T, CUBLAS_OP_T, m, n, alpha, A, lda, beta, new Pointer(), lda, C, ldc);
+ return C;
+ }
+
/**
- * To signal intent that a matrix block will be written to on the GPU
- * @return true if memory was allocated on the GPU as a result of this call
+ * Convenience method to convert a CSR matrix to a dense matrix on the GPU
+ * Since the allocated matrix is temporary, bookkeeping is not updated.
+ * Also note that the input dense matrix is expected to be in COLUMN MAJOR FORMAT
+ * Caller is responsible for deallocating memory on GPU.
+ * @param gCtx a valid {@link GPUContext}
+ * @param cusparseHandle handle to cusparse library
+ * @param densePtr [in] dense matrix pointer on the GPU in row major
+ * @param rows number of rows
+ * @param cols number of columns
+ * @return CSR (compressed sparse row) pointer
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
- public abstract boolean acquireDeviceModifyDense() throws DMLRuntimeException;
+ public static CSRPointer columnMajorDenseToRowMajorSparse(GPUContext gCtx, cusparseHandle cusparseHandle, Pointer densePtr, int rows, int cols) throws DMLRuntimeException {
+ cusparseMatDescr matDescr = CSRPointer.getDefaultCuSparseMatrixDescriptor();
+ Pointer nnzPerRowPtr = null;
+ Pointer nnzTotalDevHostPtr = null;
+
+ gCtx.ensureFreeSpace(getIntSizeOf(rows + 1));
+ nnzPerRowPtr = gCtx.allocate(getIntSizeOf(rows));
+ nnzTotalDevHostPtr = gCtx.allocate(getIntSizeOf(1));
+
+ // Output is in dense vector format, convert it to CSR
+ cusparseDnnz(cusparseHandle, cusparseDirection.CUSPARSE_DIRECTION_ROW, rows, cols, matDescr, densePtr, rows, nnzPerRowPtr, nnzTotalDevHostPtr);
+ //cudaDeviceSynchronize();
+ int[] nnzC = {-1};
+
+ long t2=0;
+ if (DMLScript.STATISTICS) t2 = System.nanoTime();
+ cudaMemcpy(Pointer.to(nnzC), nnzTotalDevHostPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost);
+ if (DMLScript.STATISTICS) GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime() - t2);
+ if (DMLScript.STATISTICS) GPUStatistics.cudaFromDevCount.addAndGet(1);
+
+ if (nnzC[0] == -1){
+ throw new DMLRuntimeException("cusparseDnnz did not calculate the correct number of nnz from the sparse-matrix vector mulitply on the GPU");
+ }
+
+ LOG.trace("GPU : col-major dense size[" + rows + "," + cols + "] to row-major sparse of with nnz = " + nnzC[0] + ", GPUContext=" + gCtx);
+
+ CSRPointer C = CSRPointer.allocateEmpty(gCtx, nnzC[0], rows);
+ cusparseDdense2csr(cusparseHandle, rows, cols, matDescr, densePtr, rows, nnzPerRowPtr, C.val, C.rowPtr, C.colInd);
+ //cudaDeviceSynchronize();
+
+ gCtx.cudaFreeHelper(nnzPerRowPtr);
+ gCtx.cudaFreeHelper(nnzTotalDevHostPtr);
+
+ return C;
+ }
+
+ /**
+ * Gets the double array from GPU memory onto host memory and returns string.
+ * @param A Pointer to memory on device (GPU), assumed to point to a double array
+ * @param rows rows in matrix A
+ * @param cols columns in matrix A
+ * @return the debug string
+ * @throws DMLRuntimeException if DMLRuntimeException occurs
+ */
+ @SuppressWarnings("unused")
+ public static String debugString(Pointer A, long rows, long cols) throws DMLRuntimeException {
+ StringBuffer sb = new StringBuffer();
+ int len = toIntExact(rows * cols);
+ double[] tmp = new double[len];
+ cudaMemcpy(Pointer.to(tmp), A, getDoubleSizeOf(len), cudaMemcpyDeviceToHost);
+ int k = 0;
+ for (int i=0; i<rows; i++){
+ for (int j=0; j<cols; j++){
+ sb.append(tmp[k]).append(' ');
+ k++;
+ }
+ sb.append('\n');
+ }
+ return sb.toString();
+ }
+
/**
- * To signal intent that a sparse matrix block will be written to on the GPU
- * @return true if memory was allocated on the GPU as a result of this call
+ * Convenience method to directly examine the Sparse matrix on GPU
+ * @return CSR (compressed sparse row) pointer
+ */
+ public CSRPointer getSparseMatrixCudaPointer() {
+ return getJcudaSparseMatrixPtr();
+ }
+
+ /**
+ * Convenience method to directly set the sparse matrix on GPU
+ * Make sure to call {@link #addReadLock()} after this to set appropriate state, if you are not sure what you are doing.
+ * Needed for operations like {@link JCusparse#cusparseDcsrgemm(cusparseHandle, int, int, int, int, int, cusparseMatDescr, int, Pointer, Pointer, Pointer, cusparseMatDescr, int, Pointer, Pointer, Pointer, cusparseMatDescr, Pointer, Pointer, Pointer)}
+ * @param sparseMatrixPtr CSR (compressed sparse row) pointer
+ */
+ public void setSparseMatrixCudaPointer(CSRPointer sparseMatrixPtr) throws DMLRuntimeException {
+ this.jcudaSparseMatrixPtr = sparseMatrixPtr;
+ this.isSparse = true;
+ if(getJcudaDenseMatrixPtr() != null) {
+ cudaFreeHelper(getJcudaDenseMatrixPtr());
+ jcudaDenseMatrixPtr = null;
+ }
+ }
+
+ /**
+ * Convenience method to directly set the dense matrix pointer on GPU
+ * Make sure to call {@link #addReadLock()} after this to set appropriate state, if you are not sure what you are doing.
+ *
+ * @param densePtr dense pointer
+ */
+ public void setDenseMatrixCudaPointer(Pointer densePtr) throws DMLRuntimeException{
+ this.jcudaDenseMatrixPtr = densePtr;
+ this.isSparse = false;
+ if(getJcudaSparseMatrixPtr() != null) {
+ getJcudaSparseMatrixPtr().deallocate();
+ jcudaSparseMatrixPtr = null;
+ }
+ }
+
+ /**
+ * Converts this GPUObject from dense to sparse format.
+ *
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
- public abstract boolean acquireDeviceModifySparse() throws DMLRuntimeException;
-
+ public void denseToSparse() throws DMLRuntimeException {
+ LOG.trace("GPU : dense -> sparse on " + this + ", GPUContext=" + getGPUContext());
+ long t0=0;
+ if (DMLScript.STATISTICS) t0 = System.nanoTime();
+ cusparseHandle cusparseHandle = getGPUContext().getCusparseHandle();
+ if(cusparseHandle == null)
+ throw new DMLRuntimeException("Expected cusparse to be initialized");
+ int rows = toIntExact(mat.getNumRows());
+ int cols = toIntExact(mat.getNumColumns());
+
+ if(getJcudaDenseMatrixPtr() == null || !isAllocated())
+ throw new DMLRuntimeException("Expected allocated dense matrix before denseToSparse() call");
+
+ convertDensePtrFromRowMajorToColumnMajor();
+ setSparseMatrixCudaPointer(columnMajorDenseToRowMajorSparse(getGPUContext(), cusparseHandle, getJcudaDenseMatrixPtr(), rows, cols));
+ // TODO: What if mat.getNnz() is -1 ?
+ if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseTime.addAndGet(System.nanoTime() - t0);
+ if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseCount.addAndGet(1);
+ }
+
/**
- * If memory on GPU has been allocated from elsewhere, this method
- * updates the internal bookkeeping
- * @param numBytes number of bytes
+ * Convenience method. Converts Row Major Dense Matrix --> Column Major Dense Matrix
+ * @throws DMLRuntimeException if DMLRuntimeException occurs
*/
- public abstract void setDeviceModify(long numBytes);
+ private void convertDensePtrFromRowMajorToColumnMajor() throws DMLRuntimeException {
+ LOG.trace("GPU : dense Ptr row-major -> col-major on " + this + ", GPUContext=" + getGPUContext());
+ int m = toIntExact(mat.getNumRows());
+ int n = toIntExact(mat.getNumColumns());
+ int lda = n;
+ int ldc = m;
+ if(!isAllocated()) {
+ throw new DMLRuntimeException("Error in converting row major to column major : data is not allocated");
+ }
+
+ Pointer tmp = transpose(getGPUContext(), getJcudaDenseMatrixPtr(), m, n, lda, ldc);
+ cudaFreeHelper(getJcudaDenseMatrixPtr());
+ setDenseMatrixCudaPointer(tmp);
+ }
+
+ private void convertDensePtrFromColMajorToRowMajor() throws DMLRuntimeException {
+ LOG.trace("GPU : dense Ptr row-major -> col-major on " + this + ", GPUContext=" + getGPUContext());
+
+ int n = toIntExact(mat.getNumRows());
+ int m = toIntExact(mat.getNumColumns());
+ int lda = n;
+ int ldc = m;
+ if(!isAllocated()) {
+ throw new DMLRuntimeException("Error in converting column major to row major : data is not allocated");
+ }
+
+ Pointer tmp = transpose(getGPUContext(), getJcudaDenseMatrixPtr(), m, n, lda, ldc);
+ cudaFreeHelper(getJcudaDenseMatrixPtr());
+ setDenseMatrixCudaPointer(tmp);
+ }
/**
- * Signal intent that a block needs to be read on the host
- * @return true if copied from device to host
- * @throws CacheException ?
+ * Convert sparse to dense (Performs transpose, use sparseToColumnMajorDense if the kernel can deal with column major format)
+ *
+ * @throws DMLRuntimeException if DMLRuntimeException occurs
*/
- public abstract boolean acquireHostRead() throws CacheException;
-
- public abstract void releaseInput() throws CacheException;
- public abstract void releaseOutput() throws CacheException;
-
- // package-level visibility as these methods are guarded by underlying GPUContext
-
- abstract void allocateDenseMatrixOnDevice() throws DMLRuntimeException;
- abstract void allocateSparseMatrixOnDevice() throws DMLRuntimeException;
- abstract void deallocateMemoryOnDevice(boolean eager) throws DMLRuntimeException;
- abstract long getSizeOnDevice() throws DMLRuntimeException;
-
- abstract void copyFromHostToDevice() throws DMLRuntimeException;
-
+ public void sparseToDense() throws DMLRuntimeException {
+ sparseToDense(null);
+ }
+
/**
- * Copies a matrix block (dense or sparse) from GPU Memory to Host memory.
- * A {@link MatrixBlock} instance is allocated, data from the GPU is copied in,
- * the current one in Host memory is deallocated by calling MatrixObject's acquireHostModify(MatrixBlock) (??? does not exist)
- * and overwritten with the newly allocated instance.
- * TODO : re-examine this to avoid spurious allocations of memory for optimizations
+ * Convert sparse to dense (Performs transpose, use sparseToColumnMajorDense if the kernel can deal with column major format)
+ * Also records per instruction invokation of sparseToDense.
+ * @param instructionName Name of the instruction for which statistics are recorded in {@link GPUStatistics}
+ * @throws DMLRuntimeException ?
+ */
+ public void sparseToDense(String instructionName) throws DMLRuntimeException {
+ LOG.trace("GPU : sparse -> dense on " + this + ", GPUContext=" + getGPUContext());
+ long start=0, end=0;
+ if (DMLScript.STATISTICS) start = System.nanoTime();
+ if(getJcudaSparseMatrixPtr() == null || !isAllocated())
+ throw new DMLRuntimeException("Expected allocated sparse matrix before sparseToDense() call");
+
+ sparseToColumnMajorDense();
+ convertDensePtrFromColMajorToRowMajor();
+ if (DMLScript.STATISTICS) end = System.nanoTime();
+ if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, end - start);
+ if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseTime.addAndGet(end - start);
+ if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseCount.addAndGet(1);
+ }
+
+ /**
+ * More efficient method to convert sparse to dense but returns dense in column major format
+ *
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
- abstract void copyFromDeviceToHost() throws DMLRuntimeException; // Called by export()
+ public void sparseToColumnMajorDense() throws DMLRuntimeException {
+ LOG.trace("GPU : sparse -> col-major dense on " + this + ", GPUContext=" + getGPUContext());
+ if(getJcudaSparseMatrixPtr() == null || !isAllocated())
+ throw new DMLRuntimeException("Expected allocated sparse matrix before sparseToDense() call");
+
+ cusparseHandle cusparseHandle = getGPUContext().getCusparseHandle();
+ if(cusparseHandle == null)
+ throw new DMLRuntimeException("Expected cusparse to be initialized");
+ int rows = toIntExact(mat.getNumRows());
+ int cols = toIntExact(mat.getNumColumns());
+ setDenseMatrixCudaPointer(getJcudaSparseMatrixPtr().toColumnMajorDenseMatrix(cusparseHandle, null, rows, cols));
+ }
+
+ /**
+ * Initializes this GPUObject with a {@link MatrixObject} instance which will contain metadata about the enclosing matrix block
+ * @param mat2 the matrix block that owns this {@link GPUObject}
+ */
+ GPUObject(GPUContext gCtx, MatrixObject mat2) {
+ gpuContext = gCtx;
+ this.mat = mat2;
+ }
+ public boolean isSparse() {
+ return isSparse;
+ }
/**
- * Convenience wrapper over {@link GPUObject#evict(String, long)}
- * @param GPUSize Desired size to be freed up on the GPU
- * @throws DMLRuntimeException If no blocks to free up or if not enough blocks with zero locks on them.
+ * Returns a previously allocated tensor shape or null
+ * @return int array of four elements or null
*/
- protected static void evict(final long GPUSize) throws DMLRuntimeException {
- evict(null, GPUSize);
+ public int [] getTensorShape() {
+ return tensorShape;
}
/**
- * Cycles through the sorted list of allocated {@link GPUObject} instances. Sorting is based on
- * number of (read) locks that have been obtained on it (reverse order). It repeatedly frees up
- * blocks on which there are zero locks until the required size has been freed up.
- * // TODO: update it with hybrid policy
- * @param instructionName name of the instruction for which performance measurements are made
- * @param GPUSize Desired size to be freed up on the GPU
- * @throws DMLRuntimeException If no blocks to free up or if not enough blocks with zero locks on them.
+ * Returns a previously allocated tensor descriptor or null
+ * @return cudnn tensor descriptor
*/
- protected static void evict(String instructionName, final long GPUSize) throws DMLRuntimeException {
- synchronized (JCudaContext.syncObj) {
-
- GPUStatistics.cudaEvictionCount.addAndGet(1);
- // Release the set of free blocks maintained in a JCudaObject.freeCUDASpaceMap
- // to free up space
- LRUCacheMap<Long, LinkedList<Pointer>> lruCacheMap = JCudaObject.freeCUDASpaceMap;
- while (lruCacheMap.size() > 0) {
- if (GPUSize <= getAvailableMemory())
- break;
- Map.Entry<Long, LinkedList<Pointer>> toFreeListPair = lruCacheMap.removeAndGetLRUEntry();
- LinkedList<Pointer> toFreeList = toFreeListPair.getValue();
- Long size = toFreeListPair.getKey();
- Pointer toFree = toFreeList.pop();
- if (toFreeList.isEmpty())
- lruCacheMap.remove(size);
- JCudaObject.cudaFreeHelper(instructionName, toFree, true);
+ public cudnnTensorDescriptor getTensorDescriptor() {
+ return tensorDescriptor;
+ }
+
+ /**
+ * Returns a previously allocated or allocates and returns a tensor descriptor
+ * @param N number of images
+ * @param C number of channels
+ * @param H height
+ * @param W width
+ * @return cudnn tensor descriptor
+ */
+ public cudnnTensorDescriptor allocateTensorDescriptor(int N, int C, int H, int W) {
+ LOG.trace("GPU : allocateTensorDescriptor with [N="+N+",C="+C+",H="+H+",W="+W+"] on " + this);
+ if(tensorDescriptor == null) {
+ tensorDescriptor = new cudnnTensorDescriptor();
+ cudnnCreateTensorDescriptor(tensorDescriptor);
+ cudnnSetTensor4dDescriptor(tensorDescriptor, CUDNN_TENSOR_NCHW, CUDNN_DATA_DOUBLE, N, C, H, W);
+ tensorShape = new int[4];
+ tensorShape[0] = N;
+ tensorShape[1] = C;
+ tensorShape[2] = H;
+ tensorShape[3] = W;
+ }
+ return tensorDescriptor;
+ }
+
+ private static long getDoubleSizeOf(long numElems) {
+ return numElems * ((long)jcuda.Sizeof.DOUBLE);
+ }
+
+ private static long getIntSizeOf(long numElems) {
+ return numElems * ((long)jcuda.Sizeof.INT);
+ }
+
+ public boolean isAllocated() {
+ boolean eitherAllocated = (getJcudaDenseMatrixPtr() != null || getJcudaSparseMatrixPtr() != null);
+ return eitherAllocated;
+ }
+
+ public boolean isInputAllocated() {
+ try {
+ boolean eitherAllocated = (getJcudaDenseMatrixPtr() != null || getJcudaSparseMatrixPtr() != null);
+ boolean isAllocatedOnThisGPUContext = getGPUContext().isBlockRecorded(this);
+ if (eitherAllocated && !isAllocatedOnThisGPUContext) {
+ LOG.warn("GPU : A block was allocated but was not on this GPUContext, GPUContext=" + getGPUContext());
}
+ return eitherAllocated && isAllocatedOnThisGPUContext;
+ } catch (DMLRuntimeException e){
+ LOG.info("GPU : System is in an inconsistent state");
+ throw new RuntimeException(e);
+ }
+ }
+
+ /**
+ * Allocates a sparse and empty {@link GPUObject}
+ * This is the result of operations that are both non zero matrices.
+ *
+ * @throws DMLRuntimeException if DMLRuntimeException occurs
+ */
+ public void allocateSparseAndEmpty() throws DMLRuntimeException{
+ LOG.trace("GPU : allocate sparse and empty block on " + this + ", GPUContext=" + getGPUContext());
+ setSparseMatrixCudaPointer(CSRPointer.allocateEmpty(getGPUContext(), 0, mat.getNumRows()));
+ addReadLock();
+ }
+
+ /**
+ * Allocates a dense matrix of size obtained from the attached matrix metadata
+ * and fills it up with a single value
+ *
+ * @param v value to fill up the dense matrix
+ * @throws DMLRuntimeException if DMLRuntimeException occurs
+ */
+ public void allocateAndFillDense(double v) throws DMLRuntimeException {
+ LOG.trace("GPU : allocate and fill dense with value " + v + " on " + this + ", GPUContext=" + getGPUContext());
+ long rows = mat.getNumRows();
+ long cols = mat.getNumColumns();
+ int numElems = toIntExact(rows * cols);
+ long size = getDoubleSizeOf(numElems);
+ setDenseMatrixCudaPointer(allocate(size));
+ addReadLock();
+ // The "fill" kernel is called which treats the matrix "jcudaDensePtr" like a vector and fills it with value "v"
+ getGPUContext().getKernels().launchKernel("fill", ExecutionConfig.getConfigForSimpleVectorOperations(numElems), getJcudaDenseMatrixPtr(), v, numElems);
+ }
+
+ /**
+ * If this {@link GPUObject} is sparse and empty
+ * Being allocated is a prerequisite to being sparse and empty.
+ *
+ * @return true if sparse and empty
+ * @throws DMLRuntimeException if error
+ */
+ public boolean isSparseAndEmpty() throws DMLRuntimeException{
+ boolean isSparseAndAllocated = isAllocated()&& LibMatrixCUDA.isInSparseFormat(getGPUContext(), mat);
+ boolean isEmptyAndSparseAndAllocated = isSparseAndAllocated && getJcudaSparseMatrixPtr().nnz == 0;
+ return isEmptyAndSparseAndAllocated;
+ }
+
+ public boolean acquireDeviceRead() throws DMLRuntimeException {
+ LOG.trace("GPU : acquireDeviceRead on " + this);
+ boolean transferred = false;
+ if(!isAllocated()) {
+ LOG.trace("GPU : in acquireDeviceRead, data is not allocated, copying from host, on " + this + ", GPUContext=" + getGPUContext());
+ copyFromHostToDevice();
+ transferred = true;
+ } else {
+ addReadLock();
+ }
+ if(!isAllocated())
+ throw new DMLRuntimeException("Expected device data to be allocated");
+ return transferred;
+ }
+
+ public boolean acquireDeviceModifyDense() throws DMLRuntimeException {
+ LOG.trace("GPU : acquireDeviceModifyDense on " + this + ", GPUContext=" + getGPUContext());
+ boolean allocated = false;
+ if(!isAllocated()) {
+ mat.setDirty(true);
+ LOG.trace("GPU : data is not allocated, allocating a dense block, on " + this);
+ // Dense block, size = numRows * numCols
+ allocateDenseMatrixOnDevice();
+ allocated = true;
+ getGPUContext().recordBlockUsage(this);
+ }
+ dirty = true;
+ if(!isAllocated())
+ throw new DMLRuntimeException("Expected device data to be allocated");
+ return allocated;
+ }
- if (GPUSize <= getAvailableMemory())
- return;
+ public boolean acquireDeviceModifySparse() throws DMLRuntimeException {
+ LOG.trace("GPU : acquireDeviceModifySparse on " + this + ", GPUContext=" + getGPUContext());
+ boolean allocated = false;
+ isSparse = true;
+ if(!isAllocated()) {
+ LOG.trace("GPU : data is not allocated, allocating a sparse block, on " + this);
+ mat.setDirty(true);
+ allocateSparseMatrixOnDevice();
+ allocated = true;
+ getGPUContext().recordBlockUsage(this);
+
+ }
+ dirty = true;
+ if(!isAllocated())
+ throw new DMLRuntimeException("Expected device data to be allocated");
+ return allocated;
+ }
+
+ public void addReadLock() {
+ readLocks.addAndGet(1);
+ }
- if (JCudaContext.allocatedPointers.size() == 0) {
- throw new DMLRuntimeException("There is not enough memory on device for this matrix!");
+ /**
+ * if the data is allocated on the GPU and is dirty, it is copied back to the host memory
+ * @return true if a copy to host happened, false otherwise
+ * @throws CacheException
+ */
+ public boolean acquireHostRead() throws CacheException {
+ boolean copied = false;
+ try {
+ LOG.trace("GPU : acquireDeviceModifySparse on " + this + ", GPUContext=" + getGPUContext());
+ if (isAllocated() && dirty) {
+ LOG.trace("GPU : data is dirty on device, copying to host, on " + this + ", GPUContext=" + getGPUContext());
+ copyFromDeviceToHost();
+ copied = true;
}
+ } catch (DMLRuntimeException e) {
+ throw new CacheException(e);
+ }
+ return copied;
+ }
+
+ /**
+ * Updates the locks depending on the eviction policy selected
+ * @throws DMLRuntimeException if there is no locked GPU Object or if could not obtain a {@link GPUContext}
+ */
+ private void updateReleaseLocks() throws DMLRuntimeException {
+ if (readLocks.addAndGet(-1) < 0) {
+ throw new CacheException("Redundant release of GPU object");
+ }
+ LOG.trace("GPU : updateReleaseLocks, new number of read locks is " + readLocks.get() + ", on " + this + ", GPUContext=" + getGPUContext());
+ GPUContext.EvictionPolicy evictionPolicy = getGPUContext().evictionPolicy;
+ switch (evictionPolicy){
+ case LRU : timestamp.set(System.nanoTime()); break;
+ case LFU : timestamp.addAndGet(1); break;
+ case MIN_EVICT : /* Do Nothing */ break;
+ default : throw new CacheException("The eviction policy is not supported:" + evictionPolicy.name());
+ }
+ }
+
+ /**
+ * Releases input allocated on GPU
+ * @throws DMLRuntimeException if data is not allocated or if there is no locked GPU Object or if could not obtain a {@link GPUContext}
+ */
+ public void releaseInput() throws DMLRuntimeException {
+ updateReleaseLocks();
+ if(!isAllocated())
+ throw new CacheException("Attempting to release an input before allocating it");
+ }
+
+ /**
+ * releases output allocated on GPU
+ * @throws DMLRuntimeException if data is not allocated or if there is no locked GPU Object or if could not obtain a {@link GPUContext}
+ */
+ public void releaseOutput() throws DMLRuntimeException {
+ updateReleaseLocks();
+ dirty = true;
+ if(!isAllocated())
+ throw new CacheException("Attempting to release an output before allocating it");
+ }
- synchronized (evictionLock) {
- Collections.sort(JCudaContext.allocatedPointers, new Comparator<GPUObject>() {
-
- @Override
- public int compare(GPUObject p1, GPUObject p2) {
- long p1Val = p1.numLocks.get();
- long p2Val = p2.numLocks.get();
-
- if (p1Val > 0 && p2Val > 0) {
- // Both are locked, so don't sort
- return 0;
- } else if (p1Val > 0 || p2Val > 0) {
- // Put the unlocked one to RHS
- return Long.compare(p2Val, p1Val);
- } else {
- // Both are unlocked
-
- if (evictionPolicy == EvictionPolicy.MIN_EVICT) {
- long p1Size = 0;
- long p2Size = 0;
- try {
- p1Size = p1.getSizeOnDevice() - GPUSize;
- p2Size = p2.getSizeOnDevice() - GPUSize;
- } catch (DMLRuntimeException e) {
- throw new RuntimeException(e);
- }
-
- if (p1Size >= 0 && p2Size >= 0) {
- return Long.compare(p2Size, p1Size);
- } else {
- return Long.compare(p1Size, p2Size);
- }
- } else if (evictionPolicy == EvictionPolicy.LRU || evictionPolicy == EvictionPolicy.LFU) {
- return Long.compare(p2.timestamp.get(), p1.timestamp.get());
- } else {
- throw new RuntimeException("Unsupported eviction policy:" + evictionPolicy.name());
- }
- }
- }
- });
-
- while (GPUSize > getAvailableMemory() && JCudaContext.allocatedPointers.size() > 0) {
- GPUObject toBeRemoved = JCudaContext.allocatedPointers.get(JCudaContext.allocatedPointers.size() - 1);
- if (toBeRemoved.numLocks.get() > 0) {
- throw new DMLRuntimeException("There is not enough memory on device for this matrix!");
- }
- if (toBeRemoved.isDeviceCopyModified) {
- toBeRemoved.copyFromDeviceToHost();
- }
-
- toBeRemoved.clearData(true);
+ void allocateDenseMatrixOnDevice() throws DMLRuntimeException {
+ LOG.trace("GPU : allocateDenseMatrixOnDevice, on " + this + ", GPUContext=" + getGPUContext());
+ assert !isAllocated() : "Internal error - trying to allocated dense matrix to a GPUObject that is already allocated";
+ long rows = mat.getNumRows();
+ long cols = mat.getNumColumns();
+ assert rows > 0 : "Internal error - invalid number of rows when allocating dense matrix";
+ assert cols > 0 : "Internal error - invalid number of columns when allocating dense matrix;";
+ long size = getDoubleSizeOf(rows * cols);
+ Pointer tmp = allocate(size);
+ setDenseMatrixCudaPointer(tmp);
+ addReadLock();
+ }
+
+ void allocateSparseMatrixOnDevice() throws DMLRuntimeException {
+ LOG.trace("GPU : allocateSparseMatrixOnDevice, on " + this + ", GPUContext=" + getGPUContext());
+ assert !isAllocated() : "Internal error = trying to allocated sparse matrix to a GPUObject that is already allocated";
+ long rows = mat.getNumRows();
+ long nnz = mat.getNnz();
+ assert rows > 0 : "Internal error - invalid number of rows when allocating a sparse matrix";
+ assert nnz > 0 : "Internal error - invalid number of non zeroes when allocating a sparse matrix";
+ CSRPointer tmp = CSRPointer.allocateEmpty(getGPUContext(), nnz, rows);
+ setSparseMatrixCudaPointer(tmp);
+ addReadLock();
+ }
+
+ void deallocateMemoryOnDevice(boolean eager) throws DMLRuntimeException {
+ LOG.trace("GPU : deallocateMemoryOnDevice, on " + this + ", GPUContext=" + getGPUContext());
+ if(getJcudaDenseMatrixPtr() != null) {
+ cudaFreeHelper(null, getJcudaDenseMatrixPtr(), eager);
+ }
+ if (getJcudaSparseMatrixPtr() != null) {
+ getJcudaSparseMatrixPtr().deallocate(eager);
+ }
+ jcudaDenseMatrixPtr = null;
+ jcudaSparseMatrixPtr = null;
+ if(tensorDescriptor != null) {
+ cudnnDestroyTensorDescriptor(tensorDescriptor);
+ tensorDescriptor = null;
+ }
+ readLocks.set(0);
+ }
+
+ protected long getSizeOnDevice() throws DMLRuntimeException {
+ long GPUSize = 0;
+ long rlen = mat.getNumRows();
+ long clen = mat.getNumColumns();
+ long nnz = mat.getNnz();
+
+ if(LibMatrixCUDA.isInSparseFormat(getGPUContext(), mat)) {
+ GPUSize = CSRPointer.estimateSize(nnz, rlen);
+ }
+ else {
+ GPUSize = getDoubleSizeOf(rlen * clen);
+ }
+ return GPUSize;
+ }
+
+ void copyFromHostToDevice() throws DMLRuntimeException {
+ LOG.trace("GPU : copyFromHostToDevice, on " + this + ", GPUContext=" + getGPUContext());
+ long start=0;
+ if (DMLScript.STATISTICS) start = System.nanoTime();
+
+ MatrixBlock tmp = mat.acquireRead();
+ if(tmp.isInSparseFormat()) {
+
+ int rowPtr[] = null;
+ int colInd[] = null;
+ double[] values = null;
+
+ tmp.recomputeNonZeros();
+ long nnz = tmp.getNonZeros();
+ mat.getMatrixCharacteristics().setNonZeros(nnz);
+
+ SparseBlock block = tmp.getSparseBlock();
+ boolean copyToDevice = true;
+ if(block == null && tmp.getNonZeros() == 0) {
+// // Allocate empty block --> not necessary
+// // To reproduce this, see org.apache.sysml.test.integration.applications.dml.ID3DMLTest
+// rowPtr = new int[0];
+// colInd = new int[0];
+// values = new double[0];
+ copyToDevice = false;
+ }
+ else if(block == null && tmp.getNonZeros() != 0) {
+ throw new DMLRuntimeException("Expected CP sparse block to be not null.");
+ }
+ else {
+ // CSR is the preferred format for cuSparse GEMM
+ // Converts MCSR and COO to CSR
+ SparseBlockCSR csrBlock = null;
+ long t0=0;
+ if (block instanceof SparseBlockCSR){
+ csrBlock = (SparseBlockCSR)block;
+ } else if (block instanceof SparseBlockCOO) {
+ // TODO - should we do this on the GPU using cusparse<t>coo2csr() ?
+ if (DMLScript.STATISTICS) t0 = System.nanoTime();
+ SparseBlockCOO cooBlock = (SparseBlockCOO)block;
+ csrBlock = new SparseBlockCSR(toIntExact(mat.getNumRows()), cooBlock.rowIndexes(), cooBlock.indexes(), cooBlock.values());
+ if (DMLScript.STATISTICS) GPUStatistics.cudaSparseConversionTime.addAndGet(System.nanoTime() - t0);
+ if (DMLScript.STATISTICS) GPUStatistics.cudaSparseConversionCount.incrementAndGet();
+ } else if (block instanceof SparseBlockMCSR) {
+ if (DMLScript.STATISTICS) t0 = System.nanoTime();
+ SparseBlockMCSR mcsrBlock = (SparseBlockMCSR)block;
+ csrBlock = new SparseBlockCSR(mcsrBlock.getRows(), toIntExact(mcsrBlock.size()));
+ if (DMLScript.STATISTICS) GPUStatistics.cudaSparseConversionTime.addAndGet(System.nanoTime() - t0);
+ if (DMLScript.STATISTICS) GPUStatistics.cudaSparseConversionCount.incrementAndGet();
+ } else {
+ throw new DMLRuntimeException("Unsupported sparse matrix format for CUDA operations");
}
+ rowPtr = csrBlock.rowPointers();
+ colInd = csrBlock.indexes();
+ values = csrBlock.values();
+ }
+ allocateSparseMatrixOnDevice();
+ getGPUContext().recordBlockUsage(this);
+
+ if(copyToDevice) {
+ CSRPointer.copyToDevice(getJcudaSparseMatrixPtr(), tmp.getNumRows(), tmp.getNonZeros(), rowPtr, colInd, values);
}
}
+ else {
+ double[] data = tmp.getDenseBlock();
+
+ if( data == null && tmp.getSparseBlock() != null )
+ throw new DMLRuntimeException("Incorrect sparsity calculation");
+ else if( data==null && tmp.getNonZeros() != 0 )
+ throw new DMLRuntimeException("MatrixBlock is not allocated");
+ else if( tmp.getNonZeros() == 0 )
+ data = new double[tmp.getNumRows()*tmp.getNumColumns()];
+
+ // Copy dense block
+ allocateDenseMatrixOnDevice();
+ getGPUContext().recordBlockUsage(this);
+
+ cudaMemcpy(getJcudaDenseMatrixPtr(), Pointer.to(data), getDoubleSizeOf(mat.getNumRows()*mat.getNumColumns()), cudaMemcpyHostToDevice);
+ }
+
+ mat.release();
+
+ if (DMLScript.STATISTICS) GPUStatistics.cudaToDevTime.addAndGet(System.nanoTime()-start);
+ if (DMLScript.STATISTICS) GPUStatistics.cudaToDevCount.addAndGet(1);
+ }
+
+ public static int toIntExact(long l) throws DMLRuntimeException {
+ if (l < Integer.MIN_VALUE || l > Integer.MAX_VALUE) {
+ throw new DMLRuntimeException("Cannot be cast to int:" + l);
+ }
+ return (int) l;
}
+ protected void copyFromDeviceToHost() throws DMLRuntimeException {
+ LOG.trace("GPU : copyFromDeviceToHost, on " + this + ", GPUContext=" + getGPUContext());
+ if (getJcudaDenseMatrixPtr() != null && getJcudaSparseMatrixPtr() != null){
+ throw new DMLRuntimeException("Invalid state : JCuda dense/sparse pointer are both allocated");
+ }
+
+ if(getJcudaDenseMatrixPtr() != null) {
+ long start=0;
+ if (DMLScript.STATISTICS) start = System.nanoTime();
+ MatrixBlock tmp = new MatrixBlock(toIntExact(mat.getNumRows()), toIntExact(mat.getNumColumns()), false);
+ tmp.allocateDenseBlock();
+ double [] data = tmp.getDenseBlock();
+
+ cudaMemcpy(Pointer.to(data), getJcudaDenseMatrixPtr(), getDoubleSizeOf(data.length), cudaMemcpyDeviceToHost);
+
+ tmp.recomputeNonZeros();
+ mat.acquireModify(tmp);
+ mat.release();
+
+ if (DMLScript.STATISTICS) GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime()-start);
+ if (DMLScript.STATISTICS) GPUStatistics.cudaFromDevCount.addAndGet(1);
+ }
+ else if (getJcudaSparseMatrixPtr() != null){
+ if(!LibMatrixCUDA.isInSparseFormat(getGPUContext(), mat))
+ throw new DMLRuntimeException("Block not in sparse format on host yet the device sparse matrix pointer is not null");
+
+ if(this.isSparseAndEmpty()){
+ MatrixBlock tmp = new MatrixBlock(); // Empty Block
+ mat.acquireModify(tmp);
+ mat.release();
+ } else {
+ long start=0;
+ if (DMLScript.STATISTICS) start = System.nanoTime();
+
+ int rows = toIntExact(mat.getNumRows());
+ int cols = toIntExact(mat.getNumColumns());
+ int nnz = toIntExact(getJcudaSparseMatrixPtr().nnz);
+ int[] rowPtr = new int[rows + 1];
+ int[] colInd = new int[nnz];
+ double[] values = new double[nnz];
+ CSRPointer.copyToHost(getJcudaSparseMatrixPtr(), rows, nnz, rowPtr, colInd, values);
+
+ SparseBlockCSR sparseBlock = new SparseBlockCSR(rowPtr, colInd, values, nnz);
+ MatrixBlock tmp = new MatrixBlock(rows, cols, nnz, sparseBlock);
+ mat.acquireModify(tmp);
+ mat.release();
+ if (DMLScript.STATISTICS) GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime() - start);
+ if (DMLScript.STATISTICS) GPUStatistics.cudaFromDevCount.addAndGet(1);
+ }
+ }
+ else {
+ throw new DMLRuntimeException("Cannot copy from device to host as JCuda dense/sparse pointer is not allocated");
+ }
+ dirty = false;
+ }
+
+
/**
* lazily clears the data associated with this {@link GPUObject} instance
* @throws CacheException ?
*/
- public void clearData() throws CacheException {
+ public void clearData() throws DMLRuntimeException {
clearData(false);
}
@@ -226,36 +819,38 @@ public abstract class GPUObject
* @param eager whether to be done synchronously or asynchronously
* @throws CacheException ?
*/
- public void clearData(boolean eager) throws CacheException {
- synchronized(evictionLock) {
- JCudaContext.allocatedPointers.remove(this);
- }
- try {
- deallocateMemoryOnDevice(eager);
- } catch (DMLRuntimeException e) {
- throw new CacheException(e);
- }
+ public void clearData(boolean eager) throws DMLRuntimeException {
+ getGPUContext().removeRecordedUsage(this);
+ deallocateMemoryOnDevice(eager);
+
+ }
+
+ /** Pointer to dense matrix */
+ public Pointer getJcudaDenseMatrixPtr() {
+ return jcudaDenseMatrixPtr;
+ }
+
+ /** Pointer to sparse matrix */
+ public CSRPointer getJcudaSparseMatrixPtr() {
+ return jcudaSparseMatrixPtr;
}
-
- static Boolean evictionLock = new Boolean(true);
-
- protected static long getAvailableMemory() {
- return GPUContext.currContext.getAvailableMemory();
- }
-
-// // Copying from device -> host occurs here
-// // Called by MatrixObject's exportData
-// public void exportData() throws CacheException {
-// boolean isDeviceCopyModified = mat.getGPUObject() != null && mat.getGPUObject().isDeviceCopyModified;
-// boolean isHostCopyUnavailable = mat.getMatrixBlock() == null ||
-// (mat.getMatrixBlock().getDenseBlock() == null && mat.getMatrixBlock().getSparseBlock() == null);
-//
-// if(mat.getGPUObject() != null && (isDeviceCopyModified || isHostCopyUnavailable)) {
-// try {
-// mat.getGPUObject().copyFromDeviceToHost();
-// } catch (DMLRuntimeException e) {
-// throw new CacheException(e);
-// }
-// }
-// }
+
+ /** Whether this block is dirty on the GPU */
+ public boolean isDirty() {
+ return dirty;
+ }
+
+ @Override
+ public String toString() {
+ final StringBuilder sb = new StringBuilder("GPUObject{");
+ sb.append(", tensorShape=").append(Arrays.toString(tensorShape));
+ sb.append(", dirty=").append(dirty);
+ sb.append(", readLocks=").append(readLocks);
+ sb.append(", sparse? ").append(isSparse);
+ sb.append(", dims=[").append(mat.getNumRows()).append(",").append(mat.getNumColumns()).append("]");
+ sb.append('}');
+ return sb.toString();
+ }
+
+
}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java
deleted file mode 100644
index bb73f4b..0000000
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java
+++ /dev/null
@@ -1,286 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.sysml.runtime.instructions.gpu.context;
-
-import jcuda.driver.JCudaDriver;
-import jcuda.jcublas.JCublas2;
-import jcuda.jcublas.cublasHandle;
-import jcuda.jcudnn.JCudnn;
-import jcuda.jcudnn.cudnnHandle;
-import jcuda.jcusparse.JCusparse;
-import jcuda.jcusparse.cusparseHandle;
-import jcuda.runtime.JCuda;
-import jcuda.runtime.cudaDeviceProp;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.sysml.conf.ConfigurationManager;
-import org.apache.sysml.conf.DMLConfig;
-import org.apache.sysml.runtime.DMLRuntimeException;
-import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
-import org.apache.sysml.utils.GPUStatistics;
-
-import java.util.ArrayList;
-import java.util.concurrent.atomic.AtomicLong;
-
-import static jcuda.driver.JCudaDriver.cuDeviceGetCount;
-import static jcuda.driver.JCudaDriver.cuInit;
-import static jcuda.jcublas.JCublas2.cublasCreate;
-import static jcuda.jcublas.JCublas2.cublasDestroy;
-import static jcuda.jcudnn.JCudnn.cudnnCreate;
-import static jcuda.jcudnn.JCudnn.cudnnDestroy;
-import static jcuda.jcusparse.JCusparse.cusparseCreate;
-import static jcuda.jcusparse.JCusparse.cusparseDestroy;
-import static jcuda.runtime.JCuda.*;
-import static jcuda.runtime.cudaError.cudaSuccess;
-
-
-public class JCudaContext extends GPUContext {
-
- /** Synchronization object to make sure no allocations happen when something is being evicted from memory */
- public static final Object syncObj = new Object();
- private static final Log LOG = LogFactory.getLog(JCudaContext.class.getName());
-
- /** Global list of allocated {@link GPUObject} instances. This list must be accessed in a synchronized way */
- public static ArrayList<GPUObject> allocatedPointers = new ArrayList<GPUObject>();
-
- // The minimum CUDA Compute capability needed for SystemML.
- // After compute capability 3.0, 2^31 - 1 blocks and 1024 threads per block are supported.
- // If SystemML needs to run on an older card, this logic can be revisited.
- final int MAJOR_REQUIRED = 3;
- final int MINOR_REQUIRED = 0;
-
- /** The total number of cuda devices on this machine */
- public static int deviceCount = -1;
-
- /** enable this to print debug information before code pertaining to the GPU is executed */
- public static boolean DEBUG = false;
-
- /** total bytes available on currently active cude device, please be careful with its bookkeeping */
- AtomicLong deviceMemBytes = new AtomicLong(0);
-
- /** Stores the cached deviceProperties */
- private static cudaDeviceProp[] deviceProperties;
-
- // Invoke cudaMemGetInfo to get available memory information. Useful if GPU is shared among multiple application.
- public double GPU_MEMORY_UTILIZATION_FACTOR = ConfigurationManager.getDMLConfig().getDoubleValue(DMLConfig.GPU_MEMORY_UTILIZATION_FACTOR);
- // Whether to invoke cudaMemGetInfo for available memory or rely on internal bookkeeping for memory info.
- public boolean REFRESH_AVAILABLE_MEMORY_EVERY_TIME = ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.REFRESH_AVAILABLE_MEMORY_EVERY_TIME);
- static {
- long start = System.nanoTime();
- JCuda.setExceptionsEnabled(true);
- JCudnn.setExceptionsEnabled(true);
- JCublas2.setExceptionsEnabled(true);
- JCusparse.setExceptionsEnabled(true);
- JCudaDriver.setExceptionsEnabled(true);
- cuInit(0); // Initialize the driver
-
- int deviceCountArray[] = { 0 };
- cuDeviceGetCount(deviceCountArray); // Obtain the number of devices
- deviceCount = deviceCountArray[0];
- deviceProperties = new cudaDeviceProp[deviceCount];
-
- LOG.info("Total number of GPUs on the machine: " + deviceCount);
- int maxBlocks = getMaxBlocks();
- int maxThreadsPerBlock = getMaxThreadsPerBlock();
- long sharedMemPerBlock = getMaxSharedMemory();
- int[] device = {-1};
- cudaGetDevice(device);
- LOG.info("Active CUDA device number : " + device[0]);
- LOG.info("Max Blocks/Threads/SharedMem : " + maxBlocks + "/" + maxThreadsPerBlock + "/" + sharedMemPerBlock);
-
- GPUStatistics.cudaInitTime = System.nanoTime() - start;
- }
-
- @Override
- public long getAvailableMemory() {
- if (REFRESH_AVAILABLE_MEMORY_EVERY_TIME) {
- long free[] = {0};
- long total[] = {0};
- if (cudaMemGetInfo(free, total) == cudaSuccess) {
- //long totalNumBytes = total[0];
- deviceMemBytes.set(free[0]);
- } else {
- throw new RuntimeException("ERROR: Unable to get memory information of the GPU.");
- }
- }
- return (long) (deviceMemBytes.get()*GPU_MEMORY_UTILIZATION_FACTOR);
- }
-
- @Override
- public void ensureComputeCapability() throws DMLRuntimeException {
- int[] devices = {-1};
- cudaGetDeviceCount(devices);
- if (devices[0] == -1){
- throw new DMLRuntimeException("Call to cudaGetDeviceCount returned 0 devices");
- }
- boolean isComputeCapable = true;
- for (int i=0; i<devices[0]; i++) {
- cudaDeviceProp properties = getGPUProperties(i);
- int major = properties.major;
- int minor = properties.minor;
- if (major < MAJOR_REQUIRED) {
- isComputeCapable = false;
- } else if (major == MAJOR_REQUIRED && minor < MINOR_REQUIRED) {
- isComputeCapable = false;
- }
- }
- if (!isComputeCapable) {
- throw new DMLRuntimeException("One of the CUDA cards on the system has compute capability lower than " + MAJOR_REQUIRED + "." + MINOR_REQUIRED);
- }
- }
-
- /**
- * Gets the device properties for the active GPU (set with cudaSetDevice())
- * @return the device properties
- */
- public static cudaDeviceProp getGPUProperties() {
- int[] device = {-1};
- cudaGetDevice(device); // Get currently active device
- return getGPUProperties(device[0]);
- }
-
- /**
- * Gets the device properties
- * @param device the device number (on a machine with more than 1 GPU)
- * @return the device properties
- */
- public static cudaDeviceProp getGPUProperties(int device){
- if (deviceProperties[device] == null) {
- cudaDeviceProp properties = new cudaDeviceProp();
- cudaGetDeviceProperties(properties, device);
- deviceProperties[device] = properties;
- }
- return deviceProperties[device];
- }
-
-
- /**
- * Gets the maximum number of threads per block for "active" GPU
- * @return the maximum number of threads per block
- */
- public static int getMaxThreadsPerBlock() {
- cudaDeviceProp deviceProps = getGPUProperties();
- return deviceProps.maxThreadsPerBlock;
- }
-
- /**
- * Gets the maximum number of blocks supported by the active cuda device
- * @return the maximum number of blocks supported
- */
- public static int getMaxBlocks() {
- cudaDeviceProp deviceProp = getGPUProperties();
- return deviceProp.maxGridSize[0];
- }
-
- /**
- * Gets the shared memory per block supported by the active cuda device
- * @return the shared memory per block
- */
- public static long getMaxSharedMemory() {
- cudaDeviceProp deviceProp = getGPUProperties();
- return deviceProp.sharedMemPerBlock;
- }
-
- /**
- * Gets the warp size supported by the active cuda device
- * @return the warp size
- */
- public static int getWarpSize() {
- cudaDeviceProp deviceProp = getGPUProperties();
- return deviceProp.warpSize;
- }
-
- /**
- * Gets the available memory and then adds value to it
- * @param v the value to add
- * @return the current available memory before adding value to it
- */
- public long getAndAddAvailableMemory(long v){
- return deviceMemBytes.getAndAdd(v);
- }
-
- public JCudaContext() throws DMLRuntimeException {
- if(isGPUContextCreated) {
- // Wait until it is deleted. This case happens during multi-threaded testing.
- // This also allows for multi-threaded execute calls
- long startTime = System.currentTimeMillis();
- do {
- try {
- Thread.sleep(100);
- } catch (InterruptedException e) {}
- } while(isGPUContextCreated && (System.currentTimeMillis() - startTime) < 60000);
- synchronized(isGPUContextCreated) {
- if(GPUContext.currContext != null) {
- throw new RuntimeException("Cannot create multiple JCudaContext. Waited for 10 min to close previous GPUContext");
- }
- }
- }
- synchronized (isGPUContextCreated){
- GPUContext.currContext = this;
- }
-
- long free [] = { 0 };
- long total [] = { 0 };
- long totalNumBytes = 0;
- if(cudaMemGetInfo(free, total) == cudaSuccess) {
- totalNumBytes = total[0];
- deviceMemBytes.set(free[0]);
- }
- else {
- throw new RuntimeException("ERROR: Unable to get memory information of the GPU.");
- }
- LOG.info("Total GPU memory: " + (totalNumBytes*(1e-6)) + " MB");
- LOG.info("Available GPU memory: " + (deviceMemBytes.get()*(1e-6)) + " MB");
-
- long start = System.nanoTime();
- LibMatrixCUDA.cudnnHandle = new cudnnHandle();
- cudnnCreate(LibMatrixCUDA.cudnnHandle);
- LibMatrixCUDA.cublasHandle = new cublasHandle();
- cublasCreate(LibMatrixCUDA.cublasHandle);
- // For cublas v2, cublasSetPointerMode tells Cublas whether to expect scalar arguments on device or on host
- // This applies to arguments like "alpha" in Dgemm, and "y" in Ddot.
- // cublasSetPointerMode(LibMatrixCUDA.cublasHandle, cublasPointerMode.CUBLAS_POINTER_MODE_DEVICE);
- LibMatrixCUDA.cusparseHandle = new cusparseHandle();
- cusparseCreate(LibMatrixCUDA.cusparseHandle);
- try {
- LibMatrixCUDA.kernels = new JCudaKernels();
- } catch (DMLRuntimeException e) {
- System.err.println("ERROR - Unable to initialize JCudaKernels. System in an inconsistent state");
- LibMatrixCUDA.kernels = null;
- }
- GPUStatistics.cudaLibrariesInitTime = System.nanoTime() - start;
- }
-
- @Override
- public void destroy() throws DMLRuntimeException {
- if(currContext != null) {
- synchronized(isGPUContextCreated) {
- cudnnDestroy(LibMatrixCUDA.cudnnHandle);
- cublasDestroy(LibMatrixCUDA.cublasHandle);
- cusparseDestroy(LibMatrixCUDA.cusparseHandle);
- currContext = null;
- isGPUContextCreated = false;
- }
- }
- else if(LibMatrixCUDA.cudnnHandle != null || LibMatrixCUDA.cublasHandle != null) {
- throw new DMLRuntimeException("Error while destroying the GPUContext");
- }
- }
-
-}