You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by na...@apache.org on 2017/06/10 19:07:10 UTC
[2/3] systemml git commit: [FIX] Fixed nested parfor for GPUs
http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
index 8da67ea..b3c19ef 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
@@ -18,14 +18,24 @@
*/
package org.apache.sysml.runtime.instructions.gpu.context;
-import jcuda.Pointer;
-import jcuda.jcublas.cublasHandle;
-import jcuda.jcudnn.cudnnHandle;
-import jcuda.jcusolver.cusolverDnHandle;
-import jcuda.jcusolver.cusolverSpHandle;
-import jcuda.jcusparse.cusparseHandle;
-import jcuda.runtime.JCuda;
-import jcuda.runtime.cudaDeviceProp;
+import static jcuda.jcublas.JCublas2.cublasCreate;
+import static jcuda.jcublas.JCublas2.cublasDestroy;
+import static jcuda.jcudnn.JCudnn.cudnnCreate;
+import static jcuda.jcudnn.JCudnn.cudnnDestroy;
+import static jcuda.jcusolver.JCusolverDn.cusolverDnCreate;
+import static jcuda.jcusolver.JCusolverDn.cusolverDnDestroy;
+import static jcuda.jcusolver.JCusolverSp.cusolverSpCreate;
+import static jcuda.jcusolver.JCusolverSp.cusolverSpDestroy;
+import static jcuda.jcusparse.JCusparse.cusparseCreate;
+import static jcuda.jcusparse.JCusparse.cusparseDestroy;
+import static jcuda.runtime.JCuda.cudaDeviceScheduleBlockingSync;
+import static jcuda.runtime.JCuda.cudaFree;
+import static jcuda.runtime.JCuda.cudaGetDeviceCount;
+import static jcuda.runtime.JCuda.cudaMalloc;
+import static jcuda.runtime.JCuda.cudaMemGetInfo;
+import static jcuda.runtime.JCuda.cudaMemset;
+import static jcuda.runtime.JCuda.cudaSetDevice;
+import static jcuda.runtime.JCuda.cudaSetDeviceFlags;
import java.util.ArrayList;
import java.util.Collections;
@@ -45,24 +55,14 @@ import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
import org.apache.sysml.utils.GPUStatistics;
import org.apache.sysml.utils.LRUCacheMap;
-import static jcuda.jcublas.JCublas2.cublasCreate;
-import static jcuda.jcublas.JCublas2.cublasDestroy;
-import static jcuda.jcudnn.JCudnn.cudnnCreate;
-import static jcuda.jcudnn.JCudnn.cudnnDestroy;
-import static jcuda.jcusolver.JCusolverDn.cusolverDnCreate;
-import static jcuda.jcusolver.JCusolverDn.cusolverDnDestroy;
-import static jcuda.jcusolver.JCusolverSp.cusolverSpCreate;
-import static jcuda.jcusolver.JCusolverSp.cusolverSpDestroy;
-import static jcuda.jcusparse.JCusparse.cusparseCreate;
-import static jcuda.jcusparse.JCusparse.cusparseDestroy;
-import static jcuda.runtime.JCuda.cudaDeviceScheduleBlockingSync;
-import static jcuda.runtime.JCuda.cudaFree;
-import static jcuda.runtime.JCuda.cudaGetDeviceCount;
-import static jcuda.runtime.JCuda.cudaMalloc;
-import static jcuda.runtime.JCuda.cudaMemGetInfo;
-import static jcuda.runtime.JCuda.cudaMemset;
-import static jcuda.runtime.JCuda.cudaSetDevice;
-import static jcuda.runtime.JCuda.cudaSetDeviceFlags;
+import jcuda.Pointer;
+import jcuda.jcublas.cublasHandle;
+import jcuda.jcudnn.cudnnHandle;
+import jcuda.jcusolver.cusolverDnHandle;
+import jcuda.jcusolver.cusolverSpHandle;
+import jcuda.jcusparse.cusparseHandle;
+import jcuda.runtime.JCuda;
+import jcuda.runtime.cudaDeviceProp;
/**
* Represents a context per GPU accessible through the same JVM
@@ -71,606 +71,643 @@ import static jcuda.runtime.JCuda.cudaSetDeviceFlags;
public class GPUContext {
protected static final Log LOG = LogFactory.getLog(GPUContext.class.getName());
+ /**
+ * currently employed eviction policy
+ */
+ public final EvictionPolicy evictionPolicy = EvictionPolicy.LRU;
+ /**
+ * The minimum CUDA Compute capability needed for SystemML.
+ * After compute capability 3.0, 2^31 - 1 blocks and 1024 threads per block are supported.
+ * If SystemML needs to run on an older card, this logic can be revisited.
+ */
+ final int MAJOR_REQUIRED = 3;
+ final int MINOR_REQUIRED = 0;
+ /**
+ * active device assigned to this GPUContext instance
+ */
+ private final int deviceNum;
+ // Invoke cudaMemGetInfo to get available memory information. Useful if GPU is shared among multiple application.
+ public double GPU_MEMORY_UTILIZATION_FACTOR = ConfigurationManager.getDMLConfig()
+ .getDoubleValue(DMLConfig.GPU_MEMORY_UTILIZATION_FACTOR);
+ /**
+ * Map of free blocks allocate on GPU. maps size_of_block -> pointer on GPU
+ */
+ private LRUCacheMap<Long, LinkedList<Pointer>> freeCUDASpaceMap = new LRUCacheMap<>();
+ /**
+ * To record size of allocated blocks
+ */
+ private HashMap<Pointer, Long> cudaBlockSizeMap = new HashMap<>();
+ /**
+ * list of allocated {@link GPUObject} instances allocated on {@link GPUContext#deviceNum} GPU
+ * These are matrices allocated on the GPU on which rmvar hasn't been called yet.
+ * If a {@link GPUObject} has more than one lock on it, it cannot be freed
+ * If it has zero locks on it, it can be freed, but it is preferrable to keep it around
+ * so that an extraneous host to dev transfer can be avoided
+ */
+ private ArrayList<GPUObject> allocatedGPUObjects = new ArrayList<>();
+ /**
+ * cudnnHandle for Deep Neural Network operations on the GPU
+ */
+ private cudnnHandle cudnnHandle;
+ /**
+ * cublasHandle for BLAS operations on the GPU
+ */
+ private cublasHandle cublasHandle;
+ /**
+ * cusparseHandle for certain sparse BLAS operations on the GPU
+ */
+ private cusparseHandle cusparseHandle;
+ /**
+ * cusolverDnHandle for invoking solve() function on dense matrices on the GPU
+ */
+ private cusolverDnHandle cusolverDnHandle;
+ /**
+ * cusolverSpHandle for invoking solve() function on sparse matrices on the GPU
+ */
+ private cusolverSpHandle cusolverSpHandle;
+ /**
+ * to launch custom CUDA kernel, specific to the active GPU for this GPUContext
+ */
+ private JCudaKernels kernels;
+
+ protected GPUContext(int deviceNum) throws DMLRuntimeException {
+ this.deviceNum = deviceNum;
+ cudaSetDevice(deviceNum);
+
+ cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+
+ long free[] = { 0 };
+ long total[] = { 0 };
+ cudaMemGetInfo(free, total);
+
+ long start = System.nanoTime();
+ cudnnHandle = new cudnnHandle();
+ cudnnCreate(cudnnHandle);
+ cublasHandle = new cublasHandle();
+ cublasCreate(cublasHandle);
+ // For cublas v2, cublasSetPointerMode tells Cublas whether to expect scalar arguments on device or on host
+ // This applies to arguments like "alpha" in Dgemm, and "y" in Ddot.
+ // cublasSetPointerMode(LibMatrixCUDA.cublasHandle, cublasPointerMode.CUBLAS_POINTER_MODE_DEVICE);
+ cusparseHandle = new cusparseHandle();
+ cusparseCreate(cusparseHandle);
+
+ cusolverDnHandle = new cusolverDnHandle();
+ cusolverDnCreate(cusolverDnHandle);
+ cusolverSpHandle = new cusolverSpHandle();
+ cusolverSpCreate(cusolverSpHandle);
+
+ kernels = new JCudaKernels(deviceNum);
+
+ GPUStatistics.cudaLibrariesInitTime = System.nanoTime() - start;
+ LOG.info(" GPU memory - Total: " + (total[0] * (1e-6)) + " MB, Available: " + (free[0] * (1e-6)) + " MB on "
+ + this);
- /** Eviction policies for {@link GPUContext#evict(long)} */
- public enum EvictionPolicy {
- LRU, LFU, MIN_EVICT
}
- /** currently employed eviction policy */
- public final EvictionPolicy evictionPolicy = EvictionPolicy.LRU;
+ public static int cudaGetDevice() {
+ int[] device = new int[1];
+ JCuda.cudaGetDevice(device);
+ return device[0];
+ }
- /** Map of free blocks allocate on GPU. maps size_of_block -> pointer on GPU */
- private LRUCacheMap<Long, LinkedList<Pointer>> freeCUDASpaceMap = new LRUCacheMap<>();
+ public int getDeviceNum() {
+ return deviceNum;
+ }
- /** To record size of allocated blocks */
- private HashMap<Pointer, Long> cudaBlockSizeMap = new HashMap<>();
+ /**
+ * Sets the device for the calling thread.
+ * This method must be called after
+ * {@link org.apache.sysml.runtime.controlprogram.context.ExecutionContext#getGPUContext(int)}
+ * If in a multi-threaded env like parfor, this method must be called when in the
+ * appropriate thread
+ */
+ public void initializeThread() {
+ cudaSetDevice(deviceNum);
+ }
+
+ /**
+ * Convenience method for {@link #allocate(String, long, int)}, defaults statsCount to 1.
+ *
+ * @param size size of data (in bytes) to allocate
+ * @return jcuda pointer
+ * @throws DMLRuntimeException if DMLRuntimeException occurs
+ */
+ public Pointer allocate(long size) throws DMLRuntimeException {
+ return allocate(null, size, 1);
+ }
+
+ /**
+ * Convenience method for {@link #allocate(String, long, int)}, defaults statsCount to 1.
+ *
+ * @param instructionName name of instruction for which to record per instruction performance statistics, null if don't want to record
+ * @param size size of data (in bytes) to allocate
+ * @return jcuda pointer
+ * @throws DMLRuntimeException if DMLRuntimeException occurs
+ */
+ public Pointer allocate(String instructionName, long size) throws DMLRuntimeException {
+ return allocate(instructionName, size, 1);
+ }
- /** active device assigned to this GPUContext instance */
- private final int deviceNum;
+ /**
+ * Allocates temporary space on the device.
+ * Does not update bookkeeping.
+ * The caller is responsible for freeing up after usage.
+ *
+ * @param instructionName name of instruction for which to record per instruction performance statistics, null if don't want to record
+ * @param size Size of data (in bytes) to allocate
+ * @param statsCount amount to increment the cudaAllocCount by
+ * @return jcuda Pointer
+ * @throws DMLRuntimeException if DMLRuntimeException occurs
+ */
+ public Pointer allocate(String instructionName, long size, int statsCount) throws DMLRuntimeException {
+ long t0 = 0, t1 = 0, end = 0;
+ Pointer A;
+ if (freeCUDASpaceMap.containsKey(size)) {
+ LOG.trace("GPU : in allocate from instruction " + instructionName + ", found free block of size " + (size
+ / 1024.0) + " Kbytes from previously allocated block on " + this);
+ if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS)
+ t0 = System.nanoTime();
+ LinkedList<Pointer> freeList = freeCUDASpaceMap.get(size);
+ A = freeList.pop();
+ if (freeList.isEmpty())
+ freeCUDASpaceMap.remove(size);
+ if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS)
+ GPUStatistics
+ .maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_REUSE, System.nanoTime() - t0);
+ } else {
+ LOG.trace(
+ "GPU : in allocate from instruction " + instructionName + ", allocating new block of size " + (size
+ / 1024.0) + " Kbytes on " + this);
+ if (DMLScript.STATISTICS)
+ t0 = System.nanoTime();
+ ensureFreeSpace(instructionName, size);
+ A = new Pointer();
+ cudaMalloc(A, size);
+ if (DMLScript.STATISTICS)
+ GPUStatistics.cudaAllocTime.getAndAdd(System.nanoTime() - t0);
+ if (DMLScript.STATISTICS)
+ GPUStatistics.cudaAllocCount.getAndAdd(statsCount);
+ if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS)
+ GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_ALLOCATE,
+ System.nanoTime() - t0);
+ }
+ // Set all elements to 0 since newly allocated space will contain garbage
+ if (DMLScript.STATISTICS)
+ t1 = System.nanoTime();
+ LOG.trace("GPU : in allocate from instruction " + instructionName + ", setting block of size " + (size / 1024.0)
+ + " Kbytes to zero on " + this);
+ cudaMemset(A, 0, size);
+ if (DMLScript.STATISTICS)
+ end = System.nanoTime();
+ if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS)
+ GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_SET_ZERO, end - t1);
+ if (DMLScript.STATISTICS)
+ GPUStatistics.cudaMemSet0Time.getAndAdd(end - t1);
+ if (DMLScript.STATISTICS)
+ GPUStatistics.cudaMemSet0Count.getAndAdd(1);
+ cudaBlockSizeMap.put(A, size);
+ return A;
- /** list of allocated {@link GPUObject} instances allocated on {@link GPUContext#deviceNum} GPU
- * These are matrices allocated on the GPU on which rmvar hasn't been called yet.
- * If a {@link GPUObject} has more than one lock on it, it cannot be freed
- * If it has zero locks on it, it can be freed, but it is preferrable to keep it around
- * so that an extraneous host to dev transfer can be avoided */
- private ArrayList<GPUObject> allocatedGPUObjects = new ArrayList<>();
+ }
- /** cudnnHandle for Deep Neural Network operations on the GPU */
- private cudnnHandle cudnnHandle;
+ /**
+ * Does lazy cudaFree calls
+ *
+ * @param toFree {@link Pointer} instance to be freed
+ */
+ public void cudaFreeHelper(final Pointer toFree) {
+ cudaFreeHelper(null, toFree, false);
+ }
- /** cublasHandle for BLAS operations on the GPU */
- private cublasHandle cublasHandle;
+ /**
+ * does lazy/eager cudaFree calls
+ *
+ * @param toFree {@link Pointer} instance to be freed
+ * @param eager true if to be done eagerly
+ */
+ public void cudaFreeHelper(final Pointer toFree, boolean eager) {
+ cudaFreeHelper(null, toFree, eager);
+ }
- /** cusparseHandle for certain sparse BLAS operations on the GPU */
- private cusparseHandle cusparseHandle;
+ /**
+ * Does lazy cudaFree calls
+ *
+ * @param instructionName name of the instruction for which to record per instruction free time, null if do not want to record
+ * @param toFree {@link Pointer} instance to be freed
+ */
+ public void cudaFreeHelper(String instructionName, final Pointer toFree) {
+ cudaFreeHelper(instructionName, toFree, false);
+ }
- /** cusolverDnHandle for invoking solve() function on dense matrices on the GPU */
- private cusolverDnHandle cusolverDnHandle;
+ /**
+ * Does cudaFree calls, lazily
+ *
+ * @param instructionName name of the instruction for which to record per instruction free time, null if do not want to record
+ * @param toFree {@link Pointer} instance to be freed
+ * @param eager true if to be done eagerly
+ */
+ public void cudaFreeHelper(String instructionName, final Pointer toFree, boolean eager) {
+ Pointer dummy = new Pointer();
+ if (toFree == dummy) // trying to free a null pointer
+ return;
+ long t0 = 0;
+ assert cudaBlockSizeMap.containsKey(
+ toFree) : "ERROR : Internal state corrupted, cache block size map is not aware of a block it trying to free up";
+ long size = cudaBlockSizeMap.get(toFree);
+ if (eager) {
+ LOG.trace("GPU : eagerly freeing cuda memory [ " + toFree + " ] for instruction " + instructionName + " on "
+ + this);
+ if (DMLScript.STATISTICS)
+ t0 = System.nanoTime();
+ cudaFree(toFree);
+ cudaBlockSizeMap.remove(toFree);
+ if (DMLScript.STATISTICS)
+ GPUStatistics.cudaDeAllocTime.addAndGet(System.nanoTime() - t0);
+ if (DMLScript.STATISTICS)
+ GPUStatistics.cudaDeAllocCount.addAndGet(1);
+ if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS)
+ GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_CUDA_FREE,
+ System.nanoTime() - t0);
+ } else {
+ LOG.trace("GPU : lazily freeing cuda memory for instruction " + instructionName + " on " + this);
+ LinkedList<Pointer> freeList = freeCUDASpaceMap.get(size);
+ if (freeList == null) {
+ freeList = new LinkedList<Pointer>();
+ freeCUDASpaceMap.put(size, freeList);
+ }
+ if (freeList.contains(toFree))
+ throw new RuntimeException("GPU : Internal state corrupted, double free");
+ freeList.add(toFree);
+ }
+ }
- /** cusolverSpHandle for invoking solve() function on sparse matrices on the GPU */
- private cusolverSpHandle cusolverSpHandle;
+ /**
+ * Thin wrapper over {@link GPUContext#evict(long)}
+ *
+ * @param size size to check
+ * @throws DMLRuntimeException if DMLRuntimeException occurs
+ */
+ void ensureFreeSpace(long size) throws DMLRuntimeException {
+ ensureFreeSpace(null, size);
+ }
- /** to launch custom CUDA kernel, specific to the active GPU for this GPUContext */
- private JCudaKernels kernels;
+ /**
+ * Thin wrapper over {@link GPUContext#evict(long)}
+ *
+ * @param instructionName instructionName name of the instruction for which performance measurements are made
+ * @param size size to check
+ * @throws DMLRuntimeException if DMLRuntimeException occurs
+ */
+ void ensureFreeSpace(String instructionName, long size) throws DMLRuntimeException {
+ if (size >= getAvailableMemory()) {
+ evict(instructionName, size);
+ }
+ }
/**
- * The minimum CUDA Compute capability needed for SystemML.
- * After compute capability 3.0, 2^31 - 1 blocks and 1024 threads per block are supported.
- * If SystemML needs to run on an older card, this logic can be revisited.
+ * Convenience wrapper over {@link GPUContext#evict(String, long)}
+ *
+ * @param GPUSize Desired size to be freed up on the GPU
+ * @throws DMLRuntimeException If no blocks to free up or if not enough blocks with zero locks on them.
*/
- final int MAJOR_REQUIRED = 3;
- final int MINOR_REQUIRED = 0;
+ protected void evict(final long GPUSize) throws DMLRuntimeException {
+ evict(null, GPUSize);
+ }
+
+ /**
+ * Memory on the GPU is tried to be freed up until either a chunk of needed size is freed up
+ * or it fails.
+ * First the set of reusable blocks is freed up. If that isn't enough, the set of allocated matrix
+ * blocks with zero locks on them is freed up.
+ * The process cycles through the sorted list of allocated {@link GPUObject} instances. Sorting is based on
+ * number of (read) locks that have been obtained on it (reverse order). It repeatedly frees up
+ * blocks on which there are zero locks until the required size has been freed up.
+ * // TODO: update it with hybrid policy
+ *
+ * @param instructionName name of the instruction for which performance measurements are made
+ * @param neededSize desired size to be freed up on the GPU
+ * @throws DMLRuntimeException If no reusable memory blocks to free up or if not enough matrix blocks with zero locks on them.
+ */
+ protected void evict(String instructionName, final long neededSize) throws DMLRuntimeException {
+ LOG.trace("GPU : evict called from " + instructionName + " for size " + neededSize + " on " + this);
+ GPUStatistics.cudaEvictionCount.addAndGet(1);
+ // Release the set of free blocks maintained in a GPUObject.freeCUDASpaceMap
+ // to free up space
+ LRUCacheMap<Long, LinkedList<Pointer>> lruCacheMap = freeCUDASpaceMap;
+ while (lruCacheMap.size() > 0) {
+ if (neededSize <= getAvailableMemory())
+ break;
+ Map.Entry<Long, LinkedList<Pointer>> toFreeListPair = lruCacheMap.removeAndGetLRUEntry();
+ LinkedList<Pointer> toFreeList = toFreeListPair.getValue();
+ Long size = toFreeListPair.getKey();
+ Pointer toFree = toFreeList.pop();
+ if (toFreeList.isEmpty())
+ lruCacheMap.remove(size);
+ cudaFreeHelper(instructionName, toFree, true);
+ }
+
+ if (neededSize <= getAvailableMemory())
+ return;
+
+ if (allocatedGPUObjects.size() == 0) {
+ throw new DMLRuntimeException(
+ "There is not enough memory on device for this matrix, request (" + neededSize + ")");
+ }
+
+ Collections.sort(allocatedGPUObjects, new Comparator<GPUObject>() {
+ @Override
+ public int compare(GPUObject p1, GPUObject p2) {
+ long p1Val = p1.locks.get();
+ long p2Val = p2.locks.get();
+
+ if (p1Val > 0 && p2Val > 0) {
+ // Both are locked, so don't sort
+ return 0;
+ } else if (p1Val > 0 || p2Val > 0) {
+ // Put the unlocked one to RHS
+ return Long.compare(p2Val, p1Val);
+ } else {
+ // Both are unlocked
+
+ if (evictionPolicy == EvictionPolicy.MIN_EVICT) {
+ long p1Size = 0;
+ long p2Size = 0;
+ try {
+ p1Size = p1.getSizeOnDevice() - neededSize;
+ p2Size = p2.getSizeOnDevice() - neededSize;
+ } catch (DMLRuntimeException e) {
+ throw new RuntimeException(e);
+ }
+
+ if (p1Size >= 0 && p2Size >= 0) {
+ return Long.compare(p2Size, p1Size);
+ } else {
+ return Long.compare(p1Size, p2Size);
+ }
+ } else if (evictionPolicy == EvictionPolicy.LRU || evictionPolicy == EvictionPolicy.LFU) {
+ return Long.compare(p2.timestamp.get(), p1.timestamp.get());
+ } else {
+ throw new RuntimeException("Unsupported eviction policy:" + evictionPolicy.name());
+ }
+ }
+ }
+ });
+
+ while (neededSize > getAvailableMemory() && allocatedGPUObjects.size() > 0) {
+ GPUObject toBeRemoved = allocatedGPUObjects.get(allocatedGPUObjects.size() - 1);
+ if (toBeRemoved.locks.get() > 0) {
+ throw new DMLRuntimeException(
+ "There is not enough memory on device for this matrix, request (" + neededSize + ")");
+ }
+ if (toBeRemoved.dirty) {
+ toBeRemoved.copyFromDeviceToHost();
+ }
+
+ toBeRemoved.clearData(true);
+ }
+ }
+
+ /**
+ * Whether the GPU associated with this {@link GPUContext} has recorded the usage of a certain block
+ *
+ * @param o the block
+ * @return true if present, false otherwise
+ */
+ public boolean isBlockRecorded(GPUObject o) {
+ return allocatedGPUObjects.contains(o);
+ }
+
+ /**
+ * @param o {@link GPUObject} instance to record
+ * @see GPUContext#allocatedGPUObjects
+ * Records the usage of a matrix block
+ */
+ public void recordBlockUsage(GPUObject o) {
+ allocatedGPUObjects.add(o);
+ }
- // Invoke cudaMemGetInfo to get available memory information. Useful if GPU is shared among multiple application.
- public double GPU_MEMORY_UTILIZATION_FACTOR = ConfigurationManager.getDMLConfig().getDoubleValue(DMLConfig.GPU_MEMORY_UTILIZATION_FACTOR);
-
- protected GPUContext(int deviceNum) throws DMLRuntimeException {
- this.deviceNum = deviceNum;
- cudaSetDevice(deviceNum);
-
- cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
-
- long free[] = {0};
- long total[] = {0};
- cudaMemGetInfo(free, total);
-
- long start = System.nanoTime();
- cudnnHandle = new cudnnHandle();
- cudnnCreate(cudnnHandle);
- cublasHandle = new cublasHandle();
- cublasCreate(cublasHandle);
- // For cublas v2, cublasSetPointerMode tells Cublas whether to expect scalar arguments on device or on host
- // This applies to arguments like "alpha" in Dgemm, and "y" in Ddot.
- // cublasSetPointerMode(LibMatrixCUDA.cublasHandle, cublasPointerMode.CUBLAS_POINTER_MODE_DEVICE);
- cusparseHandle = new cusparseHandle();
- cusparseCreate(cusparseHandle);
-
- cusolverDnHandle = new cusolverDnHandle();
- cusolverDnCreate(cusolverDnHandle);
- cusolverSpHandle = new cusolverSpHandle();
- cusolverSpCreate(cusolverSpHandle);
-
- kernels = new JCudaKernels(deviceNum);
-
- GPUStatistics.cudaLibrariesInitTime = System.nanoTime() - start;
- LOG.info(" GPU memory - Total: " + (total[0] * (1e-6)) + " MB, Available: " + (free[0] * (1e-6)) + " MB on " + this);
-
- }
-
- public int getDeviceNum() {
- return deviceNum;
- }
-
- /**
- * Sets the device for the calling thread.
- * This method must be called after {@link GPUContextPool#getFromPool()}
- * is called.
- * If in a multi-threaded env like parfor, this method must be called when in the
- * appropriate thread
- */
- public void initializeThread() {
- cudaSetDevice(deviceNum);
- }
-
- public static int cudaGetDevice() {
- int[] device = new int[1];
- JCuda.cudaGetDevice(device);
- return device[0];
- }
-
- /**
- * Convenience method for {@link #allocate(String, long, int)}, defaults statsCount to 1.
- *
- * @param size size of data (in bytes) to allocate
- * @return jcuda pointer
- * @throws DMLRuntimeException if DMLRuntimeException occurs
- */
- public Pointer allocate(long size) throws DMLRuntimeException {
- return allocate(null, size, 1);
- }
-
- /**
- * Convenience method for {@link #allocate(String, long, int)}, defaults statsCount to 1.
- *
- * @param instructionName name of instruction for which to record per instruction performance statistics, null if don't want to record
- * @param size size of data (in bytes) to allocate
- * @return jcuda pointer
- * @throws DMLRuntimeException if DMLRuntimeException occurs
- */
- public Pointer allocate(String instructionName, long size) throws DMLRuntimeException {
- return allocate(instructionName, size, 1);
- }
-
- /**
- * Allocates temporary space on the device.
- * Does not update bookkeeping.
- * The caller is responsible for freeing up after usage.
- *
- * @param instructionName name of instruction for which to record per instruction performance statistics, null if don't want to record
- * @param size Size of data (in bytes) to allocate
- * @param statsCount amount to increment the cudaAllocCount by
- * @return jcuda Pointer
- * @throws DMLRuntimeException if DMLRuntimeException occurs
- */
- public Pointer allocate(String instructionName, long size, int statsCount) throws DMLRuntimeException {
- long t0 = 0, t1 = 0, end = 0;
- Pointer A;
- if (freeCUDASpaceMap.containsKey(size)) {
- LOG.trace("GPU : in allocate from instruction " + instructionName + ", found free block of size " + (size / 1024.0) + " Kbytes from previously allocated block on " + this);
- if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
- LinkedList<Pointer> freeList = freeCUDASpaceMap.get(size);
- A = freeList.pop();
- if (freeList.isEmpty())
- freeCUDASpaceMap.remove(size);
- if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS)
- GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_REUSE, System.nanoTime() - t0);
- } else {
- LOG.trace("GPU : in allocate from instruction " + instructionName + ", allocating new block of size " + (size / 1024.0) + " Kbytes on " + this);
- if (DMLScript.STATISTICS) t0 = System.nanoTime();
- ensureFreeSpace(instructionName, size);
- A = new Pointer();
- cudaMalloc(A, size);
- if (DMLScript.STATISTICS) GPUStatistics.cudaAllocTime.getAndAdd(System.nanoTime() - t0);
- if (DMLScript.STATISTICS) GPUStatistics.cudaAllocCount.getAndAdd(statsCount);
- if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS)
- GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_ALLOCATE, System.nanoTime() - t0);
- }
- // Set all elements to 0 since newly allocated space will contain garbage
- if (DMLScript.STATISTICS) t1 = System.nanoTime();
- LOG.trace("GPU : in allocate from instruction " + instructionName + ", setting block of size " + (size / 1024.0) + " Kbytes to zero on " + this);
- cudaMemset(A, 0, size);
- if (DMLScript.STATISTICS) end = System.nanoTime();
- if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS)
- GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_SET_ZERO, end - t1);
- if (DMLScript.STATISTICS) GPUStatistics.cudaMemSet0Time.getAndAdd(end - t1);
- if (DMLScript.STATISTICS) GPUStatistics.cudaMemSet0Count.getAndAdd(1);
- cudaBlockSizeMap.put(A, size);
- return A;
-
- }
-
- /**
- * Does lazy cudaFree calls
- *
- * @param toFree {@link Pointer} instance to be freed
- */
- public void cudaFreeHelper(final Pointer toFree) {
- cudaFreeHelper(null, toFree, false);
- }
-
- /**
- * does lazy/eager cudaFree calls
- *
- * @param toFree {@link Pointer} instance to be freed
- * @param eager true if to be done eagerly
- */
- public void cudaFreeHelper(final Pointer toFree, boolean eager) {
- cudaFreeHelper(null, toFree, eager);
- }
-
- /**
- * Does lazy cudaFree calls
- *
- * @param instructionName name of the instruction for which to record per instruction free time, null if do not want to record
- * @param toFree {@link Pointer} instance to be freed
- */
- public void cudaFreeHelper(String instructionName, final Pointer toFree) {
- cudaFreeHelper(instructionName, toFree, false);
- }
-
- /**
- * Does cudaFree calls, lazily
- *
- * @param instructionName name of the instruction for which to record per instruction free time, null if do not want to record
- * @param toFree {@link Pointer} instance to be freed
- * @param eager true if to be done eagerly
- */
- public void cudaFreeHelper(String instructionName, final Pointer toFree, boolean eager) {
- Pointer dummy = new Pointer();
- if (toFree == dummy) // trying to free a null pointer
- return;
- long t0 = 0;
- assert cudaBlockSizeMap.containsKey(toFree) : "ERROR : Internal state corrupted, cache block size map is not aware of a block it trying to free up";
- long size = cudaBlockSizeMap.get(toFree);
- if (eager) {
- LOG.trace("GPU : eagerly freeing cuda memory [ " + toFree + " ] for instruction " + instructionName + " on " + this);
- if (DMLScript.STATISTICS) t0 = System.nanoTime();
- cudaFree(toFree);
- cudaBlockSizeMap.remove(toFree);
- if (DMLScript.STATISTICS) GPUStatistics.cudaDeAllocTime.addAndGet(System.nanoTime() - t0);
- if (DMLScript.STATISTICS) GPUStatistics.cudaDeAllocCount.addAndGet(1);
- if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS)
- GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t0);
- } else {
- LOG.trace("GPU : lazily freeing cuda memory for instruction " + instructionName + " on " + this);
- LinkedList<Pointer> freeList = freeCUDASpaceMap.get(size);
- if (freeList == null) {
- freeList = new LinkedList<Pointer>();
- freeCUDASpaceMap.put(size, freeList);
- }
- if (freeList.contains(toFree))
- throw new RuntimeException("GPU : Internal state corrupted, double free");
- freeList.add(toFree);
- }
- }
-
- /**
- * Thin wrapper over {@link GPUContext#evict(long)}
- *
- * @param size size to check
- * @throws DMLRuntimeException if DMLRuntimeException occurs
- */
- void ensureFreeSpace(long size) throws DMLRuntimeException {
- ensureFreeSpace(null, size);
- }
-
- /**
- * Thin wrapper over {@link GPUContext#evict(long)}
- *
- * @param instructionName instructionName name of the instruction for which performance measurements are made
- * @param size size to check
- * @throws DMLRuntimeException if DMLRuntimeException occurs
- */
- void ensureFreeSpace(String instructionName, long size) throws DMLRuntimeException {
- if (size >= getAvailableMemory()) {
- evict(instructionName, size);
- }
- }
-
- /**
- * Convenience wrapper over {@link GPUContext#evict(String, long)}
- *
- * @param GPUSize Desired size to be freed up on the GPU
- * @throws DMLRuntimeException If no blocks to free up or if not enough blocks with zero locks on them.
- */
- protected void evict(final long GPUSize) throws DMLRuntimeException {
- evict(null, GPUSize);
- }
-
- /**
- * Memory on the GPU is tried to be freed up until either a chunk of needed size is freed up
- * or it fails.
- * First the set of reusable blocks is freed up. If that isn't enough, the set of allocated matrix
- * blocks with zero locks on them is freed up.
- * The process cycles through the sorted list of allocated {@link GPUObject} instances. Sorting is based on
- * number of (read) locks that have been obtained on it (reverse order). It repeatedly frees up
- * blocks on which there are zero locks until the required size has been freed up.
- * // TODO: update it with hybrid policy
- *
- * @param instructionName name of the instruction for which performance measurements are made
- * @param neededSize desired size to be freed up on the GPU
- * @throws DMLRuntimeException If no reusable memory blocks to free up or if not enough matrix blocks with zero locks on them.
- */
- protected void evict(String instructionName, final long neededSize) throws DMLRuntimeException {
- LOG.trace("GPU : evict called from " + instructionName + " for size " + neededSize + " on " + this);
- GPUStatistics.cudaEvictionCount.addAndGet(1);
- // Release the set of free blocks maintained in a GPUObject.freeCUDASpaceMap
- // to free up space
- LRUCacheMap<Long, LinkedList<Pointer>> lruCacheMap = freeCUDASpaceMap;
- while (lruCacheMap.size() > 0) {
- if (neededSize <= getAvailableMemory())
- break;
- Map.Entry<Long, LinkedList<Pointer>> toFreeListPair = lruCacheMap.removeAndGetLRUEntry();
- LinkedList<Pointer> toFreeList = toFreeListPair.getValue();
- Long size = toFreeListPair.getKey();
- Pointer toFree = toFreeList.pop();
- if (toFreeList.isEmpty())
- lruCacheMap.remove(size);
- cudaFreeHelper(instructionName, toFree, true);
- }
-
- if (neededSize <= getAvailableMemory())
- return;
-
- if (allocatedGPUObjects.size() == 0) {
- throw new DMLRuntimeException("There is not enough memory on device for this matrix, request (" + neededSize + ")");
- }
-
- Collections.sort(allocatedGPUObjects, new Comparator<GPUObject>() {
- @Override
- public int compare(GPUObject p1, GPUObject p2) {
- long p1Val = p1.locks.get();
- long p2Val = p2.locks.get();
-
- if (p1Val > 0 && p2Val > 0) {
- // Both are locked, so don't sort
- return 0;
- } else if (p1Val > 0 || p2Val > 0) {
- // Put the unlocked one to RHS
- return Long.compare(p2Val, p1Val);
- } else {
- // Both are unlocked
-
- if (evictionPolicy == EvictionPolicy.MIN_EVICT) {
- long p1Size = 0;
- long p2Size = 0;
- try {
- p1Size = p1.getSizeOnDevice() - neededSize;
- p2Size = p2.getSizeOnDevice() - neededSize;
- } catch (DMLRuntimeException e) {
- throw new RuntimeException(e);
- }
-
- if (p1Size >= 0 && p2Size >= 0) {
- return Long.compare(p2Size, p1Size);
- } else {
- return Long.compare(p1Size, p2Size);
- }
- } else if (evictionPolicy == EvictionPolicy.LRU || evictionPolicy == EvictionPolicy.LFU) {
- return Long.compare(p2.timestamp.get(), p1.timestamp.get());
- } else {
- throw new RuntimeException("Unsupported eviction policy:" + evictionPolicy.name());
- }
- }
- }
- });
-
- while (neededSize > getAvailableMemory() && allocatedGPUObjects.size() > 0) {
- GPUObject toBeRemoved = allocatedGPUObjects.get(allocatedGPUObjects.size() - 1);
- if (toBeRemoved.locks.get() > 0) {
- throw new DMLRuntimeException("There is not enough memory on device for this matrix, request (" + neededSize + ")");
- }
- if (toBeRemoved.dirty) {
- toBeRemoved.copyFromDeviceToHost();
- }
-
- toBeRemoved.clearData(true);
- }
- }
-
- /**
- * Whether the GPU associated with this {@link GPUContext} has recorded the usage of a certain block
- *
- * @param o the block
- * @return true if present, false otherwise
- */
- public boolean isBlockRecorded(GPUObject o) {
- return allocatedGPUObjects.contains(o);
- }
-
- /**
- * @param o {@link GPUObject} instance to record
- * @see GPUContext#allocatedGPUObjects
- * Records the usage of a matrix block
- */
- public void recordBlockUsage(GPUObject o) {
- allocatedGPUObjects.add(o);
- }
-
- /**
- * @param o {@link GPUObject} instance to remove from the list of allocated GPU objects
- * @see GPUContext#allocatedGPUObjects
- * Records that a block is not used anymore
- */
- public void removeRecordedUsage(GPUObject o) {
- allocatedGPUObjects.remove(o);
- }
-
- /**
- * Gets the available memory on GPU that SystemML can use
- *
- * @return the available memory in bytes
- */
- public long getAvailableMemory() {
- long free[] = {0};
- long total[] = {0};
- cudaMemGetInfo(free, total);
- return (long) (free[0] * GPU_MEMORY_UTILIZATION_FACTOR);
- }
-
- /**
- * Makes sure that GPU that SystemML is trying to use has the minimum compute capability needed
- *
- * @throws DMLRuntimeException if the compute capability is less than what is required
- */
- public void ensureComputeCapability() throws DMLRuntimeException {
- int[] devices = {-1};
- cudaGetDeviceCount(devices);
- if (devices[0] == -1) {
- throw new DMLRuntimeException("Call to cudaGetDeviceCount returned 0 devices");
- }
- boolean isComputeCapable = true;
- for (int i = 0; i < devices[0]; i++) {
- cudaDeviceProp properties = GPUContextPool.getGPUProperties(i);
- int major = properties.major;
- int minor = properties.minor;
- if (major < MAJOR_REQUIRED) {
- isComputeCapable = false;
- } else if (major == MAJOR_REQUIRED && minor < MINOR_REQUIRED) {
- isComputeCapable = false;
- }
- }
- if (!isComputeCapable) {
- throw new DMLRuntimeException("One of the CUDA cards on the system has compute capability lower than " + MAJOR_REQUIRED + "." + MINOR_REQUIRED);
- }
- }
-
- public GPUObject createGPUObject(MatrixObject mo) {
- return new GPUObject(this, mo);
- }
-
- /**
- * Gets the device properties for the active GPU (set with cudaSetDevice())
- *
- * @return the device properties
- * @throws DMLRuntimeException ?
- */
- public cudaDeviceProp getGPUProperties() throws DMLRuntimeException {
- return GPUContextPool.getGPUProperties(deviceNum);
- }
-
- /**
- * Gets the maximum number of threads per block for "active" GPU
- *
- * @return the maximum number of threads per block
- * @throws DMLRuntimeException ?
- */
- public int getMaxThreadsPerBlock() throws DMLRuntimeException {
- cudaDeviceProp deviceProps = getGPUProperties();
- return deviceProps.maxThreadsPerBlock;
- }
-
- /**
- * Gets the maximum number of blocks supported by the active cuda device
- *
- * @return the maximum number of blocks supported
- * @throws DMLRuntimeException ?
- */
- public int getMaxBlocks() throws DMLRuntimeException {
- cudaDeviceProp deviceProp = getGPUProperties();
- return deviceProp.maxGridSize[0];
- }
-
- /**
- * Gets the shared memory per block supported by the active cuda device
- *
- * @return the shared memory per block
- * @throws DMLRuntimeException ?
- */
- public long getMaxSharedMemory() throws DMLRuntimeException {
- cudaDeviceProp deviceProp = getGPUProperties();
- return deviceProp.sharedMemPerBlock;
- }
-
- /**
- * Gets the warp size supported by the active cuda device
- *
- * @return the warp size
- * @throws DMLRuntimeException ?
- */
- public int getWarpSize() throws DMLRuntimeException {
- cudaDeviceProp deviceProp = getGPUProperties();
- return deviceProp.warpSize;
- }
-
- public cudnnHandle getCudnnHandle() {
- return cudnnHandle;
- }
-
- public cublasHandle getCublasHandle() {
- return cublasHandle;
- }
-
- public cusparseHandle getCusparseHandle() {
- return cusparseHandle;
- }
-
- public cusolverDnHandle getCusolverDnHandle() {
- return cusolverDnHandle;
- }
-
- public cusolverSpHandle getCusolverSpHandle() {
- return cusolverSpHandle;
- }
-
- public JCudaKernels getKernels() {
- return kernels;
- }
-
- /**
- * Destroys this GPUContext object
- *
- * @throws DMLRuntimeException if error
- */
- public void destroy() throws DMLRuntimeException {
- LOG.trace("GPU : this context was destroyed, this = " + this.toString());
- clearMemory();
- cudnnDestroy(cudnnHandle);
- cublasDestroy(cublasHandle);
- cusparseDestroy(cusparseHandle);
- cusolverDnDestroy(cusolverDnHandle);
- cusolverSpDestroy(cusolverSpHandle);
- cudnnHandle = null;
- cublasHandle = null;
- cusparseHandle = null;
-
- }
-
- /**
- * Clears all memory used by this {@link GPUContext}
- * Be careful to ensure that no memory is currently being used in the temporary memory before invoking this
- * If memory is being used between MLContext invocations, they are pointed to by a {@link GPUObject} instance
- * which would be part of the {@link MatrixObject}. The cleanup of that {@link MatrixObject} instance will
- * cause the memory associated with that block on the GPU to be freed up.
- * @throws DMLRuntimeException ?
- */
- public void clearMemory() throws DMLRuntimeException {
- clearTemporaryMemory();
- while (!allocatedGPUObjects.isEmpty()) {
- GPUObject o = allocatedGPUObjects.get(0);
- if (o.isDirty()){
- LOG.warn("Attempted to free GPU Memory when a block[" + o + "] is still on GPU memory, copying it back to host.");
- o.acquireHostRead();
- }
- o.clearData(true);
- }
- allocatedGPUObjects.clear();
- }
-
- /**
- * Clears up the memory used to optimize cudaMalloc/cudaFree calls
- */
- public void clearTemporaryMemory() {
- // To record the cuda block sizes needed by allocatedGPUObjects, others are cleared up.
- HashMap<Pointer, Long> tmpCudaBlockSizeMap = new HashMap<>();
- for (GPUObject o : allocatedGPUObjects) {
- if (o.isSparse()) {
- CSRPointer p = o.getSparseMatrixCudaPointer();
- if (p.rowPtr != null && cudaBlockSizeMap.containsKey(p.rowPtr)) {
- tmpCudaBlockSizeMap.put(p.rowPtr, cudaBlockSizeMap.get(p.rowPtr));
- }
- if (p.colInd != null && cudaBlockSizeMap.containsKey(p.colInd)) {
- tmpCudaBlockSizeMap.put(p.colInd, cudaBlockSizeMap.get(p.colInd));
- }
- if (p.val != null && cudaBlockSizeMap.containsKey(p.val)) {
- tmpCudaBlockSizeMap.put(p.val, cudaBlockSizeMap.get(p.val));
- }
-
- } else {
- Pointer p = o.getJcudaDenseMatrixPtr();
- tmpCudaBlockSizeMap.put(p, cudaBlockSizeMap.get(p));
- }
- }
-
- // garbage collect all temporarily allocated spaces
- for (LinkedList<Pointer> l : freeCUDASpaceMap.values()) {
- for (Pointer p : l) {
- cudaFreeHelper(p, true);
- }
- }
- cudaBlockSizeMap.clear();
- freeCUDASpaceMap.clear();
-
- // Restore only those entries for which there are still blocks on the GPU
- cudaBlockSizeMap.putAll(tmpCudaBlockSizeMap);
- }
-
- @Override
- public String toString() {
- return "GPUContext{" +
- "deviceNum=" + deviceNum +
- '}';
- }
+ /**
+ * @param o {@link GPUObject} instance to remove from the list of allocated GPU objects
+ * @see GPUContext#allocatedGPUObjects
+ * Records that a block is not used anymore
+ */
+ public void removeRecordedUsage(GPUObject o) {
+ allocatedGPUObjects.remove(o);
+ }
+
+ /**
+ * Gets the available memory on GPU that SystemML can use
+ *
+ * @return the available memory in bytes
+ */
+ public long getAvailableMemory() {
+ long free[] = { 0 };
+ long total[] = { 0 };
+ cudaMemGetInfo(free, total);
+ return (long) (free[0] * GPU_MEMORY_UTILIZATION_FACTOR);
+ }
+
+ /**
+ * Makes sure that GPU that SystemML is trying to use has the minimum compute capability needed
+ *
+ * @throws DMLRuntimeException if the compute capability is less than what is required
+ */
+ public void ensureComputeCapability() throws DMLRuntimeException {
+ int[] devices = { -1 };
+ cudaGetDeviceCount(devices);
+ if (devices[0] == -1) {
+ throw new DMLRuntimeException("Call to cudaGetDeviceCount returned 0 devices");
+ }
+ boolean isComputeCapable = true;
+ for (int i = 0; i < devices[0]; i++) {
+ cudaDeviceProp properties = GPUContextPool.getGPUProperties(i);
+ int major = properties.major;
+ int minor = properties.minor;
+ if (major < MAJOR_REQUIRED) {
+ isComputeCapable = false;
+ } else if (major == MAJOR_REQUIRED && minor < MINOR_REQUIRED) {
+ isComputeCapable = false;
+ }
+ }
+ if (!isComputeCapable) {
+ throw new DMLRuntimeException(
+ "One of the CUDA cards on the system has compute capability lower than " + MAJOR_REQUIRED + "."
+ + MINOR_REQUIRED);
+ }
+ }
+
+ public GPUObject createGPUObject(MatrixObject mo) {
+ return new GPUObject(this, mo);
+ }
+
+ /**
+ * Gets the device properties for the active GPU (set with cudaSetDevice())
+ *
+ * @return the device properties
+ * @throws DMLRuntimeException ?
+ */
+ public cudaDeviceProp getGPUProperties() throws DMLRuntimeException {
+ return GPUContextPool.getGPUProperties(deviceNum);
+ }
+
+ /**
+ * Gets the maximum number of threads per block for "active" GPU
+ *
+ * @return the maximum number of threads per block
+ * @throws DMLRuntimeException ?
+ */
+ public int getMaxThreadsPerBlock() throws DMLRuntimeException {
+ cudaDeviceProp deviceProps = getGPUProperties();
+ return deviceProps.maxThreadsPerBlock;
+ }
+
+ /**
+ * Gets the maximum number of blocks supported by the active cuda device
+ *
+ * @return the maximum number of blocks supported
+ * @throws DMLRuntimeException ?
+ */
+ public int getMaxBlocks() throws DMLRuntimeException {
+ cudaDeviceProp deviceProp = getGPUProperties();
+ return deviceProp.maxGridSize[0];
+ }
+
+ /**
+ * Gets the shared memory per block supported by the active cuda device
+ *
+ * @return the shared memory per block
+ * @throws DMLRuntimeException ?
+ */
+ public long getMaxSharedMemory() throws DMLRuntimeException {
+ cudaDeviceProp deviceProp = getGPUProperties();
+ return deviceProp.sharedMemPerBlock;
+ }
+
+ /**
+ * Gets the warp size supported by the active cuda device
+ *
+ * @return the warp size
+ * @throws DMLRuntimeException ?
+ */
+ public int getWarpSize() throws DMLRuntimeException {
+ cudaDeviceProp deviceProp = getGPUProperties();
+ return deviceProp.warpSize;
+ }
+
+ public cudnnHandle getCudnnHandle() {
+ return cudnnHandle;
+ }
+
+ public cublasHandle getCublasHandle() {
+ return cublasHandle;
+ }
+
+ public cusparseHandle getCusparseHandle() {
+ return cusparseHandle;
+ }
+
+ public cusolverDnHandle getCusolverDnHandle() {
+ return cusolverDnHandle;
+ }
+
+ public cusolverSpHandle getCusolverSpHandle() {
+ return cusolverSpHandle;
+ }
+
+ public JCudaKernels getKernels() {
+ return kernels;
+ }
+
+ /**
+ * Destroys this GPUContext object
+ *
+ * @throws DMLRuntimeException if error
+ */
+ public void destroy() throws DMLRuntimeException {
+ LOG.trace("GPU : this context was destroyed, this = " + this.toString());
+ clearMemory();
+ cudnnDestroy(cudnnHandle);
+ cublasDestroy(cublasHandle);
+ cusparseDestroy(cusparseHandle);
+ cusolverDnDestroy(cusolverDnHandle);
+ cusolverSpDestroy(cusolverSpHandle);
+ cudnnHandle = null;
+ cublasHandle = null;
+ cusparseHandle = null;
+
+ }
+
+ /**
+ * Clears all memory used by this {@link GPUContext}
+ * Be careful to ensure that no memory is currently being used in the temporary memory before invoking this
+ * If memory is being used between MLContext invocations, they are pointed to by a {@link GPUObject} instance
+ * which would be part of the {@link MatrixObject}. The cleanup of that {@link MatrixObject} instance will
+ * cause the memory associated with that block on the GPU to be freed up.
+ *
+ * @throws DMLRuntimeException ?
+ */
+ public void clearMemory() throws DMLRuntimeException {
+ clearTemporaryMemory();
+ while (!allocatedGPUObjects.isEmpty()) {
+ GPUObject o = allocatedGPUObjects.get(0);
+ if (o.isDirty()) {
+ LOG.warn("Attempted to free GPU Memory when a block[" + o
+ + "] is still on GPU memory, copying it back to host.");
+ o.acquireHostRead();
+ }
+ o.clearData(true);
+ }
+ allocatedGPUObjects.clear();
+ }
+
+ /**
+ * Clears up the memory used to optimize cudaMalloc/cudaFree calls
+ */
+ public void clearTemporaryMemory() {
+ // To record the cuda block sizes needed by allocatedGPUObjects, others are cleared up.
+ HashMap<Pointer, Long> tmpCudaBlockSizeMap = new HashMap<>();
+ for (GPUObject o : allocatedGPUObjects) {
+ if (o.isSparse()) {
+ CSRPointer p = o.getSparseMatrixCudaPointer();
+ if (p.rowPtr != null && cudaBlockSizeMap.containsKey(p.rowPtr)) {
+ tmpCudaBlockSizeMap.put(p.rowPtr, cudaBlockSizeMap.get(p.rowPtr));
+ }
+ if (p.colInd != null && cudaBlockSizeMap.containsKey(p.colInd)) {
+ tmpCudaBlockSizeMap.put(p.colInd, cudaBlockSizeMap.get(p.colInd));
+ }
+ if (p.val != null && cudaBlockSizeMap.containsKey(p.val)) {
+ tmpCudaBlockSizeMap.put(p.val, cudaBlockSizeMap.get(p.val));
+ }
+
+ } else {
+ Pointer p = o.getJcudaDenseMatrixPtr();
+ tmpCudaBlockSizeMap.put(p, cudaBlockSizeMap.get(p));
+ }
+ }
+
+ // garbage collect all temporarily allocated spaces
+ for (LinkedList<Pointer> l : freeCUDASpaceMap.values()) {
+ for (Pointer p : l) {
+ cudaFreeHelper(p, true);
+ }
+ }
+ cudaBlockSizeMap.clear();
+ freeCUDASpaceMap.clear();
+
+ // Restore only those entries for which there are still blocks on the GPU
+ cudaBlockSizeMap.putAll(tmpCudaBlockSizeMap);
+ }
+
+ @Override
+ public String toString() {
+ return "GPUContext{" + "deviceNum=" + deviceNum + '}';
+ }
+
+ /**
+ * Eviction policies for {@link GPUContext#evict(long)}
+ */
+ public enum EvictionPolicy {
+ LRU, LFU, MIN_EVICT
+ }
}
http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java
index 1d0b5c8..ac1c059 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java
@@ -23,7 +23,7 @@ import static jcuda.driver.JCudaDriver.cuInit;
import static jcuda.runtime.JCuda.cudaGetDeviceProperties;
import java.util.LinkedList;
-import java.util.Queue;
+import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@@ -39,122 +39,152 @@ import jcuda.runtime.cudaDeviceProp;
public class GPUContextPool {
- protected static final Log LOG = LogFactory.getLog(GPUContextPool.class.getName());
-
- /** Maximum number of gpus to use, -1 for all */
- public static int PER_PROCESS_MAX_GPUS = -1;
-
- /** Whether cuda has been initialized */
- static boolean initialized = false;
-
- /** The total number of cuda devices on this machine */
- static int deviceCount = -1;
-
- /** Stores the cached deviceProperties */
- static cudaDeviceProp[] deviceProperties;
-
- /** Set of free GPUContexts */
- static Queue<GPUContext> freePool = new LinkedList<>();
-
- /**
- * Static initialization of the number of devices
- * Also sets behaviour for J{Cuda, Cudnn, Cublas, Cusparse} in case of error
- * Initializes the CUDA driver
- * All these need be done once, and not per GPU
- * @throws DMLRuntimeException ?
- */
- public synchronized static void initializeGPU() throws DMLRuntimeException {
- GPUContext.LOG.info("Initializing CUDA");
- long start = System.nanoTime();
- JCuda.setExceptionsEnabled(true);
- JCudnn.setExceptionsEnabled(true);
- JCublas2.setExceptionsEnabled(true);
- JCusparse.setExceptionsEnabled(true);
- JCudaDriver.setExceptionsEnabled(true);
- cuInit(0); // Initialize the driver
-
- int deviceCountArray[] = {0};
- cuDeviceGetCount(deviceCountArray); // Obtain the number of devices
- deviceCount = deviceCountArray[0];
- deviceProperties = new cudaDeviceProp[deviceCount];
-
- if (PER_PROCESS_MAX_GPUS > 0)
- deviceCount = Math.min(PER_PROCESS_MAX_GPUS, deviceCount);
-
- // Initialize the list of devices
- for (int i = 0; i < deviceCount; i++) {
- cudaDeviceProp properties = new cudaDeviceProp();
- cudaGetDeviceProperties(properties, i);
- deviceProperties[i] = properties;
- }
-
- // Initialize the pool of GPUContexts
- for (int i=0; i<deviceCount; i++){
- GPUContext gCtx = new GPUContext(i);
- freePool.add(gCtx);
- }
-
- GPUContext.LOG.info("Total number of GPUs on the machine: " + deviceCount);
- //int[] device = {-1};
- //cudaGetDevice(device);
- //cudaDeviceProp prop = getGPUProperties(device[0]);
- //int maxBlocks = prop.maxGridSize[0];
- //int maxThreadsPerBlock = prop.maxThreadsPerBlock;
- //long sharedMemPerBlock = prop.sharedMemPerBlock;
- //LOG.debug("Active CUDA device number : " + device[0]);
- //LOG.debug("Max Blocks/Threads/SharedMem on active device: " + maxBlocks + "/" + maxThreadsPerBlock + "/" + sharedMemPerBlock);
- initialized = true;
- GPUStatistics.cudaInitTime = System.nanoTime() - start;
- }
-
- /**
- * Gets an initialized GPUContext from a pool of GPUContexts, each linked to a GPU
- * @return null if not more GPUContexts in pool, a valid GPUContext otherwise
- * @throws DMLRuntimeException ?
- */
- public static synchronized GPUContext getFromPool() throws DMLRuntimeException {
- if (!initialized) initializeGPU();
- GPUContext gCtx = freePool.poll();
- LOG.trace("GPU : got GPUContext (" + gCtx + ") from freePool. New sizes - FreePool[" + freePool.size() + "]");
- return gCtx;
- }
-
- /**
- * Get the number of free GPUContexts
- * @return number of free GPUContexts
- */
- public static synchronized int getAvailableCount() {
- return freePool.size();
- }
-
- /**
- * Gets the device properties
- * @param device the device number (on a machine with more than 1 GPU)
- * @return the device properties
- * @throws DMLRuntimeException if there is problem initializing the GPUContexts
- */
- static cudaDeviceProp getGPUProperties(int device) throws DMLRuntimeException {
- // do once - initialization of GPU
- if (!initialized) initializeGPU();
- return deviceProperties[device];
- }
-
- public static int getDeviceCount() throws DMLRuntimeException {
- if (!initialized) initializeGPU();
- return deviceCount;
- }
-
- /**
- * Returns a {@link GPUContext} back to the pool of {@link GPUContext}s
- * @param gCtx the GPUContext instance to return. If null, nothing happens
- * @throws DMLRuntimeException if error
- */
- public static synchronized void returnToPool(GPUContext gCtx) throws DMLRuntimeException {
- if (gCtx == null)
- return;
- freePool.add(gCtx);
- LOG.trace("GPU : returned GPUContext (" + gCtx + ") to freePool. New sizes - FreePool[" + freePool.size() + "]");
-
- }
+ protected static final Log LOG = LogFactory.getLog(GPUContextPool.class.getName());
+
+ /**
+ * Maximum number of gpus to use, -1 for all
+ */
+ public static int PER_PROCESS_MAX_GPUS = -1;
+
+ /**
+ * Whether cuda has been initialized
+ */
+ static boolean initialized = false;
+
+ /**
+ * The total number of cuda devices on this machine
+ */
+ static int deviceCount = -1;
+
+ /**
+ * Stores the cached deviceProperties
+ */
+ static cudaDeviceProp[] deviceProperties;
+
+ /**
+ * Set of free GPUContexts
+ */
+ static List<GPUContext> pool = new LinkedList<>();
+
+ /**
+ * Whether the pool of GPUs is reserved or not
+ */
+ static boolean reserved = false;
+
+ /**
+ * Static initialization of the number of devices
+ * Also sets behaviour for J{Cuda, Cudnn, Cublas, Cusparse} in case of error
+ * Initializes the CUDA driver
+ * All these need be done once, and not per GPU
+ *
+ * @throws DMLRuntimeException ?
+ */
+ public synchronized static void initializeGPU() throws DMLRuntimeException {
+ GPUContext.LOG.info("Initializing CUDA");
+ long start = System.nanoTime();
+ JCuda.setExceptionsEnabled(true);
+ JCudnn.setExceptionsEnabled(true);
+ JCublas2.setExceptionsEnabled(true);
+ JCusparse.setExceptionsEnabled(true);
+ JCudaDriver.setExceptionsEnabled(true);
+ cuInit(0); // Initialize the driver
+
+ int deviceCountArray[] = { 0 };
+ cuDeviceGetCount(deviceCountArray); // Obtain the number of devices
+ deviceCount = deviceCountArray[0];
+ deviceProperties = new cudaDeviceProp[deviceCount];
+
+ if (PER_PROCESS_MAX_GPUS > 0)
+ deviceCount = Math.min(PER_PROCESS_MAX_GPUS, deviceCount);
+
+ // Initialize the list of devices
+ for (int i = 0; i < deviceCount; i++) {
+ cudaDeviceProp properties = new cudaDeviceProp();
+ cudaGetDeviceProperties(properties, i);
+ deviceProperties[i] = properties;
+ }
+
+ // Initialize the pool of GPUContexts
+ for (int i = 0; i < deviceCount; i++) {
+ GPUContext gCtx = new GPUContext(i);
+ pool.add(gCtx);
+ }
+
+ GPUContext.LOG.info("Total number of GPUs on the machine: " + deviceCount);
+ //int[] device = {-1};
+ //cudaGetDevice(device);
+ //cudaDeviceProp prop = getGPUProperties(device[0]);
+ //int maxBlocks = prop.maxGridSize[0];
+ //int maxThreadsPerBlock = prop.maxThreadsPerBlock;
+ //long sharedMemPerBlock = prop.sharedMemPerBlock;
+ //LOG.debug("Active CUDA device number : " + device[0]);
+ //LOG.debug("Max Blocks/Threads/SharedMem on active device: " + maxBlocks + "/" + maxThreadsPerBlock + "/" + sharedMemPerBlock);
+ initialized = true;
+ GPUStatistics.cudaInitTime = System.nanoTime() - start;
+ }
+
+ /**
+ * Reserves and gets an initialized list of GPUContexts
+ *
+ * @return null if no GPUContexts in pool, otherwise a valid list of GPUContext
+ * @throws DMLRuntimeException ?
+ */
+ public static synchronized List<GPUContext> reserveAllGPUContexts() throws DMLRuntimeException {
+ if (reserved)
+ throw new DMLRuntimeException("Trying to re-reserve GPUs");
+ if (!initialized)
+ initializeGPU();
+ reserved = true;
+ LOG.trace("GPU : Reserved all GPUs");
+ return pool;
+ }
+
+ /**
+ * Get the number of free GPUContexts
+ *
+ * @return number of free GPUContexts
+ */
+ public static synchronized int getAvailableCount() {
+ return pool.size();
+ }
+
+ /**
+ * Gets the device properties
+ *
+ * @param device the device number (on a machine with more than 1 GPU)
+ * @return the device properties
+ * @throws DMLRuntimeException if there is problem initializing the GPUContexts
+ */
+ static cudaDeviceProp getGPUProperties(int device) throws DMLRuntimeException {
+ // do once - initialization of GPU
+ if (!initialized)
+ initializeGPU();
+ return deviceProperties[device];
+ }
+
+ /**
+ * Number of available devices on this machine
+ *
+ * @return number of available GPUs on this machine
+ * @throws DMLRuntimeException if error
+ */
+ public static int getDeviceCount() throws DMLRuntimeException {
+ if (!initialized)
+ initializeGPU();
+ return deviceCount;
+ }
+
+ /**
+ * Unreserves all GPUContexts
+ *
+ * @throws DMLRuntimeException if error
+ */
+ public static synchronized void freeAllGPUContexts() throws DMLRuntimeException {
+ if (!reserved)
+ throw new DMLRuntimeException("Trying to free unreserved GPUs");
+ reserved = false;
+ LOG.trace("GPU : Unreserved all GPUs");
+
+ }
}