You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2018/09/20 22:00:21 UTC
systemml git commit: [SYSTEMML-445] Added memory stats for GPU
allocation/eviction
Repository: systemml
Updated Branches:
refs/heads/master 69624850e -> f46279a17
[SYSTEMML-445] Added memory stats for GPU allocation/eviction
- Also, reverted the shadow buffer to the original implementation as we are getting OOM for lstm scripts. This likely has to do with pessimistic GC.
Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/f46279a1
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/f46279a1
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/f46279a1
Branch: refs/heads/master
Commit: f46279a17031d3f8827923f6eddd614c3eac77d3
Parents: 6962485
Author: Niketan Pansare <np...@us.ibm.com>
Authored: Thu Sep 20 14:56:51 2018 -0700
Committer: Niketan Pansare <np...@us.ibm.com>
Committed: Thu Sep 20 14:56:51 2018 -0700
----------------------------------------------------------------------
conf/SystemML-config.xml.template | 8 +-
.../gpu/context/GPUMemoryManager.java | 61 ++++----
.../instructions/gpu/context/GPUObject.java | 18 +--
.../instructions/gpu/context/ShadowBuffer.java | 154 +++++--------------
.../org/apache/sysml/utils/GPUStatistics.java | 29 ++++
.../apache/sysml/utils/PersistentLRUCache.java | 8 +-
6 files changed, 108 insertions(+), 170 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/systemml/blob/f46279a1/conf/SystemML-config.xml.template
----------------------------------------------------------------------
diff --git a/conf/SystemML-config.xml.template b/conf/SystemML-config.xml.template
index 3925c4e..7b535c9 100644
--- a/conf/SystemML-config.xml.template
+++ b/conf/SystemML-config.xml.template
@@ -105,11 +105,9 @@
<!-- Advanced optimization: fraction of driver memory to use for caching (default: 0.15) -->
<sysml.caching.bufferSize>0.15</sysml.caching.bufferSize>
- <!-- Advanced optimization: maximum fraction of driver memory to use for GPU shadow buffer.
- Shadow buffer is cleared eagerly on garbage collection to avoid OOM and is backed by org.apache.sysml.utils.PersistentLRUCache.
- Setting this to zero disables shadow buffering. If you intend to train network larger than GPU memory size,
- consider using large driver memory and setting this to a value greater than 0. -->
- <sysml.gpu.eviction.shadow.bufferSize>0.5</sysml.gpu.eviction.shadow.bufferSize>
+ <!-- Advanced optimization: fraction of driver memory to use for GPU shadow buffer. This optimization is ignored for double precision.
+ By default, it is disabled (hence set to 0.0). If you intend to train network larger than GPU memory size, consider using single precision and setting this to 0.1. -->
+ <sysml.gpu.eviction.shadow.bufferSize>0.0</sysml.gpu.eviction.shadow.bufferSize>
<!-- Fraction of available GPU memory to use. This is similar to TensorFlow's per_process_gpu_memory_fraction configuration property. (default: 0.9) -->
<sysml.gpu.memory.util.factor>0.9</sysml.gpu.memory.util.factor>
http://git-wip-us.apache.org/repos/asf/systemml/blob/f46279a1/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
index 033051a..57b76f6 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
@@ -191,7 +191,7 @@ public class GPUMemoryManager {
GPUStatistics.cudaAllocCount.increment();
}
if(printDebugMessage != null && (PRINT_GPU_MEMORY_INFO || LOG.isTraceEnabled()) ) {
- LOG.info("Success: " + printDebugMessage + ":" + byteCountToDisplaySize(size));
+ LOG.info("Success: " + printDebugMessage + ":" + GPUStatistics.byteCountToDisplaySize(size));
}
return A;
} catch(jcuda.CudaException e) {
@@ -203,7 +203,7 @@ public class GPUMemoryManager {
GPUStatistics.cudaAllocCount.increment();
}
if(printDebugMessage != null && (PRINT_GPU_MEMORY_INFO || LOG.isTraceEnabled()) ) {
- LOG.info("Failed: " + printDebugMessage + ":" + byteCountToDisplaySize(size));
+ LOG.info("Failed: " + printDebugMessage + ":" + GPUStatistics.byteCountToDisplaySize(size));
LOG.info("GPU Memory info " + printDebugMessage + ":" + toString());
}
return null;
@@ -224,28 +224,15 @@ public class GPUMemoryManager {
return "->" + stackTrace[index].getClassName() + "." + stackTrace[index].getMethodName() + "(" + stackTrace[index].getFileName() + ":" + stackTrace[index].getLineNumber() + ")";
}
- /**
- * Pretty printing utility to print bytes
- *
- * @param numBytes number of bytes
- * @return a human-readable display value
- */
- private String byteCountToDisplaySize(long numBytes) {
- // return org.apache.commons.io.FileUtils.byteCountToDisplaySize(bytes); // performs rounding
- if (numBytes < 1024) {
- return numBytes + " bytes";
- }
- else {
- int exp = (int) (Math.log(numBytes) / 6.931471805599453);
- return String.format("%.3f %sB", ((double)numBytes) / Math.pow(1024, exp), "KMGTP".charAt(exp-1));
- }
- }
public boolean canAllocateWithoutEviction(String opcode, long size) {
return lazyCudaFreeMemoryManager.contains(opcode, size) || allocator.canAllocate(size) ||
lazyCudaFreeMemoryManager.containsRmvarPointerMinSize(opcode, size) ;
}
+ long peakSize = 0;
+ long currentSize = 0;
+
/**
* Allocate pointer of the given size in bytes.
*
@@ -255,12 +242,19 @@ public class GPUMemoryManager {
*/
public Pointer malloc(String opcode, long size) {
if(size <= 0) {
- throw new DMLRuntimeException("Cannot allocate memory of size " + byteCountToDisplaySize(size));
+ throw new DMLRuntimeException("Cannot allocate memory of size " + GPUStatistics.byteCountToDisplaySize(size));
}
if(DEBUG_MEMORY_LEAK) {
LOG.info("GPU Memory info during malloc:" + toString());
}
+ if(ConfigurationManager.isStatistics()) {
+ GPUStatistics.cudaAllocAggSize.add(size);
+ currentSize += size;
+ peakSize = Math.max(currentSize, peakSize);
+ GPUStatistics.cudaAllocPeakSize.set(peakSize);
+ }
+
// Step 1: First try reusing exact match in rmvarGPUPointers to avoid holes in the GPU memory
Pointer A = lazyCudaFreeMemoryManager.getRmvarPointer(opcode, size);
@@ -358,7 +352,7 @@ public class GPUMemoryManager {
}
if(A == null) {
- throw new DMLRuntimeException("There is not enough memory on device for this matrix, requested = " + byteCountToDisplaySize(size) + ". \n "
+ throw new DMLRuntimeException("There is not enough memory on device for this matrix, requested = " + GPUStatistics.byteCountToDisplaySize(size) + ". \n "
+ toString());
}
@@ -377,6 +371,10 @@ public class GPUMemoryManager {
boolean eagerDelete = true;
if(gpuObj.isDirty()) {
// Eviction
+ if(ConfigurationManager.isStatistics()) {
+ long size = gpuObj.getSizeOnDevice();
+ GPUStatistics.cudaEvictAggSize.add(size);
+ }
gpuObj.copyFromDeviceToHost(opcode, true, eagerDelete);
}
else {
@@ -416,7 +414,7 @@ public class GPUMemoryManager {
if(allPointers.containsKey(toFree)) {
long size = allPointers.get(toFree).getSizeInBytes();
if(LOG.isTraceEnabled()) {
- LOG.trace("Free-ing up the pointer of size " + byteCountToDisplaySize(size));
+ LOG.trace("Free-ing up the pointer of size " + GPUStatistics.byteCountToDisplaySize(size));
}
allPointers.remove(toFree);
lazyCudaFreeMemoryManager.removeIfPresent(size, toFree);
@@ -441,6 +439,10 @@ public class GPUMemoryManager {
public void free(String opcode, Pointer toFree, boolean eager) throws DMLRuntimeException {
if(LOG.isTraceEnabled())
LOG.trace("Free-ing the pointer with eager=" + eager);
+ long size = allPointers.get(toFree).getSizeInBytes();
+ if(ConfigurationManager.isStatistics()) {
+ currentSize -= size;
+ }
if (eager) {
long t0 = ConfigurationManager.isStatistics() ? System.nanoTime() : 0;
guardedCudaFree(toFree);
@@ -451,7 +453,6 @@ public class GPUMemoryManager {
LOG.info("GPU memory info before failure:" + toString());
throw new RuntimeException("ERROR : Internal state corrupted, cache block size map is not aware of a block it trying to free up");
}
- long size = allPointers.get(toFree).getSizeInBytes();
lazyCudaFreeMemoryManager.add(size, toFree);
}
}
@@ -604,24 +605,24 @@ public class GPUMemoryManager {
ret.append(String.format("%-35s%-15s%-15s%-15s\n", "",
"Num Objects", "Num Pointers", "Size"));
ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Unlocked Dirty GPU objects",
- numUnlockedDirtyGPUObjects, numUnlockedDirtyPointers, byteCountToDisplaySize(sizeOfUnlockedDirtyGPUObjects)));
+ numUnlockedDirtyGPUObjects, numUnlockedDirtyPointers, GPUStatistics.byteCountToDisplaySize(sizeOfUnlockedDirtyGPUObjects)));
ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Unlocked NonDirty GPU objects",
- numUnlockedNonDirtyGPUObjects, numUnlockedNonDirtyPointers, byteCountToDisplaySize(sizeOfUnlockedNonDirtyGPUObjects)));
+ numUnlockedNonDirtyGPUObjects, numUnlockedNonDirtyPointers, GPUStatistics.byteCountToDisplaySize(sizeOfUnlockedNonDirtyGPUObjects)));
ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Locked GPU objects",
- numLockedGPUObjects, numLockedPointers, byteCountToDisplaySize(sizeOfLockedGPUObjects)));
+ numLockedGPUObjects, numLockedPointers, GPUStatistics.byteCountToDisplaySize(sizeOfLockedGPUObjects)));
ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Cached rmvar-ed pointers",
- "-", lazyCudaFreeMemoryManager.getNumPointers(), byteCountToDisplaySize(lazyCudaFreeMemoryManager.getTotalMemoryAllocated())));
+ "-", lazyCudaFreeMemoryManager.getNumPointers(), GPUStatistics.byteCountToDisplaySize(lazyCudaFreeMemoryManager.getTotalMemoryAllocated())));
ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Non-matrix/non-cached pointers",
- "-", potentiallyLeakyPointers.size(), byteCountToDisplaySize(totalSizePotentiallyLeakyPointers)));
+ "-", potentiallyLeakyPointers.size(), GPUStatistics.byteCountToDisplaySize(totalSizePotentiallyLeakyPointers)));
ret.append(String.format("%-35s%-15s%-15s%-15s\n", "All pointers",
- "-", allPointers.size(), byteCountToDisplaySize(totalMemoryAllocated)));
+ "-", allPointers.size(), GPUStatistics.byteCountToDisplaySize(totalMemoryAllocated)));
long free[] = { 0 };
long total[] = { 0 };
cudaMemGetInfo(free, total);
ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Free mem (from cudaMemGetInfo)",
- "-", "-", byteCountToDisplaySize(free[0])));
+ "-", "-", GPUStatistics.byteCountToDisplaySize(free[0])));
ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Total mem (from cudaMemGetInfo)",
- "-", "-", byteCountToDisplaySize(total[0])));
+ "-", "-", GPUStatistics.byteCountToDisplaySize(total[0])));
ret.append("====================================================\n");
return ret.toString();
}
http://git-wip-us.apache.org/repos/asf/systemml/blob/f46279a1/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
index 43e2727..72d3170 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -111,11 +111,7 @@ public class GPUObject {
*/
public Pointer getDensePointer() {
if(jcudaDenseMatrixPtr == null && shadowBuffer.isBuffered() && getJcudaSparseMatrixPtr() == null) {
- try {
- shadowBuffer.moveToDevice();
- } catch (IOException e) {
- throw new DMLRuntimeException("Error moving the data from shadow buffer to the device", e);
- }
+ shadowBuffer.moveToDevice();
}
return jcudaDenseMatrixPtr;
}
@@ -939,21 +935,13 @@ public class GPUObject {
else {
// If already copied to shadow buffer as part of previous eviction and this is not an eviction (i.e. bufferpool call for subsequent CP/Spark instruction),
// then copy from shadow buffer to MatrixObject.
- try {
- shadowBuffer.moveToHost();
- } catch (IOException e) {
- throw new DMLRuntimeException("Error moving the data from shadow buffer to the host memory", e);
- }
+ shadowBuffer.moveToHost();
return;
}
}
else if(shadowBuffer.isEligibleForBuffering(isEviction, eagerDelete)) {
// Perform shadow buffering if (1) single precision, (2) during eviction, (3) for dense matrices, and (4) if the given matrix can fit into the shadow buffer.
- try {
- shadowBuffer.moveFromDevice(instName);
- } catch (IOException e) {
- throw new DMLRuntimeException("Error moving the data from the device to the shadow buffer", e);
- }
+ shadowBuffer.moveFromDevice(instName);
return;
}
else if (isDensePointerNull() && getJcudaSparseMatrixPtr() == null) {
http://git-wip-us.apache.org/repos/asf/systemml/blob/f46279a1/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ShadowBuffer.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ShadowBuffer.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ShadowBuffer.java
index 4c534a0..88ea972 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ShadowBuffer.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ShadowBuffer.java
@@ -20,65 +20,41 @@ package org.apache.sysml.runtime.instructions.gpu.context;
import static jcuda.runtime.JCuda.cudaMemcpy;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.util.concurrent.atomic.AtomicLong;
-
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
+import org.apache.sysml.api.DMLScript;
import org.apache.sysml.conf.ConfigurationManager;
import org.apache.sysml.conf.DMLConfig;
-import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
import org.apache.sysml.runtime.matrix.data.MatrixBlock;
import org.apache.sysml.utils.GPUStatistics;
-import org.apache.sysml.utils.PersistentLRUCache;
import jcuda.Pointer;
+import jcuda.Sizeof;
-/**
- * Shadow buffer is a temporary staging area used during eviction.
- * It is eagerly deleted and backed using the local filesystem in case of Garbage Collection
- * or if the staging memory size exceeds the user-specified size.
- * This is needed to respect SystemML's memory estimates, while still allowing
- * for caching in case of GPU plans.
- */
public class ShadowBuffer {
private static final Log LOG = LogFactory.getLog(ShadowBuffer.class.getName());
- private static PersistentLRUCache CACHE;
- private static AtomicLong UNIQUE_ID = new AtomicLong();
- private static long EVICTION_SHADOW_BUFFER_MAX_BYTES;
- final GPUObject gpuObj;
- boolean isBuffered = false;
- String fileName;
- public static boolean isEnabled() {
- if(CACHE == null && EVICTION_SHADOW_BUFFER_MAX_BYTES >= 0) {
+ GPUObject gpuObj;
+ float[] shadowPointer = null;
+ private static boolean _warnedAboutShadowBuffer = false;
+ private static long EVICTION_SHADOW_BUFFER_CURR_BYTES = 0;
+ private static long EVICTION_SHADOW_BUFFER_MAX_BYTES;
+ static {
+ if(DMLScript.FLOATING_POINT_PRECISION.equals("double")) {
+ EVICTION_SHADOW_BUFFER_MAX_BYTES = 0;
+ }
+ else {
double shadowBufferSize = ConfigurationManager.getDMLConfig().getDoubleValue(DMLConfig.EVICTION_SHADOW_BUFFERSIZE);
- if(shadowBufferSize <= 0) {
- EVICTION_SHADOW_BUFFER_MAX_BYTES = -1; // Minor optimization to avoid unnecessary invoking configuration manager.
- }
- else {
- if(shadowBufferSize > 1)
- throw new RuntimeException("Incorrect value (" + shadowBufferSize + ") for the configuration:" + DMLConfig.EVICTION_SHADOW_BUFFERSIZE);
- EVICTION_SHADOW_BUFFER_MAX_BYTES = (long) (((double)InfrastructureAnalyzer.getLocalMaxMemory())*shadowBufferSize);
- try {
- CACHE = new PersistentLRUCache(EVICTION_SHADOW_BUFFER_MAX_BYTES);
- } catch(IOException e) {
- LOG.warn("Unable to create a temporary directory for shadow buffering on the local filesystem; disabling shadow buffering:" + e.getMessage());
- EVICTION_SHADOW_BUFFER_MAX_BYTES = -1; // Minor optimization to avoid checking for file permission.
- }
- }
+ if(shadowBufferSize < 0 || shadowBufferSize > 1)
+ throw new RuntimeException("Incorrect value (" + shadowBufferSize + ") for the configuration:" + DMLConfig.EVICTION_SHADOW_BUFFERSIZE);
+ EVICTION_SHADOW_BUFFER_MAX_BYTES = (long) (((double)InfrastructureAnalyzer.getLocalMaxMemory())*shadowBufferSize);
}
- return CACHE != null;
}
public ShadowBuffer(GPUObject gpuObj) {
- if(isEnabled())
- fileName = "shadow_" + UNIQUE_ID.incrementAndGet();
this.gpuObj = gpuObj;
-
}
/**
@@ -87,39 +63,19 @@ public class ShadowBuffer {
* @return true if the gpu object is shadow buffered
*/
public boolean isBuffered() {
- return isBuffered;
- }
-
- private static long getSizeOfDataType(long numElems) {
- return numElems * ((long) LibMatrixCUDA.sizeOfDataType);
+ return shadowPointer != null;
}
/**
* Move the data from GPU to shadow buffer
* @param instName name of the instruction
- * @throws IOException if error
- * @throws FileNotFoundException if error
*/
- public void moveFromDevice(String instName) throws FileNotFoundException, IOException {
+ public void moveFromDevice(String instName) {
long start = ConfigurationManager.isStatistics() ? System.nanoTime() : 0;
int numElems = GPUObject.toIntExact(gpuObj.mat.getNumRows()*gpuObj.mat.getNumColumns());
-
- if(isDoublePrecision()) {
- double [] shadowPointer = new double[numElems];
- cudaMemcpy(Pointer.to(shadowPointer), gpuObj.jcudaDenseMatrixPtr, getSizeOfDataType(numElems), jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost);
- CACHE.put(fileName, shadowPointer);
- isBuffered = true;
- }
- else if(isSinglePrecision()) {
- float [] shadowPointer = new float[numElems];
- cudaMemcpy(Pointer.to(shadowPointer), gpuObj.jcudaDenseMatrixPtr, getSizeOfDataType(numElems), jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost);
- CACHE.put(fileName, shadowPointer);
- isBuffered = true;
- }
- else {
- throw new DMLRuntimeException("Unsupported datatype");
- }
-
+ shadowPointer = new float[numElems];
+ EVICTION_SHADOW_BUFFER_CURR_BYTES += getSizeOfFloat(shadowPointer.length);
+ cudaMemcpy(Pointer.to(shadowPointer), gpuObj.jcudaDenseMatrixPtr, getSizeOfDataType(numElems), jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost);
gpuObj.getGPUContext().cudaFreeHelper(instName, gpuObj.jcudaDenseMatrixPtr, true);
gpuObj.jcudaDenseMatrixPtr = null;
if (ConfigurationManager.isStatistics()) {
@@ -131,36 +87,24 @@ public class ShadowBuffer {
}
}
-
- private static boolean isDoublePrecision() {
- return LibMatrixCUDA.sizeOfDataType == jcuda.Sizeof.DOUBLE;
+ private long getSizeOfFloat(long numElems) {
+ return numElems*Sizeof.FLOAT;
}
- private static boolean isSinglePrecision() {
- return LibMatrixCUDA.sizeOfDataType == jcuda.Sizeof.FLOAT;
+ private long getSizeOfDataType(long numElems) {
+ return numElems*LibMatrixCUDA.sizeOfDataType;
}
/**
* Move the data from shadow buffer to Matrix object
- * @throws IOException if error
- * @throws FileNotFoundException if error
*/
- public void moveToHost() throws FileNotFoundException, IOException {
+ public void moveToHost() {
long start = ConfigurationManager.isStatistics() ? System.nanoTime() : 0;
MatrixBlock tmp = new MatrixBlock(GPUObject.toIntExact(gpuObj.mat.getNumRows()), GPUObject.toIntExact(gpuObj.mat.getNumColumns()), false);
tmp.allocateDenseBlock();
double [] tmpArr = tmp.getDenseBlockValues();
- if(isDoublePrecision()) {
- System.arraycopy(CACHE.getAsDoubleArray(fileName), 0, tmpArr, 0, tmpArr.length);
- }
- else if(isSinglePrecision()) {
- float [] shadowPointer = CACHE.getAsFloatArray(fileName);
- for(int i = 0; i < shadowPointer.length; i++) {
- tmpArr[i] = shadowPointer[i];
- }
- }
- else {
- throw new DMLRuntimeException("Unsupported datatype");
+ for(int i = 0; i < shadowPointer.length; i++) {
+ tmpArr[i] = shadowPointer[i];
}
gpuObj.mat.acquireModify(tmp);
gpuObj.mat.release();
@@ -178,28 +122,12 @@ public class ShadowBuffer {
/**
* Move the data from shadow buffer to GPU
- * @throws IOException if error
- * @throws FileNotFoundException if error
*/
- public void moveToDevice() throws FileNotFoundException, IOException {
+ public void moveToDevice() {
long start = ConfigurationManager.isStatistics() ? System.nanoTime() : 0;
- int length; Pointer shadowDevicePointer;
- if(isDoublePrecision()) {
- double [] shadowPointer = CACHE.getAsDoubleArray(fileName);
- length = shadowPointer.length;
- shadowDevicePointer = Pointer.to(shadowPointer);
- }
- else if(isSinglePrecision()) {
- float [] shadowPointer = CACHE.getAsFloatArray(fileName);
- length = shadowPointer.length;
- shadowDevicePointer = Pointer.to(shadowPointer);
- }
- else {
- throw new DMLRuntimeException("Unsupported datatype");
- }
- long numBytes = getSizeOfDataType(length);
+ long numBytes = getSizeOfDataType(shadowPointer.length);
gpuObj.jcudaDenseMatrixPtr = gpuObj.getGPUContext().allocate(null, numBytes);
- cudaMemcpy(gpuObj.jcudaDenseMatrixPtr, shadowDevicePointer, numBytes, jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice);
+ cudaMemcpy(gpuObj.jcudaDenseMatrixPtr, Pointer.to(shadowPointer), numBytes, jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice);
clearShadowPointer();
if (ConfigurationManager.isStatistics()) {
long totalTime = System.nanoTime() - start;
@@ -216,14 +144,14 @@ public class ShadowBuffer {
* @return true if the given GPU object is eligible to be shadow buffered
*/
public boolean isEligibleForBuffering(boolean isEviction, boolean eagerDelete) {
- if(isEnabled() && isEviction && eagerDelete && !gpuObj.isDensePointerNull()) {
- long numBytes = getSizeOfDataType(gpuObj.mat.getNumRows()*gpuObj.mat.getNumColumns());
- if(EVICTION_SHADOW_BUFFER_MAX_BYTES <= numBytes) {
- return false; // Don't attempt to cache very large GPU objects.
- }
- else {
- return true; // Dense GPU objects is eligible for shadow buffering when called during eviction and is being eagerly deleted.
+ if(LibMatrixCUDA.sizeOfDataType == jcuda.Sizeof.FLOAT && isEviction && eagerDelete && !gpuObj.isDensePointerNull()) {
+ long numBytes = getSizeOfFloat(gpuObj.mat.getNumRows()*gpuObj.mat.getNumColumns());
+ boolean ret = EVICTION_SHADOW_BUFFER_CURR_BYTES + numBytes <= EVICTION_SHADOW_BUFFER_MAX_BYTES;
+ if(!ret && !_warnedAboutShadowBuffer) {
+ LOG.warn("Shadow buffer is full, so using CP bufferpool instead. Consider increasing sysml.gpu.eviction.shadow.bufferSize.");
+ _warnedAboutShadowBuffer = true;
}
+ return ret;
}
else {
return false;
@@ -234,9 +162,9 @@ public class ShadowBuffer {
* Removes the content from shadow buffer
*/
public void clearShadowPointer() {
- if(CACHE.containsKey(fileName)) {
- CACHE.remove(fileName);
- isBuffered = false;
+ if(shadowPointer != null) {
+ EVICTION_SHADOW_BUFFER_CURR_BYTES -= getSizeOfFloat(shadowPointer.length);
}
+ shadowPointer = null;
}
-}
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/systemml/blob/f46279a1/src/main/java/org/apache/sysml/utils/GPUStatistics.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/utils/GPUStatistics.java b/src/main/java/org/apache/sysml/utils/GPUStatistics.java
index e748057..541850d 100644
--- a/src/main/java/org/apache/sysml/utils/GPUStatistics.java
+++ b/src/main/java/org/apache/sysml/utils/GPUStatistics.java
@@ -26,6 +26,7 @@ import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
+import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.LongAdder;
import org.apache.sysml.conf.ConfigurationManager;
@@ -78,6 +79,10 @@ public class GPUStatistics {
public static LongAdder cudaAllocSuccessCount = new LongAdder();
public static LongAdder cudaAllocFailedCount = new LongAdder();
public static LongAdder cudaAllocReuseCount = new LongAdder();
+
+ public static LongAdder cudaAllocAggSize = new LongAdder();
+ public static AtomicLong cudaAllocPeakSize = new AtomicLong();
+ public static LongAdder cudaEvictAggSize = new LongAdder();
// Per instruction miscellaneous timers.
// Used to record events in a CP Heavy Hitter instruction and
@@ -116,6 +121,9 @@ public class GPUStatistics {
cudaDouble2FloatCount.reset();
cudaForcedClearLazyFreedEvictTime.reset();
cudaForcedClearUnpinnedEvictTime.reset();
+ cudaAllocAggSize.reset();
+ cudaAllocPeakSize.set(0);
+ cudaEvictAggSize.reset();
cudaAllocCount.reset();
cudaDeAllocCount.reset();
cudaToDevCount.reset();
@@ -218,6 +226,23 @@ public class GPUStatistics {
}
return sb.toString();
}
+
+ /**
+ * Pretty printing utility to print bytes
+ *
+ * @param numBytes number of bytes
+ * @return a human-readable display value
+ */
+ public static String byteCountToDisplaySize(long numBytes) {
+ // return org.apache.commons.io.FileUtils.byteCountToDisplaySize(bytes); // performs rounding
+ if (numBytes < 1024) {
+ return numBytes + " bytes";
+ }
+ else {
+ int exp = (int) (Math.log(numBytes) / 6.931471805599453);
+ return String.format("%.3f %sB", ((double)numBytes) / Math.pow(1024, exp), "KMGTP".charAt(exp-1));
+ }
+ }
/**
* Used to print out cuda timers & counters
@@ -242,6 +267,10 @@ public class GPUStatistics {
+ cudaAllocReuseCount.longValue() +") / "
+ cudaDeAllocCount.longValue() + " / "
+ cudaMemSet0Count.longValue() + ".\n");
+ sb.append("GPU mem size (alloc (peak) / evict):\t"
+ + byteCountToDisplaySize(cudaAllocAggSize.longValue()) + "("
+ + byteCountToDisplaySize(cudaAllocPeakSize.longValue()) + ") / "
+ + byteCountToDisplaySize(cudaEvictAggSize.longValue()) + ".\n");
sb.append("GPU mem tx time (toDev(d2f/s2d) / fromDev(f2d/s2h) / evict(d2s/size)):\t"
+ String.format("%.3f", cudaToDevTime.longValue()*1e-9) + "("
+ String.format("%.3f", cudaDouble2FloatTime.longValue()*1e-9)+ "/"
http://git-wip-us.apache.org/repos/asf/systemml/blob/f46279a1/src/main/java/org/apache/sysml/utils/PersistentLRUCache.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/utils/PersistentLRUCache.java b/src/main/java/org/apache/sysml/utils/PersistentLRUCache.java
index 71a1e28..d9d9337 100644
--- a/src/main/java/org/apache/sysml/utils/PersistentLRUCache.java
+++ b/src/main/java/org/apache/sysml/utils/PersistentLRUCache.java
@@ -519,10 +519,6 @@ class ValueWrapper {
long _clen;
long _nnz;
- // This is only used in write-mode until the writing to the disk is completed.
- // It also prevents the _softRef from being garbage collected while it is written.
- volatile DataWrapper _strongRef;
-
ValueWrapper(DataWrapper data, boolean isInReadOnlyMode) {
_lock = new Object();
_isInReadOnlyMode = isInReadOnlyMode;
@@ -530,12 +526,10 @@ class ValueWrapper {
if(!_isInReadOnlyMode && !isDummyValue) {
// Aggressive write to disk when the cache is used in the write-mode.
// This avoids the need to depend on finalize to perform writing.
- _strongRef = data;
Thread t = new Thread() {
public void run() {
try {
- _strongRef.write(true);
- _strongRef = null; // Reset the strong reference after aggresive writing
+ data.write(true);
} catch (IOException e) {
throw new DMLRuntimeException("Error occured while aggressively writing the value to disk.", e);
}