You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2018/09/20 22:00:21 UTC
systemml git commit: [SYSTEMML-445] Added memory stats for GPU allocation/eviction

Repository: systemml
Updated Branches:
  refs/heads/master 69624850e -> f46279a17


[SYSTEMML-445] Added memory stats for GPU allocation/eviction

- Also, reverted the shadow buffer to the original implementation as we are getting OOM for lstm scripts. This likely has to do with pessimistic GC.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/f46279a1
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/f46279a1
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/f46279a1

Branch: refs/heads/master
Commit: f46279a17031d3f8827923f6eddd614c3eac77d3
Parents: 6962485
Author: Niketan Pansare <np...@us.ibm.com>
Authored: Thu Sep 20 14:56:51 2018 -0700
Committer: Niketan Pansare <np...@us.ibm.com>
Committed: Thu Sep 20 14:56:51 2018 -0700

----------------------------------------------------------------------
 conf/SystemML-config.xml.template               |   8 +-
 .../gpu/context/GPUMemoryManager.java           |  61 ++++----
 .../instructions/gpu/context/GPUObject.java     |  18 +--
 .../instructions/gpu/context/ShadowBuffer.java  | 154 +++++--------------
 .../org/apache/sysml/utils/GPUStatistics.java   |  29 ++++
 .../apache/sysml/utils/PersistentLRUCache.java  |   8 +-
 6 files changed, 108 insertions(+), 170 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/f46279a1/conf/SystemML-config.xml.template
----------------------------------------------------------------------
diff --git a/conf/SystemML-config.xml.template b/conf/SystemML-config.xml.template
index 3925c4e..7b535c9 100644
--- a/conf/SystemML-config.xml.template
+++ b/conf/SystemML-config.xml.template
@@ -105,11 +105,9 @@
    <!-- Advanced optimization: fraction of driver memory to use for caching (default: 0.15) -->
    <sysml.caching.bufferSize>0.15</sysml.caching.bufferSize>
    
-   <!-- Advanced optimization: maximum fraction of driver memory to use for GPU shadow buffer. 
-   Shadow buffer is cleared eagerly on garbage collection to avoid OOM and is backed by org.apache.sysml.utils.PersistentLRUCache.
-   Setting this to zero disables shadow buffering. If you intend to train network larger than GPU memory size, 
-   consider using large driver memory and setting this to a value greater than 0. -->
-   <sysml.gpu.eviction.shadow.bufferSize>0.5</sysml.gpu.eviction.shadow.bufferSize>
+   <!-- Advanced optimization: fraction of driver memory to use for GPU shadow buffer. This optimization is ignored for double precision. 
+   By default, it is disabled (hence set to 0.0). If you intend to train network larger than GPU memory size, consider using single precision and setting this to 0.1. -->
+   <sysml.gpu.eviction.shadow.bufferSize>0.0</sysml.gpu.eviction.shadow.bufferSize>
    
    <!-- Fraction of available GPU memory to use. This is similar to TensorFlow's per_process_gpu_memory_fraction configuration property. (default: 0.9) -->
    <sysml.gpu.memory.util.factor>0.9</sysml.gpu.memory.util.factor>

http://git-wip-us.apache.org/repos/asf/systemml/blob/f46279a1/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
index 033051a..57b76f6 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
@@ -191,7 +191,7 @@ public class GPUMemoryManager {
 				GPUStatistics.cudaAllocCount.increment();
 			}
 			if(printDebugMessage != null && (PRINT_GPU_MEMORY_INFO || LOG.isTraceEnabled()) )  {
-				LOG.info("Success: " + printDebugMessage + ":" + byteCountToDisplaySize(size));
+				LOG.info("Success: " + printDebugMessage + ":" + GPUStatistics.byteCountToDisplaySize(size));
 			}
 			return A;
 		} catch(jcuda.CudaException e) {
@@ -203,7 +203,7 @@ public class GPUMemoryManager {
 				GPUStatistics.cudaAllocCount.increment();
 			}
 			if(printDebugMessage != null && (PRINT_GPU_MEMORY_INFO || LOG.isTraceEnabled()) )  {
-				LOG.info("Failed: " + printDebugMessage + ":" + byteCountToDisplaySize(size));
+				LOG.info("Failed: " + printDebugMessage + ":" + GPUStatistics.byteCountToDisplaySize(size));
 				LOG.info("GPU Memory info " + printDebugMessage + ":" + toString());
 			}
 			return null;
@@ -224,28 +224,15 @@ public class GPUMemoryManager {
 			return "->" + stackTrace[index].getClassName() + "." + stackTrace[index].getMethodName() + "(" + stackTrace[index].getFileName() + ":" + stackTrace[index].getLineNumber() + ")";
 	}
 	
-	/**
-	 * Pretty printing utility to print bytes
-	 * 
-	 * @param numBytes number of bytes
-	 * @return a human-readable display value
-	 */
-	private String byteCountToDisplaySize(long numBytes) {
-		// return org.apache.commons.io.FileUtils.byteCountToDisplaySize(bytes); // performs rounding
-	    if (numBytes < 1024) { 
-	    	return numBytes + " bytes";
-	    }
-	    else {
-		    int exp = (int) (Math.log(numBytes) / 6.931471805599453);
-		    return String.format("%.3f %sB", ((double)numBytes) / Math.pow(1024, exp), "KMGTP".charAt(exp-1));
-	    }
-	}
 	
 	public boolean canAllocateWithoutEviction(String opcode, long size) {
 		return lazyCudaFreeMemoryManager.contains(opcode, size) || allocator.canAllocate(size) ||
 			lazyCudaFreeMemoryManager.containsRmvarPointerMinSize(opcode, size) ;
 	}
 	
+	long peakSize = 0;
+	long currentSize = 0;
+	
 	/**
 	 * Allocate pointer of the given size in bytes.
 	 * 
@@ -255,12 +242,19 @@ public class GPUMemoryManager {
 	 */
 	public Pointer malloc(String opcode, long size) {
 		if(size <= 0) {
-			throw new DMLRuntimeException("Cannot allocate memory of size " + byteCountToDisplaySize(size));
+			throw new DMLRuntimeException("Cannot allocate memory of size " + GPUStatistics.byteCountToDisplaySize(size));
 		}
 		if(DEBUG_MEMORY_LEAK) {
 			LOG.info("GPU Memory info during malloc:" + toString());
 		}
 		
+		if(ConfigurationManager.isStatistics()) {
+			GPUStatistics.cudaAllocAggSize.add(size);
+			currentSize += size;
+			peakSize = Math.max(currentSize, peakSize);
+			GPUStatistics.cudaAllocPeakSize.set(peakSize);
+		}
+		
 		// Step 1: First try reusing exact match in rmvarGPUPointers to avoid holes in the GPU memory
 		Pointer A = lazyCudaFreeMemoryManager.getRmvarPointer(opcode, size);
 		
@@ -358,7 +352,7 @@ public class GPUMemoryManager {
 		}
 		
 		if(A == null) {
-			throw new DMLRuntimeException("There is not enough memory on device for this matrix, requested = " + byteCountToDisplaySize(size) + ". \n "
+			throw new DMLRuntimeException("There is not enough memory on device for this matrix, requested = " + GPUStatistics.byteCountToDisplaySize(size) + ". \n "
 					+ toString());
 		}
 		
@@ -377,6 +371,10 @@ public class GPUMemoryManager {
 		boolean eagerDelete = true;
 		if(gpuObj.isDirty()) {
 			// Eviction
+			if(ConfigurationManager.isStatistics()) {
+				long size = gpuObj.getSizeOnDevice();
+				GPUStatistics.cudaEvictAggSize.add(size);
+			}
 			gpuObj.copyFromDeviceToHost(opcode, true, eagerDelete);
 		}
 		else {
@@ -416,7 +414,7 @@ public class GPUMemoryManager {
 		if(allPointers.containsKey(toFree)) {
 			long size = allPointers.get(toFree).getSizeInBytes();
 			if(LOG.isTraceEnabled()) {
-				LOG.trace("Free-ing up the pointer of size " +  byteCountToDisplaySize(size));
+				LOG.trace("Free-ing up the pointer of size " +  GPUStatistics.byteCountToDisplaySize(size));
 			}
 			allPointers.remove(toFree);
 			lazyCudaFreeMemoryManager.removeIfPresent(size, toFree);
@@ -441,6 +439,10 @@ public class GPUMemoryManager {
 	public void free(String opcode, Pointer toFree, boolean eager) throws DMLRuntimeException {
 		if(LOG.isTraceEnabled())
 			LOG.trace("Free-ing the pointer with eager=" + eager);
+		long size = allPointers.get(toFree).getSizeInBytes();
+		if(ConfigurationManager.isStatistics()) {
+			currentSize -= size;
+		}
 		if (eager) {
 			long t0 = ConfigurationManager.isStatistics() ? System.nanoTime() : 0;
 			guardedCudaFree(toFree);
@@ -451,7 +453,6 @@ public class GPUMemoryManager {
 				LOG.info("GPU memory info before failure:" + toString());
 				throw new RuntimeException("ERROR : Internal state corrupted, cache block size map is not aware of a block it trying to free up");
 			}
-			long size = allPointers.get(toFree).getSizeInBytes();
 			lazyCudaFreeMemoryManager.add(size, toFree);
 		}
 	}
@@ -604,24 +605,24 @@ public class GPUMemoryManager {
 		ret.append(String.format("%-35s%-15s%-15s%-15s\n", "", 
 				"Num Objects", "Num Pointers", "Size"));
 		ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Unlocked Dirty GPU objects", 
-				numUnlockedDirtyGPUObjects, numUnlockedDirtyPointers, byteCountToDisplaySize(sizeOfUnlockedDirtyGPUObjects)));
+				numUnlockedDirtyGPUObjects, numUnlockedDirtyPointers, GPUStatistics.byteCountToDisplaySize(sizeOfUnlockedDirtyGPUObjects)));
 		ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Unlocked NonDirty GPU objects", 
-				numUnlockedNonDirtyGPUObjects, numUnlockedNonDirtyPointers, byteCountToDisplaySize(sizeOfUnlockedNonDirtyGPUObjects)));
+				numUnlockedNonDirtyGPUObjects, numUnlockedNonDirtyPointers, GPUStatistics.byteCountToDisplaySize(sizeOfUnlockedNonDirtyGPUObjects)));
 		ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Locked GPU objects", 
-				numLockedGPUObjects, numLockedPointers, byteCountToDisplaySize(sizeOfLockedGPUObjects)));
+				numLockedGPUObjects, numLockedPointers, GPUStatistics.byteCountToDisplaySize(sizeOfLockedGPUObjects)));
 		ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Cached rmvar-ed pointers", 
-				"-", lazyCudaFreeMemoryManager.getNumPointers(), byteCountToDisplaySize(lazyCudaFreeMemoryManager.getTotalMemoryAllocated())));
+				"-", lazyCudaFreeMemoryManager.getNumPointers(), GPUStatistics.byteCountToDisplaySize(lazyCudaFreeMemoryManager.getTotalMemoryAllocated())));
 		ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Non-matrix/non-cached pointers", 
-				"-", potentiallyLeakyPointers.size(), byteCountToDisplaySize(totalSizePotentiallyLeakyPointers)));
+				"-", potentiallyLeakyPointers.size(), GPUStatistics.byteCountToDisplaySize(totalSizePotentiallyLeakyPointers)));
 		ret.append(String.format("%-35s%-15s%-15s%-15s\n", "All pointers", 
-				"-", allPointers.size(), byteCountToDisplaySize(totalMemoryAllocated)));
+				"-", allPointers.size(), GPUStatistics.byteCountToDisplaySize(totalMemoryAllocated)));
 		long free[] = { 0 };
 		long total[] = { 0 };
 		cudaMemGetInfo(free, total);
 		ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Free mem (from cudaMemGetInfo)", 
-				"-", "-", byteCountToDisplaySize(free[0])));
+				"-", "-", GPUStatistics.byteCountToDisplaySize(free[0])));
 		ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Total mem (from cudaMemGetInfo)", 
-				"-", "-", byteCountToDisplaySize(total[0])));
+				"-", "-", GPUStatistics.byteCountToDisplaySize(total[0])));
 		ret.append("====================================================\n");
 		return ret.toString();
 	}

http://git-wip-us.apache.org/repos/asf/systemml/blob/f46279a1/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
index 43e2727..72d3170 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -111,11 +111,7 @@ public class GPUObject {
 	 */
 	public Pointer getDensePointer() {
 		if(jcudaDenseMatrixPtr == null && shadowBuffer.isBuffered() && getJcudaSparseMatrixPtr() == null) {
-			try {
-				shadowBuffer.moveToDevice();
-			} catch (IOException e) {
-				throw new DMLRuntimeException("Error moving the data from shadow buffer to the device", e);
-			}
+			shadowBuffer.moveToDevice();
 		}
 		return jcudaDenseMatrixPtr;
 	}
@@ -939,21 +935,13 @@ public class GPUObject {
 			else {
 				// If already copied to shadow buffer as part of previous eviction and this is not an eviction (i.e. bufferpool call for subsequent CP/Spark instruction),
 				// then copy from shadow buffer to MatrixObject.
-				try {
-					shadowBuffer.moveToHost();
-				} catch (IOException e) {
-					throw new DMLRuntimeException("Error moving the data from shadow buffer to the host memory", e);
-				}
+				shadowBuffer.moveToHost();
 				return;
 			}
 		}
 		else if(shadowBuffer.isEligibleForBuffering(isEviction, eagerDelete)) {
 			// Perform shadow buffering if (1) single precision, (2) during eviction, (3) for dense matrices, and (4) if the given matrix can fit into the shadow buffer. 
-			try {
-				shadowBuffer.moveFromDevice(instName);
-			} catch (IOException e) {
-				throw new DMLRuntimeException("Error moving the data from the device to the shadow buffer", e);
-			}
+			shadowBuffer.moveFromDevice(instName);
 			return;
 		}
 		else if (isDensePointerNull() && getJcudaSparseMatrixPtr() == null) {

http://git-wip-us.apache.org/repos/asf/systemml/blob/f46279a1/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ShadowBuffer.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ShadowBuffer.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ShadowBuffer.java
index 4c534a0..88ea972 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ShadowBuffer.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ShadowBuffer.java
@@ -20,65 +20,41 @@ package org.apache.sysml.runtime.instructions.gpu.context;
 
 import static jcuda.runtime.JCuda.cudaMemcpy;
 
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.util.concurrent.atomic.AtomicLong;
-
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
+import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.conf.ConfigurationManager;
 import org.apache.sysml.conf.DMLConfig;
-import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
 import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
 import org.apache.sysml.utils.GPUStatistics;
-import org.apache.sysml.utils.PersistentLRUCache;
 
 import jcuda.Pointer;
+import jcuda.Sizeof;
 
-/**
- * Shadow buffer is a temporary staging area used during eviction.
- * It is eagerly deleted and backed using the local filesystem in case of Garbage Collection
- * or if the staging memory size exceeds the user-specified size.
- * This is needed to respect SystemML's memory estimates, while still allowing
- * for caching in case of GPU plans.
- */
 public class ShadowBuffer {
 	private static final Log LOG = LogFactory.getLog(ShadowBuffer.class.getName());
-	private static PersistentLRUCache CACHE;
-	private static AtomicLong UNIQUE_ID = new AtomicLong();
-	private static long EVICTION_SHADOW_BUFFER_MAX_BYTES; 
-	final GPUObject gpuObj;
-	boolean isBuffered = false;
-	String fileName;
 	
-	public static boolean isEnabled() {
-		if(CACHE == null && EVICTION_SHADOW_BUFFER_MAX_BYTES >= 0) {
+	GPUObject gpuObj;
+	float[] shadowPointer = null;
+	private static boolean _warnedAboutShadowBuffer = false;
+	private static long EVICTION_SHADOW_BUFFER_CURR_BYTES = 0;
+	private static long EVICTION_SHADOW_BUFFER_MAX_BYTES;
+	static {
+		if(DMLScript.FLOATING_POINT_PRECISION.equals("double")) {
+			EVICTION_SHADOW_BUFFER_MAX_BYTES = 0;
+		}
+		else {
 			double shadowBufferSize = ConfigurationManager.getDMLConfig().getDoubleValue(DMLConfig.EVICTION_SHADOW_BUFFERSIZE);
-			if(shadowBufferSize <= 0) {
-				EVICTION_SHADOW_BUFFER_MAX_BYTES = -1; // Minor optimization to avoid unnecessary invoking configuration manager.
-			}
-			else {
-				if(shadowBufferSize > 1) 
-					throw new RuntimeException("Incorrect value (" + shadowBufferSize + ") for the configuration:" + DMLConfig.EVICTION_SHADOW_BUFFERSIZE);
-				EVICTION_SHADOW_BUFFER_MAX_BYTES = (long) (((double)InfrastructureAnalyzer.getLocalMaxMemory())*shadowBufferSize);
-				try {
-					CACHE = new PersistentLRUCache(EVICTION_SHADOW_BUFFER_MAX_BYTES);
-				} catch(IOException e) {
-					LOG.warn("Unable to create a temporary directory for shadow buffering on the local filesystem; disabling shadow buffering:" + e.getMessage());
-					EVICTION_SHADOW_BUFFER_MAX_BYTES = -1; // Minor optimization to avoid checking for file permission.
-				}
-			}
+			if(shadowBufferSize < 0 || shadowBufferSize > 1) 
+				throw new RuntimeException("Incorrect value (" + shadowBufferSize + ") for the configuration:" + DMLConfig.EVICTION_SHADOW_BUFFERSIZE);
+			EVICTION_SHADOW_BUFFER_MAX_BYTES = (long) (((double)InfrastructureAnalyzer.getLocalMaxMemory())*shadowBufferSize);
 		}
-		return CACHE != null;
 	}
 	
 	public ShadowBuffer(GPUObject gpuObj) {
-		if(isEnabled())
-			fileName = "shadow_" + UNIQUE_ID.incrementAndGet();
 		this.gpuObj = gpuObj;
-		
 	}
 	
 	/**
@@ -87,39 +63,19 @@ public class ShadowBuffer {
 	 * @return true if the gpu object is shadow buffered
 	 */
 	public boolean isBuffered() {
-		return isBuffered;
-	}
-	
-	private static long getSizeOfDataType(long numElems) {
-		return numElems * ((long) LibMatrixCUDA.sizeOfDataType);
+		return shadowPointer != null;
 	}
 	
 	/**
 	 * Move the data from GPU to shadow buffer 
 	 * @param instName name of the instruction
-	 * @throws IOException if error 
-	 * @throws FileNotFoundException  if error
 	 */
-	public void moveFromDevice(String instName) throws FileNotFoundException, IOException {
+	public void moveFromDevice(String instName) {
 		long start = ConfigurationManager.isStatistics() ? System.nanoTime() : 0;
 		int numElems = GPUObject.toIntExact(gpuObj.mat.getNumRows()*gpuObj.mat.getNumColumns());
-	
-		if(isDoublePrecision()) {
-			double [] shadowPointer = new double[numElems];
-			cudaMemcpy(Pointer.to(shadowPointer), gpuObj.jcudaDenseMatrixPtr, getSizeOfDataType(numElems), jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost);
-			CACHE.put(fileName, shadowPointer);
-			isBuffered = true;
-		}
-		else if(isSinglePrecision()) {
-			float [] shadowPointer = new float[numElems];
-			cudaMemcpy(Pointer.to(shadowPointer), gpuObj.jcudaDenseMatrixPtr, getSizeOfDataType(numElems), jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost);
-			CACHE.put(fileName, shadowPointer);
-			isBuffered = true;
-		}
-		else {
-			throw new DMLRuntimeException("Unsupported datatype");
-		}
-		
+		shadowPointer = new float[numElems];
+		EVICTION_SHADOW_BUFFER_CURR_BYTES += getSizeOfFloat(shadowPointer.length);
+		cudaMemcpy(Pointer.to(shadowPointer), gpuObj.jcudaDenseMatrixPtr, getSizeOfDataType(numElems), jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost);
 		gpuObj.getGPUContext().cudaFreeHelper(instName, gpuObj.jcudaDenseMatrixPtr, true);
 		gpuObj.jcudaDenseMatrixPtr = null;
 		if (ConfigurationManager.isStatistics()) {
@@ -131,36 +87,24 @@ public class ShadowBuffer {
 		}
 	}
 	
-
-	private static boolean isDoublePrecision() {
-		return LibMatrixCUDA.sizeOfDataType == jcuda.Sizeof.DOUBLE;
+	private long getSizeOfFloat(long numElems) {
+		return numElems*Sizeof.FLOAT;
 	}
 	
-	private static boolean isSinglePrecision() {
-		return LibMatrixCUDA.sizeOfDataType == jcuda.Sizeof.FLOAT;
+	private long getSizeOfDataType(long numElems) {
+		return numElems*LibMatrixCUDA.sizeOfDataType;
 	}
 	
 	/**
 	 * Move the data from shadow buffer to Matrix object
-	 * @throws IOException if error 
-	 * @throws FileNotFoundException if error
 	 */
-	public void moveToHost() throws FileNotFoundException, IOException {
+	public void moveToHost() {
 		long start = ConfigurationManager.isStatistics() ? System.nanoTime() : 0;
 		MatrixBlock tmp = new MatrixBlock(GPUObject.toIntExact(gpuObj.mat.getNumRows()), GPUObject.toIntExact(gpuObj.mat.getNumColumns()), false);
 		tmp.allocateDenseBlock();
 		double [] tmpArr = tmp.getDenseBlockValues();
-		if(isDoublePrecision()) {
-			System.arraycopy(CACHE.getAsDoubleArray(fileName), 0, tmpArr, 0, tmpArr.length);
-		}
-		else if(isSinglePrecision()) {
-			float [] shadowPointer = CACHE.getAsFloatArray(fileName);
-			for(int i = 0; i < shadowPointer.length; i++) {
-				tmpArr[i] = shadowPointer[i];
-			}
-		}
-		else {
-			throw new DMLRuntimeException("Unsupported datatype");
+		for(int i = 0; i < shadowPointer.length; i++) {
+			tmpArr[i] = shadowPointer[i];
 		}
 		gpuObj.mat.acquireModify(tmp);
 		gpuObj.mat.release();
@@ -178,28 +122,12 @@ public class ShadowBuffer {
 	
 	/**
 	 * Move the data from shadow buffer to GPU
-	 * @throws IOException if error
-	 * @throws FileNotFoundException if error
 	 */
-	public void moveToDevice() throws FileNotFoundException, IOException {
+	public void moveToDevice() {
 		long start = ConfigurationManager.isStatistics() ? System.nanoTime() : 0;
-		int length; Pointer shadowDevicePointer;
-		if(isDoublePrecision()) {
-			double [] shadowPointer = CACHE.getAsDoubleArray(fileName);
-			length = shadowPointer.length;
-			shadowDevicePointer = Pointer.to(shadowPointer);
-		}
-		else if(isSinglePrecision()) {
-			float [] shadowPointer = CACHE.getAsFloatArray(fileName);
-			length = shadowPointer.length;
-			shadowDevicePointer = Pointer.to(shadowPointer);
-		}
-		else {
-			throw new DMLRuntimeException("Unsupported datatype");
-		}
-		long numBytes = getSizeOfDataType(length);
+		long numBytes = getSizeOfDataType(shadowPointer.length);
 		gpuObj.jcudaDenseMatrixPtr = gpuObj.getGPUContext().allocate(null, numBytes);
-		cudaMemcpy(gpuObj.jcudaDenseMatrixPtr, shadowDevicePointer, numBytes, jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice);
+		cudaMemcpy(gpuObj.jcudaDenseMatrixPtr, Pointer.to(shadowPointer), numBytes, jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice);
 		clearShadowPointer();
 		if (ConfigurationManager.isStatistics()) {
 			long totalTime = System.nanoTime() - start;
@@ -216,14 +144,14 @@ public class ShadowBuffer {
 	 * @return true if the given GPU object is eligible to be shadow buffered
 	 */
 	public boolean isEligibleForBuffering(boolean isEviction, boolean eagerDelete) {
-		if(isEnabled() && isEviction && eagerDelete && !gpuObj.isDensePointerNull()) {
-			long numBytes = getSizeOfDataType(gpuObj.mat.getNumRows()*gpuObj.mat.getNumColumns());
-			if(EVICTION_SHADOW_BUFFER_MAX_BYTES <= numBytes) {
-				return false; // Don't attempt to cache very large GPU objects.
-			}
-			else {
-				return true; // Dense GPU objects is eligible for shadow buffering when called during eviction and is being eagerly deleted.
+		if(LibMatrixCUDA.sizeOfDataType == jcuda.Sizeof.FLOAT && isEviction && eagerDelete && !gpuObj.isDensePointerNull()) {
+			long numBytes = getSizeOfFloat(gpuObj.mat.getNumRows()*gpuObj.mat.getNumColumns());
+			boolean ret = EVICTION_SHADOW_BUFFER_CURR_BYTES + numBytes <= EVICTION_SHADOW_BUFFER_MAX_BYTES;
+			if(!ret && !_warnedAboutShadowBuffer) {
+				LOG.warn("Shadow buffer is full, so using CP bufferpool instead. Consider increasing sysml.gpu.eviction.shadow.bufferSize.");
+				_warnedAboutShadowBuffer = true;
 			}
+			return ret;
 		}
 		else {
 			return false;
@@ -234,9 +162,9 @@ public class ShadowBuffer {
 	 * Removes the content from shadow buffer
 	 */
 	public void clearShadowPointer() {
-		if(CACHE.containsKey(fileName)) {
-			CACHE.remove(fileName);
-			isBuffered = false;
+		if(shadowPointer != null) {
+			EVICTION_SHADOW_BUFFER_CURR_BYTES -= getSizeOfFloat(shadowPointer.length);
 		}
+		shadowPointer = null;
 	}
-}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/systemml/blob/f46279a1/src/main/java/org/apache/sysml/utils/GPUStatistics.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/utils/GPUStatistics.java b/src/main/java/org/apache/sysml/utils/GPUStatistics.java
index e748057..541850d 100644
--- a/src/main/java/org/apache/sysml/utils/GPUStatistics.java
+++ b/src/main/java/org/apache/sysml/utils/GPUStatistics.java
@@ -26,6 +26,7 @@ import java.util.HashMap;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
+import java.util.concurrent.atomic.AtomicLong;
 import java.util.concurrent.atomic.LongAdder;
 
 import org.apache.sysml.conf.ConfigurationManager;
@@ -78,6 +79,10 @@ public class GPUStatistics {
 	public static LongAdder cudaAllocSuccessCount = new LongAdder();
 	public static LongAdder cudaAllocFailedCount = new LongAdder();
 	public static LongAdder cudaAllocReuseCount = new LongAdder();
+	
+	public static LongAdder cudaAllocAggSize = new LongAdder();
+	public static AtomicLong cudaAllocPeakSize = new AtomicLong();
+	public static LongAdder cudaEvictAggSize = new LongAdder();
 
 	// Per instruction miscellaneous timers.
 	// Used to record events in a CP Heavy Hitter instruction and
@@ -116,6 +121,9 @@ public class GPUStatistics {
 		cudaDouble2FloatCount.reset();
 		cudaForcedClearLazyFreedEvictTime.reset();
 		cudaForcedClearUnpinnedEvictTime.reset();
+		cudaAllocAggSize.reset();
+		cudaAllocPeakSize.set(0);
+		cudaEvictAggSize.reset();
 		cudaAllocCount.reset();
 		cudaDeAllocCount.reset();
 		cudaToDevCount.reset();
@@ -218,6 +226,23 @@ public class GPUStatistics {
 		}
 		return sb.toString();
 	}
+	
+	/**
+	 * Pretty printing utility to print bytes
+	 * 
+	 * @param numBytes number of bytes
+	 * @return a human-readable display value
+	 */
+	public static String byteCountToDisplaySize(long numBytes) {
+		// return org.apache.commons.io.FileUtils.byteCountToDisplaySize(bytes); // performs rounding
+	    if (numBytes < 1024) { 
+	    	return numBytes + " bytes";
+	    }
+	    else {
+		    int exp = (int) (Math.log(numBytes) / 6.931471805599453);
+		    return String.format("%.3f %sB", ((double)numBytes) / Math.pow(1024, exp), "KMGTP".charAt(exp-1));
+	    }
+	}
 
 	/**
 	 * Used to print out cuda timers & counters
@@ -242,6 +267,10 @@ public class GPUStatistics {
 				+ cudaAllocReuseCount.longValue() +") / "
 				+ cudaDeAllocCount.longValue() + " / "
 				+ cudaMemSet0Count.longValue() + ".\n");
+		sb.append("GPU mem size (alloc (peak) / evict):\t"
+				+ byteCountToDisplaySize(cudaAllocAggSize.longValue()) + "("
+				+ byteCountToDisplaySize(cudaAllocPeakSize.longValue()) + ") / "
+				+ byteCountToDisplaySize(cudaEvictAggSize.longValue()) + ".\n");
 		sb.append("GPU mem tx time  (toDev(d2f/s2d) / fromDev(f2d/s2h) / evict(d2s/size)):\t"
 				+ String.format("%.3f", cudaToDevTime.longValue()*1e-9) + "("
 				+ String.format("%.3f", cudaDouble2FloatTime.longValue()*1e-9)+ "/"

http://git-wip-us.apache.org/repos/asf/systemml/blob/f46279a1/src/main/java/org/apache/sysml/utils/PersistentLRUCache.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/utils/PersistentLRUCache.java b/src/main/java/org/apache/sysml/utils/PersistentLRUCache.java
index 71a1e28..d9d9337 100644
--- a/src/main/java/org/apache/sysml/utils/PersistentLRUCache.java
+++ b/src/main/java/org/apache/sysml/utils/PersistentLRUCache.java
@@ -519,10 +519,6 @@ class ValueWrapper {
 	long _clen;
 	long _nnz;
 	
-	// This is only used in write-mode until the writing to the disk is completed.
-	// It also prevents the _softRef from being garbage collected while it is written.
-	volatile DataWrapper _strongRef; 
-	
 	ValueWrapper(DataWrapper data, boolean isInReadOnlyMode) {
 		_lock = new Object();
 		_isInReadOnlyMode = isInReadOnlyMode;
@@ -530,12 +526,10 @@ class ValueWrapper {
 		if(!_isInReadOnlyMode && !isDummyValue) {
 			// Aggressive write to disk when the cache is used in the write-mode.
 			// This avoids the need to depend on finalize to perform writing.
-			_strongRef = data;
 			Thread t = new Thread() {
 			    public void run() {
 			    	try {
-						_strongRef.write(true);
-						_strongRef = null; // Reset the strong reference after aggresive writing
+			    		data.write(true);
 					} catch (IOException e) {
 						throw new DMLRuntimeException("Error occured while aggressively writing the value to disk.", e);
 					}