You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by ar...@apache.org on 2021/05/15 17:39:43 UTC

[systemds] branch master updated: [SYSTEMDS-2947] Background thread for lineage cache eviction in GPU

This is an automated email from the ASF dual-hosted git repository.

arnabp20 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/master by this push:
     new e75c865  [SYSTEMDS-2947] Background thread for lineage cache eviction in GPU
e75c865 is described below

commit e75c86547ae8d2733a1e08a0befa6fa04d933bda
Author: arnabp <ar...@tugraz.at>
AuthorDate: Sat May 15 19:34:30 2021 +0200

    [SYSTEMDS-2947] Background thread for lineage cache eviction in GPU
    
    This patch introduces a background thread for lineage cache eviction
    in GPU. The idea is to evict a few non-live variables when the current
    instruction is running in CPU. We submit a task in the pre-processing
    phase of a CPU instruction. The eviction loop breaks if 1) done evicting
    the requested number of entries or 2) the parallel CPU instruction
    is ended (set a flag in postprocess), or 3) no entries left for eviction.
    
    In addition to that, we introduce a logic to measure the DtoH eviction
    bandwidth in real time and apply exponential smoothing to predict
    the future IO. However, the facts that the measured bandwidth PCIe3 x16
    can vary greatly and a memcpy lazily execute the kernels make the
    estimated bandwidth flaky. We need this to decide the worth of eviction
    Both of these are disabled for now.
---
 .../org/apache/sysds/api/ScriptExecutorUtils.java  |   3 +
 .../runtime/instructions/cp/CPInstruction.java     |  23 ++++
 .../gpu/context/GPULazyCudaFreeMemoryManager.java  |  12 +-
 .../gpu/context/GPUMemoryEviction.java             | 137 ++++++++++++++++++++
 .../instructions/gpu/context/GPUMemoryManager.java | 138 +++++++++++++--------
 .../instructions/gpu/context/GPUObject.java        |   3 +
 .../sysds/runtime/lineage/LineageCacheConfig.java  |   4 +
 .../sysds/runtime/lineage/LineageCacheEntry.java   |   2 +
 .../runtime/lineage/LineageCacheEviction.java      |   5 +-
 .../runtime/lineage/LineageGPUCacheEviction.java   |  41 ++++--
 10 files changed, 296 insertions(+), 72 deletions(-)

diff --git a/src/main/java/org/apache/sysds/api/ScriptExecutorUtils.java b/src/main/java/org/apache/sysds/api/ScriptExecutorUtils.java
index 970a800..fd344e4 100644
--- a/src/main/java/org/apache/sysds/api/ScriptExecutorUtils.java
+++ b/src/main/java/org/apache/sysds/api/ScriptExecutorUtils.java
@@ -35,6 +35,7 @@ import org.apache.sysds.runtime.instructions.gpu.context.GPUContext;
 import org.apache.sysds.runtime.instructions.gpu.context.GPUContextPool;
 import org.apache.sysds.runtime.instructions.gpu.context.GPUObject;
 import org.apache.sysds.runtime.lineage.LineageEstimatorStatistics;
+import org.apache.sysds.runtime.lineage.LineageGPUCacheEviction;
 import org.apache.sysds.utils.Statistics;
 
 public class ScriptExecutorUtils {
@@ -113,6 +114,8 @@ public class ScriptExecutorUtils {
 					gCtx.clearTemporaryMemory();
 				}
 				GPUContextPool.freeAllGPUContexts();
+				if (LineageGPUCacheEviction.gpuEvictionThread != null)
+					LineageGPUCacheEviction.gpuEvictionThread.shutdown();
 			}
 			if( ConfigurationManager.isCodegenEnabled() )
 				SpoofCompiler.cleanupCodeGenerator();
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/cp/CPInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/cp/CPInstruction.java
index b2a14ce..014b854 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/cp/CPInstruction.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/cp/CPInstruction.java
@@ -19,6 +19,8 @@
 
 package org.apache.sysds.runtime.instructions.cp;
 
+import java.util.concurrent.Executors;
+
 import org.apache.sysds.api.DMLScript;
 import org.apache.sysds.common.Types.DataType;
 import org.apache.sysds.lops.Lop;
@@ -29,6 +31,9 @@ import org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysds.runtime.instructions.CPInstructionParser;
 import org.apache.sysds.runtime.instructions.Instruction;
 import org.apache.sysds.runtime.instructions.fed.FEDInstructionUtils;
+import org.apache.sysds.runtime.instructions.gpu.context.GPUMemoryEviction;
+import org.apache.sysds.runtime.lineage.LineageCacheConfig;
+import org.apache.sysds.runtime.lineage.LineageGPUCacheEviction;
 import org.apache.sysds.runtime.matrix.operators.Operator;
 import org.apache.sysds.runtime.privacy.propagation.PrivacyPropagator;
 
@@ -100,6 +105,17 @@ public abstract class CPInstruction extends Instruction
 
 		tmp = PrivacyPropagator.preprocessInstruction(tmp, ec);
 		
+		//Submit a task for the eviction thread. The stopping criteria are a passed
+		//eviction count and STOPBACKGROUNDEVICTION flag. STOPBACKGROUNDEVICTION flag
+		//is set to true in the post processing of CPU instruction to stop eviction.
+		if (!LineageCacheConfig.ReuseCacheType.isNone() && DMLScript.USE_ACCELERATOR
+			&& LineageCacheConfig.CONCURRENTGPUEVICTION && !(tmp instanceof VariableCPInstruction)) {
+			if (LineageGPUCacheEviction.gpuEvictionThread == null)
+				LineageGPUCacheEviction.gpuEvictionThread = Executors.newSingleThreadExecutor();
+			LineageCacheConfig.STOPBACKGROUNDEVICTION = false;
+			LineageGPUCacheEviction.gpuEvictionThread.submit(new GPUMemoryEviction(1));
+		}
+		
 		return tmp;
 	}
 
@@ -130,6 +146,13 @@ public abstract class CPInstruction extends Instruction
 		}
 		return updateInstList.toString();
 	}
+	@Override
+	public void postprocessInstruction(ExecutionContext ec) {
+		//Stop the eviction thread if not done yet evicting the given count.
+		if (!LineageCacheConfig.ReuseCacheType.isNone() && DMLScript.USE_ACCELERATOR
+			&& LineageCacheConfig.CONCURRENTGPUEVICTION)
+			LineageCacheConfig.STOPBACKGROUNDEVICTION = true;
+	}
 	
 	/** 
 	 * Replaces ALL placeholder strings (such as ##mVar2## and ##Var5##) in a single instruction.
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPULazyCudaFreeMemoryManager.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPULazyCudaFreeMemoryManager.java
index 9369026..332f115 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPULazyCudaFreeMemoryManager.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPULazyCudaFreeMemoryManager.java
@@ -51,7 +51,7 @@ public class GPULazyCudaFreeMemoryManager {
 	 * @param size size in bytes
 	 * @return pointer
 	 */
-	public Pointer getRmvarPointer(String opcode, long size) {
+	synchronized public Pointer getRmvarPointer(String opcode, long size) {
 		if (rmvarGPUPointers.containsKey(size)) {
 			if(LOG.isTraceEnabled())
 				LOG.trace("Getting rmvar-ed pointers for size:" + size);
@@ -65,11 +65,11 @@ public class GPULazyCudaFreeMemoryManager {
 		}
 	}
 	
-	public Set<Pointer> getAllPointers() {
+	synchronized public Set<Pointer> getAllPointers() {
 		return rmvarGPUPointers.values().stream().flatMap(ptrs -> ptrs.stream()).collect(Collectors.toSet());
 	}
 	
-	public void clearAll() {
+	synchronized public void clearAll() {
 		Set<Pointer> toFree = new HashSet<>();
 		for(Set<Pointer> ptrs : rmvarGPUPointers.values()) {
 			toFree.addAll(ptrs);
@@ -80,7 +80,7 @@ public class GPULazyCudaFreeMemoryManager {
 		}
 	}
 	
-	public Pointer getRmvarPointerMinSize(String opcode, long minSize) throws DMLRuntimeException {
+	synchronized public Pointer getRmvarPointerMinSize(String opcode, long minSize) throws DMLRuntimeException {
 		Optional<Long> toClear = rmvarGPUPointers.entrySet().stream().filter(e -> e.getValue().size() > 0).map(e -> e.getKey())
 				.filter(size -> size >= minSize).min((s1, s2) -> s1 < s2 ? -1 : 1);
 		if(toClear.isPresent()) {
@@ -145,7 +145,7 @@ public class GPULazyCudaFreeMemoryManager {
 	 * @param size size of the pointer
 	 * @param toFree pointer
 	 */
-	public void add(long size, Pointer toFree) {
+	synchronized public void add(long size, Pointer toFree) {
 		Set<Pointer> freeList = rmvarGPUPointers.get(size);
 		if (freeList == null) {
 			freeList = new HashSet<>();
@@ -162,7 +162,7 @@ public class GPULazyCudaFreeMemoryManager {
 	 * @param size size in bytes
 	 * @param ptr pointer to be removed
 	 */
-	public void removeIfPresent(long size, Pointer ptr) {
+	synchronized public void removeIfPresent(long size, Pointer ptr) {
 		if(rmvarGPUPointers.containsKey(size) && rmvarGPUPointers.get(size).contains(ptr)) {
 			rmvarGPUPointers.get(size).remove(ptr);
 			if (rmvarGPUPointers.get(size).isEmpty())
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryEviction.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryEviction.java
new file mode 100644
index 0000000..5fd1474
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryEviction.java
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.instructions.gpu.context;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.sysds.api.DMLScript;
+import org.apache.sysds.runtime.lineage.LineageCacheConfig;
+import org.apache.sysds.runtime.lineage.LineageCacheEntry;
+import org.apache.sysds.runtime.lineage.LineageGPUCacheEviction;
+import org.apache.sysds.utils.GPUStatistics;
+
+public class GPUMemoryEviction implements Runnable 
+{
+	int numEvicts = 0;
+	
+	public GPUMemoryEviction(int num) {
+		numEvicts = num;
+	}
+
+	@Override
+	public void run() {
+		//long currentAvailableMemory = allocator.getAvailableMemory();
+		List<LineageCacheEntry> lockedOrLiveEntries = new ArrayList<>();
+		int count = 0;
+
+		// Stop if 1) Evicted the request number of entries, 2) The parallel
+		// CPU instruction is ended, and 3) No non-live entries left in the cache.
+		long t0 =  DMLScript.STATISTICS ? System.nanoTime() : 0;
+		while (!LineageGPUCacheEviction.isGPUCacheEmpty() && count < numEvicts) 
+		{
+			if (LineageCacheConfig.STOPBACKGROUNDEVICTION)
+				break;
+			
+			LineageCacheEntry le = LineageGPUCacheEviction.pollFirstEntry();
+			GPUObject cachedGpuObj = le.getGPUObject();
+			GPUObject headGpuObj = cachedGpuObj.lineageCachedChainHead != null
+					? cachedGpuObj.lineageCachedChainHead : cachedGpuObj;
+			// Check and continue if any object in the linked list is locked
+			boolean lockedOrLive = false;
+			GPUObject nextgpuObj = headGpuObj;
+			while (nextgpuObj!= null) {
+				if (!nextgpuObj.isrmVarPending() || nextgpuObj.isLocked()) // live or locked
+					lockedOrLive = true;
+				nextgpuObj = nextgpuObj.nextLineageCachedEntry;
+			}
+			if (lockedOrLive) {
+				lockedOrLiveEntries.add(le);
+				continue;
+			}
+
+			// TODO: First remove the gobj chains that don't contain any live and dirty objects.
+			//currentAvailableMemory += headGpuObj.getSizeOnDevice();
+
+			// Copy from device to host for all live and dirty objects
+			boolean copied = false;
+			nextgpuObj = headGpuObj;
+			while (nextgpuObj!= null) {
+				// Keeping isLinCached as True here will save data deletion by copyFromDeviceToHost
+				if (!nextgpuObj.isrmVarPending() && nextgpuObj.isDirty()) { //live and dirty
+					nextgpuObj.copyFromDeviceToHost(null, true, true);
+					copied = true;
+				}
+				nextgpuObj.setIsLinCached(false);
+				nextgpuObj = nextgpuObj.nextLineageCachedEntry;
+			}
+
+			// Copy from device cache to CPU lineage cache if not already copied
+			LineageGPUCacheEviction.copyToHostCache(le, null, copied);
+
+			// For all the other objects, remove and clear data (only once)
+			nextgpuObj = headGpuObj;
+			boolean freed = false;
+			synchronized (nextgpuObj.getGPUContext().getMemoryManager().getGPUMatrixMemoryManager().gpuObjects) {
+
+			while (nextgpuObj!= null) {
+				// If not live or live but not dirty
+				if (nextgpuObj.isrmVarPending() || !nextgpuObj.isDirty()) {
+					if (!freed) {
+						nextgpuObj.clearData(null, true);
+						//FIXME: adding to rmVar cache causes multiple failures due to concurrent
+						//access to the rmVar cache and other data structures. VariableCP instruction
+						//and other instruction free memory and add to rmVar cache in parallel to
+						//the background eviction task, which needs to be synchronized.
+						freed = true;
+					}
+					else
+						nextgpuObj.clearGPUObject();
+				}
+				nextgpuObj = nextgpuObj.nextLineageCachedEntry;
+			}
+			}
+			// Clear the GPUOjects chain
+			GPUObject currgpuObj = headGpuObj;
+			while (currgpuObj.nextLineageCachedEntry != null) {
+				nextgpuObj = currgpuObj.nextLineageCachedEntry;
+				currgpuObj.lineageCachedChainHead = null;
+				currgpuObj.nextLineageCachedEntry = null;
+				nextgpuObj.lineageCachedChainHead = null;
+				currgpuObj = nextgpuObj;
+			}
+
+			//if(currentAvailableMemory >= size)
+				// This doesn't guarantee allocation due to fragmented freed memory
+			//	A = cudaMallocNoWarn(tmpA, size, null); 
+			if (DMLScript.STATISTICS) {
+				GPUStatistics.cudaEvictCount.increment();
+			}
+			count++;
+		}
+
+		// Add the locked entries back to the eviction queue
+		if (!lockedOrLiveEntries.isEmpty())
+			LineageGPUCacheEviction.addEntryList(lockedOrLiveEntries);
+		
+		if (DMLScript.STATISTICS) //TODO: dedicated statistics for lineage
+			GPUStatistics.cudaEvictTime.add(System.nanoTime() - t0);
+	}
+}
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
index 4e1d577..a9c0a57 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
@@ -106,8 +106,10 @@ public class GPUMemoryManager {
 	 * @return either the size or -1 if no such pointer exists
 	 */
 	public long getSizeAllocatedGPUPointer(Pointer ptr) {
-		if(allPointers.containsKey(ptr)) {
-			return allPointers.get(ptr).getSizeInBytes();
+		synchronized(allPointers) {
+			if(allPointers.containsKey(ptr)) {
+				return allPointers.get(ptr).getSizeInBytes();
+			}
 		}
 		return -1;
 	}
@@ -175,7 +177,9 @@ public class GPUMemoryManager {
 		long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
 		try {
 			allocator.allocate(A, size);
-			allPointers.put(A, new PointerInfo(size));
+			synchronized(allPointers) {
+				allPointers.put(A, new PointerInfo(size));
+			}
 			if(DMLScript.STATISTICS) {
 				long totalTime = System.nanoTime() - t0;
 				GPUStatistics.cudaAllocSuccessTime.add(totalTime);
@@ -286,20 +290,23 @@ public class GPUMemoryManager {
 		// Step 5: Try eviction/clearing exactly one with size restriction
 		if(A == null) {
 			long t0 =  DMLScript.STATISTICS ? System.nanoTime() : 0;
-			Optional<GPUObject> sizeBasedUnlockedGPUObjects = matrixMemoryManager.gpuObjects.stream()
-						.filter(gpuObj -> !gpuObj.isLocked() && !gpuObj.isLinCached() && matrixMemoryManager.getWorstCaseContiguousMemorySize(gpuObj) >= size)
-						.min((o1, o2) -> worstCaseContiguousMemorySizeCompare(o1, o2));
-			if(sizeBasedUnlockedGPUObjects.isPresent()) {
-				evictOrClear(sizeBasedUnlockedGPUObjects.get(), opcode);
-				A = cudaMallocNoWarn(tmpA, size, null);
-				if(A == null)
-					LOG.warn("cudaMalloc failed after clearing/evicting based on size.");
-				if(DMLScript.STATISTICS) {
-					long totalTime = System.nanoTime() - t0;
-					GPUStatistics.cudaEvictTime.add(totalTime);
-					GPUStatistics.cudaEvictSizeTime.add(totalTime);
-					GPUStatistics.cudaEvictCount.increment();
-					GPUStatistics.cudaEvictSizeCount.increment();
+			synchronized (matrixMemoryManager.gpuObjects) {
+				Optional<GPUObject> sizeBasedUnlockedGPUObjects = matrixMemoryManager.gpuObjects.stream()
+							.filter(gpuObj -> !gpuObj.isLocked() && !gpuObj.isLinCached() 
+							&& matrixMemoryManager.getWorstCaseContiguousMemorySize(gpuObj) >= size)
+							.min((o1, o2) -> worstCaseContiguousMemorySizeCompare(o1, o2));
+				if(sizeBasedUnlockedGPUObjects.isPresent()) {
+					evictOrClear(sizeBasedUnlockedGPUObjects.get(), opcode);
+					A = cudaMallocNoWarn(tmpA, size, null);
+					if(A == null)
+						LOG.warn("cudaMalloc failed after clearing/evicting based on size.");
+					if(DMLScript.STATISTICS) {
+						long totalTime = System.nanoTime() - t0;
+						GPUStatistics.cudaEvictTime.add(totalTime);
+						GPUStatistics.cudaEvictSizeTime.add(totalTime);
+						GPUStatistics.cudaEvictCount.increment();
+						GPUStatistics.cudaEvictSizeCount.increment();
+					}
 				}
 			}
 		}
@@ -310,6 +317,7 @@ public class GPUMemoryManager {
 		if (A == null && !LineageCacheConfig.ReuseCacheType.isNone()) {
 			long currentAvailableMemory = allocator.getAvailableMemory();
 			List<LineageCacheEntry> lockedEntries = new ArrayList<>();
+			long t0 =  DMLScript.STATISTICS ? System.nanoTime() : 0;
 			while (A == null && !LineageGPUCacheEviction.isGPUCacheEmpty()) {
 				LineageCacheEntry le = LineageGPUCacheEviction.pollFirstEntry();
 				GPUObject cachedGpuObj = le.getGPUObject();
@@ -330,17 +338,25 @@ public class GPUMemoryManager {
 
 				// TODO: First remove the gobj chains that don't contain any live and dirty objects.
 				currentAvailableMemory += headGpuObj.getSizeOnDevice();
-				LineageGPUCacheEviction.copyToHostCache(le, opcode, true);
 
 				// Copy from device to host for all live and dirty objects
+				boolean copied = false;
 				nextgpuObj = headGpuObj;
 				while (nextgpuObj!= null) {
 					// Keeping isLinCached as True here will save data deletion by copyFromDeviceToHost
-					if (!nextgpuObj.isrmVarPending() && nextgpuObj.isDirty()) //live and dirty
+					if (!nextgpuObj.isrmVarPending() && nextgpuObj.isDirty()) { //live and dirty
 						nextgpuObj.copyFromDeviceToHost(opcode, true, true);
+						copied = true;
+					}
 					nextgpuObj.setIsLinCached(false);
 					nextgpuObj = nextgpuObj.nextLineageCachedEntry;
 				}
+
+				// Copy from device cache to CPU lineage cache if not already copied
+				LineageGPUCacheEviction.copyToHostCache(le, opcode, copied);
+				if (DMLScript.STATISTICS)
+					GPUStatistics.cudaEvictCount.increment();
+
 				// For all the other objects, remove and clear data (only once)
 				nextgpuObj = headGpuObj;
 				boolean freed = false;
@@ -374,6 +390,8 @@ public class GPUMemoryManager {
 			// Add the locked entries back to the eviction queue
 			if (!lockedEntries.isEmpty())
 				LineageGPUCacheEviction.addEntryList(lockedEntries);
+			if (DMLScript.STATISTICS) //TODO: dedicated statistics for lineage
+				GPUStatistics.cudaEvictTime.add(System.nanoTime() - t0);
 
 			if (A == null)
 				LOG.warn("cudaMalloc failed after Lineage GPU cache eviction.");
@@ -386,24 +404,26 @@ public class GPUMemoryManager {
 			boolean canFit = false;
 			// ---------------------------------------------------------------
 			// Evict unlocked GPU objects one-by-one and try malloc
-			List<GPUObject> unlockedGPUObjects = matrixMemoryManager.gpuObjects.stream()
+			synchronized(matrixMemoryManager.gpuObjects) {
+				List<GPUObject> unlockedGPUObjects = matrixMemoryManager.gpuObjects.stream()
 						.filter(gpuObj -> !gpuObj.isLocked() && !gpuObj.isLinCached()).collect(Collectors.toList());
-			Collections.sort(unlockedGPUObjects, new EvictionPolicyBasedComparator(size));
-			while(A == null && unlockedGPUObjects.size() > 0) {
-				GPUObject evictedGPUObject = unlockedGPUObjects.remove(unlockedGPUObjects.size()-1);
-				evictOrClear(evictedGPUObject, opcode);
-				if(!canFit) {
-					currentAvailableMemory += evictedGPUObject.getSizeOnDevice();
-					if(currentAvailableMemory >= size)
-						canFit = true;
-				}
-				if(canFit) {
-					// Checking before invoking cudaMalloc reduces the time spent in unnecessary cudaMalloc.
-					// This was the bottleneck for ResNet200 experiments with batch size > 32 on P100+Intel
-					A = cudaMallocNoWarn(tmpA, size, null); 
+				Collections.sort(unlockedGPUObjects, new EvictionPolicyBasedComparator(size));
+				while(A == null && unlockedGPUObjects.size() > 0) {
+					GPUObject evictedGPUObject = unlockedGPUObjects.remove(unlockedGPUObjects.size()-1);
+					evictOrClear(evictedGPUObject, opcode);
+					if(!canFit) {
+						currentAvailableMemory += evictedGPUObject.getSizeOnDevice();
+						if(currentAvailableMemory >= size)
+							canFit = true;
+					}
+					if(canFit) {
+						// Checking before invoking cudaMalloc reduces the time spent in unnecessary cudaMalloc.
+						// This was the bottleneck for ResNet200 experiments with batch size > 32 on P100+Intel
+						A = cudaMallocNoWarn(tmpA, size, null); 
+					}
+					if(DMLScript.STATISTICS) 
+						GPUStatistics.cudaEvictCount.increment();
 				}
-				if(DMLScript.STATISTICS) 
-					GPUStatistics.cudaEvictCount.increment();
 			}
 			if(DMLScript.STATISTICS) {
 				long totalTime = System.nanoTime() - t0;
@@ -477,19 +497,22 @@ public class GPUMemoryManager {
 	 * @param toFree pointer to call cudaFree method on
 	 */
 	void guardedCudaFree(Pointer toFree) {
-		if(allPointers.containsKey(toFree)) {
-			long size = allPointers.get(toFree).getSizeInBytes();
-			if(LOG.isTraceEnabled()) {
-				LOG.trace("Free-ing up the pointer of size " +  byteCountToDisplaySize(size));
+		synchronized(allPointers) {
+			if(allPointers.containsKey(toFree)) {
+				long size = allPointers.get(toFree).getSizeInBytes();
+				if(LOG.isTraceEnabled()) {
+					LOG.trace("Free-ing up the pointer of size " +  byteCountToDisplaySize(size));
+				}
+				allPointers.remove(toFree);
+				lazyCudaFreeMemoryManager.removeIfPresent(size, toFree);
+				allocator.free(toFree);
+				if(DMLScript.SYNCHRONIZE_GPU)
+					// Force a device synchronize after free-ing the pointer for debugging
+					jcuda.runtime.JCuda.cudaDeviceSynchronize(); 
+			}
+			else {
+				throw new RuntimeException("Attempting to free an unaccounted pointer:" + toFree);
 			}
-			allPointers.remove(toFree);
-			lazyCudaFreeMemoryManager.removeIfPresent(size, toFree);
-			allocator.free(toFree);
-			if(DMLScript.SYNCHRONIZE_GPU)
-				jcuda.runtime.JCuda.cudaDeviceSynchronize(); // Force a device synchronize after free-ing the pointer for debugging
-		}
-		else {
-			throw new RuntimeException("Attempting to free an unaccounted pointer:" + toFree);
 		}
 
 	}
@@ -511,11 +534,14 @@ public class GPUMemoryManager {
 			addMiscTime(opcode, GPUStatistics.cudaDeAllocTime, GPUStatistics.cudaDeAllocCount, GPUInstruction.MISC_TIMER_CUDA_FREE, t0);
 		}
 		else {
-			if (!allPointers.containsKey(toFree)) {
-				LOG.info("GPU memory info before failure:" + toString());
-				throw new RuntimeException("ERROR : Internal state corrupted, cache block size map is not aware of a block it trying to free up");
+			long size = 0;
+			synchronized(allPointers) {
+				if (!allPointers.containsKey(toFree)) {
+					LOG.info("GPU memory info before failure:" + toString());
+					throw new RuntimeException("ERROR : Internal state corrupted, cache block size map is not aware of a block it trying to free up");
+				}
+				size = allPointers.get(toFree).getSizeInBytes();
 			}
-			long size = allPointers.get(toFree).getSizeInBytes();
 			lazyCudaFreeMemoryManager.add(size, toFree);
 		}
 	}
@@ -554,11 +580,13 @@ public class GPUMemoryManager {
 		matrixMemoryManager.gpuObjects.clear();
 		
 		// Then clean up remaining allocated GPU pointers 
-		Set<Pointer> remainingPtr = new HashSet<>(allPointers.keySet());
-		for(Pointer toFree : remainingPtr) {
-			guardedCudaFree(toFree); // cleans up allocatedGPUPointers and rmvarGPUPointers as well
+		synchronized(allPointers) {
+			Set<Pointer> remainingPtr = new HashSet<>(allPointers.keySet());
+			for(Pointer toFree : remainingPtr) {
+				guardedCudaFree(toFree); // cleans up allocatedGPUPointers and rmvarGPUPointers as well
+			}
+			allPointers.clear();
 		}
-		allPointers.clear();
 	}
 		
 	/**
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
index d346d56..291cb07 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
@@ -155,6 +155,9 @@ public class GPUObject {
 		shadowBuffer.clearShadowPointer();
 	}
 	
+	public MatrixObject getMatrixObject() {
+		return mat;
+	}
 	
 	/**
 	 * Convenience method to directly set the dense matrix pointer on GPU
diff --git a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java
index 98d7c08..b7b66a7 100644
--- a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java
+++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java
@@ -95,6 +95,8 @@ public class LineageCacheConfig
 	public static double FSREAD_SPARSE = 400;
 	public static double FSWRITE_DENSE = 450;
 	public static double FSWRITE_SPARSE = 225;
+	public static double D2HCOPY = 1500;
+	public static double D2HMAXBANDWIDTH = 8192;
 	
 	private enum CachedItemHead {
 		TSMM,
@@ -113,6 +115,8 @@ public class LineageCacheConfig
 	private static LineageCachePolicy _cachepolicy = null;
 	// Weights for scoring components (computeTime/size, LRU timestamp, DAG height)
 	protected static double[] WEIGHTS = {1, 0, 0};
+	public static boolean CONCURRENTGPUEVICTION = false;
+	public static volatile boolean STOPBACKGROUNDEVICTION = false;
 
 	protected enum LineageCacheStatus {
 		EMPTY,     //Placeholder with no data. Cannot be evicted.
diff --git a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEntry.java b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEntry.java
index 1a32e4b..ea99cf8 100644
--- a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEntry.java
+++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEntry.java
@@ -126,6 +126,7 @@ public class LineageCacheEntry {
 	
 	public synchronized void setValue(MatrixBlock val, long computetime) {
 		_MBval = val;
+		_gpuObject = null;  //Matrix block and gpu object cannot coexist
 		_computeTime = computetime;
 		_status = isNullVal() ? LineageCacheStatus.EMPTY : LineageCacheStatus.CACHED;
 		//resume all threads waiting for val
@@ -138,6 +139,7 @@ public class LineageCacheEntry {
 
 	public synchronized void setValue(ScalarObject val, long computetime) {
 		_SOval = val;
+		_gpuObject = null;  //scalar and gpu object cannot coexist
 		_computeTime = computetime;
 		_status = isNullVal() ? LineageCacheStatus.EMPTY : LineageCacheStatus.CACHED;
 		//resume all threads waiting for val
diff --git a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEviction.java b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEviction.java
index 70a08d9..3fca108 100644
--- a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEviction.java
+++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEviction.java
@@ -238,7 +238,7 @@ public class LineageCacheEviction
 			}
 
 			if (spilltime < LineageCacheConfig.MIN_SPILL_TIME_ESTIMATE) {
-				// Can't trust the estimate if less than 100ms.
+				// Can't trust the estimate if less than 10ms.
 				// Spill if it takes longer to recompute.
 				removeOrSpillEntry(cache, e, //spill or delete
 					exectime >= LineageCacheConfig.MIN_SPILL_TIME_ESTIMATE);
@@ -290,7 +290,7 @@ public class LineageCacheEviction
 			return; 
 		
 		double newIOSpeed = size / IOtime; // MB per second 
-		// Adjust the read/write speed taking into account the last read/write.
+		// Adjust the read/write speed using exponential smoothing (alpha = 0.5)
 		// These constants will eventually converge to the real speed.
 		if (read) {
 			if (isSparse(e))
@@ -304,6 +304,7 @@ public class LineageCacheEviction
 			else
 				LineageCacheConfig.FSWRITE_DENSE= (LineageCacheConfig.FSWRITE_DENSE+ newIOSpeed) / 2;
 		}
+		// TODO: exponential smoothing with arbitrary smoothing factor
 	}
 	
 	private static boolean isSparse(LineageCacheEntry e) {
diff --git a/src/main/java/org/apache/sysds/runtime/lineage/LineageGPUCacheEviction.java b/src/main/java/org/apache/sysds/runtime/lineage/LineageGPUCacheEviction.java
index 156147b..412db39 100644
--- a/src/main/java/org/apache/sysds/runtime/lineage/LineageGPUCacheEviction.java
+++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageGPUCacheEviction.java
@@ -21,6 +21,7 @@ package org.apache.sysds.runtime.lineage;
 
 import java.util.List;
 import java.util.TreeSet;
+import java.util.concurrent.ExecutorService;
 
 import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.instructions.gpu.context.GPUContextPool;
@@ -31,6 +32,7 @@ public class LineageGPUCacheEviction
 	private static long _currentCacheSize = 0;
 	private static long GPU_CACHE_LIMIT; //limit in bytes
 	private static long _startTimestamp = 0;
+	public static ExecutorService gpuEvictionThread = null;
 	private static TreeSet<LineageCacheEntry> weightedQueue = new TreeSet<>(LineageCacheConfig.LineageCacheComparator);
 
 	protected static void resetEviction() {
@@ -40,6 +42,8 @@ public class LineageGPUCacheEviction
 			e._gpuObject.clearData(null, true);
 		}
 		_currentCacheSize = 0;
+		gpuEvictionThread = null;
+		//LineageCacheConfig.CONCURRENTGPUEVICTION = false;
 		weightedQueue.clear();
 	}
 
@@ -59,6 +63,19 @@ public class LineageGPUCacheEviction
 	protected static long getStartTimestamp() {
 		return _startTimestamp;
 	}
+	
+	private static void adjustD2HTransferSpeed(double sizeByte, double copyTime) {
+		double sizeMB = sizeByte / (1024*1024);
+		double newTSpeed = sizeMB / copyTime;  //bandwidth (MB/sec) + java overhead
+
+		// FIXME: A D2H copy lazily executes previous kernels
+		if (newTSpeed > LineageCacheConfig.D2HMAXBANDWIDTH)
+			return;  //filter out errorneous measurements (~ >8GB/sec)
+		// Perform exponential smoothing.
+		double smFactor = 0.5;  //smoothing factor
+		LineageCacheConfig.D2HCOPY = (smFactor * newTSpeed) + ((1-smFactor) * LineageCacheConfig.D2HCOPY);
+		//System.out.println("size_t: "+sizeMB+ " speed_t: "+newTSpeed + " estimate_t+1: "+LineageCacheConfig.D2HCOPY);
+	}
 
 	//--------------- CACHE MAINTENANCE & LOOKUP FUNCTIONS --------------//
 
@@ -111,19 +128,25 @@ public class LineageGPUCacheEviction
 		return GPU_CACHE_LIMIT;
 	}
 	
-	public static void copyToHostCache(LineageCacheEntry entry, String instName, boolean eagerDelete) {
+	public static void copyToHostCache(LineageCacheEntry entry, String instName, boolean alreadyCopied) {
 		// TODO: move to the shadow buffer. Convert to double precision only when reused.
-		// FIXME: Remove double copying (copyFromDeviceToHost, evictFromDeviceToHostMB)
-		MatrixBlock mb = entry._gpuObject.evictFromDeviceToHostMB(instName, eagerDelete);
+		long t0 = System.nanoTime();
+		MatrixBlock mb = alreadyCopied ? entry._gpuObject.getMatrixObject().acquireReadAndRelease()
+				: entry._gpuObject.evictFromDeviceToHostMB(instName, false);
+		long t1 = System.nanoTime();
+		adjustD2HTransferSpeed(((double)entry._gpuObject.getSizeOnDevice()), ((double)(t1-t0))/1000000000);
 		long size = mb.getInMemorySize();
-		// make space in the host memory for the data
-		if (!LineageCacheEviction.isBelowThreshold(size))
-			LineageCacheEviction.makeSpace(LineageCache.getLineageCache(), size);
+		// make space in the host memory for the data TODO: synchronize
+		if (!LineageCacheEviction.isBelowThreshold(size)) {
+			synchronized (LineageCache.getLineageCache()) {
+				LineageCacheEviction.makeSpace(LineageCache.getLineageCache(), size);
+			}
+		}
+		// FIXME: updateSize outside of synchronized is problematic, but eliminates waiting for background eviction
 		LineageCacheEviction.updateSize(size, true);
-		// place the data
+		// place the data and set gpu object to null in the cache entry
 		entry.setValue(mb);
-		entry._gpuObject = null;
-		// maintain order for eviction of host cache
+		// maintain order for eviction of host cache. FIXME: synchronize
 		LineageCacheEviction.addEntry(entry);
 		// manage space in gpu cache
 		updateSize(size, false);