You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by ar...@apache.org on 2021/05/23 08:52:00 UTC

[systemds] branch master updated: [SYSTEMDS-2980] Add statistics for lineage cache in GPU

This is an automated email from the ASF dual-hosted git repository.

arnabp20 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/master by this push:
     new dd2a876  [SYSTEMDS-2980] Add statistics for lineage cache in GPU
dd2a876 is described below

commit dd2a8767e924cb33a0a4ca1060f2f36ebd9418e6
Author: arnabp <ar...@tugraz.at>
AuthorDate: Sun May 23 10:36:58 2021 +0200

    [SYSTEMDS-2980] Add statistics for lineage cache in GPU
    
    This patch adds a initial set of statistics for reuse
    and eviction of GPU intermediates.
    e.g. LinCache GPU (Hit/Async/Sync): 	38/26/25
---
 .../gpu/context/GPUMemoryEviction.java             |  3 +-
 .../instructions/gpu/context/GPUMemoryManager.java |  3 +-
 .../apache/sysds/runtime/lineage/LineageCache.java | 12 +++++--
 .../runtime/lineage/LineageCacheStatistics.java    | 41 +++++++++++++++++++---
 .../java/org/apache/sysds/utils/Statistics.java    |  1 +
 src/test/java/org/apache/sysds/test/TestUtils.java |  8 +++--
 .../test/functions/lineage/GPUFullReuseTest.java   |  1 +
 7 files changed, 57 insertions(+), 12 deletions(-)

diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryEviction.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryEviction.java
index 5fd1474..cb7787c 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryEviction.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryEviction.java
@@ -25,6 +25,7 @@ import java.util.List;
 import org.apache.sysds.api.DMLScript;
 import org.apache.sysds.runtime.lineage.LineageCacheConfig;
 import org.apache.sysds.runtime.lineage.LineageCacheEntry;
+import org.apache.sysds.runtime.lineage.LineageCacheStatistics;
 import org.apache.sysds.runtime.lineage.LineageGPUCacheEviction;
 import org.apache.sysds.utils.GPUStatistics;
 
@@ -122,7 +123,7 @@ public class GPUMemoryEviction implements Runnable
 				// This doesn't guarantee allocation due to fragmented freed memory
 			//	A = cudaMallocNoWarn(tmpA, size, null); 
 			if (DMLScript.STATISTICS) {
-				GPUStatistics.cudaEvictCount.increment();
+				LineageCacheStatistics.incrementGpuAsyncEvicts();
 			}
 			count++;
 		}
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
index a9c0a57..7df6214 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
@@ -43,6 +43,7 @@ import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.instructions.gpu.GPUInstruction;
 import org.apache.sysds.runtime.lineage.LineageCacheConfig;
 import org.apache.sysds.runtime.lineage.LineageCacheEntry;
+import org.apache.sysds.runtime.lineage.LineageCacheStatistics;
 import org.apache.sysds.runtime.lineage.LineageGPUCacheEviction;
 import org.apache.sysds.utils.GPUStatistics;
 
@@ -355,7 +356,7 @@ public class GPUMemoryManager {
 				// Copy from device cache to CPU lineage cache if not already copied
 				LineageGPUCacheEviction.copyToHostCache(le, opcode, copied);
 				if (DMLScript.STATISTICS)
-					GPUStatistics.cudaEvictCount.increment();
+					LineageCacheStatistics.incrementGpuSyncEvicts();
 
 				// For all the other objects, remove and clear data (only once)
 				nextgpuObj = headGpuObj;
diff --git a/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java b/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java
index f908967..b366edb 100644
--- a/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java
+++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java
@@ -142,6 +142,7 @@ public class LineageCache
 			reuse = reuseAll;
 			
 			if(reuse) { //reuse
+				boolean gpuReuse = false;
 				//put reuse value into symbol table (w/ blocking on placeholders)
 				for (MutablePair<LineageItem, LineageCacheEntry> entry : liList) {
 					e = entry.getValue();
@@ -174,8 +175,9 @@ public class LineageCache
 						//shallow copy the cached GPUObj to the output MatrixObject
 						ec.getMatrixObject(outName).setGPUObject(ec.getGPUContext(0), 
 								ec.getGPUContext(0).shallowCopyGPUObject(e._gpuObject, ec.getMatrixObject(outName)));
-						//Set dirty to true, so that it is later copied to the host
+						//Set dirty to true, so that it is later copied to the host for write
 						ec.getMatrixObject(outName).getGPUObject(ec.getGPUContext(0)).setDirty(true);
+						gpuReuse = true;
 					}
 
 					reuse = true;
@@ -183,8 +185,12 @@ public class LineageCache
 					if (DMLScript.STATISTICS) //increment saved time
 						LineageCacheStatistics.incrementSavedComputeTime(e._computeTime);
 				}
-				if (DMLScript.STATISTICS)
-					LineageCacheStatistics.incrementInstHits();
+				if (DMLScript.STATISTICS) {
+					if (gpuReuse)
+						LineageCacheStatistics.incrementGpuHits();
+					else
+						LineageCacheStatistics.incrementInstHits();
+				}
 			}
 		}
 		
diff --git a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheStatistics.java b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheStatistics.java
index a4cd041..3382365 100644
--- a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheStatistics.java
+++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheStatistics.java
@@ -36,10 +36,15 @@ public class LineageCacheStatistics {
 	private static final LongAdder _numWritesFS     = new LongAdder();
 	private static final LongAdder _numMemDel       = new LongAdder();
 	private static final LongAdder _numRewrites     = new LongAdder();
-	private static final LongAdder _ctimeFSRead     = new LongAdder(); //in nano sec
-	private static final LongAdder _ctimeFSWrite    = new LongAdder(); //in nano sec
-	private static final LongAdder _ctimeSaved      = new LongAdder(); //in nano sec
-	private static final LongAdder _ctimeMissed     = new LongAdder(); //in nano sec
+	// All the time measurements are in nanoseconds
+	private static final LongAdder _ctimeFSRead     = new LongAdder();
+	private static final LongAdder _ctimeFSWrite    = new LongAdder();
+	private static final LongAdder _ctimeSaved      = new LongAdder();
+	private static final LongAdder _ctimeMissed     = new LongAdder();
+	// Bellow entries are for specific to gpu lineage cache
+	private static final LongAdder _numHitsGpu      = new LongAdder();
+	private static final LongAdder _numAsyncEvictGpu= new LongAdder();
+	private static final LongAdder _numSyncEvictGpu = new LongAdder();
 
 	public static void reset() {
 		_numHitsMem.reset();
@@ -56,6 +61,9 @@ public class LineageCacheStatistics {
 		_ctimeFSWrite.reset();
 		_ctimeSaved.reset();
 		_ctimeMissed.reset();
+		_numHitsGpu.reset();
+		_numAsyncEvictGpu.reset();
+		_numSyncEvictGpu.reset();
 	}
 	
 	public static void incrementMemHits() {
@@ -146,6 +154,21 @@ public class LineageCacheStatistics {
 		return _numHitsSB.longValue();
 	}
 
+	public static void incrementGpuHits() {
+		// Number of times single instruction results are reused in the gpu.
+		_numHitsGpu.increment();
+	}
+
+	public static void incrementGpuAsyncEvicts() {
+		// Number of gpu cache entries moved to cpu cache via the background thread
+		_numAsyncEvictGpu.increment();
+	}
+
+	public static void incrementGpuSyncEvicts() {
+		// Number of gpu cache entries moved to cpu cache during malloc 
+		_numSyncEvictGpu.increment();
+	}
+
 	public static String displayHits() {
 		StringBuilder sb = new StringBuilder();
 		sb.append(_numHitsMem.longValue());
@@ -196,4 +219,14 @@ public class LineageCacheStatistics {
 		sb.append(String.format("%.3f", ((double)_ctimeMissed.longValue())/1000000000)); //in sec
 		return sb.toString();
 	}
+
+	public static String displayGpuStats() {
+		StringBuilder sb = new StringBuilder();
+		sb.append(_numHitsGpu.longValue());
+		sb.append("/");
+		sb.append(_numAsyncEvictGpu.longValue());
+		sb.append("/");
+		sb.append(_numSyncEvictGpu.longValue());
+		return sb.toString();
+	}
 }
diff --git a/src/main/java/org/apache/sysds/utils/Statistics.java b/src/main/java/org/apache/sysds/utils/Statistics.java
index a76db81..d4247a7 100644
--- a/src/main/java/org/apache/sysds/utils/Statistics.java
+++ b/src/main/java/org/apache/sysds/utils/Statistics.java
@@ -1024,6 +1024,7 @@ public class Statistics
 			if (DMLScript.LINEAGE && !ReuseCacheType.isNone()) {
 				sb.append("LinCache hits (Mem/FS/Del): \t" + LineageCacheStatistics.displayHits() + ".\n");
 				sb.append("LinCache MultiLevel (Ins/SB/Fn):" + LineageCacheStatistics.displayMultiLevelHits() + ".\n");
+				sb.append("LinCache GPU (Hit/Async/Sync): \t" + LineageCacheStatistics.displayGpuStats() + ".\n");
 				sb.append("LinCache writes (Mem/FS/Del): \t" + LineageCacheStatistics.displayWtrites() + ".\n");
 				sb.append("LinCache FStimes (Rd/Wr): \t" + LineageCacheStatistics.displayFSTime() + " sec.\n");
 				sb.append("LinCache Computetime (S/M): \t" + LineageCacheStatistics.displayComputeTime() + " sec.\n");
diff --git a/src/test/java/org/apache/sysds/test/TestUtils.java b/src/test/java/org/apache/sysds/test/TestUtils.java
index 18eb735..f0a9c5c 100644
--- a/src/test/java/org/apache/sysds/test/TestUtils.java
+++ b/src/test/java/org/apache/sysds/test/TestUtils.java
@@ -78,7 +78,7 @@ import org.apache.sysds.runtime.util.DataConverter;
 import org.apache.sysds.runtime.util.UtilFunctions;
 import org.junit.Assert;
 
-import jcuda.runtime.JCuda;
+//import jcuda.runtime.JCuda;
 
 
 /**
@@ -3063,7 +3063,9 @@ public class TestUtils
 	
 	public static int isGPUAvailable() {
 		// returns cudaSuccess if at least one gpu is available
-		final int[] deviceCount = new int[1];
-		return JCuda.cudaGetDeviceCount(deviceCount);
+		//final int[] deviceCount = new int[1];
+		//return JCuda.cudaGetDeviceCount(deviceCount);
+		// FIXME: Fails to skip if gpu available but no libraries
+		return 1; //return false for now
 	}
 }
diff --git a/src/test/java/org/apache/sysds/test/functions/lineage/GPUFullReuseTest.java b/src/test/java/org/apache/sysds/test/functions/lineage/GPUFullReuseTest.java
index 4c08a65..3d16c70 100644
--- a/src/test/java/org/apache/sysds/test/functions/lineage/GPUFullReuseTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/lineage/GPUFullReuseTest.java
@@ -44,6 +44,7 @@ public class GPUFullReuseTest extends AutomatedTestBase{
 	@BeforeClass
 	public static void checkGPU() {
 		// Skip all the tests if no GPU is available
+		// FIXME: Fails to skip if gpu available but no libraries
 		Assume.assumeTrue(TestUtils.isGPUAvailable() == cudaError.cudaSuccess);
 	}