You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2017/09/14 20:24:10 UTC
systemml git commit: [MINOR] Refactored the locks to seperate out read and write lock

Repository: systemml
Updated Branches:
  refs/heads/master 0a984a43b -> c6d499d3e


[MINOR] Refactored the locks to seperate out read and write lock

- Refactoring the locks will avoid future bugs where the developer tries
  to obtain 2 write lock or a read lock on a write-locked objects, etc.
- I have also added a debugging utility to track potential memory leaks.

Closes #664.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/c6d499d3
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/c6d499d3
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/c6d499d3

Branch: refs/heads/master
Commit: c6d499d3e27a1842ccf5987ab84c92eee72aa5c2
Parents: 0a984a4
Author: Niketan Pansare <np...@us.ibm.com>
Authored: Thu Sep 14 13:20:46 2017 -0700
Committer: Niketan Pansare <np...@us.ibm.com>
Committed: Thu Sep 14 13:23:14 2017 -0700

----------------------------------------------------------------------
 .../context/ExecutionContext.java               |  2 +-
 .../instructions/gpu/GPUInstruction.java        | 12 +++
 .../instructions/gpu/context/GPUContext.java    | 52 ++++++++++--
 .../instructions/gpu/context/GPUObject.java     | 84 ++++++++++++++------
 4 files changed, 117 insertions(+), 33 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/c6d499d3/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
index b74c0dd..3b2436e 100644
--- a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
+++ b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
@@ -374,7 +374,7 @@ public class ExecutionContext {
 		}
 		// The lock is added here for an output block
 		// so that any block currently in use is not deallocated by eviction on the GPU
-		mo.getGPUObject(getGPUContext(0)).addLock();
+		mo.getGPUObject(getGPUContext(0)).addWriteLock();
 		return mo;
 	}
 

http://git-wip-us.apache.org/repos/asf/systemml/blob/c6d499d3/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
index 9a6a3bb..2aa73b4 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
@@ -19,6 +19,8 @@
 
 package org.apache.sysml.runtime.instructions.gpu;
 
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.lops.runtime.RunMRJobs;
 import org.apache.sysml.runtime.DMLRuntimeException;
@@ -26,6 +28,7 @@ import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysml.runtime.instructions.GPUInstructionParser;
 import org.apache.sysml.runtime.instructions.Instruction;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
 import org.apache.sysml.runtime.matrix.data.Pair;
 import org.apache.sysml.runtime.matrix.operators.Operator;
 import org.apache.sysml.utils.GPUStatistics;
@@ -46,6 +49,8 @@ public abstract class GPUInstruction extends Instruction {
 		Builtin,
 		MatrixIndexing
 	};
+	
+	private static final Log LOG = LogFactory.getLog(GPUInstruction.class.getName());
 
 	// Memory/conversions
 	public final static String MISC_TIMER_HOST_TO_DEVICE =          "H2D";	// time spent in bringing data to gpu (from host)
@@ -191,6 +196,13 @@ public abstract class GPUInstruction extends Instruction {
 		if(DMLScript.SYNCHRONIZE_GPU) {
 			jcuda.runtime.JCuda.cudaDeviceSynchronize();
 		}
+		if(LOG.isDebugEnabled()) {
+			for(GPUContext gpuCtx : ec.getGPUContexts()) {
+				if(gpuCtx != null)
+					gpuCtx.printMemoryInfo(getOpcode());
+			}
+		}
+			
 	}
 
 	/**

http://git-wip-us.apache.org/repos/asf/systemml/blob/c6d499d3/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
index a31deab..271109d 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
@@ -43,6 +43,7 @@ import java.util.Comparator;
 import java.util.HashMap;
 import java.util.LinkedList;
 import java.util.Map;
+import java.util.Map.Entry;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -130,6 +131,40 @@ public class GPUContext {
 	 * to launch custom CUDA kernel, specific to the active GPU for this GPUContext
 	 */
 	private final ThreadLocal<JCudaKernels> kernels = new ThreadLocal<>();
+	
+	/**
+	 * Print information of memory usage. 
+	 * 
+	 * @param opcode opcode of caller
+	 * @throws DMLRuntimeException if error 
+	 */
+	public void printMemoryInfo(String opcode) throws DMLRuntimeException {
+		if(LOG.isDebugEnabled()) {
+			long totalFreeCUDASpace = 0;
+			for(Entry<Long, LinkedList<Pointer>> kv : freeCUDASpaceMap.entrySet()) {
+				totalFreeCUDASpace += kv.getKey()*kv.getValue().size();
+			}
+			long readLockedAllocatedMemory = 0;
+			long writeLockedAllocatedMemory = 0;
+			long unlockedAllocatedMemory = 0;
+			for(GPUObject gpuObj : allocatedGPUObjects) {
+				if(gpuObj.readLocks.longValue() > 0)
+					readLockedAllocatedMemory += gpuObj.getSizeOnDevice();
+				else if(gpuObj.writeLock)
+					writeLockedAllocatedMemory += gpuObj.getSizeOnDevice();
+				else
+					unlockedAllocatedMemory += gpuObj.getSizeOnDevice();
+			}
+			long free[] = { 0 };
+			long total[] = { 0 };
+			cudaMemGetInfo(free, total);
+			long gpuFreeMemory =  (long) (free[0] * GPU_MEMORY_UTILIZATION_FACTOR);
+			LOG.debug(opcode + ": Total memory: " + total[0] + ", Free memory: " + free[0] + " (with util factor: " + gpuFreeMemory + "), "
+					+ "Lazy unfreed memory: " + totalFreeCUDASpace + ", Locked allocated memory (read/write): " 
+					+ readLockedAllocatedMemory + "/" + writeLockedAllocatedMemory + ", "
+					+ " Unlocked allocated memory: " + unlockedAllocatedMemory);
+		}
+	}
 
 	protected GPUContext(int deviceNum) throws DMLRuntimeException {
 		this.deviceNum = deviceNum;
@@ -472,18 +507,19 @@ public class GPUContext {
 		Collections.sort(allocatedGPUObjects, new Comparator<GPUObject>() {
 			@Override
 			public int compare(GPUObject p1, GPUObject p2) {
-				long p1Val = p1.locks.get();
-				long p2Val = p2.locks.get();
-
-				if (p1Val > 0 && p2Val > 0) {
+				if (p1.isLocked() && p2.isLocked()) {
 					// Both are locked, so don't sort
 					return 0;
-				} else if (p1Val > 0 || p2Val > 0) {
+				} else if (p1.isLocked()) {
 					// Put the unlocked one to RHS
-					return Long.compare(p2Val, p1Val);
+					// a value less than 0 if x < y; and a value greater than 0 if x > y
+					return -1;
+				} else if (p2.isLocked()) {
+					// Put the unlocked one to RHS
+					// a value less than 0 if x < y; and a value greater than 0 if x > y
+					return 1;
 				} else {
 					// Both are unlocked
-
 					if (evictionPolicy == EvictionPolicy.MIN_EVICT) {
 						long p1Size = 0;
 						long p2Size = 0;
@@ -510,7 +546,7 @@ public class GPUContext {
 
 		while (neededSize > getAvailableMemory() && allocatedGPUObjects.size() > 0) {
 			GPUObject toBeRemoved = allocatedGPUObjects.get(allocatedGPUObjects.size() - 1);
-			if (toBeRemoved.locks.get() > 0) {
+			if (toBeRemoved.isLocked()) {
 				throw new DMLRuntimeException(
 						"There is not enough memory on device for this matrix, request (" + neededSize + "). Allocated GPU objects:" + allocatedGPUObjects.toString());
 			}

http://git-wip-us.apache.org/repos/asf/systemml/blob/c6d499d3/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
index 2642011..4bc983e 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -34,6 +34,7 @@ import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
 
 import java.util.Arrays;
 import java.util.concurrent.atomic.AtomicLong;
+import java.util.concurrent.atomic.LongAdder;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -99,9 +100,14 @@ public class GPUObject {
 	protected boolean dirty = false;
 
 	/**
-	 * number of read/write locks on this object (this GPUObject is being used in a current instruction)
+	 * number of read locks on this object (this GPUObject is being used in a current instruction)
 	 */
-	protected AtomicLong locks = new AtomicLong();
+	protected LongAdder readLocks = new LongAdder();
+	
+	/**
+	 * whether write lock on this object (this GPUObject is being used in a current instruction)
+	 */
+	protected boolean writeLock = false;
 
 	/**
 	 * Timestamp, needed by {@link GPUContext#evict(long)}
@@ -132,7 +138,11 @@ public class GPUObject {
 			that.allocateTensorDescriptor(me.tensorShape[0], me.tensorShape[1], me.tensorShape[2], me.tensorShape[3]);
 		}
 		that.dirty = me.dirty;
-		that.locks = new AtomicLong(me.locks.get());
+		// TODO Nakul: Should the locks be cloned here ?
+		// The only place clone is getting called: LibMatrixCUDA's solve
+		that.readLocks.reset();
+		that.writeLock = false;
+		
 		that.timestamp = new AtomicLong(me.timestamp.get());
 		that.isSparse = me.isSparse;
 
@@ -618,7 +628,7 @@ public class GPUObject {
 			copyFromHostToDevice(opcode);
 			transferred = true;
 		}
-		addLock();
+		addReadLock();
 		if (!isAllocated())
 			throw new DMLRuntimeException("Expected device data to be allocated");
 		return transferred;
@@ -664,10 +674,6 @@ public class GPUObject {
 		return allocated;
 	}
 
-	public void addLock() {
-		locks.addAndGet(1);
-	}
-
 	/**
 	 * if the data is allocated on the GPU and is dirty, it is copied back to the host memory
 	 *
@@ -693,22 +699,51 @@ public class GPUObject {
 		}
 		return copied;
 	}
+	
+	public boolean isLocked() {
+		return writeLock || readLocks.longValue() > 0;
+	}
+	
+	public void addReadLock() throws DMLRuntimeException {
+		if(writeLock)
+			throw new DMLRuntimeException("Attempting to add a read lock when writeLock="+ writeLock);
+		else
+			readLocks.increment();
+	}
+	
+	public void addWriteLock() throws DMLRuntimeException {
+		if(readLocks.longValue() > 0)
+			throw new DMLRuntimeException("Attempting to add a write lock when readLocks="+ readLocks.longValue());
+		else if(writeLock)
+			throw new DMLRuntimeException("Attempting to add a write lock when writeLock="+ writeLock);
+		else
+			writeLock = true;
+	}
+	
+	public void releaseReadLock() throws DMLRuntimeException {
+		readLocks.decrement();
+		if(readLocks.longValue() < 0)
+			throw new DMLRuntimeException("Attempting to release a read lock when readLocks="+ readLocks.longValue());
+	}
+	
+	public void releaseWriteLock() throws DMLRuntimeException {
+		if(writeLock)
+			writeLock = false;
+		else
+			throw new DMLRuntimeException("Internal state error : Attempting to release write lock on a GPUObject, which was already released");
+	}
+	
+	public void resetReadWriteLock() {
+		readLocks.reset();
+		writeLock = false;
+	}
 
 	/**
 	 * Updates the locks depending on the eviction policy selected
 	 *
 	 * @throws DMLRuntimeException if there is no locked GPU Object or if could not obtain a {@link GPUContext}
 	 */
-	private void updateReleaseLocks(int l) throws DMLRuntimeException {
-		int newLocks = (int) locks.addAndGet(l);
-		if (newLocks < 0) {
-			throw new CacheException("Internal state error : Invalid number of locks on a GPUObject");
-		}
-
-		if(LOG.isTraceEnabled()) {
-			LOG.trace("GPU : updateReleaseLocks, new number of locks is " + newLocks + ", on " + this + ", GPUContext="
-				+ getGPUContext());
-		}
+	private void updateReleaseLocks() throws DMLRuntimeException {
 		GPUContext.EvictionPolicy evictionPolicy = getGPUContext().evictionPolicy;
 		switch (evictionPolicy) {
 		case LRU:
@@ -730,8 +765,8 @@ public class GPUObject {
 	 * @throws DMLRuntimeException if data is not allocated or if there is no locked GPU Object or if could not obtain a {@link GPUContext}
 	 */
 	public void releaseInput() throws DMLRuntimeException {
-		// A read lock is a positive quantity, therefor when the lock is freed, a negative 1 is added
-		updateReleaseLocks(-1);
+		releaseReadLock();
+		updateReleaseLocks();
 		if (!isAllocated())
 			throw new CacheException("Attempting to release an input before allocating it");
 	}
@@ -742,8 +777,8 @@ public class GPUObject {
 	 * @throws DMLRuntimeException if data is not allocated or if there is no locked GPU Object or if could not obtain a {@link GPUContext}
 	 */
 	public void releaseOutput() throws DMLRuntimeException {
-		// A write lock is a negative quantity, therefore when the lock is freed, a positive number is added
-		updateReleaseLocks(1);
+		releaseWriteLock();
+		updateReleaseLocks();
 		dirty = true;
 		if (!isAllocated())
 			throw new CacheException("Attempting to release an output before allocating it");
@@ -798,7 +833,7 @@ public class GPUObject {
 			cudnnDestroyTensorDescriptor(tensorDescriptor);
 			tensorDescriptor = null;
 		}
-		locks.set(0);
+		resetReadWriteLock();
 		getGPUContext().removeRecordedUsage(this);
 	}
 
@@ -1061,7 +1096,8 @@ public class GPUObject {
 		final StringBuilder sb = new StringBuilder("GPUObject{");
 		sb.append(", tensorShape=").append(Arrays.toString(tensorShape));
 		sb.append(", dirty=").append(dirty);
-		sb.append(", locks=").append(locks);
+		sb.append(", readLocks=").append(readLocks.longValue());
+		sb.append(", writeLock=").append(writeLock);
 		sb.append(", sparse? ").append(isSparse);
 		sb.append(", dims=[").append(mat.getNumRows()).append(",").append(mat.getNumColumns()).append("]");
 		sb.append('}');