You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by de...@apache.org on 2016/11/17 22:38:54 UTC
incubator-systemml git commit: [SYSTEMML-446] Miscellaneous bug fixes to SYSTEMML-446

Repository: incubator-systemml
Updated Branches:
  refs/heads/master 1164f99be -> 3dace3869


[SYSTEMML-446] Miscellaneous bug fixes to SYSTEMML-446

Closes #289.


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/3dace386
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/3dace386
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/3dace386

Branch: refs/heads/master
Commit: 3dace3869e8216d82f9990be438cfe9e76deb333
Parents: 1164f99
Author: Nakul Jindal <na...@gmail.com>
Authored: Thu Nov 17 14:33:32 2016 -0800
Committer: Deron Eriksson <de...@us.ibm.com>
Committed: Thu Nov 17 14:33:32 2016 -0800

----------------------------------------------------------------------
 .../gpu/context/ExecutionConfig.java            |   2 +-
 .../instructions/gpu/context/GPUObject.java     |   9 +-
 .../instructions/gpu/context/JCudaObject.java   | 108 +++++++++++++------
 .../runtime/matrix/data/LibMatrixCUDA.java      |  18 ++--
 4 files changed, 85 insertions(+), 52 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/3dace386/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
index 7cab238..c04e8a4 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
@@ -80,7 +80,7 @@ public class ExecutionConfig {
 		int maxBlockDim = getMaxBlockDim(deviceNumber);
 		int blockDimX = (int) Math.min(maxBlockDim, rlen);
 		int gridDimX = (int)Math.ceil((double)rlen / blockDimX);
-		int blockDimY = (int)Math.min(Math.ceil(((double)maxBlockDim)/blockDimX), clen);
+		int blockDimY = (int)Math.min(Math.floor(((double)maxBlockDim)/blockDimX), clen);
 		int gridDimY = (int)Math.ceil((double)clen / blockDimY);
 		return new ExecutionConfig(gridDimX, gridDimY, blockDimX, blockDimY);
 	}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/3dace386/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
index 91bd3ea..bcffa46 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -78,12 +78,9 @@ public abstract class GPUObject
 	public abstract void releaseOutput() throws CacheException;
 	
 	// package-level visibility as these methods are guarded by underlying GPUContext
-	/**
-	 * Allocates memory on the GPU
-	 * @param numElemToAllocate number of elements in dense matrix, -1 for unknown or sparse matrix
-	 * @throws DMLRuntimeException if DMLRuntimeException occurs
-	 */
-	abstract void allocateMemoryOnDevice(long numElemToAllocate) throws DMLRuntimeException;
+
+	abstract void allocateDenseMatrixOnDevice() throws DMLRuntimeException;
+	abstract void allocateSparseMatrixOnDevice() throws DMLRuntimeException;
 	abstract void deallocateMemoryOnDevice() throws DMLRuntimeException;
 	abstract long getSizeOnDevice() throws DMLRuntimeException;
 	

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/3dace386/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java
index a3aab96..0c96417 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java
@@ -90,7 +90,7 @@ public class JCudaObject extends GPUObject {
 		private static final double ULTRA_SPARSITY_TURN_POINT = 0.0004;
 
 		/**
-		 * Default constructor to help with Factory method {@link #allocateCSRMatrix(long, long, long)}
+		 * Default constructor to help with Factory method {@link #allocateEmpty(long, long)}
 		 */
 		private CSRPointer() {
 			val = new Pointer();
@@ -170,9 +170,10 @@ public class JCudaObject extends GPUObject {
 				return r;
 			}
 			ensureFreeSpace(getDoubleSizeOf(nnz2) + getIntSizeOf(rows + 1) + getIntSizeOf(nnz2));
-			r.val = allocate(getDoubleSizeOf(nnz2));
-			r.rowPtr = allocate(getIntSizeOf(rows + 1));
-			r.colInd = allocate(getIntSizeOf(nnz2));
+			// increment the cudaCount by 1 for the allocation of all 3 arrays
+			r.val = allocate(getDoubleSizeOf(nnz2), 0);
+			r.rowPtr = allocate(getIntSizeOf(rows + 1), 0);
+			r.colInd = allocate(getIntSizeOf(nnz2), 1);
 			return r;
 		}
 		
@@ -245,7 +246,8 @@ public class JCudaObject extends GPUObject {
 		private static void step1AllocateRowPointers(cusparseHandle handle, CSRPointer C, int rowsC) throws DMLRuntimeException {
 			cusparseSetPointerMode(handle, cusparsePointerMode.CUSPARSE_POINTER_MODE_HOST);
             cudaDeviceSynchronize();
-			C.rowPtr = allocate(getIntSizeOf((long)rowsC+1));
+			// Do not increment the cudaCount of allocations on GPU
+			C.rowPtr = allocate(getIntSizeOf((long)rowsC+1), 0);
 		}
 		
 		/**
@@ -321,8 +323,9 @@ public class JCudaObject extends GPUObject {
 		 * @throws DMLRuntimeException
 		 */
 		private static void step3AllocateValNInd(cusparseHandle handle, CSRPointer C) throws DMLRuntimeException {
-			C.val = allocate(getDoubleSizeOf(C.nnz));
-			C.colInd = allocate(getIntSizeOf(C.nnz));
+			// Increment cudaCount by one when all three arrays of CSR sparse array are allocated
+			C.val = allocate(getDoubleSizeOf(C.nnz), 0);
+			C.colInd = allocate(getIntSizeOf(C.nnz), 1);
 		}
 
 		// ==============================================================================================
@@ -444,11 +447,12 @@ public class JCudaObject extends GPUObject {
 	 * Allocates temporary space on the device.
 	 * Does not update bookkeeping.
 	 * The caller is responsible for freeing up after usage.
-	 * @param size size to allocate
+	 * @param size   			Size of data (in bytes) to allocate
+	 * @param statsCount	amount to increment the cudaAllocCount by
 	 * @return jcuda Pointer
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
-	public static Pointer allocate(long size) throws DMLRuntimeException{
+	public static Pointer allocate(long size, int statsCount) throws DMLRuntimeException{
 		Pointer A = new Pointer();
 		ensureFreeSpace(size);
 		long t0 = System.nanoTime();
@@ -456,11 +460,21 @@ public class JCudaObject extends GPUObject {
 		// Set all elements to 0 since newly allocated space will contain garbage
 		cudaMemset(A, 0, size);
 		Statistics.cudaAllocTime.getAndAdd(System.nanoTime() - t0);
-		Statistics.cudaAllocCount.getAndAdd(1);
+		Statistics.cudaAllocCount.getAndAdd(statsCount);
 		return A;
 	}
 
 	/**
+	 * Convenience method for {@link #allocate(long, int)}, defaults statsCount to 1.
+	 * @param size size of data (in bytes) to allocate
+	 * @return
+	 * @throws DMLRuntimeException
+	 */
+	public static Pointer allocate(long size) throws DMLRuntimeException {
+		return allocate(size, 1);
+	}
+
+	/**
 	 * Allocates a sparse and empty {@link JCudaObject}
 	 * This is the result of operations that are both non zero matrices.
 	 * 
@@ -503,6 +517,9 @@ public class JCudaObject extends GPUObject {
 		return isEmptyAndSparseAndAllocated;
 	}
 
+
+    long thisThread = Thread.currentThread().getId();
+
 	/**
 	 * Allocate necessary memory on the GPU for this {@link JCudaObject} instance.
 	 * 
@@ -522,13 +539,9 @@ public class JCudaObject extends GPUObject {
 				mat.setDirty(true);
 				// Don't copy just allocate
 				if (isSparse){
-					long sparseSize = CSRPointer.estimateSize(mat.getNnz(), mat.getNumRows());
-					ensureFreeSpace(sparseSize);
-					allocateMemoryOnDevice(-1);
+					allocateSparseMatrixOnDevice();
 				} else { 	// Dense block, size = numRows * numCols
-					long size = mat.getNumRows() * mat.getNumColumns();
-					ensureFreeSpace(getDoubleSizeOf(size));
-					allocateMemoryOnDevice(size);
+					allocateDenseMatrixOnDevice();
 				}
 				synchronized(evictionLock) {
 					GPUContext.allocatedPointers.add(this);
@@ -624,24 +637,14 @@ public class JCudaObject extends GPUObject {
 		if(!isAllocated())
 			throw new CacheException("Attempting to release an input before allocating it");
 	}
-	
-	/**
-	 * releases output allocated on GPU
-	 * @throws CacheException if data is not allocated
-	 */
-	public synchronized void releaseOutput() throws CacheException {
-		updateReleaseLocks();
-		isDeviceCopyModified = true;
-		if(!isAllocated())
-			throw new CacheException("Attempting to release an output before allocating it");
-	}
 
+	/**
 	@Override
 	void allocateMemoryOnDevice(long numElemToAllocate) throws DMLRuntimeException {
 		if(!isAllocated()) {
 			long start = System.nanoTime();
 			if(numElemToAllocate == -1 && LibMatrixCUDA.isInSparseFormat(mat)) {
-				setSparseMatrixCudaPointer(CSRPointer.allocateEmpty(mat.getNnz(), mat.getNumRows())); 
+				setSparseMatrixCudaPointer(CSRPointer.allocateEmpty(mat.getNnz(), mat.getNumRows()));
 				numBytes = CSRPointer.estimateSize(mat.getNnz(), mat.getNumRows());
 				JCudaContext.availableNumBytesWithoutUtilFactor.addAndGet(-numBytes);
 				isInSparseFormat = true;
@@ -662,13 +665,52 @@ public class JCudaObject extends GPUObject {
 				cudaMalloc(jcudaDenseMatrixPtr,  numBytes);
 				JCudaContext.availableNumBytesWithoutUtilFactor.addAndGet(-numBytes);
 			}
-			
+
 			Statistics.cudaAllocTime.addAndGet(System.nanoTime()-start);
 			Statistics.cudaAllocCount.addAndGet(1);
 
 		}
 	}
-	
+	 */
+
+	@Override
+	void allocateDenseMatrixOnDevice() throws DMLRuntimeException {
+		assert !isAllocated() : "Internal error - trying to allocated dense matrix to a JCudaObject that is already allocated";
+		long rows = mat.getNumRows();
+		long cols = mat.getNumColumns();
+		assert rows > 0 : "Internal error - invalid number of rows when allocating dense matrix";
+		assert cols > 0 : "Internal error - invalid number of columns when allocating dense matrix;";
+        long size = getDoubleSizeOf(rows * cols);
+		Pointer tmp = allocate(size);
+		setDenseMatrixCudaPointer(tmp);
+		setDeviceModify(size);
+	}
+
+	@Override
+	void allocateSparseMatrixOnDevice() throws DMLRuntimeException {
+		assert !isAllocated() : "Internal error = trying to allocated sparse matrix to a JCudaObject that is already allocated";
+		long rows = mat.getNumRows();
+		long nnz = mat.getNnz();
+		assert rows > 0 : "Internal error - invalid number of rows when allocating a sparse matrix";
+		assert nnz > 0 : "Internal error - invalid number of non zeroes when allocating a sparse matrix";
+		CSRPointer tmp = CSRPointer.allocateEmpty(nnz, rows);
+		setSparseMatrixCudaPointer(tmp);
+		long size = CSRPointer.estimateSize(nnz, rows);
+		setDeviceModify(size);
+	}
+
+	/**
+	 * releases output allocated on GPU
+	 * @throws CacheException if data is not allocated
+	 */
+    @Override
+	public synchronized void releaseOutput() throws CacheException {
+		updateReleaseLocks();
+		isDeviceCopyModified = true;
+		if(!isAllocated())
+			throw new CacheException("Attempting to release an output before allocating it");
+	}
+
 	@Override
 	public void setDeviceModify(long numBytes) {
 		this.numLocks.addAndGet(1);
@@ -765,8 +807,7 @@ public class JCudaObject extends GPUObject {
 				colInd = csrBlock.indexes();
 				values = csrBlock.values();	
 			}
-			ensureFreeSpace(CSRPointer.estimateSize(mat.getNnz(), mat.getNumRows()));
-			allocateMemoryOnDevice(-1);
+			allocateSparseMatrixOnDevice();
 			synchronized(evictionLock) {
 				GPUContext.allocatedPointers.add(this);
 			}
@@ -787,8 +828,7 @@ public class JCudaObject extends GPUObject {
 				data = new double[tmp.getNumRows()*tmp.getNumColumns()];
 			
 			// Copy dense block
-			ensureFreeSpace(getDoubleSizeOf(data.length));
-			allocateMemoryOnDevice(data.length);
+			allocateDenseMatrixOnDevice();
 			synchronized(evictionLock) {
 				GPUContext.allocatedPointers.add(this);
 			}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/3dace386/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index c836c0d..2af894b 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -382,13 +382,9 @@ public class LibMatrixCUDA {
 		try {
 			alpha = pointerTo(1.0f);
 			beta = pointerTo(0.0f);
-			int N = (int) in.getNumRows();
-			int H = (int) in.getNumColumns();
-			int W = 1;
-			if(H % 2 == 0) {
-				H /= 2;
-				W = H;
-			}
+			long N = in.getNumRows();
+			long H = in.getNumColumns();
+			long W = 1;
 			Pointer srcData = ((JCudaObject)in.getGPUObject()).jcudaDenseMatrixPtr;
 			
 			MatrixObject output = ec.getMatrixObject(outputName);
@@ -398,13 +394,13 @@ public class LibMatrixCUDA {
 			if(N*H*W >= numDoublesIn2GB) {
 				// Invokes relu(double* A,  double* ret, int rlen, int clen)
 				kernels.launchKernel("relu",
-						ExecutionConfig.getConfigForSimpleMatrixOperations(N, (int) H*W), 
-						srcData, dstData, N, (int) H*W);
+						ExecutionConfig.getConfigForSimpleMatrixOperations((int)N, (int) (H*W)), 
+						srcData, dstData, (int)N, (int) H*W);
 			}
 			else {
 				// Allocate descriptors
-				srcTensorDesc = allocateTensorDescriptor(N, 1, H, W);
-				dstTensorDesc = allocateTensorDescriptor(N, 1, H, W);
+				srcTensorDesc = allocateTensorDescriptor((int)N, 1, (int)H, (int)W);
+				dstTensorDesc = allocateTensorDescriptor((int)N, 1, (int)H, (int)W);
 				
 	            cudnnActivationForward(cudnnHandle, CUDNN_ACTIVATION_RELU, 
 	                alpha, srcTensorDesc, srcData,