You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by de...@apache.org on 2016/11/17 22:38:54 UTC
incubator-systemml git commit: [SYSTEMML-446] Miscellaneous bug fixes
to SYSTEMML-446
Repository: incubator-systemml
Updated Branches:
refs/heads/master 1164f99be -> 3dace3869
[SYSTEMML-446] Miscellaneous bug fixes to SYSTEMML-446
Closes #289.
Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/3dace386
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/3dace386
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/3dace386
Branch: refs/heads/master
Commit: 3dace3869e8216d82f9990be438cfe9e76deb333
Parents: 1164f99
Author: Nakul Jindal <na...@gmail.com>
Authored: Thu Nov 17 14:33:32 2016 -0800
Committer: Deron Eriksson <de...@us.ibm.com>
Committed: Thu Nov 17 14:33:32 2016 -0800
----------------------------------------------------------------------
.../gpu/context/ExecutionConfig.java | 2 +-
.../instructions/gpu/context/GPUObject.java | 9 +-
.../instructions/gpu/context/JCudaObject.java | 108 +++++++++++++------
.../runtime/matrix/data/LibMatrixCUDA.java | 18 ++--
4 files changed, 85 insertions(+), 52 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/3dace386/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
index 7cab238..c04e8a4 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
@@ -80,7 +80,7 @@ public class ExecutionConfig {
int maxBlockDim = getMaxBlockDim(deviceNumber);
int blockDimX = (int) Math.min(maxBlockDim, rlen);
int gridDimX = (int)Math.ceil((double)rlen / blockDimX);
- int blockDimY = (int)Math.min(Math.ceil(((double)maxBlockDim)/blockDimX), clen);
+ int blockDimY = (int)Math.min(Math.floor(((double)maxBlockDim)/blockDimX), clen);
int gridDimY = (int)Math.ceil((double)clen / blockDimY);
return new ExecutionConfig(gridDimX, gridDimY, blockDimX, blockDimY);
}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/3dace386/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
index 91bd3ea..bcffa46 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -78,12 +78,9 @@ public abstract class GPUObject
public abstract void releaseOutput() throws CacheException;
// package-level visibility as these methods are guarded by underlying GPUContext
- /**
- * Allocates memory on the GPU
- * @param numElemToAllocate number of elements in dense matrix, -1 for unknown or sparse matrix
- * @throws DMLRuntimeException if DMLRuntimeException occurs
- */
- abstract void allocateMemoryOnDevice(long numElemToAllocate) throws DMLRuntimeException;
+
+ abstract void allocateDenseMatrixOnDevice() throws DMLRuntimeException;
+ abstract void allocateSparseMatrixOnDevice() throws DMLRuntimeException;
abstract void deallocateMemoryOnDevice() throws DMLRuntimeException;
abstract long getSizeOnDevice() throws DMLRuntimeException;
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/3dace386/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java
index a3aab96..0c96417 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java
@@ -90,7 +90,7 @@ public class JCudaObject extends GPUObject {
private static final double ULTRA_SPARSITY_TURN_POINT = 0.0004;
/**
- * Default constructor to help with Factory method {@link #allocateCSRMatrix(long, long, long)}
+ * Default constructor to help with Factory method {@link #allocateEmpty(long, long)}
*/
private CSRPointer() {
val = new Pointer();
@@ -170,9 +170,10 @@ public class JCudaObject extends GPUObject {
return r;
}
ensureFreeSpace(getDoubleSizeOf(nnz2) + getIntSizeOf(rows + 1) + getIntSizeOf(nnz2));
- r.val = allocate(getDoubleSizeOf(nnz2));
- r.rowPtr = allocate(getIntSizeOf(rows + 1));
- r.colInd = allocate(getIntSizeOf(nnz2));
+ // increment the cudaCount by 1 for the allocation of all 3 arrays
+ r.val = allocate(getDoubleSizeOf(nnz2), 0);
+ r.rowPtr = allocate(getIntSizeOf(rows + 1), 0);
+ r.colInd = allocate(getIntSizeOf(nnz2), 1);
return r;
}
@@ -245,7 +246,8 @@ public class JCudaObject extends GPUObject {
private static void step1AllocateRowPointers(cusparseHandle handle, CSRPointer C, int rowsC) throws DMLRuntimeException {
cusparseSetPointerMode(handle, cusparsePointerMode.CUSPARSE_POINTER_MODE_HOST);
cudaDeviceSynchronize();
- C.rowPtr = allocate(getIntSizeOf((long)rowsC+1));
+ // Do not increment the cudaCount of allocations on GPU
+ C.rowPtr = allocate(getIntSizeOf((long)rowsC+1), 0);
}
/**
@@ -321,8 +323,9 @@ public class JCudaObject extends GPUObject {
* @throws DMLRuntimeException
*/
private static void step3AllocateValNInd(cusparseHandle handle, CSRPointer C) throws DMLRuntimeException {
- C.val = allocate(getDoubleSizeOf(C.nnz));
- C.colInd = allocate(getIntSizeOf(C.nnz));
+ // Increment cudaCount by one when all three arrays of CSR sparse array are allocated
+ C.val = allocate(getDoubleSizeOf(C.nnz), 0);
+ C.colInd = allocate(getIntSizeOf(C.nnz), 1);
}
// ==============================================================================================
@@ -444,11 +447,12 @@ public class JCudaObject extends GPUObject {
* Allocates temporary space on the device.
* Does not update bookkeeping.
* The caller is responsible for freeing up after usage.
- * @param size size to allocate
+ * @param size Size of data (in bytes) to allocate
+ * @param statsCount amount to increment the cudaAllocCount by
* @return jcuda Pointer
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
- public static Pointer allocate(long size) throws DMLRuntimeException{
+ public static Pointer allocate(long size, int statsCount) throws DMLRuntimeException{
Pointer A = new Pointer();
ensureFreeSpace(size);
long t0 = System.nanoTime();
@@ -456,11 +460,21 @@ public class JCudaObject extends GPUObject {
// Set all elements to 0 since newly allocated space will contain garbage
cudaMemset(A, 0, size);
Statistics.cudaAllocTime.getAndAdd(System.nanoTime() - t0);
- Statistics.cudaAllocCount.getAndAdd(1);
+ Statistics.cudaAllocCount.getAndAdd(statsCount);
return A;
}
/**
+ * Convenience method for {@link #allocate(long, int)}, defaults statsCount to 1.
+ * @param size size of data (in bytes) to allocate
+ * @return
+ * @throws DMLRuntimeException
+ */
+ public static Pointer allocate(long size) throws DMLRuntimeException {
+ return allocate(size, 1);
+ }
+
+ /**
* Allocates a sparse and empty {@link JCudaObject}
* This is the result of operations that are both non zero matrices.
*
@@ -503,6 +517,9 @@ public class JCudaObject extends GPUObject {
return isEmptyAndSparseAndAllocated;
}
+
+ long thisThread = Thread.currentThread().getId();
+
/**
* Allocate necessary memory on the GPU for this {@link JCudaObject} instance.
*
@@ -522,13 +539,9 @@ public class JCudaObject extends GPUObject {
mat.setDirty(true);
// Don't copy just allocate
if (isSparse){
- long sparseSize = CSRPointer.estimateSize(mat.getNnz(), mat.getNumRows());
- ensureFreeSpace(sparseSize);
- allocateMemoryOnDevice(-1);
+ allocateSparseMatrixOnDevice();
} else { // Dense block, size = numRows * numCols
- long size = mat.getNumRows() * mat.getNumColumns();
- ensureFreeSpace(getDoubleSizeOf(size));
- allocateMemoryOnDevice(size);
+ allocateDenseMatrixOnDevice();
}
synchronized(evictionLock) {
GPUContext.allocatedPointers.add(this);
@@ -624,24 +637,14 @@ public class JCudaObject extends GPUObject {
if(!isAllocated())
throw new CacheException("Attempting to release an input before allocating it");
}
-
- /**
- * releases output allocated on GPU
- * @throws CacheException if data is not allocated
- */
- public synchronized void releaseOutput() throws CacheException {
- updateReleaseLocks();
- isDeviceCopyModified = true;
- if(!isAllocated())
- throw new CacheException("Attempting to release an output before allocating it");
- }
+ /**
@Override
void allocateMemoryOnDevice(long numElemToAllocate) throws DMLRuntimeException {
if(!isAllocated()) {
long start = System.nanoTime();
if(numElemToAllocate == -1 && LibMatrixCUDA.isInSparseFormat(mat)) {
- setSparseMatrixCudaPointer(CSRPointer.allocateEmpty(mat.getNnz(), mat.getNumRows()));
+ setSparseMatrixCudaPointer(CSRPointer.allocateEmpty(mat.getNnz(), mat.getNumRows()));
numBytes = CSRPointer.estimateSize(mat.getNnz(), mat.getNumRows());
JCudaContext.availableNumBytesWithoutUtilFactor.addAndGet(-numBytes);
isInSparseFormat = true;
@@ -662,13 +665,52 @@ public class JCudaObject extends GPUObject {
cudaMalloc(jcudaDenseMatrixPtr, numBytes);
JCudaContext.availableNumBytesWithoutUtilFactor.addAndGet(-numBytes);
}
-
+
Statistics.cudaAllocTime.addAndGet(System.nanoTime()-start);
Statistics.cudaAllocCount.addAndGet(1);
}
}
-
+ */
+
+ @Override
+ void allocateDenseMatrixOnDevice() throws DMLRuntimeException {
+ assert !isAllocated() : "Internal error - trying to allocated dense matrix to a JCudaObject that is already allocated";
+ long rows = mat.getNumRows();
+ long cols = mat.getNumColumns();
+ assert rows > 0 : "Internal error - invalid number of rows when allocating dense matrix";
+ assert cols > 0 : "Internal error - invalid number of columns when allocating dense matrix;";
+ long size = getDoubleSizeOf(rows * cols);
+ Pointer tmp = allocate(size);
+ setDenseMatrixCudaPointer(tmp);
+ setDeviceModify(size);
+ }
+
+ @Override
+ void allocateSparseMatrixOnDevice() throws DMLRuntimeException {
+ assert !isAllocated() : "Internal error = trying to allocated sparse matrix to a JCudaObject that is already allocated";
+ long rows = mat.getNumRows();
+ long nnz = mat.getNnz();
+ assert rows > 0 : "Internal error - invalid number of rows when allocating a sparse matrix";
+ assert nnz > 0 : "Internal error - invalid number of non zeroes when allocating a sparse matrix";
+ CSRPointer tmp = CSRPointer.allocateEmpty(nnz, rows);
+ setSparseMatrixCudaPointer(tmp);
+ long size = CSRPointer.estimateSize(nnz, rows);
+ setDeviceModify(size);
+ }
+
+ /**
+ * releases output allocated on GPU
+ * @throws CacheException if data is not allocated
+ */
+ @Override
+ public synchronized void releaseOutput() throws CacheException {
+ updateReleaseLocks();
+ isDeviceCopyModified = true;
+ if(!isAllocated())
+ throw new CacheException("Attempting to release an output before allocating it");
+ }
+
@Override
public void setDeviceModify(long numBytes) {
this.numLocks.addAndGet(1);
@@ -765,8 +807,7 @@ public class JCudaObject extends GPUObject {
colInd = csrBlock.indexes();
values = csrBlock.values();
}
- ensureFreeSpace(CSRPointer.estimateSize(mat.getNnz(), mat.getNumRows()));
- allocateMemoryOnDevice(-1);
+ allocateSparseMatrixOnDevice();
synchronized(evictionLock) {
GPUContext.allocatedPointers.add(this);
}
@@ -787,8 +828,7 @@ public class JCudaObject extends GPUObject {
data = new double[tmp.getNumRows()*tmp.getNumColumns()];
// Copy dense block
- ensureFreeSpace(getDoubleSizeOf(data.length));
- allocateMemoryOnDevice(data.length);
+ allocateDenseMatrixOnDevice();
synchronized(evictionLock) {
GPUContext.allocatedPointers.add(this);
}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/3dace386/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index c836c0d..2af894b 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -382,13 +382,9 @@ public class LibMatrixCUDA {
try {
alpha = pointerTo(1.0f);
beta = pointerTo(0.0f);
- int N = (int) in.getNumRows();
- int H = (int) in.getNumColumns();
- int W = 1;
- if(H % 2 == 0) {
- H /= 2;
- W = H;
- }
+ long N = in.getNumRows();
+ long H = in.getNumColumns();
+ long W = 1;
Pointer srcData = ((JCudaObject)in.getGPUObject()).jcudaDenseMatrixPtr;
MatrixObject output = ec.getMatrixObject(outputName);
@@ -398,13 +394,13 @@ public class LibMatrixCUDA {
if(N*H*W >= numDoublesIn2GB) {
// Invokes relu(double* A, double* ret, int rlen, int clen)
kernels.launchKernel("relu",
- ExecutionConfig.getConfigForSimpleMatrixOperations(N, (int) H*W),
- srcData, dstData, N, (int) H*W);
+ ExecutionConfig.getConfigForSimpleMatrixOperations((int)N, (int) (H*W)),
+ srcData, dstData, (int)N, (int) H*W);
}
else {
// Allocate descriptors
- srcTensorDesc = allocateTensorDescriptor(N, 1, H, W);
- dstTensorDesc = allocateTensorDescriptor(N, 1, H, W);
+ srcTensorDesc = allocateTensorDescriptor((int)N, 1, (int)H, (int)W);
+ dstTensorDesc = allocateTensorDescriptor((int)N, 1, (int)H, (int)W);
cudnnActivationForward(cudnnHandle, CUDNN_ACTIVATION_RELU,
alpha, srcTensorDesc, srcData,