You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by na...@apache.org on 2017/05/18 01:46:57 UTC
incubator-systemml git commit: [HOTFIX] for sparse GPU transpose
Repository: incubator-systemml
Updated Branches:
refs/heads/master 1fc764b9b -> c3aeb48bf
[HOTFIX] for sparse GPU transpose
Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/c3aeb48b
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/c3aeb48b
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/c3aeb48b
Branch: refs/heads/master
Commit: c3aeb48bf6b54febb861b7b4381c3d7af450a8e8
Parents: 1fc764b
Author: Nakul Jindal <na...@gmail.com>
Authored: Wed May 17 18:46:21 2017 -0700
Committer: Nakul Jindal <na...@gmail.com>
Committed: Wed May 17 18:46:21 2017 -0700
----------------------------------------------------------------------
.../runtime/matrix/data/LibMatrixCUDA.java | 118 +++++++++++--------
1 file changed, 68 insertions(+), 50 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c3aeb48b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index 074119b..b023159 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -52,6 +52,7 @@ import static jcuda.jcudnn.cudnnDataType.CUDNN_DATA_DOUBLE;
import static jcuda.jcudnn.cudnnNanPropagation.CUDNN_PROPAGATE_NAN;
import static jcuda.jcudnn.cudnnPoolingMode.CUDNN_POOLING_MAX;
import static jcuda.jcudnn.cudnnTensorFormat.CUDNN_TENSOR_NCHW;
+import static jcuda.jcusparse.JCusparse.cusparseDcsr2csc;
import static jcuda.jcusparse.JCusparse.cusparseDcsrgemm;
import static jcuda.jcusparse.JCusparse.cusparseDcsrmv;
import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_NON_TRANSPOSE;
@@ -61,6 +62,8 @@ import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice;
import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
+import jcuda.jcusparse.cusparseAction;
+import jcuda.jcusparse.cusparseIndexBase;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.sysml.api.DMLScript;
@@ -2732,7 +2735,7 @@ public class LibMatrixCUDA {
* Performs sparse and dense dgeam given two input matrices
* C = alpha* op( A ) + beta* op ( B )
* where op = transpose or not (specified by isLeftTransposed and isRightTransposed).
- *
+ * To indicate a transpose operation, make sure in1 == in2 and isLeftTransposed == isRightTransposed == true
* @param ec execution context
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
@@ -2756,35 +2759,6 @@ public class LibMatrixCUDA {
int transa = isLeftTransposed ? CUBLAS_OP_T : CUBLAS_OP_N;
int transb = isRightTransposed ? CUBLAS_OP_T : CUBLAS_OP_N;
- int lda = (int) in1.getNumColumns();
- int ldb = (int) in2.getNumColumns();
- int m = (int) in1.getNumColumns();
- int n = (int) in2.getNumRows();
- if (isLeftTransposed && isRightTransposed) {
- m = (int) in1.getNumRows();
- n = (int) in2.getNumColumns();
- }
- else if (isLeftTransposed) {
- m = (int) in1.getNumRows();
- } else if (isRightTransposed) {
- n = (int) in2.getNumColumns();
- }
- int ldc = m;
-
-
-
- /**
- int m = (int) in1.getNumRows();
- int n = (int) in1.getNumColumns();
- if(!isLeftTransposed && isRightTransposed) {
- m = (int) in1.getNumColumns();
- n = (int) in1.getNumRows();
- }
- int lda = isLeftTransposed ? n : m;
- int ldb = isRightTransposed ? n : m;
- int ldc = m;
- **/
-
MatrixObject out = ec.getMatrixObject(outputName);
boolean isSparse1 = isInSparseFormat(gCtx, in1);
boolean isSparse2 = isInSparseFormat(gCtx, in2);
@@ -2792,39 +2766,83 @@ public class LibMatrixCUDA {
long t0=0,t1=0;
// TODO: Implement sparse-dense matrix cublasDgeam kernel
if(isSparse1 || isSparse2) {
+ int m = (int)in1.getNumRows();
+ int n = (int)in1.getNumColumns();
// Invoke cuSparse when either are in sparse format
// Perform sparse-sparse dgeam
- if(!isInSparseFormat(gCtx, in1)) {
- if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
+ if (!isInSparseFormat(gCtx, in1)) {
+ if (GPUStatistics.DISPLAY_STATISTICS)
+ t0 = System.nanoTime();
in1.getGPUObject(gCtx).denseToSparse();
- if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t0);
+ if (GPUStatistics.DISPLAY_STATISTICS)
+ GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE,
+ System.nanoTime() - t0);
}
CSRPointer A = in1.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
- if(!isInSparseFormat(gCtx, in2)) {
- if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
+ if (!isInSparseFormat(gCtx, in2)) {
+ if (GPUStatistics.DISPLAY_STATISTICS)
+ t0 = System.nanoTime();
in2.getGPUObject(gCtx).denseToSparse();
- if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t0);
+ if (GPUStatistics.DISPLAY_STATISTICS)
+ GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE,
+ System.nanoTime() - t0);
}
CSRPointer B = in2.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
ec.allocateGPUMatrixObject(outputName);
+ out.getGPUObject(gCtx).addReadLock();
- if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
- CSRPointer C = CSRPointer.allocateForDgeam(gCtx, getCusparseHandle(gCtx), A, B, m, n);
- if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_ALLOCATE_LIB, System.nanoTime() - t1);
+ if (in1 == in2 && isLeftTransposed == true && isLeftTransposed == isRightTransposed) {
+ // Special case for transpose
- out.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
- //long sizeOfC = CSRPointer.estimateSize(C.nnz, out.getNumRows());
- out.getGPUObject(gCtx).addReadLock();
- if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
- JCusparse.cusparseDcsrgeam(getCusparseHandle(gCtx), m, n, alphaPtr, A.descr, (int)A.nnz, A.val, A.rowPtr, A.colInd, betaPtr,
- B.descr, (int)B.nnz, B.val, B.rowPtr, B.colInd,
- C.descr, C.val, C.rowPtr, C.colInd);
- //cudaDeviceSynchronize;
- if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_DGEAM_LIB, System.nanoTime() - t0);
- }
- else {
+ int nnz = (int)A.nnz;
+ CSRPointer C = CSRPointer.allocateEmpty(gCtx, nnz, n);
+ out.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
+ cusparseDcsr2csc(getCusparseHandle(gCtx), m, n, nnz, A.val, A.rowPtr, A.colInd, C.val, C.colInd, C.rowPtr, cusparseAction.CUSPARSE_ACTION_NUMERIC, cusparseIndexBase.CUSPARSE_INDEX_BASE_ZERO);
+ } else {
+ // General case (cusparse does not support accept the transpose operator for dgeam)
+ // TODO: to implement the transposed + dgeam for sparse matrices, they need to be converted to csc, which is effectively a tranpose
+ if (isLeftTransposed || isRightTransposed) {
+ throw new DMLRuntimeException(
+ "Transpose in cusparseDcsrgeam not supported for sparse matrices on GPU");
+ }
+
+ if (GPUStatistics.DISPLAY_STATISTICS)
+ t1 = System.nanoTime();
+ CSRPointer C = CSRPointer.allocateForDgeam(gCtx, getCusparseHandle(gCtx), A, B, m, n);
+ if (GPUStatistics.DISPLAY_STATISTICS)
+ GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_ALLOCATE_LIB,
+ System.nanoTime() - t1);
+
+ out.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
+ //long sizeOfC = CSRPointer.estimateSize(C.nnz, out.getNumRows());
+ if (GPUStatistics.DISPLAY_STATISTICS)
+ t0 = System.nanoTime();
+ JCusparse.cusparseDcsrgeam(getCusparseHandle(gCtx), m, n, alphaPtr, A.descr, (int) A.nnz, A.val, A.rowPtr, A.colInd, betaPtr,
+ B.descr, (int) B.nnz, B.val, B.rowPtr, B.colInd, C.descr, C.val, C.rowPtr, C.colInd);
+ //cudaDeviceSynchronize;
+ if (GPUStatistics.DISPLAY_STATISTICS)
+ GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_DGEAM_LIB,
+ System.nanoTime() - t0);
+ }
+ } else {
// Dense-Dense dgeam
+
+ int lda = (int) in1.getNumColumns();
+ int ldb = (int) in2.getNumColumns();
+ int m = (int) in1.getNumColumns();
+ int n = (int) in2.getNumRows();
+ if (isLeftTransposed && isRightTransposed) {
+ m = (int) in1.getNumRows();
+ n = (int) in2.getNumColumns();
+ }
+ else if (isLeftTransposed) {
+ m = (int) in1.getNumRows();
+ } else if (isRightTransposed) {
+ n = (int) in2.getNumColumns();
+ }
+ int ldc = m;
+
Pointer A = getDensePointer(gCtx, in1, instName);
Pointer B = getDensePointer(gCtx, in2, instName);
getDenseMatrixOutputForGPUInstruction(ec, instName, outputName); // Allocated the dense output matrix