You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by na...@apache.org on 2017/05/18 01:46:57 UTC
incubator-systemml git commit: [HOTFIX] for sparse GPU transpose

Repository: incubator-systemml
Updated Branches:
  refs/heads/master 1fc764b9b -> c3aeb48bf


[HOTFIX] for sparse GPU transpose


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/c3aeb48b
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/c3aeb48b
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/c3aeb48b

Branch: refs/heads/master
Commit: c3aeb48bf6b54febb861b7b4381c3d7af450a8e8
Parents: 1fc764b
Author: Nakul Jindal <na...@gmail.com>
Authored: Wed May 17 18:46:21 2017 -0700
Committer: Nakul Jindal <na...@gmail.com>
Committed: Wed May 17 18:46:21 2017 -0700

----------------------------------------------------------------------
 .../runtime/matrix/data/LibMatrixCUDA.java      | 118 +++++++++++--------
 1 file changed, 68 insertions(+), 50 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c3aeb48b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index 074119b..b023159 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -52,6 +52,7 @@ import static jcuda.jcudnn.cudnnDataType.CUDNN_DATA_DOUBLE;
 import static jcuda.jcudnn.cudnnNanPropagation.CUDNN_PROPAGATE_NAN;
 import static jcuda.jcudnn.cudnnPoolingMode.CUDNN_POOLING_MAX;
 import static jcuda.jcudnn.cudnnTensorFormat.CUDNN_TENSOR_NCHW;
+import static jcuda.jcusparse.JCusparse.cusparseDcsr2csc;
 import static jcuda.jcusparse.JCusparse.cusparseDcsrgemm;
 import static jcuda.jcusparse.JCusparse.cusparseDcsrmv;
 import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_NON_TRANSPOSE;
@@ -61,6 +62,8 @@ import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
 
+import jcuda.jcusparse.cusparseAction;
+import jcuda.jcusparse.cusparseIndexBase;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysml.api.DMLScript;
@@ -2732,7 +2735,7 @@ public class LibMatrixCUDA {
 	 * Performs sparse and dense dgeam given two input matrices
 	 * C = alpha* op( A ) + beta* op ( B )
 	 * where op = transpose or not (specified by isLeftTransposed and isRightTransposed).
-	 *
+	 * To indicate a transpose operation, make sure in1 == in2 and isLeftTransposed == isRightTransposed == true
 	 * @param ec execution context
 	 * @param gCtx a valid {@link GPUContext}
 	 * @param instName the invoking instruction's name for record {@link Statistics}.
@@ -2756,35 +2759,6 @@ public class LibMatrixCUDA {
 		int transa = isLeftTransposed ? CUBLAS_OP_T : CUBLAS_OP_N;
 		int transb = isRightTransposed ? CUBLAS_OP_T : CUBLAS_OP_N;
 
-		int lda = (int) in1.getNumColumns();
-		int ldb = (int) in2.getNumColumns();
-		int m = (int) in1.getNumColumns();
-		int n = (int) in2.getNumRows();
-		if (isLeftTransposed && isRightTransposed) {
-			m = (int) in1.getNumRows();
-			n = (int) in2.getNumColumns();
-		}
-		else if (isLeftTransposed) {
-			m = (int) in1.getNumRows();
-		} else if (isRightTransposed) {
-			n = (int) in2.getNumColumns();
-		}
-		int ldc = m;
-
-
-
-		/**
-		int m = (int) in1.getNumRows();
-		int n = (int) in1.getNumColumns();
-		if(!isLeftTransposed && isRightTransposed) {
-			m = (int) in1.getNumColumns();
-			n = (int) in1.getNumRows();
-		}
-		int lda = isLeftTransposed ? n : m;
-		int ldb = isRightTransposed ? n : m;
-		int ldc = m;
-		**/
-
 		MatrixObject out = ec.getMatrixObject(outputName);
 		boolean isSparse1 = isInSparseFormat(gCtx, in1);
 		boolean isSparse2 = isInSparseFormat(gCtx, in2);
@@ -2792,39 +2766,83 @@ public class LibMatrixCUDA {
 		long t0=0,t1=0;
 		// TODO: Implement sparse-dense matrix cublasDgeam kernel
 		if(isSparse1 || isSparse2) {
+			int m = (int)in1.getNumRows();
+			int n = (int)in1.getNumColumns();
 			// Invoke cuSparse when either are in sparse format
 			// Perform sparse-sparse dgeam
-			if(!isInSparseFormat(gCtx, in1)) {
-				if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
+			if (!isInSparseFormat(gCtx, in1)) {
+				if (GPUStatistics.DISPLAY_STATISTICS)
+					t0 = System.nanoTime();
 				in1.getGPUObject(gCtx).denseToSparse();
-				if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t0);
+				if (GPUStatistics.DISPLAY_STATISTICS)
+					GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE,
+							System.nanoTime() - t0);
 			}
 			CSRPointer A = in1.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
-			if(!isInSparseFormat(gCtx, in2)) {
-				if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
+			if (!isInSparseFormat(gCtx, in2)) {
+				if (GPUStatistics.DISPLAY_STATISTICS)
+					t0 = System.nanoTime();
 				in2.getGPUObject(gCtx).denseToSparse();
-				if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t0);
+				if (GPUStatistics.DISPLAY_STATISTICS)
+					GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE,
+							System.nanoTime() - t0);
 			}
 			CSRPointer B = in2.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
 
 			ec.allocateGPUMatrixObject(outputName);
+			out.getGPUObject(gCtx).addReadLock();
 
-			if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
-			CSRPointer C = CSRPointer.allocateForDgeam(gCtx, getCusparseHandle(gCtx), A, B, m, n);
-			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_ALLOCATE_LIB, System.nanoTime() - t1);
+			if (in1 == in2 && isLeftTransposed == true && isLeftTransposed == isRightTransposed) {
+				// Special case for transpose
 
-			out.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
-			//long sizeOfC = CSRPointer.estimateSize(C.nnz, out.getNumRows());
-			out.getGPUObject(gCtx).addReadLock();
-			if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
-			JCusparse.cusparseDcsrgeam(getCusparseHandle(gCtx), m, n, alphaPtr, A.descr, (int)A.nnz, A.val, A.rowPtr, A.colInd, betaPtr,
-							B.descr, (int)B.nnz, B.val, B.rowPtr, B.colInd,
-							C.descr, C.val, C.rowPtr, C.colInd);
-			//cudaDeviceSynchronize;
-			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_DGEAM_LIB, System.nanoTime() - t0);
-		}
-		else {
+				int nnz = (int)A.nnz;
+				CSRPointer C = CSRPointer.allocateEmpty(gCtx, nnz, n);
+				out.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
+				cusparseDcsr2csc(getCusparseHandle(gCtx), m, n, nnz, A.val, A.rowPtr, A.colInd, C.val, C.colInd, C.rowPtr, cusparseAction.CUSPARSE_ACTION_NUMERIC, cusparseIndexBase.CUSPARSE_INDEX_BASE_ZERO);
+			} else {
+				// General case (cusparse does not support accept the transpose operator for dgeam)
+				// TODO: to implement the transposed + dgeam for sparse matrices, they need to be converted to csc, which is effectively a tranpose
+				if (isLeftTransposed || isRightTransposed) {
+					throw new DMLRuntimeException(
+							"Transpose in cusparseDcsrgeam not supported for sparse matrices on GPU");
+				}
+
+				if (GPUStatistics.DISPLAY_STATISTICS)
+					t1 = System.nanoTime();
+				CSRPointer C = CSRPointer.allocateForDgeam(gCtx, getCusparseHandle(gCtx), A, B, m, n);
+				if (GPUStatistics.DISPLAY_STATISTICS)
+					GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_ALLOCATE_LIB,
+							System.nanoTime() - t1);
+
+				out.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
+				//long sizeOfC = CSRPointer.estimateSize(C.nnz, out.getNumRows());
+				if (GPUStatistics.DISPLAY_STATISTICS)
+					t0 = System.nanoTime();
+				JCusparse.cusparseDcsrgeam(getCusparseHandle(gCtx), m, n, alphaPtr, A.descr, (int) A.nnz, A.val, A.rowPtr, A.colInd, betaPtr,
+						B.descr, (int) B.nnz, B.val, B.rowPtr, B.colInd, C.descr, C.val, C.rowPtr, C.colInd);
+				//cudaDeviceSynchronize;
+				if (GPUStatistics.DISPLAY_STATISTICS)
+					GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_DGEAM_LIB,
+							System.nanoTime() - t0);
+			}
+		} else {
 			// Dense-Dense dgeam
+
+			int lda = (int) in1.getNumColumns();
+			int ldb = (int) in2.getNumColumns();
+			int m = (int) in1.getNumColumns();
+			int n = (int) in2.getNumRows();
+			if (isLeftTransposed && isRightTransposed) {
+				m = (int) in1.getNumRows();
+				n = (int) in2.getNumColumns();
+			}
+			else if (isLeftTransposed) {
+				m = (int) in1.getNumRows();
+			} else if (isRightTransposed) {
+				n = (int) in2.getNumColumns();
+			}
+			int ldc = m;
+
 			Pointer A = getDensePointer(gCtx, in1, instName);
 			Pointer B = getDensePointer(gCtx, in2, instName);
 			getDenseMatrixOutputForGPUInstruction(ec, instName, outputName);	// Allocated the dense output matrix