You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2017/10/20 20:29:38 UTC
systemml git commit: [SYSTEMML-446] [SYSTEMML-702] Updated the sparse
matrix multiplication to minimize sparse-to-dense as well as dense-to-sparse
conversion
Repository: systemml
Updated Branches:
refs/heads/master 323dd72a8 -> 6de8f051d
[SYSTEMML-446] [SYSTEMML-702] Updated the sparse matrix multiplication to minimize sparse-to-dense as well as dense-to-sparse conversion
1. The goal of this PR is not to improve performance (for example: by considering the cost of sparse-to-dense vs FLOPs required given a memory budget) but instead to minimize sparse-to-dense conversion in the GPU matrix multiplication operator.
2. If matmult uses unnecessary sparse-to-dense conversion, then we run into
risk of one of the two situations:
- In best case some of the matmult won't be pushed to GPU under worst-case memory budget.
- On other hand, if these conversion are not accounted for, they may cause OOMs.
3. Every operator (except dense-sparse matrix multiplication) uses only memory allocated to input and output matrices.
4. Since there is no CuSPARSE kernel for dense-sparse matrix multiplication operator, we either have to transpose the output after performing sparse-dense matrix multiplication or perform dense-dense matrix multiplication after converting the sparse input to dense format.
Closes #686.
Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/6de8f051
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/6de8f051
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/6de8f051
Branch: refs/heads/master
Commit: 6de8f051daaefdab403c0edd3c7beb30c9619033
Parents: 323dd72
Author: Niketan Pansare <np...@us.ibm.com>
Authored: Fri Oct 20 12:29:20 2017 -0800
Committer: Niketan Pansare <np...@us.ibm.com>
Committed: Fri Oct 20 13:29:20 2017 -0700
----------------------------------------------------------------------
.../java/org/apache/sysml/hops/AggBinaryOp.java | 30 +-
.../gpu/AggregateBinaryGPUInstruction.java | 3 +-
.../instructions/gpu/GPUInstruction.java | 1 +
.../instructions/gpu/context/CSRPointer.java | 7 +-
.../instructions/gpu/context/GPUObject.java | 2 +-
.../runtime/matrix/data/LibMatrixCUDA.java | 541 +------------------
.../runtime/matrix/data/LibMatrixCuMatMult.java | 480 ++++++++++++++++
.../test/gpu/MatrixMultiplicationOpTest.java | 98 +++-
8 files changed, 581 insertions(+), 581 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/systemml/blob/6de8f051/src/main/java/org/apache/sysml/hops/AggBinaryOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/AggBinaryOp.java b/src/main/java/org/apache/sysml/hops/AggBinaryOp.java
index cfa99a4..d733d6a 100644
--- a/src/main/java/org/apache/sysml/hops/AggBinaryOp.java
+++ b/src/main/java/org/apache/sysml/hops/AggBinaryOp.java
@@ -371,40 +371,16 @@ public class AggBinaryOp extends Hop implements MultiThreadedHop
double ret = 0;
if (isGPUEnabled()) {
- // In GPU Mode, intermediate memory is only needed in case of one of the matrix blocks is sparse
- // When sparse block is converted to dense and a dense MM takes place, we need (dim1 * dim2)
- // When dense block is converted to sparse and a sparse MM takes place, we need (dim1 * dim2 * 2)
-
Hop in1 = _input.get(0);
Hop in2 = _input.get(1);
double in1Sparsity = OptimizerUtils.getSparsity(in1.getDim1(), in1.getDim2(), in1.getNnz());
double in2Sparsity = OptimizerUtils.getSparsity(in2.getDim1(), in2.getDim2(), in2.getNnz());
-
boolean in1Sparse = in1Sparsity < MatrixBlock.SPARSITY_TURN_POINT;
boolean in2Sparse = in2Sparsity < MatrixBlock.SPARSITY_TURN_POINT;
-
- boolean in1UltraSparse = in1Sparsity < MatrixBlock.ULTRA_SPARSITY_TURN_POINT;
- boolean in2UltraSparse = in2Sparsity < MatrixBlock.ULTRA_SPARSITY_TURN_POINT;
-
- // For Matmult X * Y, if X is sparse, Y is dense, X is converted to dense
- // If X is ultrasparse, Y is converted to sparse
- if (in1Sparse ^ in2Sparse) { // one sparse, one dense
- if (in1Sparse) {
- if (in1UltraSparse) {
- ret += 2 * OptimizerUtils.estimateSizeExactSparsity(in2.getDim1(), in2.getDim2(), in2.getNnz());
- } else {
- ret += OptimizerUtils.estimateSizeExactSparsity(in1.getDim1(), in1.getDim2(), in1.getNnz());
- }
- } else if (in2Sparse) {
- if (in2UltraSparse) {
- ret += 2 * OptimizerUtils.estimateSizeExactSparsity(in1.getDim1(), in1.getDim2(), in1.getNnz());
- } else {
- ret += OptimizerUtils.estimateSizeExactSparsity(in2.getDim1(), in2.getDim2(), in2.getNnz());
- }
- }
-
+ if(in1Sparse && !in2Sparse) {
+ // Only in sparse-dense cases, we need additional memory budget for GPU
+ ret += OptimizerUtils.estimateSizeExactSparsity(dim1, dim2, 1.0);
}
-
}
//account for potential final dense-sparse transformation (worst-case sparse representation)
http://git-wip-us.apache.org/repos/asf/systemml/blob/6de8f051/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
index dfc000f..29a1ead 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
@@ -27,6 +27,7 @@ import org.apache.sysml.runtime.functionobjects.SwapIndex;
import org.apache.sysml.runtime.instructions.InstructionUtils;
import org.apache.sysml.runtime.instructions.cp.CPOperand;
import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
+import org.apache.sysml.runtime.matrix.data.LibMatrixCuMatMult;
import org.apache.sysml.runtime.matrix.data.MatrixBlock;
import org.apache.sysml.runtime.matrix.operators.AggregateBinaryOperator;
import org.apache.sysml.runtime.matrix.operators.AggregateOperator;
@@ -94,7 +95,7 @@ public class AggregateBinaryGPUInstruction extends GPUInstruction {
int clen = (int) (_isRightTransposed ? m2.getNumRows() : m2.getNumColumns());
ec.setMetaData(_output.getName(), rlen, clen);
- LibMatrixCUDA.matmult(ec, ec.getGPUContext(0), getExtendedOpcode(), m1, m2, _output.getName(), _isLeftTransposed, _isRightTransposed);
+ LibMatrixCuMatMult.matmult(ec, ec.getGPUContext(0), getExtendedOpcode(), m1, m2, _output.getName(), _isLeftTransposed, _isRightTransposed);
//release inputs/outputs
ec.releaseMatrixInputForGPUInstruction(_input1.getName());
http://git-wip-us.apache.org/repos/asf/systemml/blob/6de8f051/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
index ed2a4a2..9dd163d 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
@@ -78,6 +78,7 @@ public abstract class GPUInstruction extends Instruction {
public final static String MISC_TIMER_DENSE_MATRIX_DENSE_MATRIX_LIB = "Mdmdm"; // time spent in matrix mult of dense matrices
public final static String MISC_TIMER_SPARSE_MATRIX_DENSE_VECTOR_LIB = "Msmdv"; // time spent in matrix mult of sparse matrix and dense vector
public final static String MISC_TIMER_SPARSE_MATRIX_SPARSE_MATRIX_LIB = "Msmsm"; // time spent in matrix mult of sparse matrices
+ public final static String MISC_TIMER_SPARSE_MATRIX_DENSE_MATRIX_LIB = "Msmdm"; // time spent in matrix mult of sparse matrices
public final static String MISC_TIMER_SYRK_LIB = "Msyrk"; // time spent in symmetric rank-k update
// Other BLAS instructions
http://git-wip-us.apache.org/repos/asf/systemml/blob/6de8f051/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
index 5a6e21c..7176a9c 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
@@ -37,7 +37,9 @@ import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.sysml.api.DMLScript;
import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
import org.apache.sysml.utils.GPUStatistics;
+import org.apache.sysml.utils.Statistics;
import jcuda.Pointer;
import jcuda.Sizeof;
@@ -495,11 +497,13 @@ public class CSRPointer {
* @param cublasHandle a valid {@link cublasHandle}
* @param rows number of rows in this CSR matrix
* @param cols number of columns in this CSR matrix
+ * @param instName name of the invoking instruction to record{@link Statistics}.
* @return A {@link Pointer} to the allocated dense matrix (in column-major format)
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public Pointer toColumnMajorDenseMatrix(cusparseHandle cusparseHandle, cublasHandle cublasHandle, int rows,
- int cols) throws DMLRuntimeException {
+ int cols, String instName) throws DMLRuntimeException {
+ long t0 = GPUStatistics.DISPLAY_STATISTICS && instName != null ? System.nanoTime() : 0;
LOG.trace("GPU : sparse -> column major dense (inside CSRPointer) on " + this + ", GPUContext="
+ getGPUContext());
long size = ((long) rows) * getDoubleSizeOf((long) cols);
@@ -512,6 +516,7 @@ public class CSRPointer {
} else {
LOG.debug("in CSRPointer, the values array, row pointers array or column indices array was null");
}
+ if (GPUStatistics.DISPLAY_STATISTICS && instName != null) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0);
return A;
}
http://git-wip-us.apache.org/repos/asf/systemml/blob/6de8f051/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
index feb34bc..06327db 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -458,7 +458,7 @@ public class GPUObject {
throw new DMLRuntimeException("Expected cusparse to be initialized");
int rows = toIntExact(mat.getNumRows());
int cols = toIntExact(mat.getNumColumns());
- setDenseMatrixCudaPointer(getJcudaSparseMatrixPtr().toColumnMajorDenseMatrix(cusparseHandle, null, rows, cols));
+ setDenseMatrixCudaPointer(getJcudaSparseMatrixPtr().toColumnMajorDenseMatrix(cusparseHandle, null, rows, cols, null));
}
/**
http://git-wip-us.apache.org/repos/asf/systemml/blob/6de8f051/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index f4a00ab..7e25299 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -22,18 +22,11 @@ package org.apache.sysml.runtime.matrix.data;
import static jcuda.jcublas.cublasOperation.CUBLAS_OP_N;
import static jcuda.jcublas.cublasOperation.CUBLAS_OP_T;
import static jcuda.jcusparse.JCusparse.cusparseDcsr2csc;
-import static jcuda.jcusparse.JCusparse.cusparseDcsrgemm;
-import static jcuda.jcusparse.JCusparse.cusparseDcsrmv;
-import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_NON_TRANSPOSE;
-import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_TRANSPOSE;
import static jcuda.runtime.JCuda.cudaMemcpy;
import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice;
import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
-import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
-
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
-import org.apache.sysml.api.DMLScript;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
@@ -175,11 +168,11 @@ public class LibMatrixCUDA {
}
- private static cusparseHandle getCusparseHandle(GPUContext gCtx) throws DMLRuntimeException{
+ protected static cusparseHandle getCusparseHandle(GPUContext gCtx) throws DMLRuntimeException{
return gCtx.getCusparseHandle();
}
- private static cublasHandle getCublasHandle(GPUContext gCtx) throws DMLRuntimeException {
+ protected static cublasHandle getCublasHandle(GPUContext gCtx) throws DMLRuntimeException {
return gCtx.getCublasHandle();
}
@@ -410,7 +403,7 @@ public class LibMatrixCUDA {
throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
if(isInSparseFormat(gCtx, left)) {
// For sparse TSMM, invoke matmult (TODO: possible performance improvement)
- matmult(ec, gCtx, instName, left, left, outputName, isLeftTransposed, !isLeftTransposed);
+ LibMatrixCuMatMult.matmult(ec, gCtx, instName, left, left, outputName, isLeftTransposed, !isLeftTransposed);
return;
}
@@ -481,534 +474,6 @@ public class LibMatrixCUDA {
//******** End of TRANSPOSE SELF MATRIX MULTIPLY Functions ***********/
//********************************************************************/
- //********************************************************************/
- //***************** MATRIX MULTIPLY Functions ************************/
- //********************************************************************/
-
- /**
- * Matrix multiply on GPU
- * Examines sparsity and shapes and routes call to appropriate method
- * from cuBLAS or cuSparse
- * C = op(A) x op(B)
- * <p>
- * Memory Requirements -
- * Both dense - inputs, output, no intermediate
- * Both sparse - inputs, output, no intermediate
- * One sparse, one dense - inputs, output, intermediates - (input_dim1 * input_dim2) OR (input_dim1 * input_dim2 + input in sparse format)
- *
- * @param ec Current {@link ExecutionContext} instance
- * @param gCtx a valid {@link GPUContext}
- * @param instName name of the invoking instruction to record{@link Statistics}.
- * @param left Matrix A
- * @param right Matrix B
- * @param outputName Name of the output matrix C (in code generated after LOP layer)
- * @param isLeftTransposed op for A, transposed or not
- * @param isRightTransposed op for B, tranposed or not
- * @throws DMLRuntimeException if DMLRuntimeException occurs
- * @return output of matrix multiply
- */
- public static MatrixObject matmult(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject left, MatrixObject right, String outputName,
- boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException {
- if (ec.getGPUContext(0) != gCtx)
- throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
- if(LOG.isTraceEnabled()) {
- LOG.trace("GPU : matmult" + ", GPUContext=" + gCtx);
- }
- if(!left.getGPUObject(gCtx).isAllocated() || !right.getGPUObject(gCtx).isAllocated())
- throw new DMLRuntimeException("One of input is not allocated:" + left.getGPUObject(gCtx).isAllocated() + " " + right.getGPUObject(gCtx).isAllocated());
-
- boolean bothDense = !left.getGPUObject(gCtx).isSparse() && !right.getGPUObject(gCtx).isSparse();
- boolean bothSparse = left.getGPUObject(gCtx).isSparse() && right.getGPUObject(gCtx).isSparse();
-
- MatrixObject output = ec.getMatrixObject(outputName);
-
- long outRLen = isLeftTransposed ? left.getNumColumns() : left.getNumRows();
- long outCLen = isRightTransposed ? right.getNumRows() : right.getNumColumns();
-
- if (bothDense) { // Dense C = Dense A * Dense B
- // For both dense, do cuBLAS
- getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, outRLen, outCLen); // Allocated the dense output matrix
- denseDenseMatmult(gCtx, instName, output, left, right, isLeftTransposed, isRightTransposed);
- }
- else if (bothSparse){ // Sparse C = Sparse A * Sparse B
- ec.allocateGPUMatrixObject(outputName, outRLen, outCLen);
- bothSparseMatmult(gCtx, instName, output, left, right, isLeftTransposed, isRightTransposed);
- }
- else { // Either of A or B is sparse, Sparse C = Sparse/Dense A * Dense/Sparse B
- // Convert the dense to sparse and use the cusparseDcsrgemm routine
- ec.allocateGPUMatrixObject(outputName, outRLen, outCLen);
- eitherSparseMatmult(gCtx, instName, output, left, right, isLeftTransposed, isRightTransposed);
- }
-
- return output;
- }
-
- /**
- * One of the matrices is sparse, the other dense
- * C = op(A) x op(B)
- *
- * @param gCtx a valid {@link GPUContext}
- * @param instName the invoking instruction's name for record {@link Statistics}.
- * @param output allocated output object for C on host to which GPU output will be attached
- * @param left Matrix A on host
- * @param right Matrix B on host
- * @param isLeftTransposed op for A, tranposed or not
- * @param isRightTransposed op for B, transposed or not
- * @throws DMLRuntimeException if DMLRuntimeException occurs
- */
- private static void eitherSparseMatmult(GPUContext gCtx, String instName, MatrixObject output, MatrixObject left, MatrixObject right,
- boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException {
-
- int m = toInt(isLeftTransposed ? left.getNumColumns() : left.getNumRows()) ;
- int n = toInt(isRightTransposed ? right.getNumRows() : right.getNumColumns());
- int k = toInt(isLeftTransposed ? left.getNumRows() : left.getNumColumns());
- int k1 = toInt(isRightTransposed ? right.getNumColumns() : right.getNumRows());
- if(k != k1)
- throw new DMLRuntimeException("Dimension mismatch: " + k + " != " + k1);
-
- if(m == -1 || n == -1 || k == -1)
- throw new DMLRuntimeException("Incorrect dimensions");
-
-
- if (left.getGPUObject(gCtx).isSparse()) {
- // Left sparse, right dense
- sparseDenseMatmult(gCtx, instName, output, left, right, isLeftTransposed, isRightTransposed, m, n, k);
- } else {
- // Left dense, right sparse
- denseSparseMatmult(gCtx, instName, left, right, output, isLeftTransposed, isRightTransposed, m, n, k);
- }
- }
-
- /**
- * C = op(A) * op(B) where A is dense and B is sparse
- * If B is ultrasparse, A is converted to a sparse matrix and {@code sparseSparseMatmult(MatrixObject, int, int, int, int, int, CSRPointer, CSRPointer)} is invoked
- * otherwise B is converted to a dense matrix and {@code denseDenseMatmult(Pointer, int, int, int, int, boolean, boolean, Pointer, Pointer)} is invoked.
- *
- * @param gCtx a valid {@link GPUContext}
- * @param instName the invoking instruction's name for record {@link Statistics}.
- * @param left {@link MatrixObject} of A
- * @param right {@link MatrixObject} of B
- * @param output {@link MatrixObject} of the output matrix C
- * @param isLeftTransposed whether matrix A needs to be transposed
- * @param isRightTransposed whether matrix B needs to be transposed
- * @param m ?
- * @param n ?
- * @param k ?
- * @throws DMLRuntimeException if DMLRuntimeException occurs
- */
- private static void denseSparseMatmult(GPUContext gCtx, String instName, MatrixObject left, MatrixObject right, MatrixObject output,
- boolean isLeftTransposed, boolean isRightTransposed, int m, int n, int k)
- throws DMLRuntimeException {
- // right sparse, left dense
- CSRPointer B = right.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
- Pointer ADense = getDensePointer(gCtx, left, instName);
- if (B.isUltraSparse(k, n)){
- if(LOG.isTraceEnabled()) {
- LOG.trace(" GPU : Convert d M %*% sp M --> sp M %*% sp M)" + ", GPUContext=" + gCtx);
- }
-
- // Convert left to CSR and do cuSparse matmul
- int rowsA = (int)left.getNumRows();
- int colsA = (int)left.getNumColumns();
-
- long t0=0,t1=0, t2=0;
- if (DMLScript.STATISTICS) t0 = System.nanoTime();
- Pointer AT = GPUObject.transpose(gCtx, ADense, rowsA, colsA, colsA, rowsA);
- if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_TRANSPOSE_LIB, System.nanoTime() - t0);
-
- if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
- CSRPointer A = GPUObject.columnMajorDenseToRowMajorSparse(gCtx, getCusparseHandle(gCtx), AT, rowsA, colsA);
- if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t1);
-
- if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseTime.add(System.nanoTime() - t0);
- if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseCount.add(1);
- sparseSparseMatmult(gCtx, instName, A, B, output, isLeftTransposed, isRightTransposed, m, n, k);
-
- if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
- A.deallocate();
- gCtx.cudaFreeHelper(AT);
- if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t2, 2);
-
- } else {
- if(LOG.isTraceEnabled()) {
- LOG.trace(" GPU : Convert d M %*% sp M --> d M %*% d M" + ", GPUContext=" + gCtx);
- }
- // Convert right to dense and do a cuBlas matmul
- // BDenseTransposed is a column major matrix
- // Note the arguments to denseDenseMatmult to accommodate for this.
- long t0=0, t1=0;
- if (DMLScript.STATISTICS) t0 = System.nanoTime();
- Pointer BDenseTransposed = B.toColumnMajorDenseMatrix(getCusparseHandle(gCtx), getCublasHandle(gCtx), (int)right.getNumRows(), (int)right.getNumColumns());
- if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0);
- if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseTime.add(System.nanoTime() - t0);
- if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseCount.add(System.nanoTime() - t0);
-
- if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
- boolean allocated = output.getGPUObject(gCtx).acquireDeviceModifyDense(); // To allocate the dense matrix
- if (GPUStatistics.DISPLAY_STATISTICS && allocated) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ALLOCATE_DENSE_OUTPUT, System.nanoTime() - t1);
- Pointer C = getDensePointer(gCtx, output, instName);
- denseDenseMatmult(gCtx, instName, C,
- toInt(left.getNumRows()), toInt(left.getNumColumns()),
- toInt(right.getNumColumns()), toInt(right.getNumRows()),
- isLeftTransposed, !isRightTransposed,
- ADense, BDenseTransposed);
-
- gCtx.cudaFreeHelper(instName, BDenseTransposed);
- }
- }
-
- /**
- * * C = op(A) * op(B) where A is sparse and B is dense
- * If A is ultrasparse, B is converted to a sparse matrix and {@code sparseSparseMatmult(MatrixObject, int, int, int, int, int, CSRPointer, CSRPointer)} is invoked
- * otherwise A is converted to a dense matrix and {@code denseDenseMatmult(Pointer, int, int, int, int, boolean, boolean, Pointer, Pointer)} is invoked.
- *
- * @param gCtx a valid {@link GPUContext}
- * @param instName the invoking instruction's name for record {@link Statistics}.
- * @param output the output matrix object
- * @param left matrix A
- * @param right matrix B
- * @param isLeftTransposed if A needs to be transposed
- * @param isRightTransposed if B needs to be transposed
- * @param m ?
- * @param n ?
- * @param k ?
- * @throws DMLRuntimeException if DMLRuntimeException occurs
- */
- private static void sparseDenseMatmult(GPUContext gCtx, String instName, MatrixObject output, MatrixObject left, MatrixObject right,
- boolean isLeftTransposed, boolean isRightTransposed, int m, int n, int k)
- throws DMLRuntimeException {
- CSRPointer A = left.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
- Pointer BDense = getDensePointer(gCtx, right, instName);
-
- if (n == 1){
- // Sparse Matrix - Dense Vector multiply
- sparseMatrixDenseVectorMult(gCtx, instName, output, A, BDense, isLeftTransposed, (int)left.getNumRows(), (int)left.getNumColumns());
-
- } else {
-
- long t0=0, t1=0, t2=0;
- // Sparse Matrix Dense Matrix multiply
- if (A.isUltraSparse(m, k)){
- if(LOG.isTraceEnabled()) {
- LOG.trace(" GPU : Convert sp M %*% d M --> sp M %*% sp M" + ", GPUContext=" + gCtx);
- }
- // Convert right to CSR and do cuSparse matmul
- int rowsB = (int)right.getNumRows();
- int colsB = (int)right.getNumColumns();
-
- if (DMLScript.STATISTICS) t0 = System.nanoTime();
- Pointer BT = GPUObject.transpose(gCtx, BDense, rowsB, colsB, colsB, rowsB);
- if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_TRANSPOSE_LIB, System.nanoTime() - t0);
-
- if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
- CSRPointer B = GPUObject.columnMajorDenseToRowMajorSparse(gCtx, getCusparseHandle(gCtx), BT, rowsB, colsB);
- if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t1);
-
- if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseTime.add(System.nanoTime() - t0);
- if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseCount.add(1);
-
- sparseSparseMatmult(gCtx, instName, A, B, output, isLeftTransposed, isRightTransposed, m, n, k);
-
- if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
- B.deallocate();
- gCtx.cudaFreeHelper(BT);
- if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t2, 2);
-
- } else {
- if(LOG.isTraceEnabled()) {
- LOG.trace(" GPU : Convert sp M %*% d M --> d M %*% d M" + ", GPUContext=" + gCtx);
- }
- // Convert left to dense and do a cuBlas matmul
- // ADenseTransposed is a column major matrix
- // Note the arguments to denseDenseMatmult to accommodate for this.
- if (DMLScript.STATISTICS) t0 = System.nanoTime();
- Pointer ADenseTransposed = A.toColumnMajorDenseMatrix(getCusparseHandle(gCtx), getCublasHandle(gCtx), (int)left.getNumRows(), (int)left.getNumColumns());
- if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0);
- if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseTime.add(System.nanoTime() - t0);
- if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseCount.add(System.nanoTime() - t0);
-
- if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
- boolean allocated = output.getGPUObject(gCtx).acquireDeviceModifyDense(); // To allocate the dense matrix
- if (allocated && GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ALLOCATE_DENSE_OUTPUT, System.nanoTime() - t1);
-
- Pointer C = getDensePointer(gCtx, output, instName);
- denseDenseMatmult(gCtx, instName, C,
- toInt(left.getNumColumns()), toInt(left.getNumRows()),
- toInt(right.getNumRows()), toInt(right.getNumColumns()),
- !isLeftTransposed, isRightTransposed,
- ADenseTransposed, BDense);
-
- gCtx.cudaFreeHelper(instName, ADenseTransposed);
- }
- }
- }
-
- /**
- * C = op(A) x B
- * A is a sparse matrix, B is a dense vector
- *
- * @param gCtx a valid {@link GPUContext}
- * @param instName the invoking instruction's name for record {@link Statistics}.
- * @param output allocated output on the host, to which the GPU output C will be attached
- * @param A sparse matrix A on the GPU
- * @param B_dense dense matrix/vector B on the GPU
- * @param isATranposed op for A, tranposed or not
- * @param m number of rows in A (not op(A))
- * @param k number of cols in A or number of rows in B (not op(A) or op(B))
- * @throws DMLRuntimeException if DMLRuntimeException occurs
- */
- private static void sparseMatrixDenseVectorMult(GPUContext gCtx, String instName, MatrixObject output, CSRPointer A, Pointer B_dense, boolean isATranposed,
- int m, int k) throws DMLRuntimeException {
- if(LOG.isTraceEnabled()) {
- LOG.trace("GPU : sp M %*% dense V" + ", GPUContext=" + gCtx);
- }
- int transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
- long size = m * Sizeof.DOUBLE;
- if (isATranposed){
- size = k * Sizeof.DOUBLE;
- transA = CUSPARSE_OPERATION_TRANSPOSE;
- }
- Pointer C_dense = gCtx.allocate(instName, (int)size);
- long t1=0;
- if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
- cusparseDcsrmv(getCusparseHandle(gCtx), transA, m, k, (int)A.nnz, one(), A.descr, A.val, A.rowPtr, A.colInd, B_dense, zero(), C_dense);
- //cudaDeviceSynchronize; // Since cusparseDcsrmv is asynchronously executed
- if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_MATRIX_DENSE_VECTOR_LIB, System.nanoTime() - t1);
-
- output.getGPUObject(gCtx).setDenseMatrixCudaPointer(C_dense);
- }
-
- /**
- * Sparse C = Sparse op(A) * Sparse op(B)
- * Reroutes call to sparse matrix-vector mult if needed
- *
- * @param gCtx a valid {@link GPUContext}
- * @param instName the invoking instruction's name for record {@link Statistics}.
- * @param output ?
- * @param instName name of the invoking instruction to record{@link Statistics}.
- * @param left ?
- * @param right ?
- * @param isLeftTransposed ?
- * @param isRightTransposed ?
- * @throws DMLRuntimeException if DMLRuntimeException occurs
- */
- private static void bothSparseMatmult(GPUContext gCtx, String instName, MatrixObject output, MatrixObject left, MatrixObject right,
- boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException {
- int m = toInt(isLeftTransposed ? left.getNumColumns() : left.getNumRows()) ;
- int n = toInt(isRightTransposed ? right.getNumRows() : right.getNumColumns());
- int k = toInt(isLeftTransposed ? left.getNumRows() : left.getNumColumns());
- int k1 = toInt(isRightTransposed ? right.getNumColumns() : right.getNumRows());
- if(k != k1)
- throw new DMLRuntimeException("Dimension mismatch: " + k + " != " + k1);
-
- if(m == -1 || n == -1 || k == -1)
- throw new DMLRuntimeException("Incorrect dimensions");
-
- CSRPointer A = left.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
- CSRPointer B = right.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
-
- // TODO if (m == 1) { // Vector-matrix multiplication
-
- if (!isRightTransposed && right.getNumColumns() == 1){ // Matrix-Vector multiplication
- sparseMatrixVectorMult(gCtx, instName, output, isLeftTransposed, (int)left.getNumRows(), (int)left.getNumColumns(), (int)right.getNumRows(), A, B);
- } else { // Matrix-Matrix multiplication
- sparseSparseMatmult(gCtx, instName, A, B, output, isLeftTransposed, isRightTransposed, m, n, k);
- }
- }
-
- /**
- * Does a sparse matrix-vector multiply.
- * C = op(A) x B, A is a sparse matrix, B is a sparse vector with numCols = 1.
- *
- * @param gCtx a valid {@link GPUContext}
- * @param instName the invoking instruction's name for record {@link Statistics}.
- * @param output allocated output object C to which the GPU output matrix will be attached
- * @param isATranposed if A is to be transposed or not (the op in op(A))
- * @param m number of rows in A (not op(A))
- * @param n number of cols in A (not op(A))
- * @param k number of rows in B, (cols in B is assumed to be 1)
- * @param A left sparse matrix on GPU
- * @param B right sparse vector on GPU
- * @throws DMLRuntimeException if DMLRuntimeException occurs
- */
- private static void sparseMatrixVectorMult(GPUContext gCtx, String instName, MatrixObject output, boolean isATranposed, int m, int n, int k,
- CSRPointer A, CSRPointer B) throws DMLRuntimeException {
- long t0=0;
- if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
- Pointer BDenseVector = B.toColumnMajorDenseMatrix(getCusparseHandle(gCtx), getCublasHandle(gCtx), k, 1);
- if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0);
- sparseMatrixDenseVectorMult(gCtx, instName, output, A, BDenseVector, isATranposed, m, k);
- }
-
- /**
- * Does a sparse-sparse Matrix multiply
- * C = op(A) x op(B), A, B are sparse matrices
- *
- * @param gCtx a valid {@link GPUContext}
- * @param instName the invoking instruction's name for record {@link Statistics}.
- * @param A left sparse matrix on GPU
- * @param B right sparse matrix on GPU
- * @param output allocated output object on host to which the GPU output matrix will be attached
- * @param isLeftTransposed op for A - to be transposed or not
- * @param isRightTransposed op for B
- * @param m number of rows in op(A)
- * @param n number of cols in op(B)
- * @param k number of cols in op(A) or rows in op(B)
- * @throws DMLRuntimeException if DMLRuntimeException occurs
- */
- private static void sparseSparseMatmult(GPUContext gCtx, String instName, CSRPointer A, CSRPointer B, MatrixObject output,
- boolean isLeftTransposed, boolean isRightTransposed, int m, int n, int k) throws DMLRuntimeException {
- if(LOG.isTraceEnabled()) {
- LOG.trace("GPU : sp M %*% sp M" + ", GPUContext=" + gCtx);
- }
-
- int transA = isLeftTransposed ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
- int transB = isRightTransposed ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
-
- long t0=0, t1=0;
- if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
- CSRPointer C = CSRPointer.allocateForMatrixMultiply(gCtx, getCusparseHandle(gCtx), A, transA, B, transB, m, n, k);
- if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_ALLOCATE_LIB, System.nanoTime() - t0);
-
- output.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
-
- if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
- cusparseDcsrgemm(getCusparseHandle(gCtx), transA, transB, m, n, k,
- A.descr, (int)A.nnz, A.val, A.rowPtr, A.colInd,
- B.descr, (int)B.nnz, B.val, B.rowPtr, B.colInd,
- C.descr, C.val, C.rowPtr, C.colInd);
- //cudaDeviceSynchronize;
- if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_MATRIX_SPARSE_MATRIX_LIB, System.nanoTime() - t1);
- }
-
- /**
- * Dense dense matrix multiply
- * C = op(A) * op(B), A and B are dense matrices
- *
- * @param gCtx a valid {@link GPUContext}
- * @param instName name of the invoking instruction to record{@link Statistics}.
- * @param output output object C on host with GPU data allocated
- * @param left left matrix A (in row-major order)
- * @param right right matrix B (in row-major order)
- * @param isLeftTransposed op for A, transposed or not
- * @param isRightTransposed op for B, transposed or not
- * @throws DMLRuntimeException if DMLRuntimeException occurs
- */
- private static void denseDenseMatmult(GPUContext gCtx, String instName, MatrixObject output, MatrixObject left, MatrixObject right,
- boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException {
-
- Pointer leftPtr = getDensePointer(gCtx, left, instName);
- Pointer rightPtr = getDensePointer(gCtx, right, instName);
-
- int leftRows = toInt(left.getNumRows());
- int leftCols = toInt(left.getNumColumns());
- int rightRows = toInt(right.getNumRows());
- int rightCols = toInt(right.getNumColumns());
- Pointer C = getDensePointer(gCtx, output, instName);
- denseDenseMatmult(gCtx, instName, C, leftRows, leftCols, rightRows, rightCols, isLeftTransposed, isRightTransposed,
- leftPtr, rightPtr);
- }
-
- /**
- * Dense-dense matrix multiply
- * C = op(A) * op(B), A and B are dense matrices
- * On the host, the matrices are in row-major format; cuBLAS expects them in column-major format.
- * What we have as input is t(A) and t(B), t(X) = transpose of X.
- * We do t(B) %*% t(A) to get t(C);
- * If we were to calculate t(t(C), we would get the resultant matrix C, but this would be in column-major format.
- * What we really want is t(C). This we already have as the result of t(B) %*% t(A).
- *
- * @param gCtx a valid {@link GPUContext}
- * @param instName name of the invoking instruction to record{@link Statistics}.
- * @param output output allocated on GPU in column major format
- * @param leftRows1 number of rows in A
- * @param leftCols1 number of cols in A
- * @param rightRows1 number of rows in B
- * @param rightCols1 number of cols in B
- * @param isLeftTransposed1 op for A, transposed or not
- * @param isRightTransposed1 op for B, transposed or not
- * @param leftPtr A allocated on the GPU in row-major format
- * @param rightPtr B allocated on the GPU in row-major format
- * @throws DMLRuntimeException if DMLRuntimeException occurs
- */
- public static void denseDenseMatmult(GPUContext gCtx, String instName, Pointer output, int leftRows1, int leftCols1, int rightRows1,
- int rightCols1, boolean isLeftTransposed1, boolean isRightTransposed1, Pointer leftPtr, Pointer rightPtr)
- throws DMLRuntimeException {
- if(LOG.isTraceEnabled()) {
- LOG.trace("GPU : d M %*% d M" + ", GPUContext=" + gCtx);
- }
-
- Pointer A = rightPtr;
- Pointer B = leftPtr;
-
- // To compensate for the input matrices being in row-major format instead of column-major (the way cublas expects)
- int leftRows = rightCols1;
- int leftCols = rightRows1;
- int rightRows = leftCols1;
- int rightCols = leftRows1;
-
- boolean isLeftTransposed = isRightTransposed1;
- boolean isRightTransposed = isLeftTransposed1;
-
- // Note: the dimensions are swapped
- int m = isLeftTransposed ? leftCols : leftRows ;
- int n = isRightTransposed ? rightRows : rightCols;
- int k = isLeftTransposed ? leftRows : leftCols;
- int k1 = isRightTransposed ? rightCols : rightRows;
- if(k != k1)
- throw new DMLRuntimeException("Dimension mismatch: " + k + " != " + k1);
-
- if(m == -1 || n == -1 || k == -1)
- throw new DMLRuntimeException("Incorrect dimensions");
-
- double[] one = { 1 };
- double[] zero = { 0 };
-
- //int lda = leftRows;
- //int ldb = leftCols;
- int lda = isLeftTransposed ? k : m;
- int ldb = isRightTransposed ? n : k;
- int ldc = m;
-
- int transa = isLeftTransposed ? cublasOperation.CUBLAS_OP_T : cublasOperation.CUBLAS_OP_N;
- int transb = isRightTransposed ? cublasOperation.CUBLAS_OP_T : cublasOperation.CUBLAS_OP_N;
-
- long t0=0;
- if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
- Pointer C = output;
- if (m == 1 && n == 1){
- // Vector product
- LOG.debug(" GPU Dense-dense Vector Product");
- double[] result = {0};
- JCublas2.cublasDdot(getCublasHandle(gCtx), k, A, 1, B, 1, Pointer.to(result));
- // By default in CuBlas V2, cublas pointer mode is set to CUBLAS_POINTER_MODE_HOST.
- // This means that scalar values passed are on host (as opposed to on device).
- // The result is copied from the host back to the device so that the rest of
- // infrastructure can treat it uniformly.
- cudaMemcpy(C, Pointer.to(result), 1 * Sizeof.DOUBLE, cudaMemcpyHostToDevice);
- if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_DOT_LIB, System.nanoTime() - t0);
- } else if (m == 1) {
- // Vector-matrix multiply
- LOG.debug(" GPU Dense Vector-Matrix Multiply");
- transb = isRightTransposed ? cublasOperation.CUBLAS_OP_N : cublasOperation.CUBLAS_OP_T;
- JCublas2.cublasDgemv(getCublasHandle(gCtx), transb, rightRows, rightCols, Pointer.to(one), B, ldb, A, 1, Pointer.to(zero), C, 1);
- if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_VECTOR_DENSE_MATRIX_LIB, System.nanoTime() - t0);
- } else if (n == 1){
- // Matrix-vector multiply
- LOG.debug(" GPU Dense Matrix-Vector Multiply");
- JCublas2.cublasDgemv(getCublasHandle(gCtx), transa, leftRows, leftCols, Pointer.to(one), A, lda, B, 1, Pointer.to(zero), C, 1);
- if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_MATRIX_DENSE_VECTOR_LIB, System.nanoTime() - t0);
- } else {
- LOG.debug(" GPU Dense-Dense Matrix Multiply ");
- JCublas2.cublasDgemm(getCublasHandle(gCtx), transa, transb, m, n, k, Pointer.to(one), A, lda, B, ldb, Pointer.to(zero), C, ldc);
- if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_MATRIX_DENSE_MATRIX_LIB, System.nanoTime() - t0);
- }
- }
-
- //********************************************************************/
- //***************** END OF MATRIX MULTIPLY Functions *****************/
- //********************************************************************/
-
//********************************************************************/
//**************** UNARY AGGREGATE Functions ************************/
http://git-wip-us.apache.org/repos/asf/systemml/blob/6de8f051/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
new file mode 100644
index 0000000..21a2a35
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
@@ -0,0 +1,480 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.matrix.data;
+
+import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_NON_TRANSPOSE;
+import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_TRANSPOSE;
+import static jcuda.runtime.JCuda.cudaMemcpy;
+import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
+import jcuda.Pointer;
+import jcuda.Sizeof;
+import jcuda.jcublas.JCublas2;
+import jcuda.jcublas.cublasHandle;
+import jcuda.jcublas.cublasOperation;
+import jcuda.jcusparse.JCusparse;
+import jcuda.jcusparse.cusparseHandle;
+import jcuda.runtime.JCuda;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysml.api.DMLScript;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
+import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
+import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
+import org.apache.sysml.runtime.instructions.gpu.context.CSRPointer;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
+import org.apache.sysml.utils.GPUStatistics;
+import org.apache.sysml.utils.Statistics;
+
+public class LibMatrixCuMatMult extends LibMatrixCUDA {
+
+ private static final Log LOG = LogFactory.getLog(LibMatrixCuMatMult.class.getName());
+
+ private static class CuMatMultParameters {
+ /*
+ * For the operation, C = op(A) %*% op(B), the below parameters are used
+ * to invoke the corresponding kernels in CuBLAS and CuSPARSE.
+ *
+ * All the below values have to be valid or else this class has to throw
+ * an exception. No special values like -1 for unknowns allowed.
+ */
+ public int m; // number of rows of matrix op(A) and C.
+ public int n; // number of columns of matrix op(B) and C.
+ public int k; // number of columns of op(A) and rows of op(B).
+ public int lda; // leading dimension of two-dimensional array used to
+ // store the matrix A.
+ public int ldb; // leading dimension of two-dimensional array used to
+ // store matrix B.
+ public int ldc; // leading dimension of a two-dimensional array used to
+ // store the matrix C.
+ public long leftNumRows; // number of rows of A
+ public long leftNumCols; // number of cols of A
+ public long rightNumRows; // number of rows of B
+ public long rightNumCols; // number of cols of B
+ private boolean isLeftTransposed; // is op(A) = t(A)
+ private boolean isRightTransposed; // is op(B) = t(B)
+
+ public CuMatMultParameters(long leftNumRows1, long leftNumCols1, long rightNumRows1, long rightNumCols1,
+ boolean isLeftTransposed1, boolean isRightTransposed1) throws DMLRuntimeException {
+ leftNumRows = leftNumRows1;
+ leftNumCols = leftNumCols1;
+ rightNumRows = rightNumRows1;
+ rightNumCols = rightNumCols1;
+ isLeftTransposed = isLeftTransposed1;
+ isRightTransposed = isRightTransposed1;
+ setDimensions();
+ }
+
+ public void rowToColumnMajor() throws DMLRuntimeException {
+ // To compensate for the input matrices being in row-major format
+ // instead of column-major (the way cublas expects)
+ isRightTransposed = swap(isLeftTransposed, isLeftTransposed = isRightTransposed);
+ rightNumCols = swap(leftNumRows, leftNumRows = rightNumCols);
+ rightNumRows = swap(leftNumCols, leftNumCols = rightNumRows);
+ setDimensions();
+ }
+
+ private void validate() throws DMLRuntimeException {
+ int k1 = toInt(isRightTransposed ? rightNumCols : rightNumRows);
+ if (k != k1)
+ throw new DMLRuntimeException("Dimension mismatch: " + k + " != " + k1 + " [" + leftNumRows + ","
+ + leftNumCols + "," + rightNumRows + "," + rightNumCols + "], " + isLeftTransposed + " "
+ + isRightTransposed);
+ }
+
+ private void setDimensions() throws DMLRuntimeException {
+ // Validate the dimensions
+ m = toInt(isLeftTransposed ? leftNumCols : leftNumRows);
+ n = toInt(isRightTransposed ? rightNumRows : rightNumCols);
+ k = toInt(isLeftTransposed ? leftNumRows : leftNumCols);
+ lda = isLeftTransposed ? k : m;
+ ldb = isRightTransposed ? n : k;
+ ldc = m;
+ if (m == -1 || n == -1 || k == -1)
+ throw new DMLRuntimeException("Incorrect dimensions");
+ }
+ }
+
+ /**
+ * Matrix multiply on GPU Examines sparsity and shapes and routes call to
+ * appropriate method from cuBLAS or cuSparse C = op(A) x op(B)
+ *
+ * The user is expected to call
+ * ec.releaseMatrixOutputForGPUInstruction(outputName);
+ *
+ * @param ec
+ * Current {@link ExecutionContext} instance
+ * @param gCtx
+ * a valid {@link GPUContext}
+ * @param instName
+ * name of the invoking instruction to record{@link Statistics}.
+ * @param left
+ * Matrix A
+ * @param right
+ * Matrix B
+ * @param outputName
+ * Name of the output matrix C (in code generated after LOP
+ * layer)
+ * @param isLeftTransposed
+ * op for A, transposed or not
+ * @param isRightTransposed
+ * op for B, tranposed or not
+ * @throws DMLRuntimeException
+ * if DMLRuntimeException occurs
+ * @return output of matrix multiply
+ */
+ public static MatrixObject matmult(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject left,
+ MatrixObject right, String outputName, boolean isLeftTransposed, boolean isRightTransposed)
+ throws DMLRuntimeException {
+ boolean isM1Sparse = isInSparseFormat(gCtx, left);
+ boolean isM2Sparse = isInSparseFormat(gCtx, right);
+ MatrixObject output = ec.getMatrixObject(outputName);
+ long outRLen = isLeftTransposed ? left.getNumColumns() : left.getNumRows();
+ long outCLen = isRightTransposed ? right.getNumRows() : right.getNumColumns();
+
+ CuMatMultParameters params = new CuMatMultParameters(left.getNumRows(), left.getNumColumns(),
+ right.getNumRows(), right.getNumColumns(), isLeftTransposed, isRightTransposed);
+
+ if (isM1Sparse && isM2Sparse) {
+ // -------------------------------------------------------------------------------------
+ // sparse-sparse matrix multiplication
+ params.validate();
+ int transa = cusparseOp(isLeftTransposed);
+ int transb = cusparseOp(isRightTransposed);
+
+ // Step 1: Allocate output => sparse format
+ ec.allocateGPUMatrixObject(outputName, outRLen, outCLen);
+
+ // Step 2: Get the handles to sparse/dense pointers for left, right
+ // and output
+ CSRPointer A = left.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
+ CSRPointer B = right.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
+ long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
+ CSRPointer C = CSRPointer.allocateForMatrixMultiply(gCtx, getCusparseHandle(gCtx), A, transa, B, transb,
+ params.m, params.n, params.k);
+ if (GPUStatistics.DISPLAY_STATISTICS)
+ GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_ALLOCATE_LIB,
+ System.nanoTime() - t0);
+
+ // Step 3: Invoke the kernel
+ long t1 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
+ JCusparse.cusparseDcsrgemm(getCusparseHandle(gCtx), transa, transb, params.m, params.n, params.k, A.descr,
+ (int) A.nnz, A.val, A.rowPtr, A.colInd, B.descr, (int) B.nnz, B.val, B.rowPtr, B.colInd, C.descr,
+ C.val, C.rowPtr, C.colInd);
+ if (GPUStatistics.DISPLAY_STATISTICS)
+ GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_MATRIX_SPARSE_MATRIX_LIB,
+ System.nanoTime() - t1);
+ output.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
+ // -------------------------------------------------------------------------------------
+ } else if (!isM1Sparse && isM2Sparse) {
+ // -------------------------------------------------------------------------------------
+ // dense-sparse matrix multiplication
+ // Step 1: Allocate output => dense format
+ getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, outRLen, outCLen);
+
+ // Step 2: Get the handles to sparse/dense pointers for left, right
+ // and output
+ Pointer A = getDensePointer(gCtx, left, instName);
+ CSRPointer B = right.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
+ Pointer C = getDensePointer(gCtx, output, instName);
+
+ // Step 3: Invoke the kernel
+ denseSparseMatMult(getCusparseHandle(gCtx), instName, C, A, B, params);
+ // -------------------------------------------------------------------------------------
+ } else if (isM1Sparse && !isM2Sparse) {
+ // -------------------------------------------------------------------------------------
+ // sparse-dense matrix multiplication
+ // Step 1: Allocate output => dense format
+ getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, outRLen, outCLen);
+
+ // Step 2: Get the handles to sparse/dense pointers for left, right
+ // and output
+ CSRPointer A = left.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
+ Pointer B = getDensePointer(gCtx, right, instName);
+ Pointer C = getDensePointer(gCtx, output, instName);
+
+ // Step 3: Invoke the kernel
+ sparseDenseMatMult(gCtx, instName, C, A, B, left.getNumRows(), left.getNumColumns(), right.getNumRows(),
+ right.getNumColumns(), outRLen, outCLen, isLeftTransposed, isRightTransposed);
+ // -------------------------------------------------------------------------------------
+ } else {
+ // -------------------------------------------------------------------------------------
+ // dense-dense matrix multiplication
+ // Step 1: Allocate output => dense format
+ getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, outRLen, outCLen);
+
+ // Step 2: Get the handles to sparse/dense pointers for left, right
+ // and output
+ Pointer A = getDensePointer(gCtx, left, instName);
+ Pointer B = getDensePointer(gCtx, right, instName);
+ Pointer C = getDensePointer(gCtx, output, instName);
+
+ // Step 3: Invoke the kernel
+ denseDenseMatMult(getCublasHandle(gCtx), instName, C, A, B, params);
+ // -------------------------------------------------------------------------------------
+ }
+ return output;
+ }
+
+ /**
+ * Internal method to invoke the appropriate CuSPARSE kernel for matrix
+ * multiplication for operation: C = op(A) * op(B) This assumes B and C are
+ * allocated in dense row-major format and A is sparse.
+ *
+ * Other than input and output, this method requires additional memory =
+ * outRLen * outCLen * Sizeof.DOUBLE
+ *
+ * @param gCtx
+ * a valid {@link GPUContext}
+ * @param instName
+ * name of the invoking instruction to record{@link Statistics}.
+ * @param C
+ * output matrix pointer
+ * @param A
+ * left matrix pointer
+ * @param B
+ * right matrix pointer
+ * @param leftNumRows
+ * number of rows of A
+ * @param leftNumColumns
+ * number of cols of A
+ * @param rightNumRows
+ * number of rows of B
+ * @param rightNumColumns
+ * number of cols of B
+ * @param outRLen
+ * number of rows of C
+ * @param outCLen
+ * number of cols of C
+ * @param isLeftTransposed
+ * is op(A) = t(A)
+ * @param isRightTransposed
+ * is op(B) = t(B)
+ * @throws DMLRuntimeException
+ * if error
+ */
+ private static void sparseDenseMatMult(GPUContext gCtx, String instName, Pointer C, CSRPointer A, Pointer B,
+ long leftNumRows, long leftNumColumns, long rightNumRows, long rightNumColumns, long outRLen, long outCLen,
+ boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException {
+ // t(C) = t(B) %*% t(A)
+ Pointer output = null;
+ if (outRLen != 1 && outCLen != 1) {
+ output = gCtx.allocate(outRLen * outCLen * Sizeof.DOUBLE);
+ } else {
+ // no transpose required for vector output
+ output = C;
+ }
+ CuMatMultParameters params = new CuMatMultParameters(rightNumRows, rightNumColumns, leftNumRows,
+ leftNumColumns, !isRightTransposed, !isLeftTransposed);
+ denseSparseMatMult(getCusparseHandle(gCtx), instName, output, B, A, params);
+ if (outRLen != 1 && outCLen != 1) {
+ // Transpose: C = t(output)
+ long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
+ JCublas2.cublasDgeam(gCtx.getCublasHandle(), cublasOperation.CUBLAS_OP_T, cublasOperation.CUBLAS_OP_T,
+ toInt(outCLen), toInt(outRLen), one(), output, toInt(outRLen), zero(), new Pointer(),
+ toInt(outRLen), C, toInt(outCLen));
+ if (!DMLScript.EAGER_CUDA_FREE)
+ JCuda.cudaDeviceSynchronize();
+ gCtx.cudaFreeHelper(output, DMLScript.EAGER_CUDA_FREE);
+ if (GPUStatistics.DISPLAY_STATISTICS)
+ GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_TRANSPOSE_LIB, System.nanoTime()
+ - t0);
+ }
+ }
+
+ /**
+ * Internal method to invoke the appropriate CuSPARSE kernel for matrix
+ * multiplication for operation: C = op(A) * op(B) This assumes B and C are
+ * allocated in dense row-major format and A is sparse.
+ *
+ * @param handle
+ * cusparse handle
+ * @param instName
+ * name of the invoking instruction to record{@link Statistics}.
+ * @param C
+ * output matrix pointer
+ * @param A
+ * left matrix pointer
+ * @param B
+ * right matrix pointer
+ * @param param
+ * BLAS parameters
+ * @throws DMLRuntimeException
+ * if error
+ */
+ private static void denseSparseMatMult(cusparseHandle handle, String instName, Pointer C, Pointer A, CSRPointer B,
+ CuMatMultParameters param) throws DMLRuntimeException {
+ long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
+ String kernel = GPUInstruction.MISC_TIMER_SPARSE_MATRIX_DENSE_MATRIX_LIB;
+ // Ignoring sparse vector dense matrix multiplication and dot product
+ boolean isVector = (param.leftNumRows == 1 && !param.isLeftTransposed)
+ || (param.leftNumCols == 1 && param.isLeftTransposed);
+ if (isVector) {
+ LOG.debug(" GPU Sparse-Dense Matrix Vector ");
+ int m = toInt(param.rightNumRows);
+ int n = toInt(param.rightNumCols);
+ int transa = reverseCusparseOp(cusparseOp(param.isLeftTransposed));
+ JCusparse.cusparseDcsrmv(handle, transa, m, n, toInt(B.nnz), one(), B.descr, B.val, B.rowPtr, B.colInd, A,
+ zero(), C);
+ kernel = GPUInstruction.MISC_TIMER_SPARSE_MATRIX_DENSE_VECTOR_LIB;
+ } else {
+ int m = toInt(param.rightNumRows);
+ int k = toInt(param.rightNumCols);
+ param.rowToColumnMajor();
+ param.validate();
+ int transa = reverseCusparseOp(cusparseOp(param.isLeftTransposed));
+ int transb = cusparseOp(param.isRightTransposed);
+ LOG.debug(" GPU Sparse-Dense Matrix Multiply (rhs transpose) ");
+ JCusparse.cusparseDcsrmm2(handle, transa, transb, m, param.n, k, toInt(B.nnz), one(), B.descr, B.val,
+ B.rowPtr, B.colInd, A, param.ldb, zero(), C, param.ldc);
+ }
+ if (GPUStatistics.DISPLAY_STATISTICS)
+ GPUStatistics.maintainCPMiscTimes(instName, kernel, System.nanoTime() - t0);
+ }
+
+ /**
+ * Internal method to invoke the appropriate CuBLAS kernel for matrix
+ * multiplication for operation: C = op(A) * op(B) This assumes A, B and C
+ * are allocated in dense format. The caller is expected to invoke
+ * params.rowToColumnMajor().
+ *
+ * @param handle
+ * cublas handle
+ * @param instName
+ * name of the invoking instruction to record{@link Statistics}.
+ * @param C
+ * output matrix pointer
+ * @param A
+ * left matrix pointer
+ * @param B
+ * right matrix pointer
+ * @param param
+ * BLAS parameters
+ * @throws DMLRuntimeException
+ * if error
+ */
+ private static void denseDenseMatMult(cublasHandle handle, String instName, Pointer C, Pointer A, Pointer B,
+ CuMatMultParameters param) throws DMLRuntimeException {
+ long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
+ String kernel = null;
+ param.rowToColumnMajor();
+ param.validate();
+ int transa = cublasOp(param.isLeftTransposed);
+ int transb = cublasOp(param.isRightTransposed);
+ B = swap(A, A = B);
+ if (param.m == 1 && param.n == 1) {
+ // Vector product
+ LOG.debug(" GPU Dense-dense Vector Product");
+ double[] result = { 0 };
+ JCublas2.cublasDdot(handle, param.k, A, 1, B, 1, Pointer.to(result));
+ // By default in CuBlas V2, cublas pointer mode is set to
+ // CUBLAS_POINTER_MODE_HOST.
+ // This means that scalar values passed are on host (as opposed to
+ // on device).
+ // The result is copied from the host back to the device so that the
+ // rest of
+ // infrastructure can treat it uniformly.
+ cudaMemcpy(C, Pointer.to(result), 1 * Sizeof.DOUBLE, cudaMemcpyHostToDevice);
+ kernel = GPUInstruction.MISC_TIMER_DENSE_DOT_LIB;
+ } else if (param.m == 1) {
+ // Vector-matrix multiply
+ LOG.debug(" GPU Dense Vector-Matrix Multiply");
+ transb = reverseCublasOp(transb);
+ int rightNumRows = (transb == CUSPARSE_OPERATION_TRANSPOSE) ? param.k : param.n;
+ int rightNumCols = (transb == CUSPARSE_OPERATION_TRANSPOSE) ? param.n : param.k;
+ JCublas2.cublasDgemv(handle, transb, rightNumRows, rightNumCols, one(), B, param.ldb, A, 1, zero(), C, 1);
+ kernel = GPUInstruction.MISC_TIMER_DENSE_VECTOR_DENSE_MATRIX_LIB;
+ } else if (param.n == 1) {
+ // Matrix-vector multiply
+ LOG.debug(" GPU Dense Matrix-Vector Multiply");
+ int leftNumRows = (transa == CUSPARSE_OPERATION_NON_TRANSPOSE) ? param.m : param.k;
+ int leftNumCols = (transa == CUSPARSE_OPERATION_NON_TRANSPOSE) ? param.k : param.m;
+ JCublas2.cublasDgemv(handle, transa, leftNumRows, leftNumCols, one(), A, param.lda, B, 1, zero(), C, 1);
+ kernel = GPUInstruction.MISC_TIMER_DENSE_MATRIX_DENSE_VECTOR_LIB;
+ } else {
+ LOG.debug(" GPU Dense-Dense Matrix Multiply ");
+ JCublas2.cublasDgemm(handle, transa, transb, param.m, param.n, param.k, one(), A, param.lda, B, param.ldb,
+ zero(), C, param.ldc);
+ kernel = GPUInstruction.MISC_TIMER_DENSE_MATRIX_DENSE_MATRIX_LIB;
+ }
+ if (GPUStatistics.DISPLAY_STATISTICS)
+ GPUStatistics.maintainCPMiscTimes(instName, kernel, System.nanoTime() - t0);
+ }
+
+ // Convenient methods to swap two values
+ // Usage: y = swap(x, x=y);
+ private static long swap(long x, long y) {
+ return x;
+ }
+
+ private static boolean swap(boolean x, boolean y) {
+ return x;
+ }
+
+ private static Pointer swap(Pointer x, Pointer y) {
+ return x;
+ }
+
+ /**
+ * Convenient wrapper to return appropriate cuSPARSE trans value
+ *
+ * @param isTransposed
+ * is op(input) = t(input)
+ * @return CUSPARSE_OPERATION_TRANSPOSE or CUSPARSE_OPERATION_NON_TRANSPOSE
+ */
+ private static int cusparseOp(boolean isTransposed) {
+ return isTransposed ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
+ }
+
+ /**
+ * Convenient wrapper to return appropriate cuBLAS trans value
+ *
+ * @param isTransposed
+ * is op(input) = t(input)
+ * @return CUBLAS_OP_T or CUBLAS_OP_N
+ */
+ private static int cublasOp(boolean isTransposed) {
+ return isTransposed ? cublasOperation.CUBLAS_OP_T : cublasOperation.CUBLAS_OP_N;
+ }
+
+ /**
+ * Flips the cuBLAS trans value
+ *
+ * @param trans
+ * can be CUBLAS_OP_T or CUBLAS_OP_N
+ * @return CUBLAS_OP_N if trans is CUBLAS_OP_T else CUBLAS_OP_T
+ */
+ private static int reverseCublasOp(int trans) {
+ return trans == cublasOperation.CUBLAS_OP_T ? cublasOperation.CUBLAS_OP_N : cublasOperation.CUBLAS_OP_T;
+ }
+
+ /**
+ * Flips the cuSPARSE trans value
+ *
+ * @param trans
+ * can be CUSPARSE_OPERATION_NON_TRANSPOSE or
+ * CUSPARSE_OPERATION_TRANSPOSE
+ * @return CUSPARSE_OPERATION_NON_TRANSPOSE if trans is
+ * CUSPARSE_OPERATION_TRANSPOSE else CUSPARSE_OPERATION_TRANSPOSE
+ */
+ private static int reverseCusparseOp(int trans) {
+ return trans == CUSPARSE_OPERATION_TRANSPOSE ? CUSPARSE_OPERATION_NON_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
+ }
+}
http://git-wip-us.apache.org/repos/asf/systemml/blob/6de8f051/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java b/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java
index 81bc254..d983716 100644
--- a/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java
+++ b/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java
@@ -50,9 +50,81 @@ public class MatrixMultiplicationOpTest extends GPUTests {
public void matrixMatrixTest1() {
String scriptStr = "O = X %*% Y";
- int[] X1 = { 1, 128, 513, 1024 };
- int[] X2 = { 128, 512, 1024 };
- int[] Y2 = { 1, 128, 513, 1024 };
+ int[] X1 = { 1, 128, 1024 };
+ int[] X2 = { 1, 128, 1024 };
+ int[] Y2 = { 1, 128, 1024 };
+ double[] SX = { 0.0, 0.03, 0.3, 0.9 };
+ double[] SY = { 0.0, 0.03, 0.3, 0.9 };
+
+ for (int x1 = 0; x1 < X1.length; x1++) {
+ for (int x2 = 0; x2 < X2.length; x2++) {
+ int y1 = x2;
+ for (int y2 = 0; y2 < Y2.length; y2++) {
+ for (int sx = 0; sx < SX.length; sx++) {
+ for (int sy = 0; sy < SY.length; sy++) {
+ assertMatrixMultiplication(scriptStr, X1[x1], X2[x2], X2[y1], Y2[y2], SX[sx], SY[sy]);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ @Test
+ public void commonCaseMLMatrixMatrixTest1() {
+ String scriptStr = "O = X %*% Y";
+
+ int[] X1 = { 1000000 };
+ int[] X2 = { 1000 };
+ int[] Y2 = { 1, 20 };
+ double[] SX = { 0.0, 0.03, 0.3 };
+ double[] SY = { 0.0, 0.03, 0.3, 0.9 };
+
+ for (int x1 = 0; x1 < X1.length; x1++) {
+ for (int x2 = 0; x2 < X2.length; x2++) {
+ int y1 = x2;
+ for (int y2 = 0; y2 < Y2.length; y2++) {
+ for (int sx = 0; sx < SX.length; sx++) {
+ for (int sy = 0; sy < SY.length; sy++) {
+ assertMatrixMultiplication(scriptStr, X1[x1], X2[x2], X2[y1], Y2[y2], SX[sx], SY[sy]);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ @Test
+ public void commonCaseDLMatrixMatrixTest1() {
+ String scriptStr = "O = X %*% Y";
+
+ int[] X1 = { 100 };
+ int[] X2 = { 600, 900 };
+ int[] Y2 = { 205800 };
+ double[] SX = { 0.0, 0.03, 0.3 };
+ double[] SY = { 0.0, 0.03, 0.3, 0.9 };
+
+ for (int x1 = 0; x1 < X1.length; x1++) {
+ for (int x2 = 0; x2 < X2.length; x2++) {
+ int y1 = x2;
+ for (int y2 = 0; y2 < Y2.length; y2++) {
+ for (int sx = 0; sx < SX.length; sx++) {
+ for (int sy = 0; sy < SY.length; sy++) {
+ assertMatrixMultiplication(scriptStr, X1[x1], X2[x2], X2[y1], Y2[y2], SX[sx], SY[sy]);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ @Test
+ public void commonCaseDLMatrixMatrixTest2() {
+ String scriptStr = "O = X %*% Y";
+
+ int[] X1 = { 64 };
+ int[] X2 = { 196608 };
+ int[] Y2 = { 512 };
double[] SX = { 0.0, 0.03, 0.3, 0.9 };
double[] SY = { 0.0, 0.03, 0.3, 0.9 };
@@ -74,9 +146,9 @@ public class MatrixMultiplicationOpTest extends GPUTests {
public void matrixMatrixTest2() {
String scriptStr = "O = X %*% t(Y)";
- int[] X1 = { 1, 128, 513, 1024 };
- int[] X2 = { 128, 512, 1024 };
- int[] Y1 = { 1, 128, 513, 1024 };
+ int[] X1 = { 1, 128, 1024 };
+ int[] X2 = { 1, 128, 1024 };
+ int[] Y1 = { 1, 128, 1024 };
double[] SX = { 0.0, 0.03, 0.3, 0.9 };
double[] SY = { 0.0, 0.03, 0.3, 0.9 };
@@ -98,9 +170,9 @@ public class MatrixMultiplicationOpTest extends GPUTests {
public void matrixMatrixTest3() {
String scriptStr = "O = t(X) %*% Y";
- int[] X1 = { 1, 128, 513, 1024 };
- int[] X2 = { 128, 512, 1024 };
- int[] Y2 = { 1, 128, 513, 1024 };
+ int[] X1 = { 1, 128, 1024 };
+ int[] X2 = { 1, 128, 1024 };
+ int[] Y2 = { 1, 128, 1024 };
double[] SX = { 0.0, 0.03, 0.3, 0.9 };
double[] SY = { 0.0, 0.03, 0.3, 0.9 };
@@ -122,9 +194,9 @@ public class MatrixMultiplicationOpTest extends GPUTests {
public void matrixMatrixTest4() {
String scriptStr = "O = t(X) %*% t(Y)";
- int[] X1 = { 1, 128, 513, 1024 };
- int[] X2 = { 128, 512, 1024 };
- int[] Y1 = { 1, 128, 513, 1024 };
+ int[] X1 = { 1, 128, 1024 };
+ int[] X2 = { 1, 128, 1024 };
+ int[] Y1 = { 1, 128, 1024 };
double[] SX = { 0.0, 0.03, 0.3, 0.9 };
double[] SY = { 0.0, 0.03, 0.3, 0.9 };
@@ -146,7 +218,7 @@ public class MatrixMultiplicationOpTest extends GPUTests {
public void transposeSelfMatrixMultiply() {
String scriptStr = "O = t(X) %*% X";
- int[] sizes = { 1, 128, 512, 1024, 2049 };
+ int[] sizes = { 1, 128, 1024 };
double[] sparsities = { 0.0, 0.03, 0.3, 0.9 };
for (int i = 0; i < sizes.length; i++) {