You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2017/10/20 20:29:38 UTC
systemml git commit: [SYSTEMML-446] [SYSTEMML-702] Updated the sparse matrix multiplication to minimize sparse-to-dense as well as dense-to-sparse conversion

Repository: systemml
Updated Branches:
  refs/heads/master 323dd72a8 -> 6de8f051d


[SYSTEMML-446] [SYSTEMML-702] Updated the sparse matrix multiplication to minimize sparse-to-dense as well as dense-to-sparse conversion

1. The goal of this PR is not to improve performance (for example: by considering the cost of sparse-to-dense vs FLOPs required given a memory budget) but instead to minimize sparse-to-dense conversion in the GPU matrix multiplication operator.

2. If matmult uses unnecessary sparse-to-dense conversion, then we run into
  risk of one of the two situations:
- In best case some of the matmult won't be pushed to GPU under worst-case memory budget.
- On other hand, if these conversion are not accounted for, they may cause OOMs.

3. Every operator (except dense-sparse matrix multiplication) uses only memory allocated to input and output matrices.

4. Since there is no CuSPARSE kernel for dense-sparse matrix multiplication operator, we either have to transpose the output after performing sparse-dense matrix multiplication or perform dense-dense matrix multiplication after converting the sparse input to dense format.

Closes #686.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/6de8f051
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/6de8f051
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/6de8f051

Branch: refs/heads/master
Commit: 6de8f051daaefdab403c0edd3c7beb30c9619033
Parents: 323dd72
Author: Niketan Pansare <np...@us.ibm.com>
Authored: Fri Oct 20 12:29:20 2017 -0800
Committer: Niketan Pansare <np...@us.ibm.com>
Committed: Fri Oct 20 13:29:20 2017 -0700

----------------------------------------------------------------------
 .../java/org/apache/sysml/hops/AggBinaryOp.java |  30 +-
 .../gpu/AggregateBinaryGPUInstruction.java      |   3 +-
 .../instructions/gpu/GPUInstruction.java        |   1 +
 .../instructions/gpu/context/CSRPointer.java    |   7 +-
 .../instructions/gpu/context/GPUObject.java     |   2 +-
 .../runtime/matrix/data/LibMatrixCUDA.java      | 541 +------------------
 .../runtime/matrix/data/LibMatrixCuMatMult.java | 480 ++++++++++++++++
 .../test/gpu/MatrixMultiplicationOpTest.java    |  98 +++-
 8 files changed, 581 insertions(+), 581 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/6de8f051/src/main/java/org/apache/sysml/hops/AggBinaryOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/AggBinaryOp.java b/src/main/java/org/apache/sysml/hops/AggBinaryOp.java
index cfa99a4..d733d6a 100644
--- a/src/main/java/org/apache/sysml/hops/AggBinaryOp.java
+++ b/src/main/java/org/apache/sysml/hops/AggBinaryOp.java
@@ -371,40 +371,16 @@ public class AggBinaryOp extends Hop implements MultiThreadedHop
 		double ret = 0;
 
 		if (isGPUEnabled()) {
-			// In GPU Mode, intermediate memory is only needed in case of one of the matrix blocks is sparse
-			// When sparse block is converted to dense and a dense MM takes place, we need (dim1 * dim2)
-			// When dense block is converted to sparse and a sparse MM takes place, we need (dim1 * dim2 * 2)
-
 			Hop in1 = _input.get(0);
 			Hop in2 = _input.get(1);
 			double in1Sparsity = OptimizerUtils.getSparsity(in1.getDim1(), in1.getDim2(), in1.getNnz());
 			double in2Sparsity = OptimizerUtils.getSparsity(in2.getDim1(), in2.getDim2(), in2.getNnz());
-
 			boolean in1Sparse = in1Sparsity < MatrixBlock.SPARSITY_TURN_POINT;
 			boolean in2Sparse = in2Sparsity < MatrixBlock.SPARSITY_TURN_POINT;
-
-			boolean in1UltraSparse = in1Sparsity < MatrixBlock.ULTRA_SPARSITY_TURN_POINT;
-			boolean in2UltraSparse = in2Sparsity < MatrixBlock.ULTRA_SPARSITY_TURN_POINT;
-
-			// For Matmult X * Y, if X is sparse, Y is dense, X is converted to dense
-			// If X is ultrasparse, Y is converted to sparse
-			if (in1Sparse ^ in2Sparse) { // one sparse, one dense
-				if (in1Sparse) {
-					if (in1UltraSparse) {
-						ret += 2 * OptimizerUtils.estimateSizeExactSparsity(in2.getDim1(), in2.getDim2(), in2.getNnz());
-					} else {
-						ret += OptimizerUtils.estimateSizeExactSparsity(in1.getDim1(), in1.getDim2(), in1.getNnz());
-					}
-				} else if (in2Sparse) {
-					if (in2UltraSparse) {
-						ret += 2 * OptimizerUtils.estimateSizeExactSparsity(in1.getDim1(), in1.getDim2(), in1.getNnz());
-					} else {
-						ret += OptimizerUtils.estimateSizeExactSparsity(in2.getDim1(), in2.getDim2(), in2.getNnz());
-					}
-				}
-
+			if(in1Sparse && !in2Sparse) {
+				// Only in sparse-dense cases, we need additional memory budget for GPU
+				ret += OptimizerUtils.estimateSizeExactSparsity(dim1, dim2, 1.0);
 			}
-
 		}
 
 		//account for potential final dense-sparse transformation (worst-case sparse representation)

http://git-wip-us.apache.org/repos/asf/systemml/blob/6de8f051/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
index dfc000f..29a1ead 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
@@ -27,6 +27,7 @@ import org.apache.sysml.runtime.functionobjects.SwapIndex;
 import org.apache.sysml.runtime.instructions.InstructionUtils;
 import org.apache.sysml.runtime.instructions.cp.CPOperand;
 import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
+import org.apache.sysml.runtime.matrix.data.LibMatrixCuMatMult;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
 import org.apache.sysml.runtime.matrix.operators.AggregateBinaryOperator;
 import org.apache.sysml.runtime.matrix.operators.AggregateOperator;
@@ -94,7 +95,7 @@ public class AggregateBinaryGPUInstruction extends GPUInstruction {
 		int clen = (int) (_isRightTransposed ? m2.getNumRows() : m2.getNumColumns());
 
 		ec.setMetaData(_output.getName(), rlen, clen);
-		LibMatrixCUDA.matmult(ec, ec.getGPUContext(0), getExtendedOpcode(), m1, m2, _output.getName(), _isLeftTransposed, _isRightTransposed);
+		LibMatrixCuMatMult.matmult(ec, ec.getGPUContext(0), getExtendedOpcode(), m1, m2, _output.getName(), _isLeftTransposed, _isRightTransposed);
 		
 		//release inputs/outputs
 		ec.releaseMatrixInputForGPUInstruction(_input1.getName());

http://git-wip-us.apache.org/repos/asf/systemml/blob/6de8f051/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
index ed2a4a2..9dd163d 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
@@ -78,6 +78,7 @@ public abstract class GPUInstruction extends Instruction {
 	public final static String MISC_TIMER_DENSE_MATRIX_DENSE_MATRIX_LIB = 	"Mdmdm";	// time spent in matrix mult of dense matrices
 	public final static String MISC_TIMER_SPARSE_MATRIX_DENSE_VECTOR_LIB = 	"Msmdv";	// time spent in matrix mult of sparse matrix and dense vector
 	public final static String MISC_TIMER_SPARSE_MATRIX_SPARSE_MATRIX_LIB = "Msmsm";  // time spent in matrix mult of sparse matrices
+	public final static String MISC_TIMER_SPARSE_MATRIX_DENSE_MATRIX_LIB = "Msmdm";  // time spent in matrix mult of sparse matrices
 	public final static String MISC_TIMER_SYRK_LIB = 												"Msyrk"; 	// time spent in symmetric rank-k update
 
 	// Other BLAS instructions

http://git-wip-us.apache.org/repos/asf/systemml/blob/6de8f051/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
index 5a6e21c..7176a9c 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
@@ -37,7 +37,9 @@ import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
 import org.apache.sysml.utils.GPUStatistics;
+import org.apache.sysml.utils.Statistics;
 
 import jcuda.Pointer;
 import jcuda.Sizeof;
@@ -495,11 +497,13 @@ public class CSRPointer {
 	 * @param cublasHandle   a valid {@link cublasHandle}
 	 * @param rows           number of rows in this CSR matrix
 	 * @param cols           number of columns in this CSR matrix
+	 * @param instName          name of the invoking instruction to record{@link Statistics}.
 	 * @return A {@link Pointer} to the allocated dense matrix (in column-major format)
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
 	public Pointer toColumnMajorDenseMatrix(cusparseHandle cusparseHandle, cublasHandle cublasHandle, int rows,
-			int cols) throws DMLRuntimeException {
+			int cols, String instName) throws DMLRuntimeException {
+		long t0 = GPUStatistics.DISPLAY_STATISTICS && instName != null ? System.nanoTime() : 0;
 		LOG.trace("GPU : sparse -> column major dense (inside CSRPointer) on " + this + ", GPUContext="
 				+ getGPUContext());
 		long size = ((long) rows) * getDoubleSizeOf((long) cols);
@@ -512,6 +516,7 @@ public class CSRPointer {
 		} else {
 			LOG.debug("in CSRPointer, the values array, row pointers array or column indices array was null");
 		}
+		if (GPUStatistics.DISPLAY_STATISTICS && instName != null) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0);
 		return A;
 	}
 

http://git-wip-us.apache.org/repos/asf/systemml/blob/6de8f051/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
index feb34bc..06327db 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -458,7 +458,7 @@ public class GPUObject {
 			throw new DMLRuntimeException("Expected cusparse to be initialized");
 		int rows = toIntExact(mat.getNumRows());
 		int cols = toIntExact(mat.getNumColumns());
-		setDenseMatrixCudaPointer(getJcudaSparseMatrixPtr().toColumnMajorDenseMatrix(cusparseHandle, null, rows, cols));
+		setDenseMatrixCudaPointer(getJcudaSparseMatrixPtr().toColumnMajorDenseMatrix(cusparseHandle, null, rows, cols, null));
 	}
 
 	/**

http://git-wip-us.apache.org/repos/asf/systemml/blob/6de8f051/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index f4a00ab..7e25299 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -22,18 +22,11 @@ package org.apache.sysml.runtime.matrix.data;
 import static jcuda.jcublas.cublasOperation.CUBLAS_OP_N;
 import static jcuda.jcublas.cublasOperation.CUBLAS_OP_T;
 import static jcuda.jcusparse.JCusparse.cusparseDcsr2csc;
-import static jcuda.jcusparse.JCusparse.cusparseDcsrgemm;
-import static jcuda.jcusparse.JCusparse.cusparseDcsrmv;
-import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_NON_TRANSPOSE;
-import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_TRANSPOSE;
 import static jcuda.runtime.JCuda.cudaMemcpy;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
-import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
-
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
-import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
@@ -175,11 +168,11 @@ public class LibMatrixCUDA {
 	}
 
 
-	private static cusparseHandle getCusparseHandle(GPUContext gCtx) throws DMLRuntimeException{
+	protected static cusparseHandle getCusparseHandle(GPUContext gCtx) throws DMLRuntimeException{
 		return gCtx.getCusparseHandle();
 	}
 
-	private static cublasHandle getCublasHandle(GPUContext gCtx) throws DMLRuntimeException {
+	protected static cublasHandle getCublasHandle(GPUContext gCtx) throws DMLRuntimeException {
 		return gCtx.getCublasHandle();
 	}
 
@@ -410,7 +403,7 @@ public class LibMatrixCUDA {
 			throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
 		if(isInSparseFormat(gCtx, left)) {
 			// For sparse TSMM, invoke matmult (TODO: possible performance improvement)
-			matmult(ec, gCtx, instName, left, left, outputName, isLeftTransposed, !isLeftTransposed);
+			LibMatrixCuMatMult.matmult(ec, gCtx, instName, left, left, outputName, isLeftTransposed, !isLeftTransposed);
 			return;
 		}
 
@@ -481,534 +474,6 @@ public class LibMatrixCUDA {
 	//******** End of TRANSPOSE SELF MATRIX MULTIPLY Functions ***********/
 	//********************************************************************/
 
-	//********************************************************************/
-	//***************** MATRIX MULTIPLY Functions ************************/
-	//********************************************************************/
-
-	/**
-	 * Matrix multiply on GPU
-	 * Examines sparsity and shapes and routes call to appropriate method
-	 * from cuBLAS or cuSparse
-	 * C = op(A) x op(B)
-	 * <p>
-	 * Memory Requirements -
-	 * Both dense - inputs, output, no intermediate
-	 * Both sparse - inputs, output, no intermediate
-	 * One sparse, one dense - inputs, output, intermediates - (input_dim1 * input_dim2) OR (input_dim1 * input_dim2 + input in sparse format)
-	 *
-	 * @param ec                Current {@link ExecutionContext} instance
-	 * @param gCtx              a valid {@link GPUContext}
-	 * @param instName          name of the invoking instruction to record{@link Statistics}.
-	 * @param left              Matrix A
-	 * @param right             Matrix B
-	 * @param outputName        Name of the output matrix C (in code generated after LOP layer)
-	 * @param isLeftTransposed  op for A, transposed or not
-	 * @param isRightTransposed op for B, tranposed or not
-	 * @throws DMLRuntimeException if DMLRuntimeException occurs
-	 * @return output of matrix multiply
-	 */
-	public static MatrixObject matmult(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject left, MatrixObject right, String outputName,
-			boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException {
-		if (ec.getGPUContext(0) != gCtx)
-			throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
-		if(LOG.isTraceEnabled()) {
-			LOG.trace("GPU : matmult" + ", GPUContext=" + gCtx);
-		}
-		if(!left.getGPUObject(gCtx).isAllocated() || !right.getGPUObject(gCtx).isAllocated())
-			throw new DMLRuntimeException("One of input is not allocated:" + left.getGPUObject(gCtx).isAllocated() + " " + right.getGPUObject(gCtx).isAllocated());
-
-		boolean bothDense = !left.getGPUObject(gCtx).isSparse() && !right.getGPUObject(gCtx).isSparse();
-		boolean bothSparse = left.getGPUObject(gCtx).isSparse() && right.getGPUObject(gCtx).isSparse();
-
-		MatrixObject output = ec.getMatrixObject(outputName);
-
-		long outRLen = isLeftTransposed ? left.getNumColumns() : left.getNumRows();
-		long outCLen = isRightTransposed ? right.getNumRows() : right.getNumColumns();
-
-		if (bothDense) {		// Dense C = Dense A * Dense B
-			// For both dense, do cuBLAS
-			getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, outRLen, outCLen); // Allocated the dense output matrix
-			denseDenseMatmult(gCtx, instName, output, left, right, isLeftTransposed, isRightTransposed);
-		}
-		else if (bothSparse){	// Sparse C = Sparse A * Sparse B
-			ec.allocateGPUMatrixObject(outputName, outRLen, outCLen);
-			bothSparseMatmult(gCtx, instName, output, left, right, isLeftTransposed, isRightTransposed);
-		}
-		else {	// Either of A or B is sparse, Sparse C = Sparse/Dense A * Dense/Sparse B
-			// Convert the dense to sparse and use the cusparseDcsrgemm routine
-			ec.allocateGPUMatrixObject(outputName, outRLen, outCLen);
-			eitherSparseMatmult(gCtx, instName, output, left, right, isLeftTransposed, isRightTransposed);
-		}
-
-		return output;
-	}
-
-	/**
-	 * One of the matrices is sparse, the other dense
-	 * C = op(A) x op(B)
-	 *
-	 * @param gCtx              a valid {@link GPUContext}
-	 * @param instName          the invoking instruction's name for record {@link Statistics}.
-	 * @param output            allocated output object for C on host to which GPU output will be attached
-	 * @param left              Matrix A on host
-	 * @param right             Matrix B on host
-	 * @param isLeftTransposed  op for A, tranposed or not
-	 * @param isRightTransposed op for B, transposed or not
-	 * @throws DMLRuntimeException if DMLRuntimeException occurs
-	 */
-	private static void eitherSparseMatmult(GPUContext gCtx, String instName, MatrixObject output, MatrixObject left, MatrixObject right,
-			boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException {
-
-		int m = toInt(isLeftTransposed ? left.getNumColumns() : left.getNumRows()) ;
-		int n = toInt(isRightTransposed ? right.getNumRows() : right.getNumColumns());
-		int k = toInt(isLeftTransposed ? left.getNumRows() :  left.getNumColumns());
-		int k1 = toInt(isRightTransposed ? right.getNumColumns() : right.getNumRows());
-		if(k != k1)
-			throw new DMLRuntimeException("Dimension mismatch: " + k + " != " + k1);
-
-		if(m == -1 || n == -1 || k == -1)
-			throw new DMLRuntimeException("Incorrect dimensions");
-
-
-		if (left.getGPUObject(gCtx).isSparse()) {
-			// Left sparse, right dense
-			sparseDenseMatmult(gCtx, instName, output, left, right, isLeftTransposed, isRightTransposed, m, n, k);
-		} else {
-			// Left dense, right sparse
-			denseSparseMatmult(gCtx, instName, left, right, output, isLeftTransposed, isRightTransposed, m, n, k);
-		}
-	}
-
-	/**
-	 * C = op(A) * op(B) where A is dense and B is sparse
-	 * If B is ultrasparse, A is converted to a sparse matrix and {@code sparseSparseMatmult(MatrixObject, int, int, int, int, int, CSRPointer, CSRPointer)} is invoked
-	 * otherwise B is converted to a dense matrix and {@code denseDenseMatmult(Pointer, int, int, int, int, boolean, boolean, Pointer, Pointer)} is invoked.
-	 *
-	 * @param gCtx              a valid {@link GPUContext}
-	 * @param instName          the invoking instruction's name for record {@link Statistics}.
-	 * @param left              {@link MatrixObject} of A
-	 * @param right             {@link MatrixObject} of B
-	 * @param output            {@link MatrixObject} of the output matrix C
-	 * @param isLeftTransposed  whether matrix A needs to be transposed
-	 * @param isRightTransposed whether matrix B needs to be transposed
-	 * @param m                 ?
-	 * @param n                 ?
-	 * @param k                 ?
-	 * @throws DMLRuntimeException if DMLRuntimeException occurs
-	 */
-	private static void denseSparseMatmult(GPUContext gCtx, String instName, MatrixObject left, MatrixObject right, MatrixObject output,
-			boolean isLeftTransposed, boolean isRightTransposed, int m, int n, int k)
-					throws DMLRuntimeException {
-		// right sparse, left dense
-		CSRPointer B = right.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
-		Pointer ADense = getDensePointer(gCtx, left, instName);
-		if (B.isUltraSparse(k, n)){
-			if(LOG.isTraceEnabled()) {
-				LOG.trace(" GPU : Convert d M %*% sp M --> sp M %*% sp M)" + ", GPUContext=" + gCtx);
-			}
-
-			// Convert left to CSR and do cuSparse matmul
-			int rowsA = (int)left.getNumRows();
-			int colsA = (int)left.getNumColumns();
-
-			long t0=0,t1=0, t2=0;
-			if (DMLScript.STATISTICS) t0 = System.nanoTime();
-			Pointer AT = GPUObject.transpose(gCtx, ADense, rowsA, colsA, colsA, rowsA);
-			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_TRANSPOSE_LIB, System.nanoTime() - t0);
-
-			if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
-			CSRPointer A = GPUObject.columnMajorDenseToRowMajorSparse(gCtx, getCusparseHandle(gCtx), AT, rowsA, colsA);
-			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t1);
-
-			if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseTime.add(System.nanoTime() - t0);
-			if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseCount.add(1);
-			sparseSparseMatmult(gCtx, instName, A, B, output, isLeftTransposed, isRightTransposed, m, n, k);
-
-			if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
-			A.deallocate();
-			gCtx.cudaFreeHelper(AT);
-			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t2, 2);
-
-		} else {
-			if(LOG.isTraceEnabled()) {
-				LOG.trace(" GPU : Convert d M %*% sp M --> d M %*% d M" + ", GPUContext=" + gCtx);
-			}
-			// Convert right to dense and do a cuBlas matmul
-			// BDenseTransposed is a column major matrix
-			// Note the arguments to denseDenseMatmult to accommodate for this.
-			long t0=0, t1=0;
-			if (DMLScript.STATISTICS) t0 = System.nanoTime();
-			Pointer BDenseTransposed = B.toColumnMajorDenseMatrix(getCusparseHandle(gCtx), getCublasHandle(gCtx), (int)right.getNumRows(), (int)right.getNumColumns());
-			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0);
-			if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseTime.add(System.nanoTime() - t0);
-			if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseCount.add(System.nanoTime() - t0);
-
-			if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
-			boolean allocated = output.getGPUObject(gCtx).acquireDeviceModifyDense();	// To allocate the dense matrix
-			if (GPUStatistics.DISPLAY_STATISTICS && allocated) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ALLOCATE_DENSE_OUTPUT, System.nanoTime() - t1);
-			Pointer C = getDensePointer(gCtx, output, instName);
-			denseDenseMatmult(gCtx, instName, C,
-					toInt(left.getNumRows()), toInt(left.getNumColumns()),
-					toInt(right.getNumColumns()), toInt(right.getNumRows()),
-					isLeftTransposed, !isRightTransposed,
-					ADense, BDenseTransposed);
-
-			gCtx.cudaFreeHelper(instName, BDenseTransposed);
-		}
-	}
-
-	/**
-	 * * C = op(A) * op(B) where A is sparse and B is dense
-	 * If A is ultrasparse, B is converted to a sparse matrix and {@code sparseSparseMatmult(MatrixObject, int, int, int, int, int, CSRPointer, CSRPointer)} is invoked
-	 * otherwise A is converted to a dense matrix and {@code denseDenseMatmult(Pointer, int, int, int, int, boolean, boolean, Pointer, Pointer)} is invoked.
-	 *
-	 * @param gCtx              a valid {@link GPUContext}
-	 * @param instName          the invoking instruction's name for record {@link Statistics}.
-	 * @param output            the output matrix object
-	 * @param left              matrix A
-	 * @param right             matrix B
-	 * @param isLeftTransposed  if A needs to be transposed
-	 * @param isRightTransposed if B needs to be transposed
-	 * @param m                 ?
-	 * @param n                 ?
-	 * @param k                 ?
-	 * @throws DMLRuntimeException if DMLRuntimeException occurs
-	 */
-	private static void sparseDenseMatmult(GPUContext gCtx, String instName, MatrixObject output, MatrixObject left, MatrixObject right,
-			boolean isLeftTransposed, boolean isRightTransposed, int m, int n, int k)
-					throws DMLRuntimeException {
-		CSRPointer A = left.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
-		Pointer BDense = getDensePointer(gCtx, right, instName);
-
-		if (n == 1){
-			// Sparse Matrix - Dense Vector multiply
-			sparseMatrixDenseVectorMult(gCtx, instName, output, A, BDense, isLeftTransposed, (int)left.getNumRows(), (int)left.getNumColumns());
-
-		} else {
-
-			long t0=0, t1=0, t2=0;
-			// Sparse Matrix Dense Matrix multiply
-			if (A.isUltraSparse(m, k)){
-				if(LOG.isTraceEnabled()) {
-					LOG.trace(" GPU : Convert sp M %*% d M --> sp M %*% sp M" + ", GPUContext=" + gCtx);
-				}
-				// Convert right to CSR and do cuSparse matmul
-				int rowsB = (int)right.getNumRows();
-				int colsB = (int)right.getNumColumns();
-
-				if (DMLScript.STATISTICS) t0 = System.nanoTime();
-				Pointer BT = GPUObject.transpose(gCtx, BDense, rowsB, colsB, colsB, rowsB);
-				if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_TRANSPOSE_LIB, System.nanoTime() - t0);
-
-				if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
-				CSRPointer B = GPUObject.columnMajorDenseToRowMajorSparse(gCtx, getCusparseHandle(gCtx), BT, rowsB, colsB);
-				if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t1);
-
-				if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseTime.add(System.nanoTime() - t0);
-				if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseCount.add(1);
-
-				sparseSparseMatmult(gCtx, instName, A, B, output, isLeftTransposed, isRightTransposed, m, n, k);
-
-				if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
-				B.deallocate();
-				gCtx.cudaFreeHelper(BT);
-				if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t2, 2);
-
-			} else {
-				if(LOG.isTraceEnabled()) {
-					LOG.trace(" GPU : Convert sp M %*% d M --> d M %*% d M" + ", GPUContext=" + gCtx);
-				}
-				// Convert left to dense and do a cuBlas matmul
-				// ADenseTransposed is a column major matrix
-				// Note the arguments to denseDenseMatmult to accommodate for this.
-				if (DMLScript.STATISTICS) t0 = System.nanoTime();
-				Pointer ADenseTransposed = A.toColumnMajorDenseMatrix(getCusparseHandle(gCtx), getCublasHandle(gCtx), (int)left.getNumRows(), (int)left.getNumColumns());
-				if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0);
-				if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseTime.add(System.nanoTime() - t0);
-				if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseCount.add(System.nanoTime() - t0);
-
-				if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
-				boolean allocated = output.getGPUObject(gCtx).acquireDeviceModifyDense();	// To allocate the dense matrix
-				if (allocated && GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ALLOCATE_DENSE_OUTPUT, System.nanoTime() - t1);
-
-				Pointer C = getDensePointer(gCtx, output, instName);
-				denseDenseMatmult(gCtx, instName, C,
-						toInt(left.getNumColumns()), toInt(left.getNumRows()),
-						toInt(right.getNumRows()), toInt(right.getNumColumns()),
-						!isLeftTransposed, isRightTransposed,
-						ADenseTransposed, BDense);
-
-				gCtx.cudaFreeHelper(instName, ADenseTransposed);
-			}
-		}
-	}
-
-	/**
-	 * C = op(A) x B
-	 * A is a sparse matrix, B is a dense vector
-	 *
-	 * @param gCtx         a valid {@link GPUContext}
-	 * @param instName     the invoking instruction's name for record {@link Statistics}.
-	 * @param output       allocated output on the host, to which the GPU output C will be attached
-	 * @param A            sparse matrix A on the GPU
-	 * @param B_dense      dense matrix/vector B on the GPU
-	 * @param isATranposed op for A, tranposed or not
-	 * @param m            number of rows in A (not op(A))
-	 * @param k            number of cols in A or number of rows in B (not op(A) or op(B))
-	 * @throws DMLRuntimeException if DMLRuntimeException occurs
-	 */
-	private static void sparseMatrixDenseVectorMult(GPUContext gCtx, String instName, MatrixObject output, CSRPointer A, Pointer B_dense, boolean isATranposed,
-			int m, int k) throws DMLRuntimeException {
-		if(LOG.isTraceEnabled()) {
-			LOG.trace("GPU : sp M %*% dense V" + ", GPUContext=" + gCtx);
-		}
-		int transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
-		long size = m * Sizeof.DOUBLE;
-		if (isATranposed){
-			size = k * Sizeof.DOUBLE;
-			transA = CUSPARSE_OPERATION_TRANSPOSE;
-		}
-		Pointer C_dense = gCtx.allocate(instName, (int)size);
-		long t1=0;
-		if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
-		cusparseDcsrmv(getCusparseHandle(gCtx), transA, m, k, (int)A.nnz, one(), A.descr, A.val, A.rowPtr, A.colInd, B_dense, zero(), C_dense);
-		//cudaDeviceSynchronize; 	// Since cusparseDcsrmv is asynchronously executed
-		if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_MATRIX_DENSE_VECTOR_LIB, System.nanoTime() - t1);
-
-		output.getGPUObject(gCtx).setDenseMatrixCudaPointer(C_dense);
-	}
-
-	/**
-	 * Sparse C = Sparse op(A) * Sparse op(B)
-	 * Reroutes call to sparse matrix-vector mult if needed
-	 *
-	 * @param gCtx              a valid {@link GPUContext}
-	 * @param instName          the invoking instruction's name for record {@link Statistics}.
-	 * @param output            ?
-	 * @param instName          name of the invoking instruction to record{@link Statistics}.
-	 * @param left              ?
-	 * @param right             ?
-	 * @param isLeftTransposed  ?
-	 * @param isRightTransposed ?
-	 * @throws DMLRuntimeException if DMLRuntimeException occurs
-	 */
-	private static void bothSparseMatmult(GPUContext gCtx, String instName, MatrixObject output, MatrixObject left, MatrixObject right,
-			boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException {
-		int m = toInt(isLeftTransposed ? left.getNumColumns() : left.getNumRows()) ;
-		int n = toInt(isRightTransposed ? right.getNumRows() : right.getNumColumns());
-		int k = toInt(isLeftTransposed ? left.getNumRows() :  left.getNumColumns());
-		int k1 = toInt(isRightTransposed ? right.getNumColumns() : right.getNumRows());
-		if(k != k1)
-			throw new DMLRuntimeException("Dimension mismatch: " + k + " != " + k1);
-
-		if(m == -1 || n == -1 || k == -1)
-			throw new DMLRuntimeException("Incorrect dimensions");
-
-		CSRPointer A = left.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
-		CSRPointer B = right.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
-
-		// TODO if (m == 1) {	// Vector-matrix multiplication
-
-		if (!isRightTransposed && right.getNumColumns() == 1){ 	// Matrix-Vector multiplication
-			sparseMatrixVectorMult(gCtx, instName, output, isLeftTransposed, (int)left.getNumRows(), (int)left.getNumColumns(), (int)right.getNumRows(), A, B);
-		} else {												// Matrix-Matrix multiplication
-			sparseSparseMatmult(gCtx, instName, A, B, output, isLeftTransposed, isRightTransposed, m, n, k);
-		}
-	}
-
-	/**
-	 * Does a sparse matrix-vector multiply.
-	 * C = op(A) x B, A is a sparse matrix, B is a sparse vector with numCols = 1.
-	 *
-	 * @param gCtx         a valid {@link GPUContext}
-	 * @param instName     the invoking instruction's name for record {@link Statistics}.
-	 * @param output       allocated output object C to which the GPU output matrix will be attached
-	 * @param isATranposed if A is to be transposed or not (the op in op(A))
-	 * @param m            number of rows in A (not op(A))
-	 * @param n            number of cols in A (not op(A))
-	 * @param k            number of rows in B, (cols in B is assumed to be 1)
-	 * @param A            left sparse matrix on GPU
-	 * @param B            right sparse vector on GPU
-	 * @throws DMLRuntimeException if DMLRuntimeException occurs
-	 */
-	private static void sparseMatrixVectorMult(GPUContext gCtx, String instName, MatrixObject output, boolean isATranposed, int m, int n, int k,
-			CSRPointer A, CSRPointer B) throws DMLRuntimeException {
-		long t0=0;
-		if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
-		Pointer BDenseVector = B.toColumnMajorDenseMatrix(getCusparseHandle(gCtx), getCublasHandle(gCtx), k, 1);
-		if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0);
-		sparseMatrixDenseVectorMult(gCtx, instName, output, A, BDenseVector, isATranposed, m, k);
-	}
-
-	/**
-	 * Does a sparse-sparse Matrix multiply
-	 * C = op(A) x op(B), A, B are sparse matrices
-	 *
-	 * @param gCtx              a valid {@link GPUContext}
-	 * @param instName          the invoking instruction's name for record {@link Statistics}.
-	 * @param A                 left sparse matrix on GPU
-	 * @param B                 right sparse matrix on GPU
-	 * @param output            allocated output object on host to which the GPU output matrix will be attached
-	 * @param isLeftTransposed  op for A - to be transposed or not
-	 * @param isRightTransposed op for B
-	 * @param m                 number of rows in op(A)
-	 * @param n                 number of cols in op(B)
-	 * @param k                 number of cols in op(A) or rows in op(B)
-	 * @throws DMLRuntimeException if DMLRuntimeException occurs
-	 */
-	private static void sparseSparseMatmult(GPUContext gCtx, String instName, CSRPointer A, CSRPointer B, MatrixObject output,
-			boolean isLeftTransposed, boolean isRightTransposed, int m, int n, int k) throws DMLRuntimeException {
-		if(LOG.isTraceEnabled()) {
-			LOG.trace("GPU : sp M %*% sp M" + ", GPUContext=" + gCtx);
-		}
-
-		int transA = isLeftTransposed ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
-		int transB = isRightTransposed ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
-
-		long t0=0, t1=0;
-		if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
-		CSRPointer C = CSRPointer.allocateForMatrixMultiply(gCtx, getCusparseHandle(gCtx), A, transA, B, transB, m, n, k);
-		if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_ALLOCATE_LIB, System.nanoTime() - t0);
-
-		output.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
-
-		if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
-		cusparseDcsrgemm(getCusparseHandle(gCtx), transA, transB, m, n, k,
-				A.descr, (int)A.nnz, A.val, A.rowPtr, A.colInd,
-				B.descr, (int)B.nnz, B.val, B.rowPtr, B.colInd,
-				C.descr, C.val, C.rowPtr, C.colInd);
-		//cudaDeviceSynchronize;
-		if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_MATRIX_SPARSE_MATRIX_LIB, System.nanoTime() - t1);
-	}
-
-	/**
-	 * Dense dense matrix multiply
-	 * C = op(A) * op(B), A and B are dense matrices
-	 *
-	 * @param gCtx              a valid {@link GPUContext}
-	 * @param instName          name of the invoking instruction to record{@link Statistics}.
-	 * @param output            output object C on host with GPU data allocated
-	 * @param left              left matrix A (in row-major order)
-	 * @param right             right matrix B (in row-major order)
-	 * @param isLeftTransposed  op for A, transposed or not
-	 * @param isRightTransposed op for B, transposed or not
-	 * @throws DMLRuntimeException if DMLRuntimeException occurs
-	 */
-	private static void denseDenseMatmult(GPUContext gCtx, String instName, MatrixObject output, MatrixObject left, MatrixObject right,
-			boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException {
-
-		Pointer leftPtr = getDensePointer(gCtx, left, instName);
-		Pointer rightPtr = getDensePointer(gCtx, right, instName);
-
-		int leftRows = toInt(left.getNumRows());
-		int leftCols = toInt(left.getNumColumns());
-		int rightRows = toInt(right.getNumRows());
-		int rightCols = toInt(right.getNumColumns());
-		Pointer C = getDensePointer(gCtx, output, instName);
-		denseDenseMatmult(gCtx, instName, C, leftRows, leftCols, rightRows, rightCols, isLeftTransposed, isRightTransposed,
-				leftPtr, rightPtr);
-	}
-
-	/**
-	 * Dense-dense matrix multiply
-	 * C = op(A) * op(B), A and B are dense matrices
-	 * On the host, the matrices are in row-major format; cuBLAS expects them in column-major format.
-	 * What we have as input is t(A) and t(B), t(X) = transpose of X.
-	 * We do t(B) %*% t(A) to get t(C);
-	 * If we were to calculate t(t(C), we would get the resultant matrix C, but this would be in column-major format.
-	 * What we really want is t(C). This we already have as the result of t(B) %*% t(A).
-	 *
-	 * @param gCtx               a valid {@link GPUContext}
-	 * @param instName           name of the invoking instruction to record{@link Statistics}.
-	 * @param output             output allocated on GPU in column major format
-	 * @param leftRows1          number of rows in A
-	 * @param leftCols1          number of cols in A
-	 * @param rightRows1         number of rows in B
-	 * @param rightCols1         number of cols in B
-	 * @param isLeftTransposed1  op for A, transposed or not
-	 * @param isRightTransposed1 op for B, transposed or not
-	 * @param leftPtr            A allocated on the GPU in row-major format
-	 * @param rightPtr           B allocated on the GPU in row-major format
-	 * @throws DMLRuntimeException if DMLRuntimeException occurs
-	 */
-	public static void denseDenseMatmult(GPUContext gCtx, String instName, Pointer output, int leftRows1, int leftCols1, int rightRows1,
-			int rightCols1, boolean isLeftTransposed1, boolean isRightTransposed1, Pointer leftPtr, Pointer rightPtr)
-					throws DMLRuntimeException {
-		if(LOG.isTraceEnabled()) {
-			LOG.trace("GPU : d M %*% d M" + ", GPUContext=" + gCtx);
-		}
-
-		Pointer A = rightPtr;
-		Pointer B = leftPtr;
-
-		// To compensate for the input matrices being in row-major format instead of column-major (the way cublas expects)
-		int leftRows = rightCols1;
-		int leftCols = rightRows1;
-		int rightRows = leftCols1;
-		int rightCols = leftRows1;
-
-		boolean isLeftTransposed = isRightTransposed1;
-		boolean isRightTransposed = isLeftTransposed1;
-
-		// Note: the dimensions are swapped
-		int m = isLeftTransposed ? leftCols : leftRows ;
-		int n = isRightTransposed ? rightRows : rightCols;
-		int k = isLeftTransposed ?  leftRows : leftCols;
-		int k1 = isRightTransposed ?  rightCols : rightRows;
-		if(k != k1)
-			throw new DMLRuntimeException("Dimension mismatch: " + k + " != " + k1);
-
-		if(m == -1 || n == -1 || k == -1)
-			throw new DMLRuntimeException("Incorrect dimensions");
-
-		double[] one = { 1 };
-		double[] zero = { 0 };
-
-		//int lda = leftRows;
-		//int ldb = leftCols;
-		int lda = isLeftTransposed ?  k : m;
-		int ldb = isRightTransposed ? n : k;
-		int ldc = m;
-
-		int transa = isLeftTransposed ? cublasOperation.CUBLAS_OP_T : cublasOperation.CUBLAS_OP_N;
-		int transb = isRightTransposed ? cublasOperation.CUBLAS_OP_T : cublasOperation.CUBLAS_OP_N;
-
-		long t0=0;
-		if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
-		Pointer C = output;
-		if (m == 1 && n == 1){
-			// Vector product
-			LOG.debug(" GPU Dense-dense Vector Product");
-			double[] result = {0};
-			JCublas2.cublasDdot(getCublasHandle(gCtx), k, A, 1, B, 1, Pointer.to(result));
-			// By default in CuBlas V2, cublas pointer mode is set to CUBLAS_POINTER_MODE_HOST.
-			// This means that scalar values passed are on host (as opposed to on device).
-			// The result is copied from the host back to the device so that the rest of
-			// infrastructure can treat it uniformly.
-			cudaMemcpy(C, Pointer.to(result), 1 * Sizeof.DOUBLE, cudaMemcpyHostToDevice);
-			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_DOT_LIB, System.nanoTime() - t0);
-		} else if (m == 1) {
-			// Vector-matrix multiply
-			LOG.debug(" GPU Dense Vector-Matrix Multiply");
-			transb = isRightTransposed ? cublasOperation.CUBLAS_OP_N : cublasOperation.CUBLAS_OP_T;
-			JCublas2.cublasDgemv(getCublasHandle(gCtx), transb, rightRows, rightCols, Pointer.to(one), B, ldb, A, 1, Pointer.to(zero), C, 1);
-			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_VECTOR_DENSE_MATRIX_LIB, System.nanoTime() - t0);
-		} else if (n == 1){
-			// Matrix-vector multiply
-			LOG.debug(" GPU Dense Matrix-Vector Multiply");
-			JCublas2.cublasDgemv(getCublasHandle(gCtx), transa, leftRows, leftCols, Pointer.to(one), A, lda, B, 1, Pointer.to(zero), C, 1);
-			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_MATRIX_DENSE_VECTOR_LIB, System.nanoTime() - t0);
-		} else {
-			LOG.debug(" GPU Dense-Dense Matrix Multiply ");
-			JCublas2.cublasDgemm(getCublasHandle(gCtx), transa, transb, m, n, k, Pointer.to(one), A, lda, B, ldb, Pointer.to(zero), C, ldc);
-			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_MATRIX_DENSE_MATRIX_LIB, System.nanoTime() - t0);
-		}
-	}
-
-	//********************************************************************/
-	//***************** END OF MATRIX MULTIPLY Functions *****************/
-	//********************************************************************/
-
 
 	//********************************************************************/
 	//****************  UNARY AGGREGATE Functions ************************/

http://git-wip-us.apache.org/repos/asf/systemml/blob/6de8f051/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
new file mode 100644
index 0000000..21a2a35
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
@@ -0,0 +1,480 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.matrix.data;
+
+import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_NON_TRANSPOSE;
+import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_TRANSPOSE;
+import static jcuda.runtime.JCuda.cudaMemcpy;
+import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
+import jcuda.Pointer;
+import jcuda.Sizeof;
+import jcuda.jcublas.JCublas2;
+import jcuda.jcublas.cublasHandle;
+import jcuda.jcublas.cublasOperation;
+import jcuda.jcusparse.JCusparse;
+import jcuda.jcusparse.cusparseHandle;
+import jcuda.runtime.JCuda;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysml.api.DMLScript;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
+import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
+import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
+import org.apache.sysml.runtime.instructions.gpu.context.CSRPointer;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
+import org.apache.sysml.utils.GPUStatistics;
+import org.apache.sysml.utils.Statistics;
+
+public class LibMatrixCuMatMult extends LibMatrixCUDA {
+
+	private static final Log LOG = LogFactory.getLog(LibMatrixCuMatMult.class.getName());
+
+	private static class CuMatMultParameters {
+		/*
+		 * For the operation, C = op(A) %*% op(B), the below parameters are used
+		 * to invoke the corresponding kernels in CuBLAS and CuSPARSE.
+		 * 
+		 * All the below values have to be valid or else this class has to throw
+		 * an exception. No special values like -1 for unknowns allowed.
+		 */
+		public int m; // number of rows of matrix op(A) and C.
+		public int n; // number of columns of matrix op(B) and C.
+		public int k; // number of columns of op(A) and rows of op(B).
+		public int lda; // leading dimension of two-dimensional array used to
+						// store the matrix A.
+		public int ldb; // leading dimension of two-dimensional array used to
+						// store matrix B.
+		public int ldc; // leading dimension of a two-dimensional array used to
+						// store the matrix C.
+		public long leftNumRows; // number of rows of A
+		public long leftNumCols; // number of cols of A
+		public long rightNumRows; // number of rows of B
+		public long rightNumCols; // number of cols of B
+		private boolean isLeftTransposed; // is op(A) = t(A)
+		private boolean isRightTransposed; // is op(B) = t(B)
+
+		public CuMatMultParameters(long leftNumRows1, long leftNumCols1, long rightNumRows1, long rightNumCols1,
+				boolean isLeftTransposed1, boolean isRightTransposed1) throws DMLRuntimeException {
+			leftNumRows = leftNumRows1;
+			leftNumCols = leftNumCols1;
+			rightNumRows = rightNumRows1;
+			rightNumCols = rightNumCols1;
+			isLeftTransposed = isLeftTransposed1;
+			isRightTransposed = isRightTransposed1;
+			setDimensions();
+		}
+
+		public void rowToColumnMajor() throws DMLRuntimeException {
+			// To compensate for the input matrices being in row-major format
+			// instead of column-major (the way cublas expects)
+			isRightTransposed = swap(isLeftTransposed, isLeftTransposed = isRightTransposed);
+			rightNumCols = swap(leftNumRows, leftNumRows = rightNumCols);
+			rightNumRows = swap(leftNumCols, leftNumCols = rightNumRows);
+			setDimensions();
+		}
+
+		private void validate() throws DMLRuntimeException {
+			int k1 = toInt(isRightTransposed ? rightNumCols : rightNumRows);
+			if (k != k1)
+				throw new DMLRuntimeException("Dimension mismatch: " + k + " != " + k1 + " [" + leftNumRows + ","
+						+ leftNumCols + "," + rightNumRows + "," + rightNumCols + "], " + isLeftTransposed + " "
+						+ isRightTransposed);
+		}
+
+		private void setDimensions() throws DMLRuntimeException {
+			// Validate the dimensions
+			m = toInt(isLeftTransposed ? leftNumCols : leftNumRows);
+			n = toInt(isRightTransposed ? rightNumRows : rightNumCols);
+			k = toInt(isLeftTransposed ? leftNumRows : leftNumCols);
+			lda = isLeftTransposed ? k : m;
+			ldb = isRightTransposed ? n : k;
+			ldc = m;
+			if (m == -1 || n == -1 || k == -1)
+				throw new DMLRuntimeException("Incorrect dimensions");
+		}
+	}
+
+	/**
+	 * Matrix multiply on GPU Examines sparsity and shapes and routes call to
+	 * appropriate method from cuBLAS or cuSparse C = op(A) x op(B)
+	 *
+	 * The user is expected to call
+	 * ec.releaseMatrixOutputForGPUInstruction(outputName);
+	 *
+	 * @param ec
+	 *            Current {@link ExecutionContext} instance
+	 * @param gCtx
+	 *            a valid {@link GPUContext}
+	 * @param instName
+	 *            name of the invoking instruction to record{@link Statistics}.
+	 * @param left
+	 *            Matrix A
+	 * @param right
+	 *            Matrix B
+	 * @param outputName
+	 *            Name of the output matrix C (in code generated after LOP
+	 *            layer)
+	 * @param isLeftTransposed
+	 *            op for A, transposed or not
+	 * @param isRightTransposed
+	 *            op for B, tranposed or not
+	 * @throws DMLRuntimeException
+	 *             if DMLRuntimeException occurs
+	 * @return output of matrix multiply
+	 */
+	public static MatrixObject matmult(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject left,
+			MatrixObject right, String outputName, boolean isLeftTransposed, boolean isRightTransposed)
+			throws DMLRuntimeException {
+		boolean isM1Sparse = isInSparseFormat(gCtx, left);
+		boolean isM2Sparse = isInSparseFormat(gCtx, right);
+		MatrixObject output = ec.getMatrixObject(outputName);
+		long outRLen = isLeftTransposed ? left.getNumColumns() : left.getNumRows();
+		long outCLen = isRightTransposed ? right.getNumRows() : right.getNumColumns();
+
+		CuMatMultParameters params = new CuMatMultParameters(left.getNumRows(), left.getNumColumns(),
+				right.getNumRows(), right.getNumColumns(), isLeftTransposed, isRightTransposed);
+
+		if (isM1Sparse && isM2Sparse) {
+			// -------------------------------------------------------------------------------------
+			// sparse-sparse matrix multiplication
+			params.validate();
+			int transa = cusparseOp(isLeftTransposed);
+			int transb = cusparseOp(isRightTransposed);
+
+			// Step 1: Allocate output => sparse format
+			ec.allocateGPUMatrixObject(outputName, outRLen, outCLen);
+
+			// Step 2: Get the handles to sparse/dense pointers for left, right
+			// and output
+			CSRPointer A = left.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
+			CSRPointer B = right.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
+			long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
+			CSRPointer C = CSRPointer.allocateForMatrixMultiply(gCtx, getCusparseHandle(gCtx), A, transa, B, transb,
+					params.m, params.n, params.k);
+			if (GPUStatistics.DISPLAY_STATISTICS)
+				GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_ALLOCATE_LIB,
+						System.nanoTime() - t0);
+
+			// Step 3: Invoke the kernel
+			long t1 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
+			JCusparse.cusparseDcsrgemm(getCusparseHandle(gCtx), transa, transb, params.m, params.n, params.k, A.descr,
+					(int) A.nnz, A.val, A.rowPtr, A.colInd, B.descr, (int) B.nnz, B.val, B.rowPtr, B.colInd, C.descr,
+					C.val, C.rowPtr, C.colInd);
+			if (GPUStatistics.DISPLAY_STATISTICS)
+				GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_MATRIX_SPARSE_MATRIX_LIB,
+						System.nanoTime() - t1);
+			output.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
+			// -------------------------------------------------------------------------------------
+		} else if (!isM1Sparse && isM2Sparse) {
+			// -------------------------------------------------------------------------------------
+			// dense-sparse matrix multiplication
+			// Step 1: Allocate output => dense format
+			getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, outRLen, outCLen);
+
+			// Step 2: Get the handles to sparse/dense pointers for left, right
+			// and output
+			Pointer A = getDensePointer(gCtx, left, instName);
+			CSRPointer B = right.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
+			Pointer C = getDensePointer(gCtx, output, instName);
+
+			// Step 3: Invoke the kernel
+			denseSparseMatMult(getCusparseHandle(gCtx), instName, C, A, B, params);
+			// -------------------------------------------------------------------------------------
+		} else if (isM1Sparse && !isM2Sparse) {
+			// -------------------------------------------------------------------------------------
+			// sparse-dense matrix multiplication
+			// Step 1: Allocate output => dense format
+			getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, outRLen, outCLen);
+
+			// Step 2: Get the handles to sparse/dense pointers for left, right
+			// and output
+			CSRPointer A = left.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
+			Pointer B = getDensePointer(gCtx, right, instName);
+			Pointer C = getDensePointer(gCtx, output, instName);
+
+			// Step 3: Invoke the kernel
+			sparseDenseMatMult(gCtx, instName, C, A, B, left.getNumRows(), left.getNumColumns(), right.getNumRows(),
+					right.getNumColumns(), outRLen, outCLen, isLeftTransposed, isRightTransposed);
+			// -------------------------------------------------------------------------------------
+		} else {
+			// -------------------------------------------------------------------------------------
+			// dense-dense matrix multiplication
+			// Step 1: Allocate output => dense format
+			getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, outRLen, outCLen);
+
+			// Step 2: Get the handles to sparse/dense pointers for left, right
+			// and output
+			Pointer A = getDensePointer(gCtx, left, instName);
+			Pointer B = getDensePointer(gCtx, right, instName);
+			Pointer C = getDensePointer(gCtx, output, instName);
+
+			// Step 3: Invoke the kernel
+			denseDenseMatMult(getCublasHandle(gCtx), instName, C, A, B, params);
+			// -------------------------------------------------------------------------------------
+		}
+		return output;
+	}
+
+	/**
+	 * Internal method to invoke the appropriate CuSPARSE kernel for matrix
+	 * multiplication for operation: C = op(A) * op(B) This assumes B and C are
+	 * allocated in dense row-major format and A is sparse.
+	 * 
+	 * Other than input and output, this method requires additional memory =
+	 * outRLen * outCLen * Sizeof.DOUBLE
+	 * 
+	 * @param gCtx
+	 *            a valid {@link GPUContext}
+	 * @param instName
+	 *            name of the invoking instruction to record{@link Statistics}.
+	 * @param C
+	 *            output matrix pointer
+	 * @param A
+	 *            left matrix pointer
+	 * @param B
+	 *            right matrix pointer
+	 * @param leftNumRows
+	 *            number of rows of A
+	 * @param leftNumColumns
+	 *            number of cols of A
+	 * @param rightNumRows
+	 *            number of rows of B
+	 * @param rightNumColumns
+	 *            number of cols of B
+	 * @param outRLen
+	 *            number of rows of C
+	 * @param outCLen
+	 *            number of cols of C
+	 * @param isLeftTransposed
+	 *            is op(A) = t(A)
+	 * @param isRightTransposed
+	 *            is op(B) = t(B)
+	 * @throws DMLRuntimeException
+	 *             if error
+	 */
+	private static void sparseDenseMatMult(GPUContext gCtx, String instName, Pointer C, CSRPointer A, Pointer B,
+			long leftNumRows, long leftNumColumns, long rightNumRows, long rightNumColumns, long outRLen, long outCLen,
+			boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException {
+		// t(C) = t(B) %*% t(A)
+		Pointer output = null;
+		if (outRLen != 1 && outCLen != 1) {
+			output = gCtx.allocate(outRLen * outCLen * Sizeof.DOUBLE);
+		} else {
+			// no transpose required for vector output
+			output = C;
+		}
+		CuMatMultParameters params = new CuMatMultParameters(rightNumRows, rightNumColumns, leftNumRows,
+				leftNumColumns, !isRightTransposed, !isLeftTransposed);
+		denseSparseMatMult(getCusparseHandle(gCtx), instName, output, B, A, params);
+		if (outRLen != 1 && outCLen != 1) {
+			// Transpose: C = t(output)
+			long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
+			JCublas2.cublasDgeam(gCtx.getCublasHandle(), cublasOperation.CUBLAS_OP_T, cublasOperation.CUBLAS_OP_T,
+					toInt(outCLen), toInt(outRLen), one(), output, toInt(outRLen), zero(), new Pointer(),
+					toInt(outRLen), C, toInt(outCLen));
+			if (!DMLScript.EAGER_CUDA_FREE)
+				JCuda.cudaDeviceSynchronize();
+			gCtx.cudaFreeHelper(output, DMLScript.EAGER_CUDA_FREE);
+			if (GPUStatistics.DISPLAY_STATISTICS)
+				GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_TRANSPOSE_LIB, System.nanoTime()
+						- t0);
+		}
+	}
+
+	/**
+	 * Internal method to invoke the appropriate CuSPARSE kernel for matrix
+	 * multiplication for operation: C = op(A) * op(B) This assumes B and C are
+	 * allocated in dense row-major format and A is sparse.
+	 * 
+	 * @param handle
+	 *            cusparse handle
+	 * @param instName
+	 *            name of the invoking instruction to record{@link Statistics}.
+	 * @param C
+	 *            output matrix pointer
+	 * @param A
+	 *            left matrix pointer
+	 * @param B
+	 *            right matrix pointer
+	 * @param param
+	 *            BLAS parameters
+	 * @throws DMLRuntimeException
+	 *             if error
+	 */
+	private static void denseSparseMatMult(cusparseHandle handle, String instName, Pointer C, Pointer A, CSRPointer B,
+			CuMatMultParameters param) throws DMLRuntimeException {
+		long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
+		String kernel = GPUInstruction.MISC_TIMER_SPARSE_MATRIX_DENSE_MATRIX_LIB;
+		// Ignoring sparse vector dense matrix multiplication and dot product
+		boolean isVector = (param.leftNumRows == 1 && !param.isLeftTransposed)
+				|| (param.leftNumCols == 1 && param.isLeftTransposed);
+		if (isVector) {
+			LOG.debug(" GPU Sparse-Dense Matrix Vector ");
+			int m = toInt(param.rightNumRows);
+			int n = toInt(param.rightNumCols);
+			int transa = reverseCusparseOp(cusparseOp(param.isLeftTransposed));
+			JCusparse.cusparseDcsrmv(handle, transa, m, n, toInt(B.nnz), one(), B.descr, B.val, B.rowPtr, B.colInd, A,
+					zero(), C);
+			kernel = GPUInstruction.MISC_TIMER_SPARSE_MATRIX_DENSE_VECTOR_LIB;
+		} else {
+			int m = toInt(param.rightNumRows);
+			int k = toInt(param.rightNumCols);
+			param.rowToColumnMajor();
+			param.validate();
+			int transa = reverseCusparseOp(cusparseOp(param.isLeftTransposed));
+			int transb = cusparseOp(param.isRightTransposed);
+			LOG.debug(" GPU Sparse-Dense Matrix Multiply (rhs transpose) ");
+			JCusparse.cusparseDcsrmm2(handle, transa, transb, m, param.n, k, toInt(B.nnz), one(), B.descr, B.val,
+					B.rowPtr, B.colInd, A, param.ldb, zero(), C, param.ldc);
+		}
+		if (GPUStatistics.DISPLAY_STATISTICS)
+			GPUStatistics.maintainCPMiscTimes(instName, kernel, System.nanoTime() - t0);
+	}
+
+	/**
+	 * Internal method to invoke the appropriate CuBLAS kernel for matrix
+	 * multiplication for operation: C = op(A) * op(B) This assumes A, B and C
+	 * are allocated in dense format. The caller is expected to invoke
+	 * params.rowToColumnMajor().
+	 * 
+	 * @param handle
+	 *            cublas handle
+	 * @param instName
+	 *            name of the invoking instruction to record{@link Statistics}.
+	 * @param C
+	 *            output matrix pointer
+	 * @param A
+	 *            left matrix pointer
+	 * @param B
+	 *            right matrix pointer
+	 * @param param
+	 *            BLAS parameters
+	 * @throws DMLRuntimeException
+	 *             if error
+	 */
+	private static void denseDenseMatMult(cublasHandle handle, String instName, Pointer C, Pointer A, Pointer B,
+			CuMatMultParameters param) throws DMLRuntimeException {
+		long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
+		String kernel = null;
+		param.rowToColumnMajor();
+		param.validate();
+		int transa = cublasOp(param.isLeftTransposed);
+		int transb = cublasOp(param.isRightTransposed);
+		B = swap(A, A = B);
+		if (param.m == 1 && param.n == 1) {
+			// Vector product
+			LOG.debug(" GPU Dense-dense Vector Product");
+			double[] result = { 0 };
+			JCublas2.cublasDdot(handle, param.k, A, 1, B, 1, Pointer.to(result));
+			// By default in CuBlas V2, cublas pointer mode is set to
+			// CUBLAS_POINTER_MODE_HOST.
+			// This means that scalar values passed are on host (as opposed to
+			// on device).
+			// The result is copied from the host back to the device so that the
+			// rest of
+			// infrastructure can treat it uniformly.
+			cudaMemcpy(C, Pointer.to(result), 1 * Sizeof.DOUBLE, cudaMemcpyHostToDevice);
+			kernel = GPUInstruction.MISC_TIMER_DENSE_DOT_LIB;
+		} else if (param.m == 1) {
+			// Vector-matrix multiply
+			LOG.debug(" GPU Dense Vector-Matrix Multiply");
+			transb = reverseCublasOp(transb);
+			int rightNumRows = (transb == CUSPARSE_OPERATION_TRANSPOSE) ? param.k : param.n;
+			int rightNumCols = (transb == CUSPARSE_OPERATION_TRANSPOSE) ? param.n : param.k;
+			JCublas2.cublasDgemv(handle, transb, rightNumRows, rightNumCols, one(), B, param.ldb, A, 1, zero(), C, 1);
+			kernel = GPUInstruction.MISC_TIMER_DENSE_VECTOR_DENSE_MATRIX_LIB;
+		} else if (param.n == 1) {
+			// Matrix-vector multiply
+			LOG.debug(" GPU Dense Matrix-Vector Multiply");
+			int leftNumRows = (transa == CUSPARSE_OPERATION_NON_TRANSPOSE) ? param.m : param.k;
+			int leftNumCols = (transa == CUSPARSE_OPERATION_NON_TRANSPOSE) ? param.k : param.m;
+			JCublas2.cublasDgemv(handle, transa, leftNumRows, leftNumCols, one(), A, param.lda, B, 1, zero(), C, 1);
+			kernel = GPUInstruction.MISC_TIMER_DENSE_MATRIX_DENSE_VECTOR_LIB;
+		} else {
+			LOG.debug(" GPU Dense-Dense Matrix Multiply ");
+			JCublas2.cublasDgemm(handle, transa, transb, param.m, param.n, param.k, one(), A, param.lda, B, param.ldb,
+					zero(), C, param.ldc);
+			kernel = GPUInstruction.MISC_TIMER_DENSE_MATRIX_DENSE_MATRIX_LIB;
+		}
+		if (GPUStatistics.DISPLAY_STATISTICS)
+			GPUStatistics.maintainCPMiscTimes(instName, kernel, System.nanoTime() - t0);
+	}
+
+	// Convenient methods to swap two values
+	// Usage: y = swap(x, x=y);
+	private static long swap(long x, long y) {
+		return x;
+	}
+
+	private static boolean swap(boolean x, boolean y) {
+		return x;
+	}
+
+	private static Pointer swap(Pointer x, Pointer y) {
+		return x;
+	}
+
+	/**
+	 * Convenient wrapper to return appropriate cuSPARSE trans value
+	 * 
+	 * @param isTransposed
+	 *            is op(input) = t(input)
+	 * @return CUSPARSE_OPERATION_TRANSPOSE or CUSPARSE_OPERATION_NON_TRANSPOSE
+	 */
+	private static int cusparseOp(boolean isTransposed) {
+		return isTransposed ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
+	}
+
+	/**
+	 * Convenient wrapper to return appropriate cuBLAS trans value
+	 * 
+	 * @param isTransposed
+	 *            is op(input) = t(input)
+	 * @return CUBLAS_OP_T or CUBLAS_OP_N
+	 */
+	private static int cublasOp(boolean isTransposed) {
+		return isTransposed ? cublasOperation.CUBLAS_OP_T : cublasOperation.CUBLAS_OP_N;
+	}
+
+	/**
+	 * Flips the cuBLAS trans value
+	 * 
+	 * @param trans
+	 *            can be CUBLAS_OP_T or CUBLAS_OP_N
+	 * @return CUBLAS_OP_N if trans is CUBLAS_OP_T else CUBLAS_OP_T
+	 */
+	private static int reverseCublasOp(int trans) {
+		return trans == cublasOperation.CUBLAS_OP_T ? cublasOperation.CUBLAS_OP_N : cublasOperation.CUBLAS_OP_T;
+	}
+
+	/**
+	 * Flips the cuSPARSE trans value
+	 * 
+	 * @param trans
+	 *            can be CUSPARSE_OPERATION_NON_TRANSPOSE or
+	 *            CUSPARSE_OPERATION_TRANSPOSE
+	 * @return CUSPARSE_OPERATION_NON_TRANSPOSE if trans is
+	 *         CUSPARSE_OPERATION_TRANSPOSE else CUSPARSE_OPERATION_TRANSPOSE
+	 */
+	private static int reverseCusparseOp(int trans) {
+		return trans == CUSPARSE_OPERATION_TRANSPOSE ? CUSPARSE_OPERATION_NON_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
+	}
+}

http://git-wip-us.apache.org/repos/asf/systemml/blob/6de8f051/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java b/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java
index 81bc254..d983716 100644
--- a/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java
+++ b/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java
@@ -50,9 +50,81 @@ public class MatrixMultiplicationOpTest extends GPUTests {
 	public void matrixMatrixTest1() {
 		String scriptStr = "O = X %*% Y";
 
-		int[] X1 = { 1, 128, 513, 1024 };
-		int[] X2 = { 128, 512, 1024 };
-		int[] Y2 = { 1, 128, 513, 1024 };
+		int[] X1 = { 1, 128, 1024 };
+		int[] X2 = { 1, 128, 1024 };
+		int[] Y2 = { 1, 128, 1024 };
+		double[] SX = { 0.0, 0.03, 0.3, 0.9 };
+		double[] SY = { 0.0, 0.03, 0.3, 0.9 };
+
+		for (int x1 = 0; x1 < X1.length; x1++) {
+			for (int x2 = 0; x2 < X2.length; x2++) {
+				int y1 = x2;
+				for (int y2 = 0; y2 < Y2.length; y2++) {
+					for (int sx = 0; sx < SX.length; sx++) {
+						for (int sy = 0; sy < SY.length; sy++) {
+							assertMatrixMultiplication(scriptStr, X1[x1], X2[x2], X2[y1], Y2[y2], SX[sx], SY[sy]);
+						}
+					}
+				}
+			}
+		}
+	}
+	
+	@Test
+	public void commonCaseMLMatrixMatrixTest1() {
+		String scriptStr = "O = X %*% Y";
+
+		int[] X1 = { 1000000 };
+		int[] X2 = { 1000 };
+		int[] Y2 = { 1, 20 };
+		double[] SX = { 0.0, 0.03, 0.3 };
+		double[] SY = { 0.0, 0.03, 0.3, 0.9 };
+
+		for (int x1 = 0; x1 < X1.length; x1++) {
+			for (int x2 = 0; x2 < X2.length; x2++) {
+				int y1 = x2;
+				for (int y2 = 0; y2 < Y2.length; y2++) {
+					for (int sx = 0; sx < SX.length; sx++) {
+						for (int sy = 0; sy < SY.length; sy++) {
+							assertMatrixMultiplication(scriptStr, X1[x1], X2[x2], X2[y1], Y2[y2], SX[sx], SY[sy]);
+						}
+					}
+				}
+			}
+		}
+	}
+	
+	@Test
+	public void commonCaseDLMatrixMatrixTest1() {
+		String scriptStr = "O = X %*% Y";
+
+		int[] X1 = { 100 };
+		int[] X2 = { 600, 900  };
+		int[] Y2 = { 205800 };
+		double[] SX = { 0.0, 0.03, 0.3 };
+		double[] SY = { 0.0, 0.03, 0.3, 0.9 };
+
+		for (int x1 = 0; x1 < X1.length; x1++) {
+			for (int x2 = 0; x2 < X2.length; x2++) {
+				int y1 = x2;
+				for (int y2 = 0; y2 < Y2.length; y2++) {
+					for (int sx = 0; sx < SX.length; sx++) {
+						for (int sy = 0; sy < SY.length; sy++) {
+							assertMatrixMultiplication(scriptStr, X1[x1], X2[x2], X2[y1], Y2[y2], SX[sx], SY[sy]);
+						}
+					}
+				}
+			}
+		}
+	}
+	
+	@Test
+	public void commonCaseDLMatrixMatrixTest2() {
+		String scriptStr = "O = X %*% Y";
+
+		int[] X1 = { 64 };
+		int[] X2 = { 196608   };
+		int[] Y2 = { 512 };
 		double[] SX = { 0.0, 0.03, 0.3, 0.9 };
 		double[] SY = { 0.0, 0.03, 0.3, 0.9 };
 
@@ -74,9 +146,9 @@ public class MatrixMultiplicationOpTest extends GPUTests {
 	public void matrixMatrixTest2() {
 		String scriptStr = "O = X %*% t(Y)";
 
-		int[] X1 = { 1, 128, 513, 1024 };
-		int[] X2 = { 128, 512, 1024 };
-		int[] Y1 = { 1, 128, 513, 1024 };
+		int[] X1 = { 1, 128, 1024 };
+		int[] X2 = { 1, 128, 1024 };
+		int[] Y1 = { 1, 128, 1024 };
 		double[] SX = { 0.0, 0.03, 0.3, 0.9 };
 		double[] SY = { 0.0, 0.03, 0.3, 0.9 };
 
@@ -98,9 +170,9 @@ public class MatrixMultiplicationOpTest extends GPUTests {
 	public void matrixMatrixTest3() {
 		String scriptStr = "O = t(X) %*% Y";
 
-		int[] X1 = { 1, 128, 513, 1024 };
-		int[] X2 = { 128, 512, 1024 };
-		int[] Y2 = { 1, 128, 513, 1024 };
+		int[] X1 = { 1, 128, 1024 };
+		int[] X2 = { 1, 128, 1024 };
+		int[] Y2 = { 1, 128, 1024 };
 		double[] SX = { 0.0, 0.03, 0.3, 0.9 };
 		double[] SY = { 0.0, 0.03, 0.3, 0.9 };
 
@@ -122,9 +194,9 @@ public class MatrixMultiplicationOpTest extends GPUTests {
 	public void matrixMatrixTest4() {
 		String scriptStr = "O = t(X) %*% t(Y)";
 
-		int[] X1 = { 1, 128, 513, 1024 };
-		int[] X2 = { 128, 512, 1024 };
-		int[] Y1 = { 1, 128, 513, 1024 };
+		int[] X1 = { 1, 128, 1024 };
+		int[] X2 = { 1, 128, 1024 };
+		int[] Y1 = { 1, 128, 1024 };
 		double[] SX = { 0.0, 0.03, 0.3, 0.9 };
 		double[] SY = { 0.0, 0.03, 0.3, 0.9 };
 
@@ -146,7 +218,7 @@ public class MatrixMultiplicationOpTest extends GPUTests {
 	public void transposeSelfMatrixMultiply() {
 		String scriptStr = "O = t(X) %*% X";
 
-		int[] sizes = { 1, 128, 512, 1024, 2049 };
+		int[] sizes = { 1, 128, 1024 };
 		double[] sparsities = { 0.0, 0.03, 0.3, 0.9 };
 
 		for (int i = 0; i < sizes.length; i++) {