You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by re...@apache.org on 2017/11/03 18:02:29 UTC
[27/50] [abbrv] systemml git commit: [SYSTEMML-1969] Support single-precision operations on GPU backend

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
index 21a2a35..d962027 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
@@ -23,13 +23,6 @@ import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_TRANSPOSE;
 import static jcuda.runtime.JCuda.cudaMemcpy;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
 import jcuda.Pointer;
-import jcuda.Sizeof;
-import jcuda.jcublas.JCublas2;
-import jcuda.jcublas.cublasHandle;
-import jcuda.jcublas.cublasOperation;
-import jcuda.jcusparse.JCusparse;
-import jcuda.jcusparse.cusparseHandle;
-import jcuda.runtime.JCuda;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -43,6 +36,11 @@ import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
 import org.apache.sysml.utils.GPUStatistics;
 import org.apache.sysml.utils.Statistics;
 
+import jcuda.jcusparse.cusparseHandle;
+import jcuda.jcublas.cublasHandle;
+import jcuda.jcublas.cublasOperation;
+import jcuda.runtime.JCuda;
+
 public class LibMatrixCuMatMult extends LibMatrixCUDA {
 
 	private static final Log LOG = LogFactory.getLog(LibMatrixCuMatMult.class.getName());
@@ -175,7 +173,7 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA {
 
 			// Step 3: Invoke the kernel
 			long t1 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
-			JCusparse.cusparseDcsrgemm(getCusparseHandle(gCtx), transa, transb, params.m, params.n, params.k, A.descr,
+			cudaSupportFunctions.cusparsecsrgemm(getCusparseHandle(gCtx), transa, transb, params.m, params.n, params.k, A.descr,
 					(int) A.nnz, A.val, A.rowPtr, A.colInd, B.descr, (int) B.nnz, B.val, B.rowPtr, B.colInd, C.descr,
 					C.val, C.rowPtr, C.colInd);
 			if (GPUStatistics.DISPLAY_STATISTICS)
@@ -239,7 +237,7 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA {
 	 * allocated in dense row-major format and A is sparse.
 	 * 
 	 * Other than input and output, this method requires additional memory =
-	 * outRLen * outCLen * Sizeof.DOUBLE
+	 * outRLen * outCLen * sizeOfDataType
 	 * 
 	 * @param gCtx
 	 *            a valid {@link GPUContext}
@@ -276,7 +274,7 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA {
 		// t(C) = t(B) %*% t(A)
 		Pointer output = null;
 		if (outRLen != 1 && outCLen != 1) {
-			output = gCtx.allocate(outRLen * outCLen * Sizeof.DOUBLE);
+			output = gCtx.allocate(outRLen * outCLen * sizeOfDataType);
 		} else {
 			// no transpose required for vector output
 			output = C;
@@ -287,7 +285,7 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA {
 		if (outRLen != 1 && outCLen != 1) {
 			// Transpose: C = t(output)
 			long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
-			JCublas2.cublasDgeam(gCtx.getCublasHandle(), cublasOperation.CUBLAS_OP_T, cublasOperation.CUBLAS_OP_T,
+			cudaSupportFunctions.cublasgeam(gCtx.getCublasHandle(), cublasOperation.CUBLAS_OP_T, cublasOperation.CUBLAS_OP_T,
 					toInt(outCLen), toInt(outRLen), one(), output, toInt(outRLen), zero(), new Pointer(),
 					toInt(outRLen), C, toInt(outCLen));
 			if (!DMLScript.EAGER_CUDA_FREE)
@@ -331,7 +329,7 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA {
 			int m = toInt(param.rightNumRows);
 			int n = toInt(param.rightNumCols);
 			int transa = reverseCusparseOp(cusparseOp(param.isLeftTransposed));
-			JCusparse.cusparseDcsrmv(handle, transa, m, n, toInt(B.nnz), one(), B.descr, B.val, B.rowPtr, B.colInd, A,
+			cudaSupportFunctions.cusparsecsrmv(handle, transa, m, n, toInt(B.nnz), one(), B.descr, B.val, B.rowPtr, B.colInd, A,
 					zero(), C);
 			kernel = GPUInstruction.MISC_TIMER_SPARSE_MATRIX_DENSE_VECTOR_LIB;
 		} else {
@@ -342,7 +340,7 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA {
 			int transa = reverseCusparseOp(cusparseOp(param.isLeftTransposed));
 			int transb = cusparseOp(param.isRightTransposed);
 			LOG.debug(" GPU Sparse-Dense Matrix Multiply (rhs transpose) ");
-			JCusparse.cusparseDcsrmm2(handle, transa, transb, m, param.n, k, toInt(B.nnz), one(), B.descr, B.val,
+			cudaSupportFunctions.cusparsecsrmm2(handle, transa, transb, m, param.n, k, toInt(B.nnz), one(), B.descr, B.val,
 					B.rowPtr, B.colInd, A, param.ldb, zero(), C, param.ldc);
 		}
 		if (GPUStatistics.DISPLAY_STATISTICS)
@@ -383,7 +381,7 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA {
 			// Vector product
 			LOG.debug(" GPU Dense-dense Vector Product");
 			double[] result = { 0 };
-			JCublas2.cublasDdot(handle, param.k, A, 1, B, 1, Pointer.to(result));
+			cudaSupportFunctions.cublasdot(handle, param.k, A, 1, B, 1, Pointer.to(result));
 			// By default in CuBlas V2, cublas pointer mode is set to
 			// CUBLAS_POINTER_MODE_HOST.
 			// This means that scalar values passed are on host (as opposed to
@@ -391,7 +389,7 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA {
 			// The result is copied from the host back to the device so that the
 			// rest of
 			// infrastructure can treat it uniformly.
-			cudaMemcpy(C, Pointer.to(result), 1 * Sizeof.DOUBLE, cudaMemcpyHostToDevice);
+			cudaMemcpy(C, Pointer.to(result), 1 * sizeOfDataType, cudaMemcpyHostToDevice);
 			kernel = GPUInstruction.MISC_TIMER_DENSE_DOT_LIB;
 		} else if (param.m == 1) {
 			// Vector-matrix multiply
@@ -399,18 +397,18 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA {
 			transb = reverseCublasOp(transb);
 			int rightNumRows = (transb == CUSPARSE_OPERATION_TRANSPOSE) ? param.k : param.n;
 			int rightNumCols = (transb == CUSPARSE_OPERATION_TRANSPOSE) ? param.n : param.k;
-			JCublas2.cublasDgemv(handle, transb, rightNumRows, rightNumCols, one(), B, param.ldb, A, 1, zero(), C, 1);
+			cudaSupportFunctions.cublasgemv(handle, transb, rightNumRows, rightNumCols, one(), B, param.ldb, A, 1, zero(), C, 1);
 			kernel = GPUInstruction.MISC_TIMER_DENSE_VECTOR_DENSE_MATRIX_LIB;
 		} else if (param.n == 1) {
 			// Matrix-vector multiply
 			LOG.debug(" GPU Dense Matrix-Vector Multiply");
 			int leftNumRows = (transa == CUSPARSE_OPERATION_NON_TRANSPOSE) ? param.m : param.k;
 			int leftNumCols = (transa == CUSPARSE_OPERATION_NON_TRANSPOSE) ? param.k : param.m;
-			JCublas2.cublasDgemv(handle, transa, leftNumRows, leftNumCols, one(), A, param.lda, B, 1, zero(), C, 1);
+			cudaSupportFunctions.cublasgemv(handle, transa, leftNumRows, leftNumCols, one(), A, param.lda, B, 1, zero(), C, 1);
 			kernel = GPUInstruction.MISC_TIMER_DENSE_MATRIX_DENSE_VECTOR_LIB;
 		} else {
 			LOG.debug(" GPU Dense-Dense Matrix Multiply ");
-			JCublas2.cublasDgemm(handle, transa, transb, param.m, param.n, param.k, one(), A, param.lda, B, param.ldb,
+			cudaSupportFunctions.cublasgemm(handle, transa, transb, param.m, param.n, param.k, one(), A, param.lda, B, param.ldb,
 					zero(), C, param.ldc);
 			kernel = GPUInstruction.MISC_TIMER_DENSE_MATRIX_DENSE_MATRIX_LIB;
 		}

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
index 8ee6f8d..c023890 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
@@ -3852,8 +3852,9 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 	 * @param ru row upper
 	 * @param cl column lower
 	 * @param cu column upper
-	 * @param ret ?
-	 * @return matrix block
+	 * @param deep should perform deep copy
+	 * @param ret output matrix block
+	 * @return matrix block output matrix block
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
 	public MatrixBlock sliceOperations(int rl, int ru, int cl, int cu, boolean deep, CacheBlock ret) 

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java b/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
new file mode 100644
index 0000000..128bb39
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
@@ -0,0 +1,208 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.matrix.data;
+
+import static jcuda.runtime.JCuda.cudaMemcpy;
+import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
+import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
+import org.apache.sysml.utils.GPUStatistics;
+
+import jcuda.Pointer;
+import jcuda.Sizeof;
+import jcuda.jcublas.JCublas2;
+import jcuda.jcublas.cublasHandle;
+import jcuda.jcusolver.JCusolverDn;
+import jcuda.jcusolver.cusolverDnHandle;
+import jcuda.jcusparse.JCusparse;
+import jcuda.jcusparse.cusparseHandle;
+import jcuda.jcusparse.cusparseMatDescr;
+
+public class SinglePrecisionCudaSupportFunctions implements CudaSupportFunctions {
+	
+	private static final Log LOG = LogFactory.getLog(SinglePrecisionCudaSupportFunctions.class.getName());
+
+	@Override
+	public int cusparsecsrgemm(cusparseHandle handle, int transA, int transB, int m, int n, int k,
+			cusparseMatDescr descrA, int nnzA, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA,
+			cusparseMatDescr descrB, int nnzB, Pointer csrValB, Pointer csrRowPtrB, Pointer csrColIndB,
+			cusparseMatDescr descrC, Pointer csrValC, Pointer csrRowPtrC, Pointer csrColIndC) {
+		return JCusparse.cusparseScsrgemm(handle, transA,  transB,  m,  n,  k,
+				 descrA,  nnzA,  csrValA,  csrRowPtrA,  csrColIndA,
+				 descrB,  nnzB,  csrValB,  csrRowPtrB,  csrColIndB,
+				 descrC,  csrValC,  csrRowPtrC,  csrColIndC);
+	}
+
+	@Override
+	public int cublasgeam(cublasHandle handle, int transa, int transb, int m, int n, Pointer alpha, Pointer A, int lda,
+			Pointer beta, Pointer B, int ldb, Pointer C, int ldc) {
+		return JCublas2.cublasSgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
+	}
+
+	@Override
+	public int cusparsecsrmv(cusparseHandle handle, int transA, int m, int n, int nnz, Pointer alpha,
+			cusparseMatDescr descrA, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA, Pointer x, Pointer beta,
+			Pointer y) {
+		return JCusparse.cusparseScsrmv(handle, transA, m, n, nnz, alpha, 
+				descrA, csrValA, csrRowPtrA, csrColIndA, x, beta, y);
+	}
+	
+	@Override
+	public int	cusparsecsrmm2(cusparseHandle handle, int transa, int transb, int m, int n, int k, int nnz, jcuda.Pointer alpha, cusparseMatDescr descrA, 
+			jcuda.Pointer csrValA, jcuda.Pointer csrRowPtrA, jcuda.Pointer csrColIndA, 
+			jcuda.Pointer B, int ldb, jcuda.Pointer beta, jcuda.Pointer C, int ldc) {
+		return JCusparse.cusparseScsrmm2(handle, transa, transb, m, n, k, nnz, alpha, descrA, csrValA, 
+				csrRowPtrA, csrColIndA, B, ldb, beta, C, ldc);
+	}
+
+	@Override
+	public int cublasdot(cublasHandle handle, int n, Pointer x, int incx, Pointer y, int incy, Pointer result) {
+		return JCublas2.cublasSdot(handle, n, x, incx, y, incy, result);
+	}
+
+	@Override
+	public int cublasgemv(cublasHandle handle, int trans, int m, int n, Pointer alpha, Pointer A, int lda, Pointer x,
+			int incx, Pointer beta, Pointer y, int incy) {
+		return JCublas2.cublasSgemv(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+	}
+
+	@Override
+	public int cublasgemm(cublasHandle handle, int transa, int transb, int m, int n, int k, Pointer alpha, Pointer A,
+			int lda, Pointer B, int ldb, Pointer beta, Pointer C, int ldc) {
+		return JCublas2.cublasSgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+	}
+
+	@Override
+	public int cusparsecsr2csc(cusparseHandle handle, int m, int n, int nnz, Pointer csrVal, Pointer csrRowPtr,
+			Pointer csrColInd, Pointer cscVal, Pointer cscRowInd, Pointer cscColPtr, int copyValues, int idxBase) {
+		return JCusparse.cusparseScsr2csc(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscRowInd, cscColPtr, copyValues, idxBase);
+	}
+
+	@Override
+	public int cublassyrk(cublasHandle handle, int uplo, int trans, int n, int k, Pointer alpha, Pointer A, int lda,
+			Pointer beta, Pointer C, int ldc) {
+		return JCublas2.cublasSsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+	}
+
+	@Override
+	public int cublasaxpy(cublasHandle handle, int n, Pointer alpha, Pointer x, int incx, Pointer y, int incy) {
+		return JCublas2.cublasSaxpy(handle, n, alpha, x, incx, y, incy);
+	}
+
+	@Override
+	public int cublastrsm(cublasHandle handle, int side, int uplo, int trans, int diag, int m, int n, Pointer alpha,
+			Pointer A, int lda, Pointer B, int ldb) {
+		return JCublas2.cublasStrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
+	}
+
+	@Override
+	public int cusolverDngeqrf_bufferSize(cusolverDnHandle handle, int m, int n, Pointer A, int lda, int[] Lwork) {
+		return JCusolverDn.cusolverDnSgeqrf_bufferSize(handle, m, n, A, lda, Lwork);
+	}
+
+	@Override
+	public int cusolverDngeqrf(cusolverDnHandle handle, int m, int n, Pointer A, int lda, Pointer TAU,
+			Pointer Workspace, int Lwork, Pointer devInfo) {
+		return JCusolverDn.cusolverDnSgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
+	}
+	
+	@Override
+	public int cusolverDnormqr(cusolverDnHandle handle, int side, int trans, int m, int n, int k, Pointer A, int lda,
+			Pointer tau, Pointer C, int ldc, Pointer work, int lwork, Pointer devInfo) {
+		return JCusolverDn.cusolverDnSormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, devInfo);
+	}
+
+	@Override
+	public int cusparsecsrgeam(cusparseHandle handle, int m, int n, Pointer alpha, cusparseMatDescr descrA, int nnzA,
+			Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA, Pointer beta, cusparseMatDescr descrB, int nnzB,
+			Pointer csrValB, Pointer csrRowPtrB, Pointer csrColIndB, cusparseMatDescr descrC, Pointer csrValC,
+			Pointer csrRowPtrC, Pointer csrColIndC) {
+		return JCusparse.cusparseScsrgeam(handle, m, n, alpha, descrA, nnzA, 
+				csrValA, csrRowPtrA, csrColIndA, beta, descrB, nnzB, 
+				csrValB, csrRowPtrB, csrColIndB, descrC, csrValC, csrRowPtrC, csrColIndC);
+	}
+
+	@Override
+	public int cusparsecsr2dense(cusparseHandle handle, int m, int n, cusparseMatDescr descrA, Pointer csrValA,
+			Pointer csrRowPtrA, Pointer csrColIndA, Pointer A, int lda) {
+		return JCusparse.cusparseScsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA, csrColIndA, A, lda);
+	}
+	
+	@Override
+	public int cusparsedense2csr(cusparseHandle handle, int m, int n, cusparseMatDescr descrA, Pointer A, int lda,
+			Pointer nnzPerRow, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA) {
+		return JCusparse.cusparseSdense2csr(handle, m, n, descrA, A, lda, nnzPerRow, csrValA, csrRowPtrA, csrColIndA);
+	}
+	
+	@Override
+	public int cusparsennz(cusparseHandle handle, int dirA, int m, int n, cusparseMatDescr descrA, Pointer A, int lda,
+			Pointer nnzPerRowCol, Pointer nnzTotalDevHostPtr) {
+		return JCusparse.cusparseSnnz(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol, nnzTotalDevHostPtr);
+	}
+	
+	@Override
+	public void deviceToHost(GPUContext gCtx, Pointer src, double[] dest, String instName) throws DMLRuntimeException {
+		long t1 = GPUStatistics.DISPLAY_STATISTICS  && instName != null? System.nanoTime() : 0;
+		LOG.debug("Potential OOM: Allocated additional space in deviceToHost");
+		if(PERFORM_CONVERSION_ON_DEVICE) {
+			Pointer deviceDoubleData = gCtx.allocate(((long)dest.length)*Sizeof.DOUBLE);
+			LibMatrixCUDA.float2double(gCtx, src, deviceDoubleData, dest.length);
+			cudaMemcpy(Pointer.to(dest), deviceDoubleData, ((long)dest.length)*Sizeof.DOUBLE, cudaMemcpyDeviceToHost);
+			gCtx.cudaFreeHelper(deviceDoubleData);
+		}
+		else {
+			// TODO: Perform conversion on GPU using double2float and float2double kernels
+			float [] floatData = new float[dest.length];
+			cudaMemcpy(Pointer.to(floatData), src, ((long)dest.length)*Sizeof.FLOAT, cudaMemcpyDeviceToHost);
+			for(int i = 0; i < dest.length; i++) {
+				dest[i] = floatData[i];
+			}
+		}
+		if(GPUStatistics.DISPLAY_STATISTICS && instName != null) 
+			GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DEVICE_TO_HOST, System.nanoTime() - t1);
+	}
+
+	@Override
+	public void hostToDevice(GPUContext gCtx, double[] src, Pointer dest, String instName) throws DMLRuntimeException {
+		LOG.debug("Potential OOM: Allocated additional space in hostToDevice");
+		// TODO: Perform conversion on GPU using double2float and float2double kernels
+		long t1 = GPUStatistics.DISPLAY_STATISTICS  && instName != null? System.nanoTime() : 0;
+		if(PERFORM_CONVERSION_ON_DEVICE) {
+			Pointer deviceDoubleData = gCtx.allocate(((long)src.length)*Sizeof.DOUBLE);
+			cudaMemcpy(deviceDoubleData, Pointer.to(src), ((long)src.length)*Sizeof.DOUBLE, cudaMemcpyHostToDevice);
+			LibMatrixCUDA.double2float(gCtx, deviceDoubleData, dest, src.length);
+			gCtx.cudaFreeHelper(deviceDoubleData);
+		}
+		else {
+			float [] floatData = new float[src.length];
+			for(int i = 0; i < src.length; i++) {
+				floatData[i] = (float) src[i];
+			}
+			cudaMemcpy(dest, Pointer.to(floatData), ((long)src.length)*Sizeof.FLOAT, cudaMemcpyHostToDevice);
+		}
+		
+		if(GPUStatistics.DISPLAY_STATISTICS && instName != null) 
+			GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_HOST_TO_DEVICE, System.nanoTime() - t1);
+	}
+}

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/test/java/org/apache/sysml/test/gpu/GPUTests.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/gpu/GPUTests.java b/src/test/java/org/apache/sysml/test/gpu/GPUTests.java
index b4e4b62..d7d1ad5 100644
--- a/src/test/java/org/apache/sysml/test/gpu/GPUTests.java
+++ b/src/test/java/org/apache/sysml/test/gpu/GPUTests.java
@@ -51,9 +51,14 @@ public abstract class GPUTests extends AutomatedTestBase {
 	
 	protected final static String TEST_DIR = "org/apache/sysml/api/mlcontext";
 	protected static SparkSession spark;
-	protected final double THRESHOLD = 1e-9;    // for relative error
+	protected final double DOUBLE_PRECISION_THRESHOLD = 1e-9;    // for relative error
 	private static final boolean PRINT_MAT_ERROR = false;
 	
+	// We will use this flag until lower precision is supported on CP. 
+	private final static String DATA_TYPE = "double";  
+	protected final double SINGLE_PRECISION_THRESHOLD = 1e-3;    // for relative error
+	
+	
 	@BeforeClass
 	public static void beforeClass() {
 		spark = createSystemMLSparkSession("GPUTests", "local");
@@ -70,7 +75,9 @@ public abstract class GPUTests extends AutomatedTestBase {
 	 * @return a valid threshold
 	 */
 	protected double getTHRESHOLD() {
-		return THRESHOLD;
+		if(DATA_TYPE.equals("double"))  return DOUBLE_PRECISION_THRESHOLD;
+		else if(DATA_TYPE.equals("float"))  return SINGLE_PRECISION_THRESHOLD;
+		else throw new RuntimeException("Unsupported datatype:" + DATA_TYPE);
 	}
 
 	@After
@@ -228,7 +235,7 @@ public abstract class GPUTests extends AutomatedTestBase {
 	}
 
 	/**
-	 * Asserts that the values in two matrices are in {@link UnaryOpTests#THRESHOLD} of each other
+	 * Asserts that the values in two matrices are in {@link UnaryOpTests#DOUBLE_PRECISION_THRESHOLD} of each other
 	 *
 	 * @param expected expected matrix
 	 * @param actual   actual matrix
@@ -251,11 +258,15 @@ public abstract class GPUTests extends AutomatedTestBase {
 					double actualDouble = actualMB.quickGetValue(i, j);
 					if (expectedDouble != 0.0 && !Double.isNaN(expectedDouble) && Double.isFinite(expectedDouble)) {
 						double relativeError = Math.abs((expectedDouble - actualDouble) / expectedDouble);
+						double absoluteError = Math.abs(expectedDouble - actualDouble);
 						Formatter format = new Formatter();
 						format.format(
 								"Relative error(%f) is more than threshold (%f). Expected = %f, Actual = %f, differed at [%d, %d]",
 								relativeError, getTHRESHOLD(), expectedDouble, actualDouble, i, j);
-						Assert.assertTrue(format.toString(), relativeError < getTHRESHOLD());
+						if(DATA_TYPE.equals("double"))
+							Assert.assertTrue(format.toString(), relativeError < getTHRESHOLD());
+						else
+							Assert.assertTrue(format.toString(), relativeError < getTHRESHOLD() || absoluteError < getTHRESHOLD());
 						format.close();
 					} else {
 						Assert.assertEquals(expectedDouble, actualDouble, getTHRESHOLD());
@@ -313,6 +324,7 @@ public abstract class GPUTests extends AutomatedTestBase {
 	protected List<Object> runOnGPU(SparkSession spark, String scriptStr, Map<String, Object> inputs,
 			List<String> outStrs) {
 		MLContext gpuMLC = new MLContext(spark);
+		gpuMLC.setConfigProperty("sysml.gpu.dataType", DATA_TYPE);
 		gpuMLC.setGPU(true);
 		gpuMLC.setForceGPU(true);
 		gpuMLC.setStatistics(true);

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java b/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java
index d983716..cbc3563 100644
--- a/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java
+++ b/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java
@@ -50,9 +50,9 @@ public class MatrixMultiplicationOpTest extends GPUTests {
 	public void matrixMatrixTest1() {
 		String scriptStr = "O = X %*% Y";
 
-		int[] X1 = { 1, 128, 1024 };
-		int[] X2 = { 1, 128, 1024 };
-		int[] Y2 = { 1, 128, 1024 };
+		int[] X1 = { 1, 121 };
+		int[] X2 = { 1, 123 };
+		int[] Y2 = { 1, 122 };
 		double[] SX = { 0.0, 0.03, 0.3, 0.9 };
 		double[] SY = { 0.0, 0.03, 0.3, 0.9 };
 
@@ -74,8 +74,8 @@ public class MatrixMultiplicationOpTest extends GPUTests {
 	public void commonCaseMLMatrixMatrixTest1() {
 		String scriptStr = "O = X %*% Y";
 
-		int[] X1 = { 1000000 };
-		int[] X2 = { 1000 };
+		int[] X1 = { 5000 };
+		int[] X2 = { 50 };
 		int[] Y2 = { 1, 20 };
 		double[] SX = { 0.0, 0.03, 0.3 };
 		double[] SY = { 0.0, 0.03, 0.3, 0.9 };
@@ -98,9 +98,9 @@ public class MatrixMultiplicationOpTest extends GPUTests {
 	public void commonCaseDLMatrixMatrixTest1() {
 		String scriptStr = "O = X %*% Y";
 
-		int[] X1 = { 100 };
-		int[] X2 = { 600, 900  };
-		int[] Y2 = { 205800 };
+		int[] X1 = { 32 };
+		int[] X2 = { 60, 90  };
+		int[] Y2 = { 2058 };
 		double[] SX = { 0.0, 0.03, 0.3 };
 		double[] SY = { 0.0, 0.03, 0.3, 0.9 };
 
@@ -122,9 +122,9 @@ public class MatrixMultiplicationOpTest extends GPUTests {
 	public void commonCaseDLMatrixMatrixTest2() {
 		String scriptStr = "O = X %*% Y";
 
-		int[] X1 = { 64 };
-		int[] X2 = { 196608   };
-		int[] Y2 = { 512 };
+		int[] X1 = { 32 };
+		int[] X2 = { 1966   };
+		int[] Y2 = { 256 };
 		double[] SX = { 0.0, 0.03, 0.3, 0.9 };
 		double[] SY = { 0.0, 0.03, 0.3, 0.9 };