You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2017/10/26 02:29:02 UTC

[1/4] systemml git commit: [SYSTEMML-1969] Support single-precision operations on GPU backend

Repository: systemml
Updated Branches:
  refs/heads/master 881caa9ba -> abbffc55e


http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
index 21a2a35..d962027 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
@@ -23,13 +23,6 @@ import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_TRANSPOSE;
 import static jcuda.runtime.JCuda.cudaMemcpy;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
 import jcuda.Pointer;
-import jcuda.Sizeof;
-import jcuda.jcublas.JCublas2;
-import jcuda.jcublas.cublasHandle;
-import jcuda.jcublas.cublasOperation;
-import jcuda.jcusparse.JCusparse;
-import jcuda.jcusparse.cusparseHandle;
-import jcuda.runtime.JCuda;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -43,6 +36,11 @@ import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
 import org.apache.sysml.utils.GPUStatistics;
 import org.apache.sysml.utils.Statistics;
 
+import jcuda.jcusparse.cusparseHandle;
+import jcuda.jcublas.cublasHandle;
+import jcuda.jcublas.cublasOperation;
+import jcuda.runtime.JCuda;
+
 public class LibMatrixCuMatMult extends LibMatrixCUDA {
 
 	private static final Log LOG = LogFactory.getLog(LibMatrixCuMatMult.class.getName());
@@ -175,7 +173,7 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA {
 
 			// Step 3: Invoke the kernel
 			long t1 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
-			JCusparse.cusparseDcsrgemm(getCusparseHandle(gCtx), transa, transb, params.m, params.n, params.k, A.descr,
+			cudaSupportFunctions.cusparsecsrgemm(getCusparseHandle(gCtx), transa, transb, params.m, params.n, params.k, A.descr,
 					(int) A.nnz, A.val, A.rowPtr, A.colInd, B.descr, (int) B.nnz, B.val, B.rowPtr, B.colInd, C.descr,
 					C.val, C.rowPtr, C.colInd);
 			if (GPUStatistics.DISPLAY_STATISTICS)
@@ -239,7 +237,7 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA {
 	 * allocated in dense row-major format and A is sparse.
 	 * 
 	 * Other than input and output, this method requires additional memory =
-	 * outRLen * outCLen * Sizeof.DOUBLE
+	 * outRLen * outCLen * sizeOfDataType
 	 * 
 	 * @param gCtx
 	 *            a valid {@link GPUContext}
@@ -276,7 +274,7 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA {
 		// t(C) = t(B) %*% t(A)
 		Pointer output = null;
 		if (outRLen != 1 && outCLen != 1) {
-			output = gCtx.allocate(outRLen * outCLen * Sizeof.DOUBLE);
+			output = gCtx.allocate(outRLen * outCLen * sizeOfDataType);
 		} else {
 			// no transpose required for vector output
 			output = C;
@@ -287,7 +285,7 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA {
 		if (outRLen != 1 && outCLen != 1) {
 			// Transpose: C = t(output)
 			long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
-			JCublas2.cublasDgeam(gCtx.getCublasHandle(), cublasOperation.CUBLAS_OP_T, cublasOperation.CUBLAS_OP_T,
+			cudaSupportFunctions.cublasgeam(gCtx.getCublasHandle(), cublasOperation.CUBLAS_OP_T, cublasOperation.CUBLAS_OP_T,
 					toInt(outCLen), toInt(outRLen), one(), output, toInt(outRLen), zero(), new Pointer(),
 					toInt(outRLen), C, toInt(outCLen));
 			if (!DMLScript.EAGER_CUDA_FREE)
@@ -331,7 +329,7 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA {
 			int m = toInt(param.rightNumRows);
 			int n = toInt(param.rightNumCols);
 			int transa = reverseCusparseOp(cusparseOp(param.isLeftTransposed));
-			JCusparse.cusparseDcsrmv(handle, transa, m, n, toInt(B.nnz), one(), B.descr, B.val, B.rowPtr, B.colInd, A,
+			cudaSupportFunctions.cusparsecsrmv(handle, transa, m, n, toInt(B.nnz), one(), B.descr, B.val, B.rowPtr, B.colInd, A,
 					zero(), C);
 			kernel = GPUInstruction.MISC_TIMER_SPARSE_MATRIX_DENSE_VECTOR_LIB;
 		} else {
@@ -342,7 +340,7 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA {
 			int transa = reverseCusparseOp(cusparseOp(param.isLeftTransposed));
 			int transb = cusparseOp(param.isRightTransposed);
 			LOG.debug(" GPU Sparse-Dense Matrix Multiply (rhs transpose) ");
-			JCusparse.cusparseDcsrmm2(handle, transa, transb, m, param.n, k, toInt(B.nnz), one(), B.descr, B.val,
+			cudaSupportFunctions.cusparsecsrmm2(handle, transa, transb, m, param.n, k, toInt(B.nnz), one(), B.descr, B.val,
 					B.rowPtr, B.colInd, A, param.ldb, zero(), C, param.ldc);
 		}
 		if (GPUStatistics.DISPLAY_STATISTICS)
@@ -383,7 +381,7 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA {
 			// Vector product
 			LOG.debug(" GPU Dense-dense Vector Product");
 			double[] result = { 0 };
-			JCublas2.cublasDdot(handle, param.k, A, 1, B, 1, Pointer.to(result));
+			cudaSupportFunctions.cublasdot(handle, param.k, A, 1, B, 1, Pointer.to(result));
 			// By default in CuBlas V2, cublas pointer mode is set to
 			// CUBLAS_POINTER_MODE_HOST.
 			// This means that scalar values passed are on host (as opposed to
@@ -391,7 +389,7 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA {
 			// The result is copied from the host back to the device so that the
 			// rest of
 			// infrastructure can treat it uniformly.
-			cudaMemcpy(C, Pointer.to(result), 1 * Sizeof.DOUBLE, cudaMemcpyHostToDevice);
+			cudaMemcpy(C, Pointer.to(result), 1 * sizeOfDataType, cudaMemcpyHostToDevice);
 			kernel = GPUInstruction.MISC_TIMER_DENSE_DOT_LIB;
 		} else if (param.m == 1) {
 			// Vector-matrix multiply
@@ -399,18 +397,18 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA {
 			transb = reverseCublasOp(transb);
 			int rightNumRows = (transb == CUSPARSE_OPERATION_TRANSPOSE) ? param.k : param.n;
 			int rightNumCols = (transb == CUSPARSE_OPERATION_TRANSPOSE) ? param.n : param.k;
-			JCublas2.cublasDgemv(handle, transb, rightNumRows, rightNumCols, one(), B, param.ldb, A, 1, zero(), C, 1);
+			cudaSupportFunctions.cublasgemv(handle, transb, rightNumRows, rightNumCols, one(), B, param.ldb, A, 1, zero(), C, 1);
 			kernel = GPUInstruction.MISC_TIMER_DENSE_VECTOR_DENSE_MATRIX_LIB;
 		} else if (param.n == 1) {
 			// Matrix-vector multiply
 			LOG.debug(" GPU Dense Matrix-Vector Multiply");
 			int leftNumRows = (transa == CUSPARSE_OPERATION_NON_TRANSPOSE) ? param.m : param.k;
 			int leftNumCols = (transa == CUSPARSE_OPERATION_NON_TRANSPOSE) ? param.k : param.m;
-			JCublas2.cublasDgemv(handle, transa, leftNumRows, leftNumCols, one(), A, param.lda, B, 1, zero(), C, 1);
+			cudaSupportFunctions.cublasgemv(handle, transa, leftNumRows, leftNumCols, one(), A, param.lda, B, 1, zero(), C, 1);
 			kernel = GPUInstruction.MISC_TIMER_DENSE_MATRIX_DENSE_VECTOR_LIB;
 		} else {
 			LOG.debug(" GPU Dense-Dense Matrix Multiply ");
-			JCublas2.cublasDgemm(handle, transa, transb, param.m, param.n, param.k, one(), A, param.lda, B, param.ldb,
+			cudaSupportFunctions.cublasgemm(handle, transa, transb, param.m, param.n, param.k, one(), A, param.lda, B, param.ldb,
 					zero(), C, param.ldc);
 			kernel = GPUInstruction.MISC_TIMER_DENSE_MATRIX_DENSE_MATRIX_LIB;
 		}

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
index 8ee6f8d..c023890 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
@@ -3852,8 +3852,9 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 	 * @param ru row upper
 	 * @param cl column lower
 	 * @param cu column upper
-	 * @param ret ?
-	 * @return matrix block
+	 * @param deep should perform deep copy
+	 * @param ret output matrix block
+	 * @return matrix block output matrix block
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
 	public MatrixBlock sliceOperations(int rl, int ru, int cl, int cu, boolean deep, CacheBlock ret) 

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java b/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
new file mode 100644
index 0000000..128bb39
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
@@ -0,0 +1,208 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.matrix.data;
+
+import static jcuda.runtime.JCuda.cudaMemcpy;
+import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
+import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
+import org.apache.sysml.utils.GPUStatistics;
+
+import jcuda.Pointer;
+import jcuda.Sizeof;
+import jcuda.jcublas.JCublas2;
+import jcuda.jcublas.cublasHandle;
+import jcuda.jcusolver.JCusolverDn;
+import jcuda.jcusolver.cusolverDnHandle;
+import jcuda.jcusparse.JCusparse;
+import jcuda.jcusparse.cusparseHandle;
+import jcuda.jcusparse.cusparseMatDescr;
+
+public class SinglePrecisionCudaSupportFunctions implements CudaSupportFunctions {
+	
+	private static final Log LOG = LogFactory.getLog(SinglePrecisionCudaSupportFunctions.class.getName());
+
+	@Override
+	public int cusparsecsrgemm(cusparseHandle handle, int transA, int transB, int m, int n, int k,
+			cusparseMatDescr descrA, int nnzA, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA,
+			cusparseMatDescr descrB, int nnzB, Pointer csrValB, Pointer csrRowPtrB, Pointer csrColIndB,
+			cusparseMatDescr descrC, Pointer csrValC, Pointer csrRowPtrC, Pointer csrColIndC) {
+		return JCusparse.cusparseScsrgemm(handle, transA,  transB,  m,  n,  k,
+				 descrA,  nnzA,  csrValA,  csrRowPtrA,  csrColIndA,
+				 descrB,  nnzB,  csrValB,  csrRowPtrB,  csrColIndB,
+				 descrC,  csrValC,  csrRowPtrC,  csrColIndC);
+	}
+
+	@Override
+	public int cublasgeam(cublasHandle handle, int transa, int transb, int m, int n, Pointer alpha, Pointer A, int lda,
+			Pointer beta, Pointer B, int ldb, Pointer C, int ldc) {
+		return JCublas2.cublasSgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
+	}
+
+	@Override
+	public int cusparsecsrmv(cusparseHandle handle, int transA, int m, int n, int nnz, Pointer alpha,
+			cusparseMatDescr descrA, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA, Pointer x, Pointer beta,
+			Pointer y) {
+		return JCusparse.cusparseScsrmv(handle, transA, m, n, nnz, alpha, 
+				descrA, csrValA, csrRowPtrA, csrColIndA, x, beta, y);
+	}
+	
+	@Override
+	public int	cusparsecsrmm2(cusparseHandle handle, int transa, int transb, int m, int n, int k, int nnz, jcuda.Pointer alpha, cusparseMatDescr descrA, 
+			jcuda.Pointer csrValA, jcuda.Pointer csrRowPtrA, jcuda.Pointer csrColIndA, 
+			jcuda.Pointer B, int ldb, jcuda.Pointer beta, jcuda.Pointer C, int ldc) {
+		return JCusparse.cusparseScsrmm2(handle, transa, transb, m, n, k, nnz, alpha, descrA, csrValA, 
+				csrRowPtrA, csrColIndA, B, ldb, beta, C, ldc);
+	}
+
+	@Override
+	public int cublasdot(cublasHandle handle, int n, Pointer x, int incx, Pointer y, int incy, Pointer result) {
+		return JCublas2.cublasSdot(handle, n, x, incx, y, incy, result);
+	}
+
+	@Override
+	public int cublasgemv(cublasHandle handle, int trans, int m, int n, Pointer alpha, Pointer A, int lda, Pointer x,
+			int incx, Pointer beta, Pointer y, int incy) {
+		return JCublas2.cublasSgemv(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+	}
+
+	@Override
+	public int cublasgemm(cublasHandle handle, int transa, int transb, int m, int n, int k, Pointer alpha, Pointer A,
+			int lda, Pointer B, int ldb, Pointer beta, Pointer C, int ldc) {
+		return JCublas2.cublasSgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+	}
+
+	@Override
+	public int cusparsecsr2csc(cusparseHandle handle, int m, int n, int nnz, Pointer csrVal, Pointer csrRowPtr,
+			Pointer csrColInd, Pointer cscVal, Pointer cscRowInd, Pointer cscColPtr, int copyValues, int idxBase) {
+		return JCusparse.cusparseScsr2csc(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscRowInd, cscColPtr, copyValues, idxBase);
+	}
+
+	@Override
+	public int cublassyrk(cublasHandle handle, int uplo, int trans, int n, int k, Pointer alpha, Pointer A, int lda,
+			Pointer beta, Pointer C, int ldc) {
+		return JCublas2.cublasSsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+	}
+
+	@Override
+	public int cublasaxpy(cublasHandle handle, int n, Pointer alpha, Pointer x, int incx, Pointer y, int incy) {
+		return JCublas2.cublasSaxpy(handle, n, alpha, x, incx, y, incy);
+	}
+
+	@Override
+	public int cublastrsm(cublasHandle handle, int side, int uplo, int trans, int diag, int m, int n, Pointer alpha,
+			Pointer A, int lda, Pointer B, int ldb) {
+		return JCublas2.cublasStrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
+	}
+
+	@Override
+	public int cusolverDngeqrf_bufferSize(cusolverDnHandle handle, int m, int n, Pointer A, int lda, int[] Lwork) {
+		return JCusolverDn.cusolverDnSgeqrf_bufferSize(handle, m, n, A, lda, Lwork);
+	}
+
+	@Override
+	public int cusolverDngeqrf(cusolverDnHandle handle, int m, int n, Pointer A, int lda, Pointer TAU,
+			Pointer Workspace, int Lwork, Pointer devInfo) {
+		return JCusolverDn.cusolverDnSgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
+	}
+	
+	@Override
+	public int cusolverDnormqr(cusolverDnHandle handle, int side, int trans, int m, int n, int k, Pointer A, int lda,
+			Pointer tau, Pointer C, int ldc, Pointer work, int lwork, Pointer devInfo) {
+		return JCusolverDn.cusolverDnSormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, devInfo);
+	}
+
+	@Override
+	public int cusparsecsrgeam(cusparseHandle handle, int m, int n, Pointer alpha, cusparseMatDescr descrA, int nnzA,
+			Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA, Pointer beta, cusparseMatDescr descrB, int nnzB,
+			Pointer csrValB, Pointer csrRowPtrB, Pointer csrColIndB, cusparseMatDescr descrC, Pointer csrValC,
+			Pointer csrRowPtrC, Pointer csrColIndC) {
+		return JCusparse.cusparseScsrgeam(handle, m, n, alpha, descrA, nnzA, 
+				csrValA, csrRowPtrA, csrColIndA, beta, descrB, nnzB, 
+				csrValB, csrRowPtrB, csrColIndB, descrC, csrValC, csrRowPtrC, csrColIndC);
+	}
+
+	@Override
+	public int cusparsecsr2dense(cusparseHandle handle, int m, int n, cusparseMatDescr descrA, Pointer csrValA,
+			Pointer csrRowPtrA, Pointer csrColIndA, Pointer A, int lda) {
+		return JCusparse.cusparseScsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA, csrColIndA, A, lda);
+	}
+	
+	@Override
+	public int cusparsedense2csr(cusparseHandle handle, int m, int n, cusparseMatDescr descrA, Pointer A, int lda,
+			Pointer nnzPerRow, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA) {
+		return JCusparse.cusparseSdense2csr(handle, m, n, descrA, A, lda, nnzPerRow, csrValA, csrRowPtrA, csrColIndA);
+	}
+	
+	@Override
+	public int cusparsennz(cusparseHandle handle, int dirA, int m, int n, cusparseMatDescr descrA, Pointer A, int lda,
+			Pointer nnzPerRowCol, Pointer nnzTotalDevHostPtr) {
+		return JCusparse.cusparseSnnz(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol, nnzTotalDevHostPtr);
+	}
+	
+	@Override
+	public void deviceToHost(GPUContext gCtx, Pointer src, double[] dest, String instName) throws DMLRuntimeException {
+		long t1 = GPUStatistics.DISPLAY_STATISTICS  && instName != null? System.nanoTime() : 0;
+		LOG.debug("Potential OOM: Allocated additional space in deviceToHost");
+		if(PERFORM_CONVERSION_ON_DEVICE) {
+			Pointer deviceDoubleData = gCtx.allocate(((long)dest.length)*Sizeof.DOUBLE);
+			LibMatrixCUDA.float2double(gCtx, src, deviceDoubleData, dest.length);
+			cudaMemcpy(Pointer.to(dest), deviceDoubleData, ((long)dest.length)*Sizeof.DOUBLE, cudaMemcpyDeviceToHost);
+			gCtx.cudaFreeHelper(deviceDoubleData);
+		}
+		else {
+			// TODO: Perform conversion on GPU using double2float and float2double kernels
+			float [] floatData = new float[dest.length];
+			cudaMemcpy(Pointer.to(floatData), src, ((long)dest.length)*Sizeof.FLOAT, cudaMemcpyDeviceToHost);
+			for(int i = 0; i < dest.length; i++) {
+				dest[i] = floatData[i];
+			}
+		}
+		if(GPUStatistics.DISPLAY_STATISTICS && instName != null) 
+			GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DEVICE_TO_HOST, System.nanoTime() - t1);
+	}
+
+	@Override
+	public void hostToDevice(GPUContext gCtx, double[] src, Pointer dest, String instName) throws DMLRuntimeException {
+		LOG.debug("Potential OOM: Allocated additional space in hostToDevice");
+		// TODO: Perform conversion on GPU using double2float and float2double kernels
+		long t1 = GPUStatistics.DISPLAY_STATISTICS  && instName != null? System.nanoTime() : 0;
+		if(PERFORM_CONVERSION_ON_DEVICE) {
+			Pointer deviceDoubleData = gCtx.allocate(((long)src.length)*Sizeof.DOUBLE);
+			cudaMemcpy(deviceDoubleData, Pointer.to(src), ((long)src.length)*Sizeof.DOUBLE, cudaMemcpyHostToDevice);
+			LibMatrixCUDA.double2float(gCtx, deviceDoubleData, dest, src.length);
+			gCtx.cudaFreeHelper(deviceDoubleData);
+		}
+		else {
+			float [] floatData = new float[src.length];
+			for(int i = 0; i < src.length; i++) {
+				floatData[i] = (float) src[i];
+			}
+			cudaMemcpy(dest, Pointer.to(floatData), ((long)src.length)*Sizeof.FLOAT, cudaMemcpyHostToDevice);
+		}
+		
+		if(GPUStatistics.DISPLAY_STATISTICS && instName != null) 
+			GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_HOST_TO_DEVICE, System.nanoTime() - t1);
+	}
+}

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/test/java/org/apache/sysml/test/gpu/GPUTests.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/gpu/GPUTests.java b/src/test/java/org/apache/sysml/test/gpu/GPUTests.java
index b4e4b62..d7d1ad5 100644
--- a/src/test/java/org/apache/sysml/test/gpu/GPUTests.java
+++ b/src/test/java/org/apache/sysml/test/gpu/GPUTests.java
@@ -51,9 +51,14 @@ public abstract class GPUTests extends AutomatedTestBase {
 	
 	protected final static String TEST_DIR = "org/apache/sysml/api/mlcontext";
 	protected static SparkSession spark;
-	protected final double THRESHOLD = 1e-9;    // for relative error
+	protected final double DOUBLE_PRECISION_THRESHOLD = 1e-9;    // for relative error
 	private static final boolean PRINT_MAT_ERROR = false;
 	
+	// We will use this flag until lower precision is supported on CP. 
+	private final static String DATA_TYPE = "double";  
+	protected final double SINGLE_PRECISION_THRESHOLD = 1e-3;    // for relative error
+	
+	
 	@BeforeClass
 	public static void beforeClass() {
 		spark = createSystemMLSparkSession("GPUTests", "local");
@@ -70,7 +75,9 @@ public abstract class GPUTests extends AutomatedTestBase {
 	 * @return a valid threshold
 	 */
 	protected double getTHRESHOLD() {
-		return THRESHOLD;
+		if(DATA_TYPE.equals("double"))  return DOUBLE_PRECISION_THRESHOLD;
+		else if(DATA_TYPE.equals("float"))  return SINGLE_PRECISION_THRESHOLD;
+		else throw new RuntimeException("Unsupported datatype:" + DATA_TYPE);
 	}
 
 	@After
@@ -228,7 +235,7 @@ public abstract class GPUTests extends AutomatedTestBase {
 	}
 
 	/**
-	 * Asserts that the values in two matrices are in {@link UnaryOpTests#THRESHOLD} of each other
+	 * Asserts that the values in two matrices are in {@link UnaryOpTests#DOUBLE_PRECISION_THRESHOLD} of each other
 	 *
 	 * @param expected expected matrix
 	 * @param actual   actual matrix
@@ -251,11 +258,15 @@ public abstract class GPUTests extends AutomatedTestBase {
 					double actualDouble = actualMB.quickGetValue(i, j);
 					if (expectedDouble != 0.0 && !Double.isNaN(expectedDouble) && Double.isFinite(expectedDouble)) {
 						double relativeError = Math.abs((expectedDouble - actualDouble) / expectedDouble);
+						double absoluteError = Math.abs(expectedDouble - actualDouble);
 						Formatter format = new Formatter();
 						format.format(
 								"Relative error(%f) is more than threshold (%f). Expected = %f, Actual = %f, differed at [%d, %d]",
 								relativeError, getTHRESHOLD(), expectedDouble, actualDouble, i, j);
-						Assert.assertTrue(format.toString(), relativeError < getTHRESHOLD());
+						if(DATA_TYPE.equals("double"))
+							Assert.assertTrue(format.toString(), relativeError < getTHRESHOLD());
+						else
+							Assert.assertTrue(format.toString(), relativeError < getTHRESHOLD() || absoluteError < getTHRESHOLD());
 						format.close();
 					} else {
 						Assert.assertEquals(expectedDouble, actualDouble, getTHRESHOLD());
@@ -313,6 +324,7 @@ public abstract class GPUTests extends AutomatedTestBase {
 	protected List<Object> runOnGPU(SparkSession spark, String scriptStr, Map<String, Object> inputs,
 			List<String> outStrs) {
 		MLContext gpuMLC = new MLContext(spark);
+		gpuMLC.setConfigProperty("sysml.gpu.dataType", DATA_TYPE);
 		gpuMLC.setGPU(true);
 		gpuMLC.setForceGPU(true);
 		gpuMLC.setStatistics(true);

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java b/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java
index d983716..cbc3563 100644
--- a/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java
+++ b/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java
@@ -50,9 +50,9 @@ public class MatrixMultiplicationOpTest extends GPUTests {
 	public void matrixMatrixTest1() {
 		String scriptStr = "O = X %*% Y";
 
-		int[] X1 = { 1, 128, 1024 };
-		int[] X2 = { 1, 128, 1024 };
-		int[] Y2 = { 1, 128, 1024 };
+		int[] X1 = { 1, 121 };
+		int[] X2 = { 1, 123 };
+		int[] Y2 = { 1, 122 };
 		double[] SX = { 0.0, 0.03, 0.3, 0.9 };
 		double[] SY = { 0.0, 0.03, 0.3, 0.9 };
 
@@ -74,8 +74,8 @@ public class MatrixMultiplicationOpTest extends GPUTests {
 	public void commonCaseMLMatrixMatrixTest1() {
 		String scriptStr = "O = X %*% Y";
 
-		int[] X1 = { 1000000 };
-		int[] X2 = { 1000 };
+		int[] X1 = { 5000 };
+		int[] X2 = { 50 };
 		int[] Y2 = { 1, 20 };
 		double[] SX = { 0.0, 0.03, 0.3 };
 		double[] SY = { 0.0, 0.03, 0.3, 0.9 };
@@ -98,9 +98,9 @@ public class MatrixMultiplicationOpTest extends GPUTests {
 	public void commonCaseDLMatrixMatrixTest1() {
 		String scriptStr = "O = X %*% Y";
 
-		int[] X1 = { 100 };
-		int[] X2 = { 600, 900  };
-		int[] Y2 = { 205800 };
+		int[] X1 = { 32 };
+		int[] X2 = { 60, 90  };
+		int[] Y2 = { 2058 };
 		double[] SX = { 0.0, 0.03, 0.3 };
 		double[] SY = { 0.0, 0.03, 0.3, 0.9 };
 
@@ -122,9 +122,9 @@ public class MatrixMultiplicationOpTest extends GPUTests {
 	public void commonCaseDLMatrixMatrixTest2() {
 		String scriptStr = "O = X %*% Y";
 
-		int[] X1 = { 64 };
-		int[] X2 = { 196608   };
-		int[] Y2 = { 512 };
+		int[] X1 = { 32 };
+		int[] X2 = { 1966   };
+		int[] Y2 = { 256 };
 		double[] SX = { 0.0, 0.03, 0.3, 0.9 };
 		double[] SY = { 0.0, 0.03, 0.3, 0.9 };
 


[2/4] systemml git commit: [SYSTEMML-1969] Support single-precision operations on GPU backend

Posted by ni...@apache.org.
http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/main/java/org/apache/sysml/api/DMLScript.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/DMLScript.java b/src/main/java/org/apache/sysml/api/DMLScript.java
index ba447cf..4da874e 100644
--- a/src/main/java/org/apache/sysml/api/DMLScript.java
+++ b/src/main/java/org/apache/sysml/api/DMLScript.java
@@ -163,6 +163,7 @@ public class DMLScript
 	public static boolean           ENABLE_DEBUG_MODE   = DMLOptions.defaultOptions.debug;       // debug mode
 	public static ExplainType       EXPLAIN             = DMLOptions.defaultOptions.explainType; // explain type
 	public static String            DML_FILE_PATH_ANTLR_PARSER = DMLOptions.defaultOptions.filePath; // filename of dml/pydml script
+	public static String            FLOATING_POINT_PRECISION = "double"; 							// data type to use internally
 
 	/**
 	 * Global variable indicating the script type (DML or PYDML). Can be used

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
index a49ffda..51ab6a1 100644
--- a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
+++ b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
@@ -81,6 +81,10 @@ public class ScriptExecutorUtils {
 		DMLScript.SYNCHRONIZE_GPU = dmlconf.getBooleanValue(DMLConfig.SYNCHRONIZE_GPU);
 		DMLScript.EAGER_CUDA_FREE = dmlconf.getBooleanValue(DMLConfig.EAGER_CUDA_FREE);
 		DMLScript.STATISTICS_MAX_WRAP_LEN = dmlconf.getIntValue(DMLConfig.STATS_MAX_WRAP_LEN);
+		if(DMLScript.USE_ACCELERATOR) {
+			DMLScript.FLOATING_POINT_PRECISION = dmlconf.getTextValue(DMLConfig.FLOATING_POINT_PRECISION);
+			org.apache.sysml.runtime.matrix.data.LibMatrixCUDA.resetFloatingPointPrecision();
+		}
 
 		boolean exceptionThrown = false;
 

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/main/java/org/apache/sysml/conf/DMLConfig.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/conf/DMLConfig.java b/src/main/java/org/apache/sysml/conf/DMLConfig.java
index 0b73ab0..e8bde56 100644
--- a/src/main/java/org/apache/sysml/conf/DMLConfig.java
+++ b/src/main/java/org/apache/sysml/conf/DMLConfig.java
@@ -92,6 +92,7 @@ public class DMLConfig
 	// Fraction of available memory to use. The available memory is computer when the GPUContext is created
 	// to handle the tradeoff on calling cudaMemGetInfo too often.
 	public static final String GPU_MEMORY_UTILIZATION_FACTOR = "sysml.gpu.memory.util.factor";
+	public static final String FLOATING_POINT_PRECISION = "sysml.floating.point.precision"; // String to specify the datatype to use internally: supported values are double, single
 
 	// supported prefixes for custom map/reduce configurations
 	public static final String PREFIX_MAPRED = "mapred";
@@ -139,6 +140,7 @@ public class DMLConfig
 		_defaultVals.put(AVAILABLE_GPUS,         "-1");
 		_defaultVals.put(SYNCHRONIZE_GPU,        "true" );
 		_defaultVals.put(EAGER_CUDA_FREE,        "false" );
+		_defaultVals.put(FLOATING_POINT_PRECISION,        	 "double" );
 	}
 	
 	public DMLConfig()
@@ -421,7 +423,7 @@ public class DMLConfig
 				COMPRESSED_LINALG, 
 				CODEGEN, CODEGEN_COMPILER, CODEGEN_OPTIMIZER, CODEGEN_PLANCACHE, CODEGEN_LITERALS,
 				EXTRA_GPU_STATS, EXTRA_DNN_STATS, EXTRA_FINEGRAINED_STATS, STATS_MAX_WRAP_LEN,
-				AVAILABLE_GPUS, SYNCHRONIZE_GPU, EAGER_CUDA_FREE
+				AVAILABLE_GPUS, SYNCHRONIZE_GPU, EAGER_CUDA_FREE, FLOATING_POINT_PRECISION
 		}; 
 		
 		StringBuilder sb = new StringBuilder();

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
index 5297e61..c7ffdb1 100644
--- a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
+++ b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
@@ -404,7 +404,7 @@ public abstract class CacheableData<T extends CacheBlock> extends Data
                 LOG.error("Inconsistent internal state - A copy of this CacheableData was dirty on more than 1 GPU");
                 throw new CacheException("Internal Error : Inconsistent internal state, A copy of this CacheableData was dirty on more than 1 GPU");
             } else if (gObj != null){
-                copiedFromGPU = gObj.acquireHostRead();
+                copiedFromGPU = gObj.acquireHostRead(null);
                 if( _data == null )
                     getCache();
             }
@@ -793,7 +793,7 @@ public abstract class CacheableData<T extends CacheBlock> extends Data
                 LOG.error("Inconsistent internal state - A copy of this CacheableData was dirty on more than 1 GPU");
                 throw new CacheException("Internal Error : Inconsistent internal state, A copy of this CacheableData was dirty on more than 1 GPU");
             } else if (gObj != null){
-                copiedFromGPU = gObj.acquireHostRead();
+                copiedFromGPU = gObj.acquireHostRead(null);
                 if( _data == null )
                     getCache();
             }

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
index 7176a9c..53f1a19 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
@@ -20,7 +20,6 @@
 package org.apache.sysml.runtime.instructions.gpu.context;
 
 import static jcuda.jcusparse.JCusparse.cusparseCreateMatDescr;
-import static jcuda.jcusparse.JCusparse.cusparseDcsr2dense;
 import static jcuda.jcusparse.JCusparse.cusparseSetMatIndexBase;
 import static jcuda.jcusparse.JCusparse.cusparseSetMatType;
 import static jcuda.jcusparse.JCusparse.cusparseSetPointerMode;
@@ -38,6 +37,7 @@ import org.apache.commons.logging.LogFactory;
 import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
+import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
 import org.apache.sysml.utils.GPUStatistics;
 import org.apache.sysml.utils.Statistics;
 
@@ -112,8 +112,8 @@ public class CSRPointer {
 		allocateMatDescrPointer();
 	}
 
-	private static long getDoubleSizeOf(long numElems) {
-		return numElems * ((long) jcuda.Sizeof.DOUBLE);
+	private static long getDataTypeSizeOf(long numElems) {
+		return numElems * ((long) LibMatrixCUDA.sizeOfDataType);
 	}
 
 	//  private Pointer allocate(String instName, long size) throws DMLRuntimeException {
@@ -121,7 +121,7 @@ public class CSRPointer {
 	//  }
 
 	private static long getIntSizeOf(long numElems) {
-		return numElems * ((long) jcuda.Sizeof.INT);
+		return numElems * ((long) Sizeof.INT);
 	}
 
 	//  private void cudaFreeHelper(Pointer toFree) throws DMLRuntimeException {
@@ -163,7 +163,7 @@ public class CSRPointer {
 	 * @return size estimate
 	 */
 	public static long estimateSize(long nnz2, long rows) {
-		long sizeofValArray = getDoubleSizeOf(nnz2);
+		long sizeofValArray = getDataTypeSizeOf(nnz2);
 		long sizeofRowPtrArray = getIntSizeOf(rows + 1);
 		long sizeofColIndArray = getIntSizeOf(nnz2);
 		long sizeofDescr = getIntSizeOf(4);
@@ -181,6 +181,7 @@ public class CSRPointer {
 	/**
 	 * Static method to copy a CSR sparse matrix from Host to Device
 	 *
+	 * @param gCtx GPUContext
 	 * @param dest   [input] destination location (on GPU)
 	 * @param rows   number of rows
 	 * @param nnz    number of non-zeroes
@@ -189,7 +190,7 @@ public class CSRPointer {
 	 * @param values double array of non zero values
 	 * @throws DMLRuntimeException if error occurs
 	 */
-	public static void copyToDevice(CSRPointer dest, int rows, long nnz, int[] rowPtr, int[] colInd, double[] values) throws DMLRuntimeException {
+	public static void copyToDevice(GPUContext gCtx, CSRPointer dest, int rows, long nnz, int[] rowPtr, int[] colInd, double[] values) throws DMLRuntimeException {
 		CSRPointer r = dest;
 		long t0 = 0;
 		if (DMLScript.STATISTICS)
@@ -200,15 +201,15 @@ public class CSRPointer {
 		if(rowPtr.length < rows + 1) throw new DMLRuntimeException("The length of rowPtr needs to be greater than or equal to " + (rows + 1));
 		if(colInd.length < nnz) throw new DMLRuntimeException("The length of colInd needs to be greater than or equal to " + nnz);
 		if(values.length < nnz) throw new DMLRuntimeException("The length of values needs to be greater than or equal to " + nnz);
+		LibMatrixCUDA.cudaSupportFunctions.hostToDevice(gCtx, values, r.val, null);
 		cudaMemcpy(r.rowPtr, Pointer.to(rowPtr), getIntSizeOf(rows + 1), cudaMemcpyHostToDevice);
 		cudaMemcpy(r.colInd, Pointer.to(colInd), getIntSizeOf(nnz), cudaMemcpyHostToDevice);
-		cudaMemcpy(r.val, Pointer.to(values), getDoubleSizeOf(nnz), cudaMemcpyHostToDevice);
 		if (DMLScript.STATISTICS)
 			GPUStatistics.cudaToDevTime.add(System.nanoTime() - t0);
 		if (DMLScript.STATISTICS)
 			GPUStatistics.cudaToDevCount.add(3);
 	}
-
+	
 	/**
 	 * Static method to copy a CSR sparse matrix from Device to host
 	 *
@@ -217,20 +218,12 @@ public class CSRPointer {
 	 * @param nnz    [input] number of non-zeroes
 	 * @param rowPtr [output] pre-allocated integer array of row pointers of size (rows+1)
 	 * @param colInd [output] pre-allocated integer array of column indices of size nnz
-	 * @param values [output] pre-allocated double array of values of size nnz
+	 * @throws DMLRuntimeException if error
 	 */
-	public static void copyToHost(CSRPointer src, int rows, long nnz, int[] rowPtr, int[] colInd, double[] values) {
+	public static void copyPtrToHost(CSRPointer src, int rows, long nnz, int[] rowPtr, int[] colInd) throws DMLRuntimeException {
 		CSRPointer r = src;
-		long t0 = 0;
-		if (DMLScript.STATISTICS)
-			t0 = System.nanoTime();
 		cudaMemcpy(Pointer.to(rowPtr), r.rowPtr, getIntSizeOf(rows + 1), cudaMemcpyDeviceToHost);
 		cudaMemcpy(Pointer.to(colInd), r.colInd, getIntSizeOf(nnz), cudaMemcpyDeviceToHost);
-		cudaMemcpy(Pointer.to(values), r.val, getDoubleSizeOf(nnz), cudaMemcpyDeviceToHost);
-		if (DMLScript.STATISTICS)
-			GPUStatistics.cudaFromDevTime.add(System.nanoTime() - t0);
-		if (DMLScript.STATISTICS)
-			GPUStatistics.cudaFromDevCount.add(3);
 	}
 
 	/**
@@ -305,9 +298,9 @@ public class CSRPointer {
 			// with no memory allocated on the GPU.
 			return r;
 		}
-		gCtx.ensureFreeSpace(getDoubleSizeOf(nnz2) + getIntSizeOf(rows + 1) + getIntSizeOf(nnz2));
+		gCtx.ensureFreeSpace(getDataTypeSizeOf(nnz2) + getIntSizeOf(rows + 1) + getIntSizeOf(nnz2));
 		// increment the cudaCount by 1 for the allocation of all 3 arrays
-		r.val = gCtx.allocate(null, getDoubleSizeOf(nnz2));
+		r.val = gCtx.allocate(null, getDataTypeSizeOf(nnz2));
 		r.rowPtr = gCtx.allocate(null, getIntSizeOf(rows + 1));
 		r.colInd = gCtx.allocate(null, getIntSizeOf(nnz2));
 		return r;
@@ -410,7 +403,7 @@ public class CSRPointer {
 			throws DMLRuntimeException {
 		LOG.trace("GPU : step3AllocateValNInd" + ", GPUContext=" + gCtx);
 		// Increment cudaCount by one when all three arrays of CSR sparse array are allocated
-		C.val = gCtx.allocate(null, getDoubleSizeOf(C.nnz));
+		C.val = gCtx.allocate(null, getDataTypeSizeOf(C.nnz));
 		C.colInd = gCtx.allocate(null, getIntSizeOf(C.nnz));
 	}
 
@@ -441,13 +434,14 @@ public class CSRPointer {
 		that.gpuContext.ensureFreeSpace(totalSize);
 
 		that.nnz = me.nnz;
-		that.val = allocate(that.nnz * Sizeof.DOUBLE);
-		that.rowPtr = allocate(rows * Sizeof.DOUBLE);
-		that.colInd = allocate(that.nnz * Sizeof.DOUBLE);
+		that.val = allocate(that.nnz * LibMatrixCUDA.sizeOfDataType);
+		// TODO: Nakul ... can you please double-check whether the below was a bug or intentional ?
+		that.rowPtr = allocate(rows * Sizeof.INT);
+		that.colInd = allocate(that.nnz * Sizeof.INT);
 
-		cudaMemcpy(that.val, me.val, that.nnz * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
-		cudaMemcpy(that.rowPtr, me.rowPtr, rows * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
-		cudaMemcpy(that.colInd, me.colInd, that.nnz * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
+		cudaMemcpy(that.val, me.val, that.nnz * LibMatrixCUDA.sizeOfDataType, cudaMemcpyDeviceToDevice);
+		cudaMemcpy(that.rowPtr, me.rowPtr, rows * Sizeof.INT, cudaMemcpyDeviceToDevice);
+		cudaMemcpy(that.colInd, me.colInd, that.nnz * Sizeof.INT, cudaMemcpyDeviceToDevice);
 
 		return that;
 	}
@@ -506,12 +500,12 @@ public class CSRPointer {
 		long t0 = GPUStatistics.DISPLAY_STATISTICS && instName != null ? System.nanoTime() : 0;
 		LOG.trace("GPU : sparse -> column major dense (inside CSRPointer) on " + this + ", GPUContext="
 				+ getGPUContext());
-		long size = ((long) rows) * getDoubleSizeOf((long) cols);
+		long size = ((long) rows) * getDataTypeSizeOf((long) cols);
 		Pointer A = allocate(size);
 		// If this sparse block is empty, the allocated dense matrix, initialized to zeroes, will be returned.
 		if (val != null && rowPtr != null && colInd != null && nnz > 0) {
 			// Note: cusparseDcsr2dense method cannot handle empty blocks
-			cusparseDcsr2dense(cusparseHandle, rows, cols, descr, val, rowPtr, colInd, A, rows);
+			LibMatrixCUDA.cudaSupportFunctions.cusparsecsr2dense(cusparseHandle, rows, cols, descr, val, rowPtr, colInd, A, rows);
 			//cudaDeviceSynchronize;
 		} else {
 			LOG.debug("in CSRPointer, the values array, row pointers array or column indices array was null");

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
index 55cb95f..dd776bc 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
@@ -24,8 +24,6 @@ import static jcuda.jcudnn.JCudnn.cudnnCreate;
 import static jcuda.jcudnn.JCudnn.cudnnDestroy;
 import static jcuda.jcusolver.JCusolverDn.cusolverDnCreate;
 import static jcuda.jcusolver.JCusolverDn.cusolverDnDestroy;
-import static jcuda.jcusolver.JCusolverSp.cusolverSpCreate;
-import static jcuda.jcusolver.JCusolverSp.cusolverSpDestroy;
 import static jcuda.jcusparse.JCusparse.cusparseCreate;
 import static jcuda.jcusparse.JCusparse.cusparseDestroy;
 import static jcuda.runtime.JCuda.cudaDeviceScheduleBlockingSync;
@@ -63,7 +61,6 @@ import jcuda.Pointer;
 import jcuda.jcublas.cublasHandle;
 import jcuda.jcudnn.cudnnHandle;
 import jcuda.jcusolver.cusolverDnHandle;
-import jcuda.jcusolver.cusolverSpHandle;
 import jcuda.jcusparse.cusparseHandle;
 import jcuda.runtime.JCuda;
 import jcuda.runtime.cudaDeviceProp;
@@ -107,10 +104,6 @@ public class GPUContext {
 	 */
 	private cusolverDnHandle cusolverDnHandle;
 	/**
-	 * cusolverSpHandle for invoking solve() function on sparse matrices on the GPU
-	 */
-	private cusolverSpHandle cusolverSpHandle;
-	/**
 	 * to launch custom CUDA kernel, specific to the active GPU for this GPUContext
 	 */
 	private JCudaKernels kernels;
@@ -233,12 +226,7 @@ public class GPUContext {
 			cusolverDnHandle = new cusolverDnHandle();
 			cusolverDnCreate(cusolverDnHandle);
 		}
-
-		if (cusolverSpHandle == null) {
-			cusolverSpHandle = new cusolverSpHandle();
-			cusolverSpCreate(cusolverSpHandle);
-		}
-
+		
 		if (kernels == null) {
 			kernels = new JCudaKernels();
 		}
@@ -578,7 +566,7 @@ public class GPUContext {
 								+ "). Allocated GPU objects:" + allocatedGPUObjects.toString());
 			}
 			if (toBeRemoved.dirty) {
-				toBeRemoved.copyFromDeviceToHost();
+				toBeRemoved.copyFromDeviceToHost(instructionName);
 			}
 			toBeRemoved.clearData(true);
 		}
@@ -754,15 +742,6 @@ public class GPUContext {
 	}
 
 	/**
-	 * Returns cusolverSpHandle for invoking solve() function on sparse matrices on the GPU.
-	 *
-	 * @return cusolverSpHandle for current thread
-	 */
-	public cusolverSpHandle getCusolverSpHandle() {
-		return cusolverSpHandle;
-	}
-
-	/**
 	 * Returns utility class used to launch custom CUDA kernel, specific to the active GPU for this GPUContext.
 	 *
 	 * @return {@link JCudaKernels} for current thread
@@ -801,14 +780,10 @@ public class GPUContext {
 		if (cusolverDnHandle != null)
 			cusolverDnDestroy(cusolverDnHandle);
 
-		if (cusolverSpHandle != null)
-			cusolverSpDestroy(cusolverSpHandle);
-
 		cudnnHandle = null;
 		cublasHandle = null;
 		cusparseHandle = null;
 		cusolverDnHandle = null;
-		cusolverSpHandle = null;
 	}
 
 	/**
@@ -827,7 +802,7 @@ public class GPUContext {
 			if (o.isDirty()) {
 				LOG.warn("Attempted to free GPU Memory when a block[" + o
 						+ "] is still on GPU memory, copying it back to host.");
-				o.acquireHostRead();
+				o.acquireHostRead(null);
 			}
 			o.clearData(true);
 		}

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
index 06327db..35dfd58 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -19,14 +19,10 @@
 package org.apache.sysml.runtime.instructions.gpu.context;
 
 import static jcuda.jcublas.cublasOperation.CUBLAS_OP_T;
-import static jcuda.jcusparse.JCusparse.cusparseDdense2csr;
-import static jcuda.jcusparse.JCusparse.cusparseDnnz;
 import static jcuda.runtime.JCuda.cudaMemcpy;
 import static jcuda.runtime.JCuda.cudaMemset;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
-import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
-
 import java.util.concurrent.atomic.AtomicLong;
 import java.util.concurrent.atomic.LongAdder;
 
@@ -47,9 +43,6 @@ import org.apache.sysml.runtime.matrix.data.SparseBlockMCSR;
 import org.apache.sysml.utils.GPUStatistics;
 
 import jcuda.Pointer;
-import jcuda.Sizeof;
-import jcuda.jcublas.JCublas2;
-import jcuda.jcusparse.JCusparse;
 import jcuda.jcusparse.cusparseDirection;
 import jcuda.jcusparse.cusparseHandle;
 import jcuda.jcusparse.cusparseMatDescr;
@@ -126,7 +119,7 @@ public class GPUObject {
 			if (me.jcudaDenseMatrixPtr != null) {
 				long rows = me.mat.getNumRows();
 				long cols = me.mat.getNumColumns();
-				long size = rows * cols * Sizeof.DOUBLE;
+				long size = rows * cols * LibMatrixCUDA.sizeOfDataType;
 				me.gpuContext.ensureFreeSpace((int) size);
 				that.jcudaDenseMatrixPtr = allocate(size);
 				cudaMemcpy(that.jcudaDenseMatrixPtr, me.jcudaDenseMatrixPtr, size, cudaMemcpyDeviceToDevice);
@@ -181,13 +174,13 @@ public class GPUObject {
 		if(LOG.isTraceEnabled()) {
 			LOG.trace("GPU : transpose of block of size [" + m + "," + n + "]" + ", GPUContext=" + gCtx);
 		}
-		Pointer alpha = Pointer.to(new double[] { 1.0 });
-		Pointer beta = Pointer.to(new double[] { 0.0 });
+		Pointer alpha = LibMatrixCUDA.one();
+		Pointer beta = LibMatrixCUDA.zero();
 		Pointer A = densePtr;
-		Pointer C = gCtx.allocate(((long) m) * getDoubleSizeOf(n));
+		Pointer C = gCtx.allocate(((long) m) * getDatatypeSizeOf(n));
 
 		// Transpose the matrix to get a dense matrix
-		JCublas2.cublasDgeam(gCtx.getCublasHandle(), CUBLAS_OP_T, CUBLAS_OP_T, m, n, alpha, A, lda, beta, new Pointer(),
+		LibMatrixCUDA.cudaSupportFunctions.cublasgeam(gCtx.getCublasHandle(), CUBLAS_OP_T, CUBLAS_OP_T, m, n, alpha, A, lda, beta, new Pointer(),
 				lda, C, ldc);
 		return C;
 	}
@@ -217,7 +210,7 @@ public class GPUObject {
 		nnzTotalDevHostPtr = gCtx.allocate(getIntSizeOf(1));
 
 		// Output is in dense vector format, convert it to CSR
-		cusparseDnnz(cusparseHandle, cusparseDirection.CUSPARSE_DIRECTION_ROW, rows, cols, matDescr, densePtr, rows,
+		LibMatrixCUDA.cudaSupportFunctions.cusparsennz(cusparseHandle, cusparseDirection.CUSPARSE_DIRECTION_ROW, rows, cols, matDescr, densePtr, rows,
 				nnzPerRowPtr, nnzTotalDevHostPtr);
 		//cudaDeviceSynchronize();
 		int[] nnzC = { -1 };
@@ -241,7 +234,7 @@ public class GPUObject {
 		}
 
 		CSRPointer C = CSRPointer.allocateEmpty(gCtx, nnzC[0], rows);
-		cusparseDdense2csr(cusparseHandle, rows, cols, matDescr, densePtr, rows, nnzPerRowPtr, C.val, C.rowPtr,
+		LibMatrixCUDA.cudaSupportFunctions.cusparsedense2csr(cusparseHandle, rows, cols, matDescr, densePtr, rows, nnzPerRowPtr, C.val, C.rowPtr,
 				C.colInd);
 		//cudaDeviceSynchronize();
 
@@ -252,31 +245,6 @@ public class GPUObject {
 	}
 
 	/**
-	 * Gets the double array from GPU memory onto host memory and returns string.
-	 *
-	 * @param A    Pointer to memory on device (GPU), assumed to point to a double array
-	 * @param rows rows in matrix A
-	 * @param cols columns in matrix A
-	 * @return the debug string
-	 * @throws DMLRuntimeException if DMLRuntimeException occurs
-	 */
-	public static String debugString(Pointer A, long rows, long cols) throws DMLRuntimeException {
-		StringBuffer sb = new StringBuffer();
-		int len = toIntExact(rows * cols);
-		double[] tmp = new double[len];
-		cudaMemcpy(Pointer.to(tmp), A, getDoubleSizeOf(len), cudaMemcpyDeviceToHost);
-		int k = 0;
-		for (int i = 0; i < rows; i++) {
-			for (int j = 0; j < cols; j++) {
-				sb.append(tmp[k]).append(' ');
-				k++;
-			}
-			sb.append('\n');
-		}
-		return sb.toString();
-	}
-
-	/**
 	 * Convenience method to directly examine the Sparse matrix on GPU
 	 *
 	 * @return CSR (compressed sparse row) pointer
@@ -287,7 +255,7 @@ public class GPUObject {
 
 	/**
 	 * Convenience method to directly set the sparse matrix on GPU
-	 * Needed for operations like {@link JCusparse#cusparseDcsrgemm(cusparseHandle, int, int, int, int, int, cusparseMatDescr, int, Pointer, Pointer, Pointer, cusparseMatDescr, int, Pointer, Pointer, Pointer, cusparseMatDescr, Pointer, Pointer, Pointer)}
+	 * Needed for operations like cusparseDcsrgemm(cusparseHandle, int, int, int, int, int, cusparseMatDescr, int, Pointer, Pointer, Pointer, cusparseMatDescr, int, Pointer, Pointer, Pointer, cusparseMatDescr, Pointer, Pointer, Pointer)
 	 *
 	 * @param sparseMatrixPtr CSR (compressed sparse row) pointer
 	 * @throws DMLRuntimeException ?
@@ -475,8 +443,8 @@ public class GPUObject {
 		return isSparse;
 	}
 	
-	private static long getDoubleSizeOf(long numElems) {
-		return numElems * ((long) jcuda.Sizeof.DOUBLE);
+	private static long getDatatypeSizeOf(long numElems) {
+		return numElems * LibMatrixCUDA.sizeOfDataType;
 	}
 
 	private static long getIntSizeOf(long numElems) {
@@ -524,7 +492,7 @@ public class GPUObject {
 		long rows = mat.getNumRows();
 		long cols = mat.getNumColumns();
 		int numElems = toIntExact(rows * cols);
-		long size = getDoubleSizeOf(numElems);
+		long size = getDatatypeSizeOf(numElems);
 		setDenseMatrixCudaPointer(allocate(size));
 		// The "fill" kernel is called which treats the matrix "jcudaDensePtr" like a vector and fills it with value "v"
 		// If the fill value is 0, no need to call the special kernel, the allocate memsets the allocated region to 0
@@ -609,10 +577,11 @@ public class GPUObject {
 	/**
 	 * if the data is allocated on the GPU and is dirty, it is copied back to the host memory
 	 *
+	 * @param instName name of the instruction
 	 * @return true if a copy to host happened, false otherwise
 	 * @throws CacheException ?
 	 */
-	public boolean acquireHostRead() throws CacheException {
+	public boolean acquireHostRead(String instName) throws CacheException {
 		boolean copied = false;
 		try {
 			if(LOG.isTraceEnabled()) {
@@ -623,7 +592,7 @@ public class GPUObject {
 					LOG.trace("GPU : data is dirty on device, copying to host, on " + this + ", GPUContext="
 						+ getGPUContext());
 				}
-				copyFromDeviceToHost();
+				copyFromDeviceToHost(instName);
 				copied = true;
 			}
 		} catch (DMLRuntimeException e) {
@@ -728,7 +697,7 @@ public class GPUObject {
 			throw new DMLRuntimeException("Internal error - invalid number of rows when allocating dense matrix");
 		if(cols <= 0)
 			throw new DMLRuntimeException("Internal error - invalid number of columns when allocating dense matrix;");
-		long size = getDoubleSizeOf(rows * cols);
+		long size = getDatatypeSizeOf(rows * cols);
 		Pointer tmp = allocate(size);
 		setDenseMatrixCudaPointer(tmp);
 	}
@@ -774,7 +743,7 @@ public class GPUObject {
 		if (LibMatrixCUDA.isInSparseFormat(getGPUContext(), mat)) {
 			GPUSize = CSRPointer.estimateSize(nnz, rlen);
 		} else {
-			GPUSize = getDoubleSizeOf(rlen * clen);
+			GPUSize = getDatatypeSizeOf(rlen * clen);
 		}
 		return GPUSize;
 	}
@@ -858,7 +827,7 @@ public class GPUObject {
 
 			if (copyToDevice) {
 				long t1 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
-				CSRPointer.copyToDevice(getJcudaSparseMatrixPtr(), tmp.getNumRows(), tmp.getNonZeros(), rowPtr, colInd,
+				CSRPointer.copyToDevice(getGPUContext(), getJcudaSparseMatrixPtr(), tmp.getNumRows(), tmp.getNonZeros(), rowPtr, colInd,
 						values);
 				if(GPUStatistics.DISPLAY_STATISTICS) 
 					GPUStatistics.maintainCPMiscTimes(opcode, GPUInstruction.MISC_TIMER_HOST_TO_DEVICE, System.nanoTime() - t1);
@@ -877,18 +846,14 @@ public class GPUObject {
 				// Minor optimization: No need to allocate empty error for CPU 
 				// data = new double[tmp.getNumRows() * tmp.getNumColumns()];
 				long t1 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
-				cudaMemset(getJcudaDenseMatrixPtr(), 0, getDoubleSizeOf(mat.getNumRows() * mat.getNumColumns()));
+				cudaMemset(getJcudaDenseMatrixPtr(), 0, getDatatypeSizeOf(mat.getNumRows() * mat.getNumColumns()));
 				if(GPUStatistics.DISPLAY_STATISTICS) 
 					GPUStatistics.maintainCPMiscTimes(opcode, GPUInstruction.MISC_TIMER_SET_ZERO, System.nanoTime() - t1);
 			}
 			else {
 				// Copy dense block
 				// H2D now only measures the time taken to do 
-				long t1 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
-				cudaMemcpy(getJcudaDenseMatrixPtr(), Pointer.to(data),
-						getDoubleSizeOf(mat.getNumRows() * mat.getNumColumns()), cudaMemcpyHostToDevice);
-				if(GPUStatistics.DISPLAY_STATISTICS) 
-					GPUStatistics.maintainCPMiscTimes(opcode, GPUInstruction.MISC_TIMER_HOST_TO_DEVICE, System.nanoTime() - t1);
+				LibMatrixCUDA.cudaSupportFunctions.hostToDevice(getGPUContext(), data, getJcudaDenseMatrixPtr(), opcode);
 			}
 		}
 
@@ -907,7 +872,7 @@ public class GPUObject {
 		return (int) l;
 	}
 
-	protected void copyFromDeviceToHost() throws DMLRuntimeException {
+	protected void copyFromDeviceToHost(String instName) throws DMLRuntimeException {
 		if(LOG.isTraceEnabled()) {
 			LOG.trace("GPU : copyFromDeviceToHost, on " + this + ", GPUContext=" + getGPUContext());
 		}
@@ -921,11 +886,7 @@ public class GPUObject {
 				start = System.nanoTime();
 			MatrixBlock tmp = new MatrixBlock(toIntExact(mat.getNumRows()), toIntExact(mat.getNumColumns()), false);
 			tmp.allocateDenseBlock();
-			double[] data = tmp.getDenseBlock();
-
-			cudaMemcpy(Pointer.to(data), getJcudaDenseMatrixPtr(), getDoubleSizeOf(data.length),
-					cudaMemcpyDeviceToHost);
-
+			LibMatrixCUDA.cudaSupportFunctions.deviceToHost(getGPUContext(), getJcudaDenseMatrixPtr(), tmp.getDenseBlock(), instName);
 			tmp.recomputeNonZeros();
 			mat.acquireModify(tmp);
 			mat.release();
@@ -951,10 +912,16 @@ public class GPUObject {
 				int rows = toIntExact(mat.getNumRows());
 				int cols = toIntExact(mat.getNumColumns());
 				int nnz = toIntExact(getJcudaSparseMatrixPtr().nnz);
+				double[] values = new double[nnz];
+				LibMatrixCUDA.cudaSupportFunctions.deviceToHost(getGPUContext(), getJcudaSparseMatrixPtr().val, values, instName);
 				int[] rowPtr = new int[rows + 1];
 				int[] colInd = new int[nnz];
-				double[] values = new double[nnz];
-				CSRPointer.copyToHost(getJcudaSparseMatrixPtr(), rows, nnz, rowPtr, colInd, values);
+				long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
+				CSRPointer.copyPtrToHost(getJcudaSparseMatrixPtr(), rows, nnz, rowPtr, colInd);
+				if (DMLScript.STATISTICS)
+					GPUStatistics.cudaFromDevTime.add(System.nanoTime() - t0);
+				if (DMLScript.STATISTICS)
+					GPUStatistics.cudaFromDevCount.add(3);
 
 				SparseBlockCSR sparseBlock = new SparseBlockCSR(rowPtr, colInd, values, nnz);
 				MatrixBlock tmp = new MatrixBlock(rows, cols, nnz, sparseBlock);

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java
index e1894ae..d22110d 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java
@@ -29,6 +29,7 @@ import java.util.HashMap;
 
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.io.IOUtilFunctions;
+import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
 
 import jcuda.Pointer;
 import jcuda.driver.CUfunction;
@@ -72,11 +73,17 @@ public class JCudaKernels {
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
 	public void launchKernel(String name, ExecutionConfig config, Object... arguments) throws DMLRuntimeException {
+		name = name + LibMatrixCUDA.customKernelSuffix;
 		CUfunction function = kernels.get(name);
+		
 		if (function == null) {
 			// caching functions into hashmap reduces the lookup overhead
 			function = new CUfunction();
-			checkResult(cuModuleGetFunction(function, module, name));
+			try {
+				checkResult(cuModuleGetFunction(function, module, name));
+			} catch(jcuda.CudaException e) {
+				throw new DMLRuntimeException("Error finding the custom kernel:" + name, e);
+			}
 		}
 
 		// Setup parameters

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/main/java/org/apache/sysml/runtime/matrix/data/CudaSupportFunctions.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/CudaSupportFunctions.java b/src/main/java/org/apache/sysml/runtime/matrix/data/CudaSupportFunctions.java
new file mode 100644
index 0000000..2b6c039
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/CudaSupportFunctions.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,  * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.matrix.data;
+
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
+
+import jcuda.jcublas.cublasHandle;
+import jcuda.jcusolver.cusolverDnHandle;
+import jcuda.jcusparse.cusparseHandle;
+import jcuda.jcusparse.cusparseMatDescr;
+import jcuda.Pointer;
+
+/**
+ * DESIGN DOCUMENTATION FOR SUPPORTING LOWER PRECISION:
+ * 1. SystemML.cu has been templatized in following way to support different datatype:
+ * - Similar to CuBLAS and CuSPARSE, the global kernels have the datatype specification in their name (for example: f for float
+ * and d for datatpe). But unlike CuBLAS and CuSPARSE, these are suffixes so as to simplify the engine.  
+ * - The global kernels with datatype specification invoke a corresponding templatized kernel (without suffix) which contains the core logic.
+ * - The suffixes are added in JCudaKernels's launchKernel method before invocation.
+ * For example:
+ * <code>
+ * template &lt; typename T &gt;
+ * __device__ void matrix_atan(T *A, T *C, unsigned int size) {
+ *     int index = blockIdx.x * blockDim.x + threadIdx.x;
+ *     if (index &lt; size){
+ *         C[index] = atan(A[index]);
+ *     }
+ * }
+ * extern "C" __global__ void matrix_atand(double *A, double *C, unsigned int size) {
+ * 	matrix_atan(A, C, size);
+ * }
+ * extern "C" __global__ void matrix_atanf(float *A, float *C, unsigned int size) {
+ * 	matrix_atan(A, C, size);
+ * } 
+ * </code>
+ * 
+ * 2. The CUDA library calls (such as CuBLAS, CuSPARSE, etc) go through this interface.
+ * The naming and parameters of the methods in this class are consistent with that of CUDA library to simplify development.
+ * 
+ * 3. During SystemML initialization, the appropriate class implementing CudaKernels interface is set based on the configuration property sysml.dataType.
+ */
+public interface CudaSupportFunctions {
+	public static boolean PERFORM_CONVERSION_ON_DEVICE = true;
+	public int cusparsecsrgemm(cusparseHandle handle, int transA, int transB, int m, int n, int k, 
+			cusparseMatDescr descrA, int nnzA, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA, 
+			cusparseMatDescr descrB, int nnzB, Pointer csrValB, Pointer csrRowPtrB, Pointer csrColIndB, 
+			cusparseMatDescr descrC, Pointer csrValC, Pointer csrRowPtrC, Pointer csrColIndC);
+	public int	cublasgeam(cublasHandle handle, int transa, int transb, int m, int n, jcuda.Pointer alpha, jcuda.Pointer A, 
+			int lda, jcuda.Pointer beta, jcuda.Pointer B, int ldb, jcuda.Pointer C, int ldc);
+	public int	cusparsecsrmv(cusparseHandle handle, int transA, int m, int n, int nnz, jcuda.Pointer alpha, cusparseMatDescr descrA, jcuda.Pointer csrValA, jcuda.Pointer csrRowPtrA, jcuda.Pointer csrColIndA, 
+			jcuda.Pointer x, jcuda.Pointer beta, jcuda.Pointer y);
+	public int	cusparsecsrmm2(cusparseHandle handle, int transa, int transb, int m, int n, int k, int nnz, jcuda.Pointer alpha, cusparseMatDescr descrA, jcuda.Pointer csrValA, jcuda.Pointer csrRowPtrA, jcuda.Pointer csrColIndA, 
+			jcuda.Pointer B, int ldb, jcuda.Pointer beta, jcuda.Pointer C, int ldc);
+	public int cublasdot(cublasHandle handle, int n, jcuda.Pointer x, int incx, jcuda.Pointer y, int incy, jcuda.Pointer result);
+	public int cublasgemv(cublasHandle handle, int trans, int m, int n, jcuda.Pointer alpha, jcuda.Pointer A, int lda, jcuda.Pointer x, int incx, jcuda.Pointer beta, jcuda.Pointer y, int incy);
+	public int cublasgemm(cublasHandle handle, int transa, int transb, int m, int n, int k, jcuda.Pointer alpha, jcuda.Pointer A, int lda, jcuda.Pointer B, int ldb, jcuda.Pointer beta, jcuda.Pointer C, int ldc);
+	public int cusparsecsr2csc(cusparseHandle handle, int m, int n, int nnz, jcuda.Pointer csrVal, jcuda.Pointer csrRowPtr, jcuda.Pointer csrColInd, jcuda.Pointer cscVal, jcuda.Pointer cscRowInd, jcuda.Pointer cscColPtr, int copyValues, int idxBase);
+	public int cublassyrk(cublasHandle handle, int uplo, int trans, int n, int k, jcuda.Pointer alpha, jcuda.Pointer A, int lda, jcuda.Pointer beta, jcuda.Pointer C, int ldc);
+	public int cublasaxpy(cublasHandle handle, int n, jcuda.Pointer alpha, jcuda.Pointer x, int incx, jcuda.Pointer y, int incy);
+	public int cublastrsm(cublasHandle handle, int side, int uplo, int trans, int diag, int m, int n, jcuda.Pointer alpha, jcuda.Pointer A, int lda, jcuda.Pointer B, int ldb);
+	public int cusolverDngeqrf_bufferSize(cusolverDnHandle handle, int m, int n, Pointer A, int lda, int[] Lwork);
+	public int cusolverDngeqrf(cusolverDnHandle handle, int m, int n, Pointer A, int lda, Pointer TAU, Pointer Workspace, int Lwork, Pointer devInfo);
+	public int cusolverDnormqr(cusolverDnHandle handle, int side, int trans, int m, int n, int k, Pointer A, int lda, Pointer tau, Pointer C, int ldc, Pointer work, int lwork, Pointer devInfo);
+	public int cusparsecsrgeam(cusparseHandle handle, int m, int n, jcuda.Pointer alpha, cusparseMatDescr descrA, int nnzA, jcuda.Pointer csrValA, jcuda.Pointer csrRowPtrA, jcuda.Pointer csrColIndA, jcuda.Pointer beta, cusparseMatDescr descrB, int nnzB, jcuda.Pointer csrValB, jcuda.Pointer csrRowPtrB, jcuda.Pointer csrColIndB, cusparseMatDescr descrC, jcuda.Pointer csrValC, jcuda.Pointer csrRowPtrC, jcuda.Pointer csrColIndC);
+	public int cusparsecsr2dense(cusparseHandle handle, int m, int n, cusparseMatDescr descrA, jcuda.Pointer csrValA, jcuda.Pointer csrRowPtrA, jcuda.Pointer csrColIndA, jcuda.Pointer A, int lda) ;
+	public int cusparsedense2csr(cusparseHandle handle, int m, int n, cusparseMatDescr descrA, jcuda.Pointer A, int lda, jcuda.Pointer nnzPerRow, jcuda.Pointer csrValA, jcuda.Pointer csrRowPtrA, jcuda.Pointer csrColIndA);
+	public int cusparsennz(cusparseHandle handle, int dirA, int m, int n, cusparseMatDescr descrA, jcuda.Pointer A, int lda, jcuda.Pointer nnzPerRowCol, jcuda.Pointer nnzTotalDevHostPtr);
+	public void deviceToHost(GPUContext gCtx, Pointer src, double [] dest, String instName) throws DMLRuntimeException;
+	public void hostToDevice(GPUContext gCtx, double [] src,  Pointer dest, String instName) throws DMLRuntimeException;
+	
+}

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/main/java/org/apache/sysml/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java b/src/main/java/org/apache/sysml/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java
new file mode 100644
index 0000000..78b4de0
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java
@@ -0,0 +1,175 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.matrix.data;
+
+import static jcuda.runtime.JCuda.cudaMemcpy;
+import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
+import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
+
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
+import org.apache.sysml.utils.GPUStatistics;
+
+import jcuda.Pointer;
+import jcuda.Sizeof;
+import jcuda.jcublas.JCublas2;
+import jcuda.jcublas.cublasHandle;
+import jcuda.jcusolver.JCusolverDn;
+import jcuda.jcusolver.cusolverDnHandle;
+import jcuda.jcusparse.JCusparse;
+import jcuda.jcusparse.cusparseHandle;
+import jcuda.jcusparse.cusparseMatDescr;
+
+public class DoublePrecisionCudaSupportFunctions implements CudaSupportFunctions {
+
+	@Override
+	public int cusparsecsrgemm(cusparseHandle handle, int transA, int transB, int m, int n, int k,
+			cusparseMatDescr descrA, int nnzA, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA,
+			cusparseMatDescr descrB, int nnzB, Pointer csrValB, Pointer csrRowPtrB, Pointer csrColIndB,
+			cusparseMatDescr descrC, Pointer csrValC, Pointer csrRowPtrC, Pointer csrColIndC) {
+		return JCusparse.cusparseDcsrgemm(handle, transA,  transB,  m,  n,  k,
+				 descrA,  nnzA,  csrValA,  csrRowPtrA,  csrColIndA,
+				 descrB,  nnzB,  csrValB,  csrRowPtrB,  csrColIndB,
+				 descrC,  csrValC,  csrRowPtrC,  csrColIndC);
+	}
+	
+	@Override
+	public int cublasgeam(cublasHandle handle, int transa, int transb, int m, int n, Pointer alpha, Pointer A, int lda,
+			Pointer beta, Pointer B, int ldb, Pointer C, int ldc) {
+		return JCublas2.cublasDgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
+	}
+	
+	@Override
+	public int cusparsecsrmv(cusparseHandle handle, int transA, int m, int n, int nnz, Pointer alpha,
+			cusparseMatDescr descrA, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA, Pointer x, Pointer beta,
+			Pointer y) {
+		return JCusparse.cusparseDcsrmv(handle, transA, m, n, nnz, alpha, 
+				descrA, csrValA, csrRowPtrA, csrColIndA, x, beta, y);
+	}
+	
+	@Override
+	public int	cusparsecsrmm2(cusparseHandle handle, int transa, int transb, int m, int n, int k, int nnz, jcuda.Pointer alpha, cusparseMatDescr descrA, 
+			jcuda.Pointer csrValA, jcuda.Pointer csrRowPtrA, jcuda.Pointer csrColIndA, 
+			jcuda.Pointer B, int ldb, jcuda.Pointer beta, jcuda.Pointer C, int ldc) {
+		return JCusparse.cusparseDcsrmm2(handle, transa, transb, m, n, k, nnz, alpha, descrA, csrValA, 
+				csrRowPtrA, csrColIndA, B, ldb, beta, C, ldc);
+	}
+	
+	@Override
+	public int cublasdot(cublasHandle handle, int n, Pointer x, int incx, Pointer y, int incy, Pointer result) {
+		return JCublas2.cublasDdot(handle, n, x, incx, y, incy, result);
+	}
+	
+	@Override
+	public int cublasgemv(cublasHandle handle, int trans, int m, int n, Pointer alpha, Pointer A, int lda, Pointer x,
+			int incx, Pointer beta, Pointer y, int incy) {
+		return JCublas2.cublasDgemv(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+	}
+	
+	@Override
+	public int cublasgemm(cublasHandle handle, int transa, int transb, int m, int n, int k, Pointer alpha, Pointer A,
+			int lda, Pointer B, int ldb, Pointer beta, Pointer C, int ldc) {
+		return JCublas2.cublasDgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+	}
+	
+	@Override
+	public int cusparsecsr2csc(cusparseHandle handle, int m, int n, int nnz, Pointer csrVal, Pointer csrRowPtr,
+			Pointer csrColInd, Pointer cscVal, Pointer cscRowInd, Pointer cscColPtr, int copyValues, int idxBase) {
+		return JCusparse.cusparseDcsr2csc(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscRowInd, cscColPtr, copyValues, idxBase);
+	}
+	
+	@Override
+	public int cublassyrk(cublasHandle handle, int uplo, int trans, int n, int k, Pointer alpha, Pointer A, int lda,
+			Pointer beta, Pointer C, int ldc) {
+		return JCublas2.cublasDsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+	}
+	
+	@Override
+	public int cublasaxpy(cublasHandle handle, int n, Pointer alpha, Pointer x, int incx, Pointer y, int incy) {
+		return JCublas2.cublasDaxpy(handle, n, alpha, x, incx, y, incy);
+	}
+	
+	@Override
+	public int cublastrsm(cublasHandle handle, int side, int uplo, int trans, int diag, int m, int n, Pointer alpha,
+			Pointer A, int lda, Pointer B, int ldb) {
+		return JCublas2.cublasDtrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
+	}
+
+	@Override
+	public int cusolverDngeqrf_bufferSize(cusolverDnHandle handle, int m, int n, Pointer A, int lda, int[] Lwork) {
+		return JCusolverDn.cusolverDnDgeqrf_bufferSize(handle, m, n, A, lda, Lwork);
+	}
+	
+	@Override
+	public int cusolverDngeqrf(cusolverDnHandle handle, int m, int n, Pointer A, int lda, Pointer TAU,
+			Pointer Workspace, int Lwork, Pointer devInfo) {
+		return JCusolverDn.cusolverDnDgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
+	}
+
+	@Override
+	public int cusolverDnormqr(cusolverDnHandle handle, int side, int trans, int m, int n, int k, Pointer A, int lda,
+			Pointer tau, Pointer C, int ldc, Pointer work, int lwork, Pointer devInfo) {
+		return JCusolverDn.cusolverDnDormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, devInfo);
+	}
+	
+	@Override
+	public int cusparsecsrgeam(cusparseHandle handle, int m, int n, Pointer alpha, cusparseMatDescr descrA, int nnzA,
+			Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA, Pointer beta, cusparseMatDescr descrB, int nnzB,
+			Pointer csrValB, Pointer csrRowPtrB, Pointer csrColIndB, cusparseMatDescr descrC, Pointer csrValC,
+			Pointer csrRowPtrC, Pointer csrColIndC) {
+		return JCusparse.cusparseDcsrgeam(handle, m, n, alpha, descrA, nnzA, 
+				csrValA, csrRowPtrA, csrColIndA, beta, descrB, nnzB, 
+				csrValB, csrRowPtrB, csrColIndB, descrC, csrValC, csrRowPtrC, csrColIndC);
+	}
+	
+	@Override
+	public int cusparsecsr2dense(cusparseHandle handle, int m, int n, cusparseMatDescr descrA, Pointer csrValA,
+			Pointer csrRowPtrA, Pointer csrColIndA, Pointer A, int lda) {
+		return JCusparse.cusparseDcsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA, csrColIndA, A, lda);
+	}
+
+	@Override
+	public int cusparsedense2csr(cusparseHandle handle, int m, int n, cusparseMatDescr descrA, Pointer A, int lda,
+			Pointer nnzPerRow, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA) {
+		return JCusparse.cusparseDdense2csr(handle, m, n, descrA, A, lda, nnzPerRow, csrValA, csrRowPtrA, csrColIndA);
+	}
+
+	@Override
+	public int cusparsennz(cusparseHandle handle, int dirA, int m, int n, cusparseMatDescr descrA, Pointer A, int lda,
+			Pointer nnzPerRowCol, Pointer nnzTotalDevHostPtr) {
+		return JCusparse.cusparseDnnz(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol, nnzTotalDevHostPtr);
+	}
+
+	@Override
+	public void deviceToHost(GPUContext gCtx, Pointer src, double[] dest, String instName) throws DMLRuntimeException {
+		long t1 = GPUStatistics.DISPLAY_STATISTICS  && instName != null? System.nanoTime() : 0;
+		cudaMemcpy(Pointer.to(dest), src, ((long)dest.length)*Sizeof.DOUBLE, cudaMemcpyDeviceToHost);
+		if(GPUStatistics.DISPLAY_STATISTICS && instName != null) 
+			GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DEVICE_TO_HOST, System.nanoTime() - t1);
+	}
+
+	@Override
+	public void hostToDevice(GPUContext gCtx, double[] src, Pointer dest, String instName) throws DMLRuntimeException {
+		long t1 = GPUStatistics.DISPLAY_STATISTICS  && instName != null? System.nanoTime() : 0;
+		cudaMemcpy(dest, Pointer.to(src), ((long)src.length)*Sizeof.DOUBLE, cudaMemcpyHostToDevice);
+		if(GPUStatistics.DISPLAY_STATISTICS && instName != null) 
+			GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_HOST_TO_DEVICE, System.nanoTime() - t1);
+	}
+}

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index 7e25299..eb17e69 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -21,12 +21,13 @@ package org.apache.sysml.runtime.matrix.data;
 
 import static jcuda.jcublas.cublasOperation.CUBLAS_OP_N;
 import static jcuda.jcublas.cublasOperation.CUBLAS_OP_T;
-import static jcuda.jcusparse.JCusparse.cusparseDcsr2csc;
 import static jcuda.runtime.JCuda.cudaMemcpy;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
+
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
+import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
@@ -80,14 +81,11 @@ import org.apache.sysml.utils.Statistics;
 
 import jcuda.Pointer;
 import jcuda.Sizeof;
-import jcuda.jcublas.JCublas2;
 import jcuda.jcublas.cublasDiagType;
 import jcuda.jcublas.cublasFillMode;
 import jcuda.jcublas.cublasHandle;
 import jcuda.jcublas.cublasOperation;
 import jcuda.jcublas.cublasSideMode;
-import jcuda.jcusolver.JCusolverDn;
-import jcuda.jcusparse.JCusparse;
 import jcuda.jcusparse.cusparseAction;
 import jcuda.jcusparse.cusparseHandle;
 import jcuda.jcusparse.cusparseIndexBase;
@@ -100,6 +98,34 @@ import jcuda.jcusparse.cusparseIndexBase;
 public class LibMatrixCUDA {
 
 	private static final Log LOG = LogFactory.getLog(LibMatrixCUDA.class.getName());
+	
+	protected static int CUDNN_DATA_TYPE = jcuda.jcudnn.cudnnDataType.CUDNN_DATA_DOUBLE;
+	// The below variables are used in CSRPointer, GPUObjects, etc.
+	public static CudaSupportFunctions cudaSupportFunctions = new DoublePrecisionCudaSupportFunctions();
+	public static int sizeOfDataType = jcuda.Sizeof.DOUBLE;
+	public static String customKernelSuffix = "_d";
+	
+	/**
+	 * Sets the internal state based on the DMLScript.DATA_TYPE
+	 * @throws DMLRuntimeException if error
+	 */
+	public static void resetFloatingPointPrecision() throws DMLRuntimeException {
+		if(DMLScript.FLOATING_POINT_PRECISION.equalsIgnoreCase("double")) {
+			LibMatrixCUDA.CUDNN_DATA_TYPE = jcuda.jcudnn.cudnnDataType.CUDNN_DATA_DOUBLE;
+			LibMatrixCUDA.cudaSupportFunctions = new DoublePrecisionCudaSupportFunctions();
+			LibMatrixCUDA.sizeOfDataType = jcuda.Sizeof.DOUBLE;
+			LibMatrixCUDA.customKernelSuffix = "_d";
+		}
+		else if(DMLScript.FLOATING_POINT_PRECISION.equalsIgnoreCase("single")) {
+			LibMatrixCUDA.CUDNN_DATA_TYPE = jcuda.jcudnn.cudnnDataType.CUDNN_DATA_FLOAT;
+			LibMatrixCUDA.cudaSupportFunctions = new SinglePrecisionCudaSupportFunctions();
+			LibMatrixCUDA.sizeOfDataType = jcuda.Sizeof.FLOAT;
+			LibMatrixCUDA.customKernelSuffix = "_f";
+		}
+		else {
+			throw new DMLRuntimeException("Unsupported floating point precision: " + DMLScript.FLOATING_POINT_PRECISION);
+		}
+	}
 
 	// Assume Compute Capability 3.0
 	// MAX BLOCKS is 2^31 - 1 For compute capability > 3.0
@@ -110,7 +136,7 @@ public class LibMatrixCUDA {
 	
 	// From CuDNN 5.1 documentation:
 	// The total size of a tensor including the potential padding between dimensions is limited to 2 Giga-elements of type datatype.
-	protected static long maxNumDoublesOfCuDNNTensor = 2000000000;
+	protected static long maxNumElementsOfCuDNNTensor = 2000000000;
 
 	//********************************************************************/
 	//***************************** UTILS ********************************/
@@ -179,7 +205,18 @@ public class LibMatrixCUDA {
 	protected static JCudaKernels getCudaKernels(GPUContext gCtx) throws DMLRuntimeException {
 		return gCtx.getKernels();
 	}
-
+	
+	public static Pointer double2float(GPUContext gCtx, Pointer A, Pointer ret, int numElems) throws DMLRuntimeException {
+		getCudaKernels(gCtx).launchKernel("double2float", ExecutionConfig.getConfigForSimpleVectorOperations(numElems),
+				A, ret, numElems);
+		return ret;
+	}
+	
+	public static Pointer float2double(GPUContext gCtx, Pointer A, Pointer ret, int numElems) throws DMLRuntimeException {
+		getCudaKernels(gCtx).launchKernel("float2double", ExecutionConfig.getConfigForSimpleVectorOperations(numElems),
+				A, ret, numElems);
+		return ret;
+	}
 
 	//********************************************************************/
 	//************************ End of UTILS ******************************/
@@ -191,13 +228,15 @@ public class LibMatrixCUDA {
 
 	private static Pointer _one;
 	private static Pointer _zero;
+	private static int oldDataTypeSize;
 	/**
 	 * Convenience method to get a pointer to value '1.0' on device. Instead of allocating and deallocating it for every kernel invocation.
 	 * @return jcuda pointer
 	 */
-	protected static Pointer one() {
-		if(_one == null) {
-			_one = pointerTo(1.0);
+	public static Pointer one() {
+		if(_one == null || oldDataTypeSize != sizeOfDataType) {
+			_one = dataTypePointerTo(1.0);
+			oldDataTypeSize = sizeOfDataType;
 		}
 		return _one;
 	}
@@ -205,9 +244,10 @@ public class LibMatrixCUDA {
 	 * Convenience method to get a pointer to value '0.0f' on device. Instead of allocating and deallocating it for every kernel invocation.
 	 * @return jcuda pointer
 	 */
-	protected static Pointer zero() {
-		if(_zero == null) {
-			_zero = pointerTo(0.0f);
+	public static Pointer zero() {
+		if(_zero == null  || oldDataTypeSize != sizeOfDataType) {
+			_zero = dataTypePointerTo(0.0);
+			oldDataTypeSize = sizeOfDataType;
 		}
 		return _zero;
 	}
@@ -242,8 +282,16 @@ public class LibMatrixCUDA {
 		return input.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
 	}
 	
-	protected static Pointer pointerTo(double value) {
-		return Pointer.to(new double[] { value });
+	protected static Pointer dataTypePointerTo(double value) {
+		if(sizeOfDataType == Sizeof.DOUBLE) {
+			return Pointer.to(new double[] { value });
+		}
+		else if(sizeOfDataType == Sizeof.FLOAT) {
+			return Pointer.to(new float[] { (float) value });
+		}
+		else {
+			throw new RuntimeException("Unsupported datatype with size " + sizeOfDataType);
+		}
 	}
 	
 
@@ -434,7 +482,7 @@ public class LibMatrixCUDA {
 		long t0=0, t1=0;
 
 		if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
-		JCublas2.cublasDsyrk(getCublasHandle(gCtx), cublasFillMode.CUBLAS_FILL_MODE_LOWER,transa, m, k, one(), A, lda, zero(), C, ldc);
+		cudaSupportFunctions.cublassyrk(getCublasHandle(gCtx), cublasFillMode.CUBLAS_FILL_MODE_LOWER,transa, m, k, one(), A, lda, zero(), C, ldc);
 		if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SYRK_LIB, System.nanoTime() - t0);
 
 		if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
@@ -630,7 +678,7 @@ public class LibMatrixCUDA {
 		}
 		case OP_PLUS_SQ : {
 			// Calculate the squares in a temporary object tmp
-			Pointer tmp = gCtx.allocate(instName, size * Sizeof.DOUBLE);
+			Pointer tmp = gCtx.allocate(instName, size * sizeOfDataType);
 
 			squareMatrix(gCtx, instName, in, tmp, rlen, clen);
 			// Then do the sum on the temporary object and free it
@@ -729,8 +777,8 @@ public class LibMatrixCUDA {
 		}
 		case OP_VARIANCE : {
 			// Temporary GPU array for
-			Pointer tmp = gCtx.allocate(instName, size * Sizeof.DOUBLE);
-			Pointer tmp2 = gCtx.allocate(instName, size * Sizeof.DOUBLE);
+			Pointer tmp = gCtx.allocate(instName, size * sizeOfDataType);
+			Pointer tmp2 = gCtx.allocate(instName, size * sizeOfDataType);
 
 			switch(reductionDirection) {
 
@@ -758,7 +806,7 @@ public class LibMatrixCUDA {
 
 				squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen);
 
-				Pointer tmpRow = gCtx.allocate(instName, rlen * Sizeof.DOUBLE);
+				Pointer tmpRow = gCtx.allocate(instName, rlen * sizeOfDataType);
 				reduceRow(gCtx, instName, "reduce_row_sum", tmp2, tmpRow, rlen, clen);
 
 				ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), clen - 1);
@@ -776,7 +824,7 @@ public class LibMatrixCUDA {
 
 				squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen);
 
-				Pointer tmpCol = gCtx.allocate(instName, clen * Sizeof.DOUBLE);
+				Pointer tmpCol = gCtx.allocate(instName, clen * sizeOfDataType);
 				reduceCol(gCtx, instName, "reduce_col_sum", tmp2, tmpCol, rlen, clen);
 
 				ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), rlen - 1);
@@ -847,9 +895,9 @@ public class LibMatrixCUDA {
 		int[] tmp = getKernelParamsForReduceAll(gCtx, n);
 		int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2];
 
-		Pointer tempOut = gCtx.allocate(instName, n * Sizeof.DOUBLE);
+		Pointer tempOut = gCtx.allocate(instName, n * sizeOfDataType);
 
-		long t1=0,t2=0,t3=0;
+		long t1=0,t2=0;
 
 		if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
 		getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), in, tempOut, n);
@@ -867,11 +915,7 @@ public class LibMatrixCUDA {
 			s = (s + (threads*2-1)) / (threads*2);
 		}
 		double[] result = {-1f};
-
-		if (GPUStatistics.DISPLAY_STATISTICS) t3 = System.nanoTime();
-		cudaMemcpy(Pointer.to(result), tempOut, Sizeof.DOUBLE, cudaMemcpyDeviceToHost);
-		if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DEVICE_TO_HOST, System.nanoTime() - t3);
-
+		cudaSupportFunctions.deviceToHost(gCtx, tempOut, result, instName);
 		gCtx.cudaFreeHelper(instName, tempOut);
 		return result[0];
 	}
@@ -946,7 +990,7 @@ public class LibMatrixCUDA {
 		int blocks = (n + (threads * 2 - 1)) / (threads * 2);
 		blocks = Math.min(MAX_BLOCKS, blocks);
 
-		int sharedMemSize = threads * Sizeof.DOUBLE;
+		int sharedMemSize = threads * sizeOfDataType;
 		if (threads <= WARP_SIZE){
 			sharedMemSize *= 2;
 		}
@@ -965,7 +1009,7 @@ public class LibMatrixCUDA {
 		final int MAX_THREADS = getMaxThreads(gCtx);
 		int threads = (cols < MAX_THREADS *2) ? nextPow2((cols + 1)/ 2) : MAX_THREADS;
 		int blocks = rows;
-		int sharedMemSize = threads * Sizeof.DOUBLE;
+		int sharedMemSize = threads * sizeOfDataType;
 		if (threads <= WARP_SIZE){
 			sharedMemSize *=2;
 		}
@@ -979,7 +1023,7 @@ public class LibMatrixCUDA {
 		int threads = Math.min(cols, MAX_THREADS);
 		int blocks = Math.min(cols/MAX_THREADS, MAX_BLOCKS);
 		if (cols % MAX_THREADS != 0) blocks++;
-		int sharedMemSize = threads * Sizeof.DOUBLE;
+		int sharedMemSize = threads * sizeOfDataType;
 		if (threads <= WARP_SIZE){
 			sharedMemSize *=2;
 		}
@@ -1475,7 +1519,7 @@ public class LibMatrixCUDA {
 	private static void deviceCopy(String instName, Pointer src, Pointer dest, int rlen, int clen) throws DMLRuntimeException {
 		long t0=0;
 		if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
-		int size = rlen * clen * Sizeof.DOUBLE;
+		int size = rlen * clen * sizeOfDataType;
 		cudaMemcpy(dest, src, size, cudaMemcpyDeviceToDevice);
 		if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DEVICE_TO_DEVICE, System.nanoTime() - t0);
 	}
@@ -1538,8 +1582,8 @@ public class LibMatrixCUDA {
 			LOG.trace("GPU : dgeam" + ", GPUContext=" + gCtx);
 		}
 
-		Pointer alphaPtr = pointerTo(alpha);
-		Pointer betaPtr = pointerTo(beta);
+		Pointer alphaPtr = dataTypePointerTo(alpha);
+		Pointer betaPtr = dataTypePointerTo(beta);
 		int transa = isLeftTransposed ? CUBLAS_OP_T : CUBLAS_OP_N;
 		int transb = isRightTransposed ? CUBLAS_OP_T : CUBLAS_OP_N;
 
@@ -1584,7 +1628,7 @@ public class LibMatrixCUDA {
 				int nnz = (int)A.nnz;
 				CSRPointer C = CSRPointer.allocateEmpty(gCtx, nnz, n);
 				out.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
-				cusparseDcsr2csc(getCusparseHandle(gCtx), m, n, nnz, A.val, A.rowPtr, A.colInd, C.val, C.colInd, C.rowPtr, cusparseAction.CUSPARSE_ACTION_NUMERIC, cusparseIndexBase.CUSPARSE_INDEX_BASE_ZERO);
+				cudaSupportFunctions.cusparsecsr2csc(getCusparseHandle(gCtx), m, n, nnz, A.val, A.rowPtr, A.colInd, C.val, C.colInd, C.rowPtr, cusparseAction.CUSPARSE_ACTION_NUMERIC, cusparseIndexBase.CUSPARSE_INDEX_BASE_ZERO);
 			} else {
 				// General case (cusparse does not support accept the transpose operator for dgeam)
 				// TODO: to implement the transposed + dgeam for sparse matrices, they need to be converted to csc, which is effectively a tranpose
@@ -1604,7 +1648,7 @@ public class LibMatrixCUDA {
 				//long sizeOfC = CSRPointer.estimateSize(C.nnz, out.getNumRows());
 				if (GPUStatistics.DISPLAY_STATISTICS)
 					t0 = System.nanoTime();
-				JCusparse.cusparseDcsrgeam(getCusparseHandle(gCtx), m, n, alphaPtr, A.descr, toInt(A.nnz), A.val, A.rowPtr, A.colInd, betaPtr,
+				cudaSupportFunctions.cusparsecsrgeam(getCusparseHandle(gCtx), m, n, alphaPtr, A.descr, toInt(A.nnz), A.val, A.rowPtr, A.colInd, betaPtr,
 						B.descr, toInt(B.nnz), B.val, B.rowPtr, B.colInd, C.descr, C.val, C.rowPtr, C.colInd);
 				//cudaDeviceSynchronize;
 				if (GPUStatistics.DISPLAY_STATISTICS)
@@ -1635,7 +1679,7 @@ public class LibMatrixCUDA {
 			Pointer C = getDensePointer(gCtx, out, instName);
 
 			if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
-			JCublas2.cublasDgeam(getCublasHandle(gCtx), transa, transb, m, n, alphaPtr, A, lda, betaPtr, B, ldb, C, ldc);
+			cudaSupportFunctions.cublasgeam(getCublasHandle(gCtx), transa, transb, m, n, alphaPtr, A, lda, betaPtr, B, ldb, C, ldc);
 			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_DGEAM_LIB, System.nanoTime() - t0);
 		}
 	}
@@ -1673,7 +1717,7 @@ public class LibMatrixCUDA {
 	//******************* End of Re-org Functions ************************/
 	//********************************************************************/
 
-	static int toInt(long num) throws DMLRuntimeException {
+	public static int toInt(long num) throws DMLRuntimeException {
 		if(num >= Integer.MAX_VALUE || num <= Integer.MIN_VALUE) {
 			throw new DMLRuntimeException("GPU : Exceeded supported size " + num);
 		}
@@ -1751,8 +1795,8 @@ public class LibMatrixCUDA {
 		long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
 		long retClen = cu - cl + 1;
 		if (inClen == retClen) {
-			cudaMemcpy(outPointer, inPointer.withByteOffset(rl * inClen * Sizeof.DOUBLE), (ru - rl + 1) * inClen
-					* Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
+			cudaMemcpy(outPointer, inPointer.withByteOffset(rl * inClen * sizeOfDataType), (ru - rl + 1) * inClen
+					* sizeOfDataType, cudaMemcpyDeviceToDevice);
 		} else {
 			long retRlen = ru - rl + 1;
 			getCudaKernels(gCtx).launchKernel("slice_dense_dense", ExecutionConfig.getConfigForSimpleVectorOperations(toInt(retRlen*retClen)),
@@ -2255,17 +2299,17 @@ public class LibMatrixCUDA {
 
 			// Matrix-Matrix daxpy
 			long n = in1.getNumRows()*in2.getNumColumns(); // Since A is always a matrix
-			Pointer alphaPtr = pointerTo(constant);
+			Pointer alphaPtr = dataTypePointerTo(constant);
 			// C <- A + alpha*B
 			// becomes
 			// C <- A
 			// C <- alpha*B + C
 			if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
-			cudaMemcpy(C, A, n*((long)jcuda.Sizeof.DOUBLE), cudaMemcpyDeviceToDevice);
+			cudaMemcpy(C, A, n*((long)sizeOfDataType), cudaMemcpyDeviceToDevice);
 			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DEVICE_TO_DEVICE, System.nanoTime() - t1);
 
 			if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
-			JCublas2.cublasDaxpy(getCublasHandle(gCtx), toInt(n), alphaPtr, B, 1, C, 1);
+			cudaSupportFunctions.cublasaxpy(getCublasHandle(gCtx), toInt(n), alphaPtr, B, 1, C, 1);
 			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DAXPY_LIB, System.nanoTime() - t2);
 		}
 		else {
@@ -2353,15 +2397,15 @@ public class LibMatrixCUDA {
 		// step 3: query working space of geqrf and ormqr
 		if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
 		int[] lwork = {0};
-		JCusolverDn.cusolverDnDgeqrf_bufferSize(gCtx.getCusolverDnHandle(), m, n, A, m, lwork);
+		cudaSupportFunctions.cusolverDngeqrf_bufferSize(gCtx.getCusolverDnHandle(), m, n, A, m, lwork);
 		if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_QR_BUFFER, System.nanoTime() - t0);
 
 		// step 4: compute QR factorization
-		Pointer work = gCtx.allocate(instName, lwork[0] * Sizeof.DOUBLE);
-		Pointer tau = gCtx.allocate(instName, m * Sizeof.DOUBLE);
+		Pointer work = gCtx.allocate(instName, lwork[0] * sizeOfDataType);
+		Pointer tau = gCtx.allocate(instName, m * sizeOfDataType);
 		Pointer devInfo = gCtx.allocate(Sizeof.INT);
 		if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
-		JCusolverDn.cusolverDnDgeqrf(gCtx.getCusolverDnHandle(), m, n, A, m, tau, work, lwork[0], devInfo);
+		cudaSupportFunctions.cusolverDngeqrf(gCtx.getCusolverDnHandle(), m, n, A, m, tau, work, lwork[0], devInfo);
 		if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_QR, System.nanoTime() - t0);
 
 		int[] qrError = {-1};
@@ -2372,7 +2416,7 @@ public class LibMatrixCUDA {
 
 		// step 5: compute Q^T*B
 		if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
-		JCusolverDn.cusolverDnDormqr(gCtx.getCusolverDnHandle(), cublasSideMode.CUBLAS_SIDE_LEFT, cublasOperation.CUBLAS_OP_T, m, 1, n, A, m, tau, b, m, work, lwork[0], devInfo);
+		cudaSupportFunctions.cusolverDnormqr(gCtx.getCusolverDnHandle(), cublasSideMode.CUBLAS_SIDE_LEFT, cublasOperation.CUBLAS_OP_T, m, 1, n, A, m, tau, b, m, work, lwork[0], devInfo);
 		if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ORMQR, System.nanoTime() - t0);
 		cudaMemcpy(Pointer.to(qrError), devInfo, Sizeof.INT, cudaMemcpyDeviceToHost);
 		if (qrError[0] != 0) {
@@ -2381,9 +2425,9 @@ public class LibMatrixCUDA {
 
 		// step 6: compute x = R \ Q^T*B
 		if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
-		JCublas2.cublasDtrsm(gCtx.getCublasHandle(),
+		cudaSupportFunctions.cublastrsm(gCtx.getCublasHandle(),
 			cublasSideMode.CUBLAS_SIDE_LEFT, cublasFillMode.CUBLAS_FILL_MODE_UPPER, cublasOperation.CUBLAS_OP_N, cublasDiagType.CUBLAS_DIAG_NON_UNIT,
-			n, 1, pointerTo(1.0), A, m, b, m);
+			n, 1, dataTypePointerTo(1.0), A, m, b, m);
 		if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_TRSM, System.nanoTime() - t0);
 
 		if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
@@ -2393,7 +2437,7 @@ public class LibMatrixCUDA {
 		// TODO  : Find a way to assign bTobj directly to the output and set the correct flags so as to not crash
 		// There is an avoidable copy happening here
 		MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, in1.getNumColumns(), 1);
-		cudaMemcpy(out.getGPUObject(gCtx).getJcudaDenseMatrixPtr(), bTobj.getJcudaDenseMatrixPtr(), n * 1 * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
+		cudaMemcpy(out.getGPUObject(gCtx).getJcudaDenseMatrixPtr(), bTobj.getJcudaDenseMatrixPtr(), n * 1 * sizeOfDataType, cudaMemcpyDeviceToDevice);
 
 		gCtx.cudaFreeHelper(instName, work);
 		gCtx.cudaFreeHelper(instName, tau);

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
index bb74aa2..7fd766c 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
@@ -30,13 +30,11 @@ import static jcuda.jcudnn.JCudnn.cudnnPoolingForward;
 import static jcuda.jcudnn.JCudnn.cudnnSetActivationDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnSetTensor4dDescriptor;
 import static jcuda.jcudnn.cudnnActivationMode.CUDNN_ACTIVATION_RELU;
-import static jcuda.jcudnn.cudnnDataType.CUDNN_DATA_DOUBLE;
 import static jcuda.jcudnn.cudnnNanPropagation.CUDNN_PROPAGATE_NAN;
 import static jcuda.jcudnn.cudnnTensorFormat.CUDNN_TENSOR_NCHW;
 import static jcuda.runtime.JCuda.cudaMemset;
 import jcuda.CudaException;
 import jcuda.Pointer;
-import jcuda.Sizeof;
 import jcuda.jcudnn.cudnnActivationDescriptor;
 import jcuda.jcudnn.cudnnConvolutionFwdPreference;
 import jcuda.jcudnn.cudnnHandle;
@@ -131,7 +129,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 		long CHW = C*H*W; long KPQ = K*P*Q; long CRS = C*R*S; 
 		long NCHW = N*CHW; long NKPQ = N*KPQ; long KCRS = K*CRS;
 		
-		if(NCHW < maxNumDoublesOfCuDNNTensor && NKPQ < maxNumDoublesOfCuDNNTensor && KCRS < maxNumDoublesOfCuDNNTensor) {
+		if(NCHW < maxNumElementsOfCuDNNTensor && NKPQ < maxNumElementsOfCuDNNTensor && KCRS < maxNumElementsOfCuDNNTensor) {
 			// Filter and output are accounted as dense in the memory estimation for conv2d
 			double overhead = isInSparseFormat(gCtx, filter) ? OptimizerUtils.estimateSizeExactSparsity(K, CRS, 1.0) : 0;
 			overhead += isInSparseFormat(gCtx, image) ? OptimizerUtils.estimateSizeExactSparsity(N, CHW, 1.0) : 0;
@@ -155,7 +153,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 					try(LibMatrixCuDNNInputRowFetcher imgFetcher = new LibMatrixCuDNNInputRowFetcher(gCtx, instName, image)) {
 						for(int n = 0; n < N; n++) {
 							// Perform one-input all-channel conv2d
-							cudnnConv2d(gCtx, instName, imgFetcher.getNthRow(n), filterPointer, dstPointer.withByteOffset(n*KPQ*Sizeof.DOUBLE), algo);
+							cudnnConv2d(gCtx, instName, imgFetcher.getNthRow(n), filterPointer, dstPointer.withByteOffset(n*KPQ*sizeOfDataType), algo);
 						}
 					}
 				}
@@ -180,7 +178,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 	 */
 	private static void throwCuDNNDimensionError(long dim1, long dim2, long dim3, long dim4) throws DMLRuntimeException {
 		throw new DMLRuntimeException("The dimensions of input/output matrices is too large to execute a CuDNN kernel. "
-				+ "Max CuDNN matrix size:" + maxNumDoublesOfCuDNNTensor + ". "
+				+ "Max CuDNN matrix size:" + maxNumElementsOfCuDNNTensor + ". "
 				+ "Given input matrix dimensions: [" + dim1 + "," + dim2 + "]. Output dimension:  [" + dim3 + "," + dim4 + "].");
 	}
 
@@ -197,7 +195,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 	 */
 	private static void throwCuDNNDimensionError(long dim1, long dim2, long dim3, long dim4, long dim5, long dim6) throws DMLRuntimeException {
 		throw new DMLRuntimeException("The dimensions of input/output matrices is too large to execute a CuDNN kernel. "
-				+ "Max CuDNN matrix size:" + maxNumDoublesOfCuDNNTensor + ". "
+				+ "Max CuDNN matrix size:" + maxNumElementsOfCuDNNTensor + ". "
 				+ "Given input matrix dimensions: [" + dim1 + "," + dim2 + "], [" + dim3 + "," + dim4 + "]. Output dimension: [" + dim5 + "," + dim6 + "]");
 	}
 
@@ -270,7 +268,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 		long NCHW = N*CHW; long NKPQ = N*KPQ; long KCRS = K*CRS;
 		
 		
-		if(NCHW < maxNumDoublesOfCuDNNTensor && NKPQ < maxNumDoublesOfCuDNNTensor && KCRS < maxNumDoublesOfCuDNNTensor) {
+		if(NCHW < maxNumElementsOfCuDNNTensor && NKPQ < maxNumElementsOfCuDNNTensor && KCRS < maxNumElementsOfCuDNNTensor) {
 			Pointer dwPointer = getDensePointerForCuDNN(gCtx, outputBlock, instName);
 			double overhead = isInSparseFormat(gCtx, image) ? OptimizerUtils.estimateSizeExactSparsity(N, CHW, 1.0) : 0;
 			overhead += isInSparseFormat(gCtx, dout) ? OptimizerUtils.estimateSizeExactSparsity(N, KPQ, 1.0) : 0;
@@ -292,10 +290,10 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 					try(LibMatrixCuDNNInputRowFetcher imgFetcher = new LibMatrixCuDNNInputRowFetcher(gCtx, instName, image);
 						LibMatrixCuDNNInputRowFetcher doutFetcher = new LibMatrixCuDNNInputRowFetcher(gCtx, instName, dout)) {
 						// Perform one-input conv2dBackwardFilter
-						Pointer tempdwPointer = gCtx.allocate(KCRS*Sizeof.DOUBLE);
+						Pointer tempdwPointer = gCtx.allocate(KCRS*sizeOfDataType);
 						for(int n = 0; n < N; n++) {
 							long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
-							cudaMemset(tempdwPointer, 0, KCRS*Sizeof.DOUBLE);
+							cudaMemset(tempdwPointer, 0, KCRS*sizeOfDataType);
 							if(GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SET_ZERO, System.nanoTime() - t0);
 							// Perform one-input conv2dBackwardFilter
 							cudnnConv2dBackwardFilter(gCtx, instName, imgFetcher.getNthRow(n), doutFetcher.getNthRow(n), tempdwPointer, algo);
@@ -376,7 +374,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 		long CHW = C*H*W; long KPQ = K*P*Q; long CRS = C*R*S; 
 		long NCHW = N*CHW; long NKPQ = N*KPQ; long KCRS = K*CRS;
 
-		if(NCHW < maxNumDoublesOfCuDNNTensor && NKPQ < maxNumDoublesOfCuDNNTensor && KCRS < maxNumDoublesOfCuDNNTensor) {
+		if(NCHW < maxNumElementsOfCuDNNTensor && NKPQ < maxNumElementsOfCuDNNTensor && KCRS < maxNumElementsOfCuDNNTensor) {
 			// Filter and output are accounted as dense in the memory estimation for conv2dBackwardData
 			double overhead = isInSparseFormat(gCtx, filter) ? OptimizerUtils.estimateSizeExactSparsity(K, CRS, 1.0) : 0;
 			overhead += isInSparseFormat(gCtx, dout) ? OptimizerUtils.estimateSizeExactSparsity(N, KPQ, 1.0) : 0;
@@ -398,7 +396,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 				else {
 					try(LibMatrixCuDNNInputRowFetcher doutFetcher = new LibMatrixCuDNNInputRowFetcher(gCtx, instName, dout)) {
 						for(int n = 0; n < N; n++) {
-							cudnnConv2dBackwardData(gCtx, instName, doutFetcher.getNthRow(n), filterPointer, dstPointer.withByteOffset(n*CHW*Sizeof.DOUBLE), algo);
+							cudnnConv2dBackwardData(gCtx, instName, doutFetcher.getNthRow(n), filterPointer, dstPointer.withByteOffset(n*CHW*sizeOfDataType), algo);
 						}
 					}
 				}
@@ -468,7 +466,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 		long CHW = C*H*W; long CPQ = C*P*Q;  
 		long NCHW = N*CHW; long NCPQ = N*CPQ; 
 
-		if(NCHW < maxNumDoublesOfCuDNNTensor && NCPQ < maxNumDoublesOfCuDNNTensor) {
+		if(NCHW < maxNumElementsOfCuDNNTensor && NCPQ < maxNumElementsOfCuDNNTensor) {
 			// Filter and output are accounted as dense in the memory estimation for conv2dBackwardData
 			long overhead = isInSparseFormat(gCtx, image) ? OptimizerUtils.estimateSizeExactSparsity(N, CHW, 1.0) : 0;
 			Pointer y = getDensePointerForCuDNN(gCtx, outputBlock, instName);
@@ -479,7 +477,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 			else {
 				LibMatrixCuDNNInputRowFetcher imgFetcher = new LibMatrixCuDNNInputRowFetcher(gCtx, instName, image);
 				for(int n = 0; n < N; n++) {
-					cudnnMaxpooling(gCtx, instName, imgFetcher.getNthRow(n), y.withByteOffset(n*CPQ*Sizeof.DOUBLE), 1, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+					cudnnMaxpooling(gCtx, instName, imgFetcher.getNthRow(n), y.withByteOffset(n*CPQ*sizeOfDataType), 1, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
 				}
 				imgFetcher.close();
 			}
@@ -545,7 +543,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 		long CHW = C*H*W; long CPQ = C*P*Q;  
 		long NCHW = N*CHW; long NCPQ = N*CPQ; 
 
-		if(NCHW < maxNumDoublesOfCuDNNTensor && NCPQ < maxNumDoublesOfCuDNNTensor) {
+		if(NCHW < maxNumElementsOfCuDNNTensor && NCPQ < maxNumElementsOfCuDNNTensor) {
 			// Filter and output are accounted as dense in the memory estimation for conv2dBackwardData
 			long overhead = isInSparseFormat(gCtx, image) ? OptimizerUtils.estimateSizeExactSparsity(N, CHW, 1.0) : 0;
 			overhead += isInSparseFormat(gCtx, dout) ? OptimizerUtils.estimateSizeExactSparsity(N, CPQ, 1.0) : 0;
@@ -560,7 +558,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 				LibMatrixCuDNNInputRowFetcher doutFetcher = new LibMatrixCuDNNInputRowFetcher(gCtx, instName, dout);
 				for(int n = 0; n < N; n++) {
 					cudnnMaxpoolingBackward(gCtx, instName, imgFetcher.getNthRow(n), doutFetcher.getNthRow(n), 
-							dx.withByteOffset(n*CHW*Sizeof.DOUBLE), 
+							dx.withByteOffset(n*CHW*sizeOfDataType), 
 							1, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
 				}
 				// Deallocate temporary array to hold one element of input
@@ -591,7 +589,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 			
 			// Calling PoolForward first, y is one of the inputs for poolBackward
 			// TODO: Remove calling poolForward after necessary changes at language level for poolBackward
-			long numBytes = N*C*P*Q*Sizeof.DOUBLE;
+			long numBytes = N*C*P*Q*sizeOfDataType;
 			y = gCtx.allocate(numBytes);
 			
 			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
@@ -668,7 +666,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 		MatrixObject output = ec.getMatrixObject(outputName);
 		getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, in.getNumRows(), in.getNumColumns()); // Allocated the dense output matrix
 		long t0=0;
-		if(N*CHW >= maxNumDoublesOfCuDNNTensor) {
+		if(N*CHW >= maxNumElementsOfCuDNNTensor) {
 			if(LOG.isTraceEnabled()) {
 				LOG.trace("GPU : relu custom kernel" + ", GPUContext=" + gCtx);
 			}
@@ -684,7 +682,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 		else {
 			cudnnTensorDescriptor tensorDescriptor = new cudnnTensorDescriptor();
 			cudnnCreateTensorDescriptor(tensorDescriptor);
-			cudnnSetTensor4dDescriptor(tensorDescriptor, CUDNN_TENSOR_NCHW, CUDNN_DATA_DOUBLE, toInt(N), 1, 1, toInt(CHW));
+			cudnnSetTensor4dDescriptor(tensorDescriptor, CUDNN_TENSOR_NCHW, CUDNN_DATA_TYPE, toInt(N), 1, 1, toInt(CHW));
 			cudnnReLU(gCtx, instName, in, getDensePointerForCuDNN(gCtx, output, instName), tensorDescriptor);
 			cudnnDestroyTensorDescriptor(tensorDescriptor);
 		}
@@ -701,7 +699,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 	 */
 	protected static Pointer getDensePointerForCuDNN(GPUContext gCtx, MatrixObject image, String instName) throws DMLRuntimeException {
 		long numElems = image.getNumRows()*image.getNumColumns();
-		if(numElems > maxNumDoublesOfCuDNNTensor) {
+		if(numElems > maxNumElementsOfCuDNNTensor) {
 			throw new DMLRuntimeException("CuDNN restriction: the size of input tensor cannot have greater than 2 giga-elements, but has " + numElems + " (i.e. [" + image.getNumRows() + " X " + image.getNumColumns() + "]). Hint: try reducing the mini-batch size.");
 		}
 		return getDensePointer(gCtx, image, instName);
@@ -717,4 +715,4 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 		if(status != cudnnStatus.CUDNN_STATUS_SUCCESS)
 			throw new DMLRuntimeException("Error status returned by CuDNN:" + jcuda.jcudnn.cudnnStatus.stringFor(status));
 	}
-}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
index f49433d..ee22541 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
@@ -40,7 +40,6 @@ import static jcuda.jcudnn.JCudnn.cudnnSetConvolution2dDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnSetFilter4dDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnSetTensor4dDescriptor;
 import static jcuda.jcudnn.cudnnConvolutionMode.CUDNN_CROSS_CORRELATION;
-import static jcuda.jcudnn.cudnnDataType.CUDNN_DATA_DOUBLE;
 import static jcuda.jcudnn.cudnnTensorFormat.CUDNN_TENSOR_NCHW;
 
 /**
@@ -255,14 +254,14 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements java.lang.AutoCloseab
 	private static cudnnTensorDescriptor allocateTensorDescriptor(int N, int C, int H, int W) throws DMLRuntimeException {
 		cudnnTensorDescriptor tensorDescriptor = new cudnnTensorDescriptor();
 		cudnnCreateTensorDescriptor(tensorDescriptor);
-		cudnnSetTensor4dDescriptor(tensorDescriptor, CUDNN_TENSOR_NCHW, CUDNN_DATA_DOUBLE, N, C, H, W);
+		cudnnSetTensor4dDescriptor(tensorDescriptor, CUDNN_TENSOR_NCHW, LibMatrixCUDA.CUDNN_DATA_TYPE, N, C, H, W);
 		return tensorDescriptor;
 	}
 	
 	private static cudnnFilterDescriptor allocateFilterDescriptor(int K, int C, int R, int S) {
 		cudnnFilterDescriptor filterDesc = new cudnnFilterDescriptor();
 		cudnnCreateFilterDescriptor(filterDesc);
-		cudnnSetFilter4dDescriptor(filterDesc, CUDNN_DATA_DOUBLE, CUDNN_TENSOR_NCHW, K, C, R, S);
+		cudnnSetFilter4dDescriptor(filterDesc, LibMatrixCUDA.CUDNN_DATA_TYPE, CUDNN_TENSOR_NCHW, K, C, R, S);
 		return filterDesc;
 	}
 	

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java
index 581607e..5121c87 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java
@@ -20,8 +20,6 @@ package org.apache.sysml.runtime.matrix.data;
 
 import static jcuda.runtime.JCuda.cudaMemset;
 import jcuda.Pointer;
-import jcuda.Sizeof;
-
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
@@ -32,7 +30,7 @@ import org.apache.sysml.utils.GPUStatistics;
 /**
  * Performs a slice operation: out = in[(n+1):(n+1), 1:numColumns]
  */
-public class LibMatrixCuDNNInputRowFetcher implements java.lang.AutoCloseable {
+public class LibMatrixCuDNNInputRowFetcher extends LibMatrixCUDA implements java.lang.AutoCloseable {
 	GPUContext gCtx; String instName; int numColumns; boolean isInputInSparseFormat; 
 	Object inPointer; // can be either CSRPointer or Pointer 
 	Pointer outPointer;
@@ -50,7 +48,7 @@ public class LibMatrixCuDNNInputRowFetcher implements java.lang.AutoCloseable {
 		numColumns = LibMatrixCUDA.toInt(image.getNumColumns());
 		isInputInSparseFormat = LibMatrixCUDA.isInSparseFormat(gCtx, image);
 		inPointer = isInputInSparseFormat ? LibMatrixCUDA.getSparsePointer(gCtx, image, instName) : LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, image, instName);
-		outPointer = gCtx.allocate(numColumns*Sizeof.DOUBLE);
+		outPointer = gCtx.allocate(numColumns*sizeOfDataType);
 	}
 	/**
 	 * Copy the nth row and return the dense pointer
@@ -62,7 +60,7 @@ public class LibMatrixCuDNNInputRowFetcher implements java.lang.AutoCloseable {
 		if(isInputInSparseFormat) {
 			jcuda.runtime.JCuda.cudaDeviceSynchronize();
 			long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
-			cudaMemset(outPointer, 0, numColumns*Sizeof.DOUBLE);
+			cudaMemset(outPointer, 0, numColumns*sizeOfDataType);
 			jcuda.runtime.JCuda.cudaDeviceSynchronize();
 			if(GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SET_ZERO, System.nanoTime() - t0);
 			LibMatrixCUDA.sliceSparseDense(gCtx, instName, (CSRPointer)inPointer, outPointer, n, n, 0, LibMatrixCUDA.toInt(numColumns-1), numColumns);

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNPoolingDescriptors.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNPoolingDescriptors.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNPoolingDescriptors.java
index f817bd5..d4b213f 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNPoolingDescriptors.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNPoolingDescriptors.java
@@ -24,7 +24,6 @@ import static jcuda.jcudnn.JCudnn.cudnnCreateTensorDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnDestroyTensorDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnSetPooling2dDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnSetTensor4dDescriptor;
-import static jcuda.jcudnn.cudnnDataType.CUDNN_DATA_DOUBLE;
 import static jcuda.jcudnn.cudnnNanPropagation.CUDNN_PROPAGATE_NAN;
 import static jcuda.jcudnn.cudnnPoolingMode.CUDNN_POOLING_MAX;
 import static jcuda.jcudnn.cudnnTensorFormat.CUDNN_TENSOR_NCHW;
@@ -141,7 +140,7 @@ public class LibMatrixCuDNNPoolingDescriptors implements java.lang.AutoCloseable
 	private static cudnnTensorDescriptor allocateTensorDescriptor(int N, int C, int H, int W) throws DMLRuntimeException {
 		cudnnTensorDescriptor tensorDescriptor = new cudnnTensorDescriptor();
 		cudnnCreateTensorDescriptor(tensorDescriptor);
-		cudnnSetTensor4dDescriptor(tensorDescriptor, CUDNN_TENSOR_NCHW, CUDNN_DATA_DOUBLE, N, C, H, W);
+		cudnnSetTensor4dDescriptor(tensorDescriptor, CUDNN_TENSOR_NCHW, LibMatrixCUDA.CUDNN_DATA_TYPE, N, C, H, W);
 		return tensorDescriptor;
 	}
 	


[3/4] systemml git commit: [SYSTEMML-1969] Support single-precision operations on GPU backend

Posted by ni...@apache.org.
http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/main/cpp/kernels/SystemML.ptx
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.ptx b/src/main/cpp/kernels/SystemML.ptx
index 73b057e..d382fc5 100644
--- a/src/main/cpp/kernels/SystemML.ptx
+++ b/src/main/cpp/kernels/SystemML.ptx
@@ -1,8 +1,8 @@
 //
 // Generated by NVIDIA NVVM Compiler
 //
-// Compiler Build ID: CL-21124049
-// Cuda compilation tools, release 8.0, V8.0.44
+// Compiler Build ID: CL-21554848
+// Cuda compilation tools, release 8.0, V8.0.61
 // Based on LLVM 3.4svn
 //
 
@@ -10,7 +10,7 @@
 .target sm_30
 .address_size 64
 
-	// .globl	slice_sparse_dense_row
+	// .globl	double2float_f
 .func  (.param .b64 func_retval0) __internal_trig_reduction_slowpathd
 (
 	.param .b64 __internal_trig_reduction_slowpathd_param_0,
@@ -23,20 +23,97 @@
 	.param .b64 __internal_accurate_pow_param_1
 )
 ;
-.extern .shared .align 8 .b8 sdata[];
+.extern .shared .align 1 .b8 my_sdata[];
+.const .align 4 .b8 __cudart_i2opi_f[24] = {65, 144, 67, 60, 153, 149, 98, 219, 192, 221, 52, 245, 209, 87, 39, 252, 41, 21, 68, 78, 110, 131, 249, 162};
 .const .align 8 .b8 __cudart_i2opi_d[144] = {8, 93, 141, 31, 177, 95, 251, 107, 234, 146, 82, 138, 247, 57, 7, 61, 123, 241, 229, 235, 199, 186, 39, 117, 45, 234, 95, 158, 102, 63, 70, 79, 183, 9, 203, 39, 207, 126, 54, 109, 31, 109, 10, 90, 139, 17, 47, 239, 15, 152, 5, 222, 255, 151, 248, 31, 59, 40, 249, 189, 139, 95, 132, 156, 244, 57, 83, 131, 57, 214, 145, 57, 65, 126, 95, 180, 38, 112, 156, 233, 132, 68, 187, 46, 245, 53, 130, 232, 62, 167, 41, 177, 28, 235, 29, 254, 28, 146, 209, 9, 234, 46, 73, 6, 224, 210, 77, 66, 58, 110, 36, 183, 97, 197, 187, 222, 171, 99, 81, 254, 65, 144, 67, 60, 153, 149, 98, 219, 192, 221, 52, 245, 209, 87, 39, 252, 41, 21, 68, 78, 110, 131, 249, 162};
 .const .align 8 .b8 __cudart_sin_cos_coeffs[128] = {186, 94, 120, 249, 101, 219, 229, 61, 70, 210, 176, 44, 241, 229, 90, 190, 146, 227, 172, 105, 227, 29, 199, 62, 161, 98, 219, 25, 160, 1, 42, 191, 24, 8, 17, 17, 17, 17, 129, 63, 84, 85, 85, 85, 85, 85, 197, 191, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 100, 129, 253, 32, 131, 255, 168, 189, 40, 133, 239, 193, 167, 238, 33, 62, 217, 230, 6, 142, 79, 126, 146, 190, 233, 188, 221, 25, 160, 1, 250, 62, 71, 93, 193, 22, 108, 193, 86, 191, 81, 85, 85, 85, 85, 85, 165, 63, 0, 0, 0, 0, 0, 0, 224, 191, 0, 0, 0, 0, 0, 0, 240, 63};
 
-.visible .entry slice_sparse_dense_row(
-	.param .u64 slice_sparse_dense_row_param_0,
-	.param .u64 slice_sparse_dense_row_param_1,
-	.param .u64 slice_sparse_dense_row_param_2,
-	.param .u64 slice_sparse_dense_row_param_3,
-	.param .u32 slice_sparse_dense_row_param_4,
-	.param .u32 slice_sparse_dense_row_param_5,
-	.param .u32 slice_sparse_dense_row_param_6,
-	.param .u32 slice_sparse_dense_row_param_7,
-	.param .u32 slice_sparse_dense_row_param_8
+.visible .entry double2float_f(
+	.param .u64 double2float_f_param_0,
+	.param .u64 double2float_f_param_1,
+	.param .u32 double2float_f_param_2
+)
+{
+	.reg .pred 	%p<2>;
+	.reg .f32 	%f<2>;
+	.reg .b32 	%r<6>;
+	.reg .f64 	%fd<2>;
+	.reg .b64 	%rd<9>;
+
+
+	ld.param.u64 	%rd1, [double2float_f_param_0];
+	ld.param.u64 	%rd2, [double2float_f_param_1];
+	ld.param.u32 	%r2, [double2float_f_param_2];
+	mov.u32 	%r3, %ctaid.x;
+	mov.u32 	%r4, %ntid.x;
+	mov.u32 	%r5, %tid.x;
+	mad.lo.s32 	%r1, %r4, %r3, %r5;
+	setp.ge.s32	%p1, %r1, %r2;
+	@%p1 bra 	BB0_2;
+
+	cvta.to.global.u64 	%rd3, %rd1;
+	mul.wide.s32 	%rd4, %r1, 8;
+	add.s64 	%rd5, %rd3, %rd4;
+	ld.global.f64 	%fd1, [%rd5];
+	cvt.rn.f32.f64	%f1, %fd1;
+	cvta.to.global.u64 	%rd6, %rd2;
+	mul.wide.s32 	%rd7, %r1, 4;
+	add.s64 	%rd8, %rd6, %rd7;
+	st.global.f32 	[%rd8], %f1;
+
+BB0_2:
+	ret;
+}
+
+	// .globl	float2double_f
+.visible .entry float2double_f(
+	.param .u64 float2double_f_param_0,
+	.param .u64 float2double_f_param_1,
+	.param .u32 float2double_f_param_2
+)
+{
+	.reg .pred 	%p<2>;
+	.reg .f32 	%f<2>;
+	.reg .b32 	%r<6>;
+	.reg .f64 	%fd<2>;
+	.reg .b64 	%rd<9>;
+
+
+	ld.param.u64 	%rd1, [float2double_f_param_0];
+	ld.param.u64 	%rd2, [float2double_f_param_1];
+	ld.param.u32 	%r2, [float2double_f_param_2];
+	mov.u32 	%r3, %ctaid.x;
+	mov.u32 	%r4, %ntid.x;
+	mov.u32 	%r5, %tid.x;
+	mad.lo.s32 	%r1, %r4, %r3, %r5;
+	setp.ge.s32	%p1, %r1, %r2;
+	@%p1 bra 	BB1_2;
+
+	cvta.to.global.u64 	%rd3, %rd1;
+	mul.wide.s32 	%rd4, %r1, 4;
+	add.s64 	%rd5, %rd3, %rd4;
+	ld.global.f32 	%f1, [%rd5];
+	cvt.f64.f32	%fd1, %f1;
+	cvta.to.global.u64 	%rd6, %rd2;
+	mul.wide.s32 	%rd7, %r1, 8;
+	add.s64 	%rd8, %rd6, %rd7;
+	st.global.f64 	[%rd8], %fd1;
+
+BB1_2:
+	ret;
+}
+
+	// .globl	slice_sparse_dense_row_d
+.visible .entry slice_sparse_dense_row_d(
+	.param .u64 slice_sparse_dense_row_d_param_0,
+	.param .u64 slice_sparse_dense_row_d_param_1,
+	.param .u64 slice_sparse_dense_row_d_param_2,
+	.param .u64 slice_sparse_dense_row_d_param_3,
+	.param .u32 slice_sparse_dense_row_d_param_4,
+	.param .u32 slice_sparse_dense_row_d_param_5,
+	.param .u32 slice_sparse_dense_row_d_param_6,
+	.param .u32 slice_sparse_dense_row_d_param_7,
+	.param .u32 slice_sparse_dense_row_d_param_8
 )
 {
 	.reg .pred 	%p<7>;
@@ -45,22 +122,22 @@
 	.reg .b64 	%rd<23>;
 
 
-	ld.param.u64 	%rd9, [slice_sparse_dense_row_param_0];
-	ld.param.u64 	%rd10, [slice_sparse_dense_row_param_1];
-	ld.param.u64 	%rd11, [slice_sparse_dense_row_param_2];
-	ld.param.u64 	%rd12, [slice_sparse_dense_row_param_3];
-	ld.param.u32 	%r15, [slice_sparse_dense_row_param_4];
-	ld.param.u32 	%r16, [slice_sparse_dense_row_param_5];
-	ld.param.u32 	%r12, [slice_sparse_dense_row_param_6];
-	ld.param.u32 	%r13, [slice_sparse_dense_row_param_7];
-	ld.param.u32 	%r14, [slice_sparse_dense_row_param_8];
+	ld.param.u64 	%rd9, [slice_sparse_dense_row_d_param_0];
+	ld.param.u64 	%rd10, [slice_sparse_dense_row_d_param_1];
+	ld.param.u64 	%rd11, [slice_sparse_dense_row_d_param_2];
+	ld.param.u64 	%rd12, [slice_sparse_dense_row_d_param_3];
+	ld.param.u32 	%r15, [slice_sparse_dense_row_d_param_4];
+	ld.param.u32 	%r16, [slice_sparse_dense_row_d_param_5];
+	ld.param.u32 	%r12, [slice_sparse_dense_row_d_param_6];
+	ld.param.u32 	%r13, [slice_sparse_dense_row_d_param_7];
+	ld.param.u32 	%r14, [slice_sparse_dense_row_d_param_8];
 	mov.u32 	%r17, %ntid.x;
 	mov.u32 	%r18, %ctaid.x;
 	mov.u32 	%r19, %tid.x;
 	mad.lo.s32 	%r1, %r17, %r18, %r19;
 	add.s32 	%r2, %r1, %r15;
 	setp.gt.s32	%p1, %r2, %r16;
-	@%p1 bra 	BB0_6;
+	@%p1 bra 	BB2_6;
 
 	cvta.to.global.u64 	%rd13, %rd10;
 	mul.wide.s32 	%rd14, %r2, 4;
@@ -68,7 +145,7 @@
 	ld.global.u32 	%r23, [%rd1];
 	ld.global.u32 	%r22, [%rd1+4];
 	setp.ge.s32	%p2, %r23, %r22;
-	@%p2 bra 	BB0_6;
+	@%p2 bra 	BB2_6;
 
 	cvta.to.global.u64 	%rd2, %rd12;
 	cvta.to.global.u64 	%rd15, %rd9;
@@ -80,12 +157,12 @@
 	mul.wide.s32 	%rd18, %r23, 4;
 	add.s64 	%rd21, %rd16, %rd18;
 
-BB0_3:
+BB2_3:
 	ld.global.u32 	%r8, [%rd21];
 	setp.lt.s32	%p3, %r8, %r12;
 	setp.gt.s32	%p4, %r8, %r13;
 	or.pred  	%p5, %p3, %p4;
-	@%p5 bra 	BB0_5;
+	@%p5 bra 	BB2_5;
 
 	ld.global.f64 	%fd1, [%rd22];
 	add.s32 	%r21, %r5, %r8;
@@ -94,28 +171,106 @@ BB0_3:
 	st.global.f64 	[%rd20], %fd1;
 	ld.global.u32 	%r22, [%rd1+4];
 
-BB0_5:
+BB2_5:
 	add.s64 	%rd22, %rd22, 8;
 	add.s64 	%rd21, %rd21, 4;
 	add.s32 	%r23, %r23, 1;
 	setp.lt.s32	%p6, %r23, %r22;
-	@%p6 bra 	BB0_3;
+	@%p6 bra 	BB2_3;
+
+BB2_6:
+	ret;
+}
+
+	// .globl	slice_sparse_dense_row_f
+.visible .entry slice_sparse_dense_row_f(
+	.param .u64 slice_sparse_dense_row_f_param_0,
+	.param .u64 slice_sparse_dense_row_f_param_1,
+	.param .u64 slice_sparse_dense_row_f_param_2,
+	.param .u64 slice_sparse_dense_row_f_param_3,
+	.param .u32 slice_sparse_dense_row_f_param_4,
+	.param .u32 slice_sparse_dense_row_f_param_5,
+	.param .u32 slice_sparse_dense_row_f_param_6,
+	.param .u32 slice_sparse_dense_row_f_param_7,
+	.param .u32 slice_sparse_dense_row_f_param_8
+)
+{
+	.reg .pred 	%p<7>;
+	.reg .f32 	%f<2>;
+	.reg .b32 	%r<24>;
+	.reg .b64 	%rd<22>;
+
+
+	ld.param.u64 	%rd9, [slice_sparse_dense_row_f_param_0];
+	ld.param.u64 	%rd10, [slice_sparse_dense_row_f_param_1];
+	ld.param.u64 	%rd11, [slice_sparse_dense_row_f_param_2];
+	ld.param.u64 	%rd12, [slice_sparse_dense_row_f_param_3];
+	ld.param.u32 	%r15, [slice_sparse_dense_row_f_param_4];
+	ld.param.u32 	%r16, [slice_sparse_dense_row_f_param_5];
+	ld.param.u32 	%r12, [slice_sparse_dense_row_f_param_6];
+	ld.param.u32 	%r13, [slice_sparse_dense_row_f_param_7];
+	ld.param.u32 	%r14, [slice_sparse_dense_row_f_param_8];
+	mov.u32 	%r17, %ntid.x;
+	mov.u32 	%r18, %ctaid.x;
+	mov.u32 	%r19, %tid.x;
+	mad.lo.s32 	%r1, %r17, %r18, %r19;
+	add.s32 	%r2, %r1, %r15;
+	setp.gt.s32	%p1, %r2, %r16;
+	@%p1 bra 	BB3_6;
+
+	cvta.to.global.u64 	%rd13, %rd10;
+	mul.wide.s32 	%rd14, %r2, 4;
+	add.s64 	%rd1, %rd13, %rd14;
+	ld.global.u32 	%r23, [%rd1];
+	ld.global.u32 	%r22, [%rd1+4];
+	setp.ge.s32	%p2, %r23, %r22;
+	@%p2 bra 	BB3_6;
+
+	cvta.to.global.u64 	%rd2, %rd12;
+	cvta.to.global.u64 	%rd15, %rd9;
+	cvta.to.global.u64 	%rd16, %rd11;
+	mul.lo.s32 	%r20, %r1, %r14;
+	sub.s32 	%r5, %r20, %r12;
+	mul.wide.s32 	%rd17, %r23, 4;
+	add.s64 	%rd21, %rd15, %rd17;
+	add.s64 	%rd20, %rd16, %rd17;
+
+BB3_3:
+	ld.global.u32 	%r8, [%rd20];
+	setp.lt.s32	%p3, %r8, %r12;
+	setp.gt.s32	%p4, %r8, %r13;
+	or.pred  	%p5, %p3, %p4;
+	@%p5 bra 	BB3_5;
+
+	ld.global.f32 	%f1, [%rd21];
+	add.s32 	%r21, %r5, %r8;
+	mul.wide.s32 	%rd18, %r21, 4;
+	add.s64 	%rd19, %rd2, %rd18;
+	st.global.f32 	[%rd19], %f1;
+	ld.global.u32 	%r22, [%rd1+4];
+
+BB3_5:
+	add.s64 	%rd21, %rd21, 4;
+	add.s64 	%rd20, %rd20, 4;
+	add.s32 	%r23, %r23, 1;
+	setp.lt.s32	%p6, %r23, %r22;
+	@%p6 bra 	BB3_3;
 
-BB0_6:
+BB3_6:
 	ret;
 }
 
-	// .globl	slice_sparse_dense_nnz
-.visible .entry slice_sparse_dense_nnz(
-	.param .u64 slice_sparse_dense_nnz_param_0,
-	.param .u64 slice_sparse_dense_nnz_param_1,
-	.param .u64 slice_sparse_dense_nnz_param_2,
-	.param .u64 slice_sparse_dense_nnz_param_3,
-	.param .u32 slice_sparse_dense_nnz_param_4,
-	.param .u32 slice_sparse_dense_nnz_param_5,
-	.param .u32 slice_sparse_dense_nnz_param_6,
-	.param .u32 slice_sparse_dense_nnz_param_7,
-	.param .u32 slice_sparse_dense_nnz_param_8
+	// .globl	slice_sparse_dense_nnz_d
+.visible .entry slice_sparse_dense_nnz_d(
+	.param .u64 slice_sparse_dense_nnz_d_param_0,
+	.param .u64 slice_sparse_dense_nnz_d_param_1,
+	.param .u64 slice_sparse_dense_nnz_d_param_2,
+	.param .u64 slice_sparse_dense_nnz_d_param_3,
+	.param .u32 slice_sparse_dense_nnz_d_param_4,
+	.param .u32 slice_sparse_dense_nnz_d_param_5,
+	.param .u32 slice_sparse_dense_nnz_d_param_6,
+	.param .u32 slice_sparse_dense_nnz_d_param_7,
+	.param .u32 slice_sparse_dense_nnz_d_param_8
 )
 {
 	.reg .pred 	%p<6>;
@@ -124,15 +279,15 @@ BB0_6:
 	.reg .b64 	%rd<22>;
 
 
-	ld.param.u64 	%rd5, [slice_sparse_dense_nnz_param_0];
-	ld.param.u64 	%rd8, [slice_sparse_dense_nnz_param_1];
-	ld.param.u64 	%rd6, [slice_sparse_dense_nnz_param_2];
-	ld.param.u64 	%rd7, [slice_sparse_dense_nnz_param_3];
-	ld.param.u32 	%r5, [slice_sparse_dense_nnz_param_4];
-	ld.param.u32 	%r9, [slice_sparse_dense_nnz_param_5];
-	ld.param.u32 	%r6, [slice_sparse_dense_nnz_param_6];
-	ld.param.u32 	%r7, [slice_sparse_dense_nnz_param_7];
-	ld.param.u32 	%r8, [slice_sparse_dense_nnz_param_8];
+	ld.param.u64 	%rd5, [slice_sparse_dense_nnz_d_param_0];
+	ld.param.u64 	%rd8, [slice_sparse_dense_nnz_d_param_1];
+	ld.param.u64 	%rd6, [slice_sparse_dense_nnz_d_param_2];
+	ld.param.u64 	%rd7, [slice_sparse_dense_nnz_d_param_3];
+	ld.param.u32 	%r5, [slice_sparse_dense_nnz_d_param_4];
+	ld.param.u32 	%r9, [slice_sparse_dense_nnz_d_param_5];
+	ld.param.u32 	%r6, [slice_sparse_dense_nnz_d_param_6];
+	ld.param.u32 	%r7, [slice_sparse_dense_nnz_d_param_7];
+	ld.param.u32 	%r8, [slice_sparse_dense_nnz_d_param_8];
 	mov.u32 	%r10, %ntid.x;
 	mov.u32 	%r11, %ctaid.x;
 	mov.u32 	%r12, %tid.x;
@@ -146,7 +301,7 @@ BB0_6:
 	add.s64 	%rd12, %rd1, %rd11;
 	ld.global.u32 	%r15, [%rd12+4];
 	setp.ge.s32	%p1, %r1, %r15;
-	@%p1 bra 	BB1_5;
+	@%p1 bra 	BB4_5;
 
 	cvta.to.global.u64 	%rd2, %rd7;
 	cvta.to.global.u64 	%rd3, %rd5;
@@ -158,11 +313,11 @@ BB0_6:
 	setp.lt.s32	%p2, %r2, %r6;
 	setp.gt.s32	%p3, %r2, %r7;
 	or.pred  	%p4, %p2, %p3;
-	@%p4 bra 	BB1_5;
+	@%p4 bra 	BB4_5;
 
 	mov.u32 	%r21, %r5;
 
-BB1_3:
+BB4_3:
 	mov.u32 	%r3, %r21;
 	add.s32 	%r4, %r3, 1;
 	mul.wide.s32 	%rd16, %r4, 4;
@@ -170,7 +325,7 @@ BB1_3:
 	ld.global.u32 	%r16, [%rd17];
 	setp.le.s32	%p5, %r16, %r1;
 	mov.u32 	%r21, %r4;
-	@%p5 bra 	BB1_3;
+	@%p5 bra 	BB4_3;
 
 	shl.b64 	%rd18, %rd4, 3;
 	add.s64 	%rd19, %rd3, %rd18;
@@ -183,21 +338,103 @@ BB1_3:
 	add.s64 	%rd21, %rd2, %rd20;
 	st.global.f64 	[%rd21], %fd1;
 
-BB1_5:
+BB4_5:
+	ret;
+}
+
+	// .globl	slice_sparse_dense_nnz_f
+.visible .entry slice_sparse_dense_nnz_f(
+	.param .u64 slice_sparse_dense_nnz_f_param_0,
+	.param .u64 slice_sparse_dense_nnz_f_param_1,
+	.param .u64 slice_sparse_dense_nnz_f_param_2,
+	.param .u64 slice_sparse_dense_nnz_f_param_3,
+	.param .u32 slice_sparse_dense_nnz_f_param_4,
+	.param .u32 slice_sparse_dense_nnz_f_param_5,
+	.param .u32 slice_sparse_dense_nnz_f_param_6,
+	.param .u32 slice_sparse_dense_nnz_f_param_7,
+	.param .u32 slice_sparse_dense_nnz_f_param_8
+)
+{
+	.reg .pred 	%p<6>;
+	.reg .f32 	%f<2>;
+	.reg .b32 	%r<22>;
+	.reg .b64 	%rd<22>;
+
+
+	ld.param.u64 	%rd5, [slice_sparse_dense_nnz_f_param_0];
+	ld.param.u64 	%rd8, [slice_sparse_dense_nnz_f_param_1];
+	ld.param.u64 	%rd6, [slice_sparse_dense_nnz_f_param_2];
+	ld.param.u64 	%rd7, [slice_sparse_dense_nnz_f_param_3];
+	ld.param.u32 	%r5, [slice_sparse_dense_nnz_f_param_4];
+	ld.param.u32 	%r9, [slice_sparse_dense_nnz_f_param_5];
+	ld.param.u32 	%r6, [slice_sparse_dense_nnz_f_param_6];
+	ld.param.u32 	%r7, [slice_sparse_dense_nnz_f_param_7];
+	ld.param.u32 	%r8, [slice_sparse_dense_nnz_f_param_8];
+	mov.u32 	%r10, %ntid.x;
+	mov.u32 	%r11, %ctaid.x;
+	mov.u32 	%r12, %tid.x;
+	mad.lo.s32 	%r13, %r10, %r11, %r12;
+	cvta.to.global.u64 	%rd1, %rd8;
+	mul.wide.s32 	%rd9, %r5, 4;
+	add.s64 	%rd10, %rd1, %rd9;
+	ld.global.u32 	%r14, [%rd10];
+	add.s32 	%r1, %r13, %r14;
+	mul.wide.s32 	%rd11, %r9, 4;
+	add.s64 	%rd12, %rd1, %rd11;
+	ld.global.u32 	%r15, [%rd12+4];
+	setp.ge.s32	%p1, %r1, %r15;
+	@%p1 bra 	BB5_5;
+
+	cvta.to.global.u64 	%rd2, %rd7;
+	cvta.to.global.u64 	%rd3, %rd5;
+	cvta.to.global.u64 	%rd13, %rd6;
+	cvt.s64.s32	%rd4, %r1;
+	mul.wide.s32 	%rd14, %r1, 4;
+	add.s64 	%rd15, %rd13, %rd14;
+	ld.global.u32 	%r2, [%rd15];
+	setp.lt.s32	%p2, %r2, %r6;
+	setp.gt.s32	%p3, %r2, %r7;
+	or.pred  	%p4, %p2, %p3;
+	@%p4 bra 	BB5_5;
+
+	mov.u32 	%r21, %r5;
+
+BB5_3:
+	mov.u32 	%r3, %r21;
+	add.s32 	%r4, %r3, 1;
+	mul.wide.s32 	%rd16, %r4, 4;
+	add.s64 	%rd17, %rd1, %rd16;
+	ld.global.u32 	%r16, [%rd17];
+	setp.le.s32	%p5, %r16, %r1;
+	mov.u32 	%r21, %r4;
+	@%p5 bra 	BB5_3;
+
+	shl.b64 	%rd18, %rd4, 2;
+	add.s64 	%rd19, %rd3, %rd18;
+	ld.global.f32 	%f1, [%rd19];
+	sub.s32 	%r17, %r3, %r5;
+	mul.lo.s32 	%r18, %r17, %r8;
+	sub.s32 	%r19, %r18, %r6;
+	add.s32 	%r20, %r19, %r2;
+	mul.wide.s32 	%rd20, %r20, 4;
+	add.s64 	%rd21, %rd2, %rd20;
+	st.global.f32 	[%rd21], %f1;
+
+BB5_5:
 	ret;
 }
 
-	// .globl	slice_dense_dense
-.visible .entry slice_dense_dense(
-	.param .u64 slice_dense_dense_param_0,
-	.param .u64 slice_dense_dense_param_1,
-	.param .u32 slice_dense_dense_param_2,
-	.param .u32 slice_dense_dense_param_3,
-	.param .u32 slice_dense_dense_param_4,
-	.param .u32 slice_dense_dense_param_5,
-	.param .u32 slice_dense_dense_param_6,
-	.param .u32 slice_dense_dense_param_7,
-	.param .u32 slice_dense_dense_param_8
+	// .globl	slice_dense_dense_d
+.visible .entry slice_dense_dense_d(
+	.param .u64 slice_dense_dense_d_param_0,
+	.param .u64 slice_dense_dense_d_param_1,
+	.param .u32 slice_dense_dense_d_param_2,
+	.param .u32 slice_dense_dense_d_param_3,
+	.param .u32 slice_dense_dense_d_param_4,
+	.param .u32 slice_dense_dense_d_param_5,
+	.param .u32 slice_dense_dense_d_param_6,
+	.param .u32 slice_dense_dense_d_param_7,
+	.param .u32 slice_dense_dense_d_param_8
 )
 {
 	.reg .pred 	%p<4>;
@@ -206,13 +443,13 @@ BB1_5:
 	.reg .b64 	%rd<9>;
 
 
-	ld.param.u64 	%rd1, [slice_dense_dense_param_0];
-	ld.param.u64 	%rd2, [slice_dense_dense_param_1];
-	ld.param.u32 	%r3, [slice_dense_dense_param_2];
-	ld.param.u32 	%r4, [slice_dense_dense_param_4];
-	ld.param.u32 	%r5, [slice_dense_dense_param_6];
-	ld.param.u32 	%r7, [slice_dense_dense_param_7];
-	ld.param.u32 	%r6, [slice_dense_dense_param_8];
+	ld.param.u64 	%rd1, [slice_dense_dense_d_param_0];
+	ld.param.u64 	%rd2, [slice_dense_dense_d_param_1];
+	ld.param.u32 	%r3, [slice_dense_dense_d_param_2];
+	ld.param.u32 	%r4, [slice_dense_dense_d_param_4];
+	ld.param.u32 	%r5, [slice_dense_dense_d_param_6];
+	ld.param.u32 	%r7, [slice_dense_dense_d_param_7];
+	ld.param.u32 	%r6, [slice_dense_dense_d_param_8];
 	mov.u32 	%r8, %ctaid.x;
 	mov.u32 	%r9, %ntid.x;
 	mov.u32 	%r10, %tid.x;
@@ -221,10 +458,10 @@ BB1_5:
 	setp.lt.s32	%p1, %r2, %r7;
 	setp.gt.s32	%p2, %r6, -1;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB2_2;
-	bra.uni 	BB2_1;
+	@!%p3 bra 	BB6_2;
+	bra.uni 	BB6_1;
 
-BB2_1:
+BB6_1:
 	rem.s32 	%r11, %r1, %r6;
 	cvta.to.global.u64 	%rd3, %rd1;
 	add.s32 	%r12, %r2, %r3;
@@ -238,15 +475,70 @@ BB2_1:
 	add.s64 	%rd8, %rd6, %rd7;
 	st.global.f64 	[%rd8], %fd1;
 
-BB2_2:
+BB6_2:
+	ret;
+}
+
+	// .globl	slice_dense_dense_f
+.visible .entry slice_dense_dense_f(
+	.param .u64 slice_dense_dense_f_param_0,
+	.param .u64 slice_dense_dense_f_param_1,
+	.param .u32 slice_dense_dense_f_param_2,
+	.param .u32 slice_dense_dense_f_param_3,
+	.param .u32 slice_dense_dense_f_param_4,
+	.param .u32 slice_dense_dense_f_param_5,
+	.param .u32 slice_dense_dense_f_param_6,
+	.param .u32 slice_dense_dense_f_param_7,
+	.param .u32 slice_dense_dense_f_param_8
+)
+{
+	.reg .pred 	%p<4>;
+	.reg .f32 	%f<2>;
+	.reg .b32 	%r<15>;
+	.reg .b64 	%rd<9>;
+
+
+	ld.param.u64 	%rd1, [slice_dense_dense_f_param_0];
+	ld.param.u64 	%rd2, [slice_dense_dense_f_param_1];
+	ld.param.u32 	%r3, [slice_dense_dense_f_param_2];
+	ld.param.u32 	%r4, [slice_dense_dense_f_param_4];
+	ld.param.u32 	%r5, [slice_dense_dense_f_param_6];
+	ld.param.u32 	%r7, [slice_dense_dense_f_param_7];
+	ld.param.u32 	%r6, [slice_dense_dense_f_param_8];
+	mov.u32 	%r8, %ctaid.x;
+	mov.u32 	%r9, %ntid.x;
+	mov.u32 	%r10, %tid.x;
+	mad.lo.s32 	%r1, %r9, %r8, %r10;
+	div.s32 	%r2, %r1, %r6;
+	setp.lt.s32	%p1, %r2, %r7;
+	setp.gt.s32	%p2, %r6, -1;
+	and.pred  	%p3, %p1, %p2;
+	@!%p3 bra 	BB7_2;
+	bra.uni 	BB7_1;
+
+BB7_1:
+	rem.s32 	%r11, %r1, %r6;
+	cvta.to.global.u64 	%rd3, %rd1;
+	add.s32 	%r12, %r2, %r3;
+	add.s32 	%r13, %r11, %r4;
+	mad.lo.s32 	%r14, %r12, %r5, %r13;
+	mul.wide.s32 	%rd4, %r14, 4;
+	add.s64 	%rd5, %rd3, %rd4;
+	ld.global.f32 	%f1, [%rd5];
+	cvta.to.global.u64 	%rd6, %rd2;
+	mul.wide.s32 	%rd7, %r1, 4;
+	add.s64 	%rd8, %rd6, %rd7;
+	st.global.f32 	[%rd8], %f1;
+
+BB7_2:
 	ret;
 }
 
-	// .globl	copy_u2l_dense
-.visible .entry copy_u2l_dense(
-	.param .u64 copy_u2l_dense_param_0,
-	.param .u32 copy_u2l_dense_param_1,
-	.param .u32 copy_u2l_dense_param_2
+	// .globl	copy_u2l_dense_d
+.visible .entry copy_u2l_dense_d(
+	.param .u64 copy_u2l_dense_d_param_0,
+	.param .u32 copy_u2l_dense_d_param_1,
+	.param .u32 copy_u2l_dense_d_param_2
 )
 {
 	.reg .pred 	%p<4>;
@@ -255,9 +547,9 @@ BB2_2:
 	.reg .b64 	%rd<7>;
 
 
-	ld.param.u64 	%rd1, [copy_u2l_dense_param_0];
-	ld.param.u32 	%r3, [copy_u2l_dense_param_1];
-	ld.param.u32 	%r4, [copy_u2l_dense_param_2];
+	ld.param.u64 	%rd1, [copy_u2l_dense_d_param_0];
+	ld.param.u32 	%r3, [copy_u2l_dense_d_param_1];
+	ld.param.u32 	%r4, [copy_u2l_dense_d_param_2];
 	mov.u32 	%r5, %ntid.x;
 	mov.u32 	%r6, %ctaid.x;
 	mov.u32 	%r7, %tid.x;
@@ -268,10 +560,10 @@ BB2_2:
 	setp.gt.s32	%p1, %r9, %r8;
 	setp.lt.s32	%p2, %r2, %r4;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB3_2;
-	bra.uni 	BB3_1;
+	@!%p3 bra 	BB8_2;
+	bra.uni 	BB8_1;
 
-BB3_1:
+BB8_1:
 	cvta.to.global.u64 	%rd2, %rd1;
 	mul.wide.s32 	%rd3, %r1, 8;
 	add.s64 	%rd4, %rd2, %rd3;
@@ -280,16 +572,58 @@ BB3_1:
 	add.s64 	%rd6, %rd2, %rd5;
 	st.global.f64 	[%rd6], %fd1;
 
-BB3_2:
+BB8_2:
+	ret;
+}
+
+	// .globl	copy_u2l_dense_f
+.visible .entry copy_u2l_dense_f(
+	.param .u64 copy_u2l_dense_f_param_0,
+	.param .u32 copy_u2l_dense_f_param_1,
+	.param .u32 copy_u2l_dense_f_param_2
+)
+{
+	.reg .pred 	%p<4>;
+	.reg .f32 	%f<2>;
+	.reg .b32 	%r<10>;
+	.reg .b64 	%rd<7>;
+
+
+	ld.param.u64 	%rd1, [copy_u2l_dense_f_param_0];
+	ld.param.u32 	%r3, [copy_u2l_dense_f_param_1];
+	ld.param.u32 	%r4, [copy_u2l_dense_f_param_2];
+	mov.u32 	%r5, %ntid.x;
+	mov.u32 	%r6, %ctaid.x;
+	mov.u32 	%r7, %tid.x;
+	mad.lo.s32 	%r1, %r5, %r6, %r7;
+	div.s32 	%r8, %r1, %r3;
+	rem.s32 	%r9, %r1, %r3;
+	mad.lo.s32 	%r2, %r9, %r3, %r8;
+	setp.gt.s32	%p1, %r9, %r8;
+	setp.lt.s32	%p2, %r2, %r4;
+	and.pred  	%p3, %p1, %p2;
+	@!%p3 bra 	BB9_2;
+	bra.uni 	BB9_1;
+
+BB9_1:
+	cvta.to.global.u64 	%rd2, %rd1;
+	mul.wide.s32 	%rd3, %r1, 4;
+	add.s64 	%rd4, %rd2, %rd3;
+	ld.global.f32 	%f1, [%rd4];
+	mul.wide.s32 	%rd5, %r2, 4;
+	add.s64 	%rd6, %rd2, %rd5;
+	st.global.f32 	[%rd6], %f1;
+
+BB9_2:
 	ret;
 }
 
-	// .globl	relu
-.visible .entry relu(
-	.param .u64 relu_param_0,
-	.param .u64 relu_param_1,
-	.param .u32 relu_param_2,
-	.param .u32 relu_param_3
+	// .globl	relu_d
+.visible .entry relu_d(
+	.param .u64 relu_d_param_0,
+	.param .u64 relu_d_param_1,
+	.param .u32 relu_d_param_2,
+	.param .u32 relu_d_param_3
 )
 {
 	.reg .pred 	%p<4>;
@@ -298,10 +632,10 @@ BB3_2:
 	.reg .b64 	%rd<8>;
 
 
-	ld.param.u64 	%rd1, [relu_param_0];
-	ld.param.u64 	%rd2, [relu_param_1];
-	ld.param.u32 	%r2, [relu_param_2];
-	ld.param.u32 	%r3, [relu_param_3];
+	ld.param.u64 	%rd1, [relu_d_param_0];
+	ld.param.u64 	%rd2, [relu_d_param_1];
+	ld.param.u32 	%r2, [relu_d_param_2];
+	ld.param.u32 	%r3, [relu_d_param_3];
 	mov.u32 	%r4, %ctaid.x;
 	mov.u32 	%r5, %ntid.x;
 	mov.u32 	%r6, %tid.x;
@@ -310,10 +644,10 @@ BB3_2:
 	setp.lt.s32	%p1, %r7, %r2;
 	setp.gt.s32	%p2, %r3, -1;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB4_2;
-	bra.uni 	BB4_1;
+	@!%p3 bra 	BB10_2;
+	bra.uni 	BB10_1;
 
-BB4_1:
+BB10_1:
 	cvta.to.global.u64 	%rd3, %rd1;
 	mul.wide.s32 	%rd4, %r1, 8;
 	add.s64 	%rd5, %rd3, %rd4;
@@ -324,17 +658,64 @@ BB4_1:
 	add.s64 	%rd7, %rd6, %rd4;
 	st.global.f64 	[%rd7], %fd3;
 
-BB4_2:
+BB10_2:
+	ret;
+}
+
+	// .globl	relu_f
+.visible .entry relu_f(
+	.param .u64 relu_f_param_0,
+	.param .u64 relu_f_param_1,
+	.param .u32 relu_f_param_2,
+	.param .u32 relu_f_param_3
+)
+{
+	.reg .pred 	%p<4>;
+	.reg .f32 	%f<3>;
+	.reg .b32 	%r<8>;
+	.reg .f64 	%fd<4>;
+	.reg .b64 	%rd<8>;
+
+
+	ld.param.u64 	%rd1, [relu_f_param_0];
+	ld.param.u64 	%rd2, [relu_f_param_1];
+	ld.param.u32 	%r2, [relu_f_param_2];
+	ld.param.u32 	%r3, [relu_f_param_3];
+	mov.u32 	%r4, %ctaid.x;
+	mov.u32 	%r5, %ntid.x;
+	mov.u32 	%r6, %tid.x;
+	mad.lo.s32 	%r1, %r5, %r4, %r6;
+	div.s32 	%r7, %r1, %r3;
+	setp.lt.s32	%p1, %r7, %r2;
+	setp.gt.s32	%p2, %r3, -1;
+	and.pred  	%p3, %p1, %p2;
+	@!%p3 bra 	BB11_2;
+	bra.uni 	BB11_1;
+
+BB11_1:
+	cvta.to.global.u64 	%rd3, %rd1;
+	mul.wide.s32 	%rd4, %r1, 4;
+	add.s64 	%rd5, %rd3, %rd4;
+	ld.global.f32 	%f1, [%rd5];
+	cvt.f64.f32	%fd1, %f1;
+	mov.f64 	%fd2, 0d0000000000000000;
+	max.f64 	%fd3, %fd2, %fd1;
+	cvt.rn.f32.f64	%f2, %fd3;
+	cvta.to.global.u64 	%rd6, %rd2;
+	add.s64 	%rd7, %rd6, %rd4;
+	st.global.f32 	[%rd7], %f2;
+
+BB11_2:
 	ret;
 }
 
-	// .globl	relu_backward
-.visible .entry relu_backward(
-	.param .u64 relu_backward_param_0,
-	.param .u64 relu_backward_param_1,
-	.param .u64 relu_backward_param_2,
-	.param .u32 relu_backward_param_3,
-	.param .u32 relu_backward_param_4
+	// .globl	relu_backward_d
+.visible .entry relu_backward_d(
+	.param .u64 relu_backward_d_param_0,
+	.param .u64 relu_backward_d_param_1,
+	.param .u64 relu_backward_d_param_2,
+	.param .u32 relu_backward_d_param_3,
+	.param .u32 relu_backward_d_param_4
 )
 {
 	.reg .pred 	%p<5>;
@@ -343,11 +724,11 @@ BB4_2:
 	.reg .b64 	%rd<14>;
 
 
-	ld.param.u64 	%rd2, [relu_backward_param_0];
-	ld.param.u64 	%rd3, [relu_backward_param_1];
-	ld.param.u64 	%rd4, [relu_backward_param_2];
-	ld.param.u32 	%r2, [relu_backward_param_3];
-	ld.param.u32 	%r3, [relu_backward_param_4];
+	ld.param.u64 	%rd2, [relu_backward_d_param_0];
+	ld.param.u64 	%rd3, [relu_backward_d_param_1];
+	ld.param.u64 	%rd4, [relu_backward_d_param_2];
+	ld.param.u32 	%r2, [relu_backward_d_param_3];
+	ld.param.u32 	%r3, [relu_backward_d_param_4];
 	mov.u32 	%r4, %ntid.x;
 	mov.u32 	%r5, %ctaid.x;
 	mov.u32 	%r6, %tid.x;
@@ -356,10 +737,10 @@ BB4_2:
 	setp.lt.s32	%p1, %r7, %r2;
 	setp.gt.s32	%p2, %r3, -1;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB5_4;
-	bra.uni 	BB5_1;
+	@!%p3 bra 	BB12_4;
+	bra.uni 	BB12_1;
 
-BB5_1:
+BB12_1:
 	cvta.to.global.u64 	%rd5, %rd2;
 	cvt.s64.s32	%rd1, %r1;
 	mul.wide.s32 	%rd6, %r1, 8;
@@ -367,42 +748,98 @@ BB5_1:
 	ld.global.f64 	%fd4, [%rd7];
 	mov.f64 	%fd5, 0d0000000000000000;
 	setp.leu.f64	%p4, %fd4, 0d0000000000000000;
-	@%p4 bra 	BB5_3;
+	@%p4 bra 	BB12_3;
 
 	cvta.to.global.u64 	%rd8, %rd3;
 	shl.b64 	%rd9, %rd1, 3;
 	add.s64 	%rd10, %rd8, %rd9;
 	ld.global.f64 	%fd5, [%rd10];
 
-BB5_3:
+BB12_3:
 	cvta.to.global.u64 	%rd11, %rd4;
 	shl.b64 	%rd12, %rd1, 3;
 	add.s64 	%rd13, %rd11, %rd12;
 	st.global.f64 	[%rd13], %fd5;
 
-BB5_4:
+BB12_4:
 	ret;
 }
 
-	// .globl	inplace_add
-.visible .entry inplace_add(
-	.param .u64 inplace_add_param_0,
-	.param .u64 inplace_add_param_1,
-	.param .u32 inplace_add_param_2,
-	.param .u32 inplace_add_param_3
+	// .globl	relu_backward_f
+.visible .entry relu_backward_f(
+	.param .u64 relu_backward_f_param_0,
+	.param .u64 relu_backward_f_param_1,
+	.param .u64 relu_backward_f_param_2,
+	.param .u32 relu_backward_f_param_3,
+	.param .u32 relu_backward_f_param_4
 )
 {
-	.reg .pred 	%p<4>;
+	.reg .pred 	%p<5>;
+	.reg .f32 	%f<6>;
 	.reg .b32 	%r<8>;
-	.reg .f64 	%fd<4>;
-	.reg .b64 	%rd<8>;
+	.reg .b64 	%rd<14>;
 
 
-	ld.param.u64 	%rd1, [inplace_add_param_0];
-	ld.param.u64 	%rd2, [inplace_add_param_1];
-	ld.param.u32 	%r2, [inplace_add_param_2];
-	ld.param.u32 	%r3, [inplace_add_param_3];
-	mov.u32 	%r4, %ctaid.x;
+	ld.param.u64 	%rd2, [relu_backward_f_param_0];
+	ld.param.u64 	%rd3, [relu_backward_f_param_1];
+	ld.param.u64 	%rd4, [relu_backward_f_param_2];
+	ld.param.u32 	%r2, [relu_backward_f_param_3];
+	ld.param.u32 	%r3, [relu_backward_f_param_4];
+	mov.u32 	%r4, %ntid.x;
+	mov.u32 	%r5, %ctaid.x;
+	mov.u32 	%r6, %tid.x;
+	mad.lo.s32 	%r1, %r4, %r5, %r6;
+	div.s32 	%r7, %r1, %r3;
+	setp.lt.s32	%p1, %r7, %r2;
+	setp.gt.s32	%p2, %r3, -1;
+	and.pred  	%p3, %p1, %p2;
+	@!%p3 bra 	BB13_4;
+	bra.uni 	BB13_1;
+
+BB13_1:
+	cvta.to.global.u64 	%rd5, %rd2;
+	cvt.s64.s32	%rd1, %r1;
+	mul.wide.s32 	%rd6, %r1, 4;
+	add.s64 	%rd7, %rd5, %rd6;
+	ld.global.f32 	%f4, [%rd7];
+	mov.f32 	%f5, 0f00000000;
+	setp.leu.f32	%p4, %f4, 0f00000000;
+	@%p4 bra 	BB13_3;
+
+	cvta.to.global.u64 	%rd8, %rd3;
+	shl.b64 	%rd9, %rd1, 2;
+	add.s64 	%rd10, %rd8, %rd9;
+	ld.global.f32 	%f5, [%rd10];
+
+BB13_3:
+	cvta.to.global.u64 	%rd11, %rd4;
+	shl.b64 	%rd12, %rd1, 2;
+	add.s64 	%rd13, %rd11, %rd12;
+	st.global.f32 	[%rd13], %f5;
+
+BB13_4:
+	ret;
+}
+
+	// .globl	inplace_add_d
+.visible .entry inplace_add_d(
+	.param .u64 inplace_add_d_param_0,
+	.param .u64 inplace_add_d_param_1,
+	.param .u32 inplace_add_d_param_2,
+	.param .u32 inplace_add_d_param_3
+)
+{
+	.reg .pred 	%p<4>;
+	.reg .b32 	%r<8>;
+	.reg .f64 	%fd<4>;
+	.reg .b64 	%rd<8>;
+
+
+	ld.param.u64 	%rd1, [inplace_add_d_param_0];
+	ld.param.u64 	%rd2, [inplace_add_d_param_1];
+	ld.param.u32 	%r2, [inplace_add_d_param_2];
+	ld.param.u32 	%r3, [inplace_add_d_param_3];
+	mov.u32 	%r4, %ctaid.x;
 	mov.u32 	%r5, %ntid.x;
 	mov.u32 	%r6, %tid.x;
 	mad.lo.s32 	%r1, %r5, %r4, %r6;
@@ -410,10 +847,10 @@ BB5_4:
 	setp.lt.s32	%p1, %r7, %r2;
 	setp.gt.s32	%p2, %r3, -1;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB6_2;
-	bra.uni 	BB6_1;
+	@!%p3 bra 	BB14_2;
+	bra.uni 	BB14_1;
 
-BB6_1:
+BB14_1:
 	cvta.to.global.u64 	%rd3, %rd1;
 	mul.wide.s32 	%rd4, %r1, 8;
 	add.s64 	%rd5, %rd3, %rd4;
@@ -424,18 +861,62 @@ BB6_1:
 	add.f64 	%fd3, %fd2, %fd1;
 	st.global.f64 	[%rd7], %fd3;
 
-BB6_2:
+BB14_2:
+	ret;
+}
+
+	// .globl	inplace_add_f
+.visible .entry inplace_add_f(
+	.param .u64 inplace_add_f_param_0,
+	.param .u64 inplace_add_f_param_1,
+	.param .u32 inplace_add_f_param_2,
+	.param .u32 inplace_add_f_param_3
+)
+{
+	.reg .pred 	%p<4>;
+	.reg .f32 	%f<4>;
+	.reg .b32 	%r<8>;
+	.reg .b64 	%rd<8>;
+
+
+	ld.param.u64 	%rd1, [inplace_add_f_param_0];
+	ld.param.u64 	%rd2, [inplace_add_f_param_1];
+	ld.param.u32 	%r2, [inplace_add_f_param_2];
+	ld.param.u32 	%r3, [inplace_add_f_param_3];
+	mov.u32 	%r4, %ctaid.x;
+	mov.u32 	%r5, %ntid.x;
+	mov.u32 	%r6, %tid.x;
+	mad.lo.s32 	%r1, %r5, %r4, %r6;
+	div.s32 	%r7, %r1, %r3;
+	setp.lt.s32	%p1, %r7, %r2;
+	setp.gt.s32	%p2, %r3, -1;
+	and.pred  	%p3, %p1, %p2;
+	@!%p3 bra 	BB15_2;
+	bra.uni 	BB15_1;
+
+BB15_1:
+	cvta.to.global.u64 	%rd3, %rd1;
+	mul.wide.s32 	%rd4, %r1, 4;
+	add.s64 	%rd5, %rd3, %rd4;
+	cvta.to.global.u64 	%rd6, %rd2;
+	add.s64 	%rd7, %rd6, %rd4;
+	ld.global.f32 	%f1, [%rd7];
+	ld.global.f32 	%f2, [%rd5];
+	add.f32 	%f3, %f2, %f1;
+	st.global.f32 	[%rd7], %f3;
+
+BB15_2:
 	ret;
 }
 
-	// .globl	bias_add
-.visible .entry bias_add(
-	.param .u64 bias_add_param_0,
-	.param .u64 bias_add_param_1,
-	.param .u64 bias_add_param_2,
-	.param .u32 bias_add_param_3,
-	.param .u32 bias_add_param_4,
-	.param .u32 bias_add_param_5
+	// .globl	bias_add_d
+.visible .entry bias_add_d(
+	.param .u64 bias_add_d_param_0,
+	.param .u64 bias_add_d_param_1,
+	.param .u64 bias_add_d_param_2,
+	.param .u32 bias_add_d_param_3,
+	.param .u32 bias_add_d_param_4,
+	.param .u32 bias_add_d_param_5
 )
 {
 	.reg .pred 	%p<4>;
@@ -444,12 +925,12 @@ BB6_2:
 	.reg .b64 	%rd<12>;
 
 
-	ld.param.u64 	%rd1, [bias_add_param_0];
-	ld.param.u64 	%rd2, [bias_add_param_1];
-	ld.param.u64 	%rd3, [bias_add_param_2];
-	ld.param.u32 	%r4, [bias_add_param_3];
-	ld.param.u32 	%r2, [bias_add_param_4];
-	ld.param.u32 	%r3, [bias_add_param_5];
+	ld.param.u64 	%rd1, [bias_add_d_param_0];
+	ld.param.u64 	%rd2, [bias_add_d_param_1];
+	ld.param.u64 	%rd3, [bias_add_d_param_2];
+	ld.param.u32 	%r4, [bias_add_d_param_3];
+	ld.param.u32 	%r2, [bias_add_d_param_4];
+	ld.param.u32 	%r3, [bias_add_d_param_5];
 	mov.u32 	%r5, %ctaid.x;
 	mov.u32 	%r6, %ntid.x;
 	mov.u32 	%r7, %tid.x;
@@ -458,10 +939,10 @@ BB6_2:
 	setp.lt.s32	%p1, %r8, %r4;
 	setp.gt.s32	%p2, %r2, -1;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB7_2;
-	bra.uni 	BB7_1;
+	@!%p3 bra 	BB16_2;
+	bra.uni 	BB16_1;
 
-BB7_1:
+BB16_1:
 	rem.s32 	%r9, %r1, %r2;
 	cvta.to.global.u64 	%rd4, %rd1;
 	mul.wide.s32 	%rd5, %r1, 8;
@@ -477,20 +958,73 @@ BB7_1:
 	add.s64 	%rd11, %rd10, %rd5;
 	st.global.f64 	[%rd11], %fd3;
 
-BB7_2:
+BB16_2:
+	ret;
+}
+
+	// .globl	bias_add_f
+.visible .entry bias_add_f(
+	.param .u64 bias_add_f_param_0,
+	.param .u64 bias_add_f_param_1,
+	.param .u64 bias_add_f_param_2,
+	.param .u32 bias_add_f_param_3,
+	.param .u32 bias_add_f_param_4,
+	.param .u32 bias_add_f_param_5
+)
+{
+	.reg .pred 	%p<4>;
+	.reg .f32 	%f<4>;
+	.reg .b32 	%r<11>;
+	.reg .b64 	%rd<12>;
+
+
+	ld.param.u64 	%rd1, [bias_add_f_param_0];
+	ld.param.u64 	%rd2, [bias_add_f_param_1];
+	ld.param.u64 	%rd3, [bias_add_f_param_2];
+	ld.param.u32 	%r4, [bias_add_f_param_3];
+	ld.param.u32 	%r2, [bias_add_f_param_4];
+	ld.param.u32 	%r3, [bias_add_f_param_5];
+	mov.u32 	%r5, %ctaid.x;
+	mov.u32 	%r6, %ntid.x;
+	mov.u32 	%r7, %tid.x;
+	mad.lo.s32 	%r1, %r6, %r5, %r7;
+	div.s32 	%r8, %r1, %r2;
+	setp.lt.s32	%p1, %r8, %r4;
+	setp.gt.s32	%p2, %r2, -1;
+	and.pred  	%p3, %p1, %p2;
+	@!%p3 bra 	BB17_2;
+	bra.uni 	BB17_1;
+
+BB17_1:
+	rem.s32 	%r9, %r1, %r2;
+	cvta.to.global.u64 	%rd4, %rd1;
+	mul.wide.s32 	%rd5, %r1, 4;
+	add.s64 	%rd6, %rd4, %rd5;
+	div.s32 	%r10, %r9, %r3;
+	cvta.to.global.u64 	%rd7, %rd2;
+	mul.wide.s32 	%rd8, %r10, 4;
+	add.s64 	%rd9, %rd7, %rd8;
+	ld.global.f32 	%f1, [%rd9];
+	ld.global.f32 	%f2, [%rd6];
+	add.f32 	%f3, %f2, %f1;
+	cvta.to.global.u64 	%rd10, %rd3;
+	add.s64 	%rd11, %rd10, %rd5;
+	st.global.f32 	[%rd11], %f3;
+
+BB17_2:
 	ret;
 }
 
-	// .globl	daxpy_matrix_vector
-.visible .entry daxpy_matrix_vector(
-	.param .u64 daxpy_matrix_vector_param_0,
-	.param .u64 daxpy_matrix_vector_param_1,
-	.param .f64 daxpy_matrix_vector_param_2,
-	.param .u64 daxpy_matrix_vector_param_3,
-	.param .u32 daxpy_matrix_vector_param_4,
-	.param .u32 daxpy_matrix_vector_param_5,
-	.param .u32 daxpy_matrix_vector_param_6,
-	.param .u32 daxpy_matrix_vector_param_7
+	// .globl	daxpy_matrix_vector_d
+.visible .entry daxpy_matrix_vector_d(
+	.param .u64 daxpy_matrix_vector_d_param_0,
+	.param .u64 daxpy_matrix_vector_d_param_1,
+	.param .f64 daxpy_matrix_vector_d_param_2,
+	.param .u64 daxpy_matrix_vector_d_param_3,
+	.param .u32 daxpy_matrix_vector_d_param_4,
+	.param .u32 daxpy_matrix_vector_d_param_5,
+	.param .u32 daxpy_matrix_vector_d_param_6,
+	.param .u32 daxpy_matrix_vector_d_param_7
 )
 {
 	.reg .pred 	%p<5>;
@@ -499,13 +1033,13 @@ BB7_2:
 	.reg .b64 	%rd<14>;
 
 
-	ld.param.u64 	%rd3, [daxpy_matrix_vector_param_0];
-	ld.param.u64 	%rd5, [daxpy_matrix_vector_param_1];
-	ld.param.f64 	%fd2, [daxpy_matrix_vector_param_2];
-	ld.param.u64 	%rd4, [daxpy_matrix_vector_param_3];
-	ld.param.u32 	%r5, [daxpy_matrix_vector_param_4];
-	ld.param.u32 	%r3, [daxpy_matrix_vector_param_5];
-	ld.param.u32 	%r4, [daxpy_matrix_vector_param_6];
+	ld.param.u64 	%rd3, [daxpy_matrix_vector_d_param_0];
+	ld.param.u64 	%rd5, [daxpy_matrix_vector_d_param_1];
+	ld.param.f64 	%fd2, [daxpy_matrix_vector_d_param_2];
+	ld.param.u64 	%rd4, [daxpy_matrix_vector_d_param_3];
+	ld.param.u32 	%r5, [daxpy_matrix_vector_d_param_4];
+	ld.param.u32 	%r3, [daxpy_matrix_vector_d_param_5];
+	ld.param.u32 	%r4, [daxpy_matrix_vector_d_param_6];
 	cvta.to.global.u64 	%rd1, %rd5;
 	mov.u32 	%r6, %ntid.x;
 	mov.u32 	%r7, %ctaid.x;
@@ -516,10 +1050,10 @@ BB7_2:
 	setp.lt.s32	%p1, %r1, %r5;
 	setp.gt.s32	%p2, %r3, -1;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB8_4;
-	bra.uni 	BB8_1;
+	@!%p3 bra 	BB18_4;
+	bra.uni 	BB18_1;
 
-BB8_1:
+BB18_1:
 	cvta.to.global.u64 	%rd6, %rd4;
 	mad.lo.s32 	%r10, %r1, %r3, %r2;
 	cvta.to.global.u64 	%rd7, %rd3;
@@ -528,36 +1062,111 @@ BB8_1:
 	ld.global.f64 	%fd1, [%rd9];
 	add.s64 	%rd2, %rd6, %rd8;
 	setp.eq.s32	%p4, %r4, 1;
-	@%p4 bra 	BB8_3;
-	bra.uni 	BB8_2;
+	@%p4 bra 	BB18_3;
+	bra.uni 	BB18_2;
 
-BB8_3:
+BB18_3:
 	mul.wide.s32 	%rd12, %r2, 8;
 	add.s64 	%rd13, %rd1, %rd12;
 	ld.global.f64 	%fd5, [%rd13];
 	fma.rn.f64 	%fd6, %fd5, %fd2, %fd1;
 	st.global.f64 	[%rd2], %fd6;
-	bra.uni 	BB8_4;
+	bra.uni 	BB18_4;
 
-BB8_2:
+BB18_2:
 	mul.wide.s32 	%rd10, %r1, 8;
 	add.s64 	%rd11, %rd1, %rd10;
 	ld.global.f64 	%fd3, [%rd11];
 	fma.rn.f64 	%fd4, %fd3, %fd2, %fd1;
 	st.global.f64 	[%rd2], %fd4;
 
-BB8_4:
+BB18_4:
+	ret;
+}
+
+	// .globl	daxpy_matrix_vector_f
+.visible .entry daxpy_matrix_vector_f(
+	.param .u64 daxpy_matrix_vector_f_param_0,
+	.param .u64 daxpy_matrix_vector_f_param_1,
+	.param .f64 daxpy_matrix_vector_f_param_2,
+	.param .u64 daxpy_matrix_vector_f_param_3,
+	.param .u32 daxpy_matrix_vector_f_param_4,
+	.param .u32 daxpy_matrix_vector_f_param_5,
+	.param .u32 daxpy_matrix_vector_f_param_6,
+	.param .u32 daxpy_matrix_vector_f_param_7
+)
+{
+	.reg .pred 	%p<5>;
+	.reg .f32 	%f<6>;
+	.reg .b32 	%r<11>;
+	.reg .f64 	%fd<7>;
+	.reg .b64 	%rd<14>;
+
+
+	ld.param.u64 	%rd3, [daxpy_matrix_vector_f_param_0];
+	ld.param.u64 	%rd5, [daxpy_matrix_vector_f_param_1];
+	ld.param.f64 	%fd2, [daxpy_matrix_vector_f_param_2];
+	ld.param.u64 	%rd4, [daxpy_matrix_vector_f_param_3];
+	ld.param.u32 	%r5, [daxpy_matrix_vector_f_param_4];
+	ld.param.u32 	%r3, [daxpy_matrix_vector_f_param_5];
+	ld.param.u32 	%r4, [daxpy_matrix_vector_f_param_6];
+	cvta.to.global.u64 	%rd1, %rd5;
+	mov.u32 	%r6, %ntid.x;
+	mov.u32 	%r7, %ctaid.x;
+	mov.u32 	%r8, %tid.x;
+	mad.lo.s32 	%r9, %r6, %r7, %r8;
+	div.s32 	%r1, %r9, %r3;
+	rem.s32 	%r2, %r9, %r3;
+	setp.lt.s32	%p1, %r1, %r5;
+	setp.gt.s32	%p2, %r3, -1;
+	and.pred  	%p3, %p1, %p2;
+	@!%p3 bra 	BB19_4;
+	bra.uni 	BB19_1;
+
+BB19_1:
+	cvta.to.global.u64 	%rd6, %rd4;
+	mad.lo.s32 	%r10, %r1, %r3, %r2;
+	cvta.to.global.u64 	%rd7, %rd3;
+	mul.wide.s32 	%rd8, %r10, 4;
+	add.s64 	%rd9, %rd7, %rd8;
+	ld.global.f32 	%f1, [%rd9];
+	cvt.f64.f32	%fd1, %f1;
+	add.s64 	%rd2, %rd6, %rd8;
+	setp.eq.s32	%p4, %r4, 1;
+	@%p4 bra 	BB19_3;
+	bra.uni 	BB19_2;
+
+BB19_3:
+	mul.wide.s32 	%rd12, %r2, 4;
+	add.s64 	%rd13, %rd1, %rd12;
+	ld.global.f32 	%f4, [%rd13];
+	cvt.f64.f32	%fd5, %f4;
+	fma.rn.f64 	%fd6, %fd5, %fd2, %fd1;
+	cvt.rn.f32.f64	%f5, %fd6;
+	st.global.f32 	[%rd2], %f5;
+	bra.uni 	BB19_4;
+
+BB19_2:
+	mul.wide.s32 	%rd10, %r1, 4;
+	add.s64 	%rd11, %rd1, %rd10;
+	ld.global.f32 	%f2, [%rd11];
+	cvt.f64.f32	%fd3, %f2;
+	fma.rn.f64 	%fd4, %fd3, %fd2, %fd1;
+	cvt.rn.f32.f64	%f3, %fd4;
+	st.global.f32 	[%rd2], %f3;
+
+BB19_4:
 	ret;
 }
 
-	// .globl	bias_multiply
-.visible .entry bias_multiply(
-	.param .u64 bias_multiply_param_0,
-	.param .u64 bias_multiply_param_1,
-	.param .u64 bias_multiply_param_2,
-	.param .u32 bias_multiply_param_3,
-	.param .u32 bias_multiply_param_4,
-	.param .u32 bias_multiply_param_5
+	// .globl	bias_multiply_d
+.visible .entry bias_multiply_d(
+	.param .u64 bias_multiply_d_param_0,
+	.param .u64 bias_multiply_d_param_1,
+	.param .u64 bias_multiply_d_param_2,
+	.param .u32 bias_multiply_d_param_3,
+	.param .u32 bias_multiply_d_param_4,
+	.param .u32 bias_multiply_d_param_5
 )
 {
 	.reg .pred 	%p<4>;
@@ -566,12 +1175,12 @@ BB8_4:
 	.reg .b64 	%rd<12>;
 
 
-	ld.param.u64 	%rd1, [bias_multiply_param_0];
-	ld.param.u64 	%rd2, [bias_multiply_param_1];
-	ld.param.u64 	%rd3, [bias_multiply_param_2];
-	ld.param.u32 	%r4, [bias_multiply_param_3];
-	ld.param.u32 	%r2, [bias_multiply_param_4];
-	ld.param.u32 	%r3, [bias_multiply_param_5];
+	ld.param.u64 	%rd1, [bias_multiply_d_param_0];
+	ld.param.u64 	%rd2, [bias_multiply_d_param_1];
+	ld.param.u64 	%rd3, [bias_multiply_d_param_2];
+	ld.param.u32 	%r4, [bias_multiply_d_param_3];
+	ld.param.u32 	%r2, [bias_multiply_d_param_4];
+	ld.param.u32 	%r3, [bias_multiply_d_param_5];
 	mov.u32 	%r5, %ctaid.x;
 	mov.u32 	%r6, %ntid.x;
 	mov.u32 	%r7, %tid.x;
@@ -580,10 +1189,10 @@ BB8_4:
 	setp.lt.s32	%p1, %r8, %r4;
 	setp.gt.s32	%p2, %r2, -1;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB9_2;
-	bra.uni 	BB9_1;
+	@!%p3 bra 	BB20_2;
+	bra.uni 	BB20_1;
 
-BB9_1:
+BB20_1:
 	rem.s32 	%r9, %r1, %r2;
 	cvta.to.global.u64 	%rd4, %rd1;
 	mul.wide.s32 	%rd5, %r1, 8;
@@ -599,110 +1208,89 @@ BB9_1:
 	add.s64 	%rd11, %rd10, %rd5;
 	st.global.f64 	[%rd11], %fd3;
 
-BB9_2:
+BB20_2:
 	ret;
 }
 
-	// .globl	compare_and_set
-.visible .entry compare_and_set(
-	.param .u64 compare_and_set_param_0,
-	.param .u64 compare_and_set_param_1,
-	.param .u32 compare_and_set_param_2,
-	.param .u32 compare_and_set_param_3,
-	.param .f64 compare_and_set_param_4,
-	.param .f64 compare_and_set_param_5,
-	.param .f64 compare_and_set_param_6,
-	.param .f64 compare_and_set_param_7,
-	.param .f64 compare_and_set_param_8
+	// .globl	bias_multiply_f
+.visible .entry bias_multiply_f(
+	.param .u64 bias_multiply_f_param_0,
+	.param .u64 bias_multiply_f_param_1,
+	.param .u64 bias_multiply_f_param_2,
+	.param .u32 bias_multiply_f_param_3,
+	.param .u32 bias_multiply_f_param_4,
+	.param .u32 bias_multiply_f_param_5
 )
 {
-	.reg .pred 	%p<6>;
-	.reg .b32 	%r<10>;
-	.reg .f64 	%fd<9>;
-	.reg .b64 	%rd<8>;
+	.reg .pred 	%p<4>;
+	.reg .f32 	%f<4>;
+	.reg .b32 	%r<11>;
+	.reg .b64 	%rd<12>;
 
 
-	ld.param.u64 	%rd2, [compare_and_set_param_0];
-	ld.param.u64 	%rd3, [compare_and_set_param_1];
-	ld.param.u32 	%r2, [compare_and_set_param_2];
-	ld.param.u32 	%r3, [compare_and_set_param_3];
-	ld.param.f64 	%fd2, [compare_and_set_param_4];
-	ld.param.f64 	%fd3, [compare_and_set_param_5];
-	ld.param.f64 	%fd4, [compare_and_set_param_6];
-	ld.param.f64 	%fd5, [compare_and_set_param_7];
-	ld.param.f64 	%fd6, [compare_and_set_param_8];
-	mov.u32 	%r4, %ctaid.x;
-	mov.u32 	%r5, %ntid.x;
-	mov.u32 	%r6, %tid.x;
-	mad.lo.s32 	%r7, %r5, %r4, %r6;
-	div.s32 	%r8, %r7, %r3;
-	rem.s32 	%r9, %r7, %r3;
-	mad.lo.s32 	%r1, %r8, %r3, %r9;
-	setp.lt.s32	%p1, %r8, %r2;
-	setp.gt.s32	%p2, %r3, -1;
+	ld.param.u64 	%rd1, [bias_multiply_f_param_0];
+	ld.param.u64 	%rd2, [bias_multiply_f_param_1];
+	ld.param.u64 	%rd3, [bias_multiply_f_param_2];
+	ld.param.u32 	%r4, [bias_multiply_f_param_3];
+	ld.param.u32 	%r2, [bias_multiply_f_param_4];
+	ld.param.u32 	%r3, [bias_multiply_f_param_5];
+	mov.u32 	%r5, %ctaid.x;
+	mov.u32 	%r6, %ntid.x;
+	mov.u32 	%r7, %tid.x;
+	mad.lo.s32 	%r1, %r6, %r5, %r7;
+	div.s32 	%r8, %r1, %r2;
+	setp.lt.s32	%p1, %r8, %r4;
+	setp.gt.s32	%p2, %r2, -1;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB10_6;
-	bra.uni 	BB10_1;
+	@!%p3 bra 	BB21_2;
+	bra.uni 	BB21_1;
 
-BB10_1:
-	cvta.to.global.u64 	%rd4, %rd2;
-	mul.wide.s32 	%rd5, %r1, 8;
+BB21_1:
+	rem.s32 	%r9, %r1, %r2;
+	cvta.to.global.u64 	%rd4, %rd1;
+	mul.wide.s32 	%rd5, %r1, 4;
 	add.s64 	%rd6, %rd4, %rd5;
-	ld.global.f64 	%fd1, [%rd6];
-	sub.f64 	%fd7, %fd1, %fd2;
-	abs.f64 	%fd8, %fd7;
-	setp.lt.f64	%p4, %fd8, %fd3;
-	cvta.to.global.u64 	%rd7, %rd3;
-	add.s64 	%rd1, %rd7, %rd5;
-	@%p4 bra 	BB10_5;
-	bra.uni 	BB10_2;
-
-BB10_5:
-	st.global.f64 	[%rd1], %fd4;
-	bra.uni 	BB10_6;
-
-BB10_2:
-	setp.lt.f64	%p5, %fd1, %fd2;
-	@%p5 bra 	BB10_4;
-	bra.uni 	BB10_3;
-
-BB10_4:
-	st.global.f64 	[%rd1], %fd5;
-	bra.uni 	BB10_6;
-
-BB10_3:
-	st.global.f64 	[%rd1], %fd6;
+	div.s32 	%r10, %r9, %r3;
+	cvta.to.global.u64 	%rd7, %rd2;
+	mul.wide.s32 	%rd8, %r10, 4;
+	add.s64 	%rd9, %rd7, %rd8;
+	ld.global.f32 	%f1, [%rd9];
+	ld.global.f32 	%f2, [%rd6];
+	mul.f32 	%f3, %f2, %f1;
+	cvta.to.global.u64 	%rd10, %rd3;
+	add.s64 	%rd11, %rd10, %rd5;
+	st.global.f32 	[%rd11], %f3;
 
-BB10_6:
+BB21_2:
 	ret;
 }
 
-	// .globl	matrix_matrix_cellwise_op
-.visible .entry matrix_matrix_cellwise_op(
-	.param .u64 matrix_matrix_cellwise_op_param_0,
-	.param .u64 matrix_matrix_cellwise_op_param_1,
-	.param .u64 matrix_matrix_cellwise_op_param_2,
-	.param .u32 matrix_matrix_cellwise_op_param_3,
-	.param .u32 matrix_matrix_cellwise_op_param_4,
-	.param .u32 matrix_matrix_cellwise_op_param_5,
-	.param .u32 matrix_matrix_cellwise_op_param_6,
-	.param .u32 matrix_matrix_cellwise_op_param_7
+	// .globl	matrix_matrix_cellwise_op_d
+.visible .entry matrix_matrix_cellwise_op_d(
+	.param .u64 matrix_matrix_cellwise_op_d_param_0,
+	.param .u64 matrix_matrix_cellwise_op_d_param_1,
+	.param .u64 matrix_matrix_cellwise_op_d_param_2,
+	.param .u32 matrix_matrix_cellwise_op_d_param_3,
+	.param .u32 matrix_matrix_cellwise_op_d_param_4,
+	.param .u32 matrix_matrix_cellwise_op_d_param_5,
+	.param .u32 matrix_matrix_cellwise_op_d_param_6,
+	.param .u32 matrix_matrix_cellwise_op_d_param_7
 )
 {
-	.reg .pred 	%p<77>;
-	.reg .b32 	%r<65>;
-	.reg .f64 	%fd<55>;
+	.reg .pred 	%p<73>;
+	.reg .b32 	%r<66>;
+	.reg .f64 	%fd<56>;
 	.reg .b64 	%rd<19>;
 
 
-	ld.param.u64 	%rd2, [matrix_matrix_cellwise_op_param_0];
-	ld.param.u64 	%rd3, [matrix_matrix_cellwise_op_param_1];
-	ld.param.u64 	%rd4, [matrix_matrix_cellwise_op_param_2];
-	ld.param.u32 	%r14, [matrix_matrix_cellwise_op_param_3];
-	ld.param.u32 	%r10, [matrix_matrix_cellwise_op_param_4];
-	ld.param.u32 	%r11, [matrix_matrix_cellwise_op_param_5];
-	ld.param.u32 	%r12, [matrix_matrix_cellwise_op_param_6];
-	ld.param.u32 	%r13, [matrix_matrix_cellwise_op_param_7];
+	ld.param.u64 	%rd2, [matrix_matrix_cellwise_op_d_param_0];
+	ld.param.u64 	%rd3, [matrix_matrix_cellwise_op_d_param_1];
+	ld.param.u64 	%rd4, [matrix_matrix_cellwise_op_d_param_2];
+	ld.param.u32 	%r14, [matrix_matrix_cellwise_op_d_param_3];
+	ld.param.u32 	%r10, [matrix_matrix_cellwise_op_d_param_4];
+	ld.param.u32 	%r11, [matrix_matrix_cellwise_op_d_param_5];
+	ld.param.u32 	%r12, [matrix_matrix_cellwise_op_d_param_6];
+	ld.param.u32 	%r13, [matrix_matrix_cellwise_op_d_param_7];
 	mov.u32 	%r15, %ntid.x;
 	mov.u32 	%r16, %ctaid.x;
 	mov.u32 	%r17, %tid.x;
@@ -712,93 +1300,93 @@ BB10_6:
 	setp.lt.s32	%p2, %r1, %r14;
 	setp.gt.s32	%p3, %r10, -1;
 	and.pred  	%p4, %p2, %p3;
-	@!%p4 bra 	BB11_73;
-	bra.uni 	BB11_1;
+	@!%p4 bra 	BB22_77;
+	bra.uni 	BB22_1;
 
-BB11_1:
+BB22_1:
 	mad.lo.s32 	%r3, %r1, %r10, %r2;
 	setp.eq.s32	%p5, %r11, 1;
-	mov.u32 	%r63, %r1;
-	@%p5 bra 	BB11_5;
+	mov.u32 	%r64, %r1;
+	@%p5 bra 	BB22_5;
 
 	setp.ne.s32	%p6, %r11, 2;
-	mov.u32 	%r64, %r3;
-	@%p6 bra 	BB11_4;
+	mov.u32 	%r65, %r3;
+	@%p6 bra 	BB22_4;
 
-	mov.u32 	%r64, %r2;
+	mov.u32 	%r65, %r2;
 
-BB11_4:
-	mov.u32 	%r58, %r64;
-	mov.u32 	%r4, %r58;
-	mov.u32 	%r63, %r4;
+BB22_4:
+	mov.u32 	%r59, %r65;
+	mov.u32 	%r4, %r59;
+	mov.u32 	%r64, %r4;
 
-BB11_5:
-	mov.u32 	%r5, %r63;
+BB22_5:
+	mov.u32 	%r5, %r64;
 	setp.eq.s32	%p7, %r12, 1;
-	mov.u32 	%r61, %r1;
-	@%p7 bra 	BB11_9;
+	mov.u32 	%r62, %r1;
+	@%p7 bra 	BB22_9;
 
 	setp.ne.s32	%p8, %r12, 2;
-	mov.u32 	%r62, %r3;
-	@%p8 bra 	BB11_8;
+	mov.u32 	%r63, %r3;
+	@%p8 bra 	BB22_8;
 
-	mov.u32 	%r62, %r2;
+	mov.u32 	%r63, %r2;
 
-BB11_8:
-	mov.u32 	%r61, %r62;
+BB22_8:
+	mov.u32 	%r62, %r63;
 
-BB11_9:
+BB22_9:
 	cvta.to.global.u64 	%rd5, %rd3;
 	cvta.to.global.u64 	%rd6, %rd2;
 	mul.wide.s32 	%rd7, %r5, 8;
 	add.s64 	%rd8, %rd6, %rd7;
 	ld.global.f64 	%fd1, [%rd8];
-	mul.wide.s32 	%rd9, %r61, 8;
+	mul.wide.s32 	%rd9, %r62, 8;
 	add.s64 	%rd10, %rd5, %rd9;
 	ld.global.f64 	%fd2, [%rd10];
-	mov.f64 	%fd54, 0d7FEFFFFFFFFFFFFF;
+	mov.f64 	%fd55, 0d7FEFFFFFFFFFFFFF;
 	setp.gt.s32	%p9, %r13, 8;
-	@%p9 bra 	BB11_26;
+	@%p9 bra 	BB22_26;
 
 	setp.gt.s32	%p23, %r13, 3;
-	@%p23 bra 	BB11_18;
+	@%p23 bra 	BB22_18;
 
 	setp.gt.s32	%p30, %r13, 1;
-	@%p30 bra 	BB11_15;
+	@%p30 bra 	BB22_15;
 
 	setp.eq.s32	%p33, %r13, 0;
-	@%p33 bra 	BB11_71;
-	bra.uni 	BB11_13;
+	@%p33 bra 	BB22_75;
+	bra.uni 	BB22_13;
 
-BB11_71:
-	add.f64 	%fd54, %fd1, %fd2;
-	bra.uni 	BB11_72;
+BB22_75:
+	add.f64 	%fd55, %fd1, %fd2;
+	bra.uni 	BB22_76;
 
-BB11_26:
+BB22_26:
 	setp.gt.s32	%p10, %r13, 13;
-	@%p10 bra 	BB11_35;
+	@%p10 bra 	BB22_35;
 
 	setp.gt.s32	%p17, %r13, 10;
-	@%p17 bra 	BB11_31;
+	@%p17 bra 	BB22_31;
 
 	setp.eq.s32	%p21, %r13, 9;
-	@%p21 bra 	BB11_53;
-	bra.uni 	BB11_29;
+	@%p21 bra 	BB22_55;
+	bra.uni 	BB22_29;
 
-BB11_53:
-	setp.eq.f64	%p50, %fd1, %fd2;
-	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p50;
-	bra.uni 	BB11_72;
+BB22_55:
+	setp.eq.f64	%p48, %fd1, %fd2;
+	selp.f64	%fd55, 0d3FF0000000000000, 0d0000000000000000, %p48;
+	bra.uni 	BB22_76;
 
-BB11_18:
+BB22_18:
 	setp.gt.s32	%p24, %r13, 5;
-	@%p24 bra 	BB11_22;
+	@%p24 bra 	BB22_22;
 
 	setp.eq.s32	%p28, %r13, 4;
-	@%p28 bra 	BB11_56;
-	bra.uni 	BB11_20;
+	@%p28 bra 	BB22_58;
+	bra.uni 	BB22_20;
 
-BB11_56:
+BB22_58:
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r8}, %fd1;
@@ -811,7 +1399,7 @@ BB11_56:
 	add.s32 	%r32, %r31, -1012;
 	mov.b64 	 %rd15, %fd2;
 	shl.b64 	%rd1, %rd15, %r32;
-	setp.eq.s64	%p55, %rd1, -9223372036854775808;
+	setp.eq.s64	%p53, %rd1, -9223372036854775808;
 	abs.f64 	%fd19, %fd1;
 	// Callseq Start 0
 	{
@@ -828,472 +1416,966 @@ BB11_56:
 	param0, 
 	param1
 	);
-	ld.param.f64	%fd53, [retval0+0];
+	ld.param.f64	%fd54, [retval0+0];
 	
 	//{
 	}// Callseq End 0
-	setp.lt.s32	%p56, %r8, 0;
-	and.pred  	%p1, %p56, %p55;
-	@!%p1 bra 	BB11_58;
-	bra.uni 	BB11_57;
+	setp.lt.s32	%p54, %r8, 0;
+	and.pred  	%p1, %p54, %p53;
+	@!%p1 bra 	BB22_60;
+	bra.uni 	BB22_59;
 
-BB11_57:
+BB22_59:
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r33}, %fd53;
+	mov.b64 	{%temp, %r33}, %fd54;
 	}
 	xor.b32  	%r34, %r33, -2147483648;
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r35, %temp}, %fd53;
+	mov.b64 	{%r35, %temp}, %fd54;
 	}
-	mov.b64 	%fd53, {%r35, %r34};
+	mov.b64 	%fd54, {%r35, %r34};
 
-BB11_58:
-	mov.f64 	%fd52, %fd53;
-	setp.eq.f64	%p57, %fd1, 0d0000000000000000;
-	@%p57 bra 	BB11_61;
-	bra.uni 	BB11_59;
+BB22_60:
+	mov.f64 	%fd53, %fd54;
+	setp.eq.f64	%p55, %fd1, 0d0000000000000000;
+	@%p55 bra 	BB22_63;
+	bra.uni 	BB22_61;
 
-BB11_61:
-	selp.b32	%r36, %r8, 0, %p55;
+BB22_63:
+	selp.b32	%r36, %r8, 0, %p53;
 	or.b32  	%r37, %r36, 2146435072;
-	setp.lt.s32	%p61, %r9, 0;
-	selp.b32	%r38, %r37, %r36, %p61;
+	setp.lt.s32	%p59, %r9, 0;
+	selp.b32	%r38, %r37, %r36, %p59;
 	mov.u32 	%r39, 0;
-	mov.b64 	%fd52, {%r39, %r38};
-	bra.uni 	BB11_62;
+	mov.b64 	%fd53, {%r39, %r38};
+	bra.uni 	BB22_64;
 
-BB11_35:
+BB22_35:
 	setp.gt.s32	%p11, %r13, 15;
-	@%p11 bra 	BB11_39;
+	@%p11 bra 	BB22_39;
 
 	setp.eq.s32	%p15, %r13, 14;
-	@%p15 bra 	BB11_50;
-	bra.uni 	BB11_37;
+	@%p15 bra 	BB22_52;
+	bra.uni 	BB22_37;
 
-BB11_50:
+BB22_52:
 	cvt.rni.s64.f64	%rd11, %fd1;
 	cvt.rni.s64.f64	%rd12, %fd2;
 	cvt.u32.u64	%r25, %rd11;
 	cvt.u32.u64	%r26, %rd12;
 	or.b32  	%r27, %r26, %r25;
-	setp.eq.s32	%p47, %r27, 0;
-	selp.f64	%fd54, 0d0000000000000000, 0d3FF0000000000000, %p47;
-	bra.uni 	BB11_72;
+	setp.eq.s32	%p45, %r27, 0;
+	selp.f64	%fd55, 0d0000000000000000, 0d3FF0000000000000, %p45;
+	bra.uni 	BB22_76;
 
-BB11_15:
+BB22_15:
 	setp.eq.s32	%p31, %r13, 2;
-	@%p31 bra 	BB11_70;
-	bra.uni 	BB11_16;
+	@%p31 bra 	BB22_74;
+	bra.uni 	BB22_16;
 
-BB11_70:
-	mul.f64 	%fd54, %fd1, %fd2;
-	bra.uni 	BB11_72;
+BB22_74:
+	mul.f64 	%fd55, %fd1, %fd2;
+	bra.uni 	BB22_76;
 
-BB11_31:
+BB22_31:
 	setp.eq.s32	%p18, %r13, 11;
-	@%p18 bra 	BB11_52;
+	@%p18 bra 	BB22_54;
 
 	setp.eq.s32	%p19, %r13, 12;
-	@%p19 bra 	BB11_51;
-	bra.uni 	BB11_33;
+	@%p19 bra 	BB22_53;
+	bra.uni 	BB22_33;
 
-BB11_51:
-	max.f64 	%fd54, %fd1, %fd2;
-	bra.uni 	BB11_72;
+BB22_53:
+	max.f64 	%fd55, %fd1, %fd2;
+	bra.uni 	BB22_76;
 
-BB11_22:
+BB22_22:
 	setp.eq.s32	%p25, %r13, 6;
-	@%p25 bra 	BB11_55;
+	@%p25 bra 	BB22_57;
 
 	setp.eq.s32	%p26, %r13, 7;
-	@%p26 bra 	BB11_54;
-	bra.uni 	BB11_24;
+	@%p26 bra 	BB22_56;
+	bra.uni 	BB22_24;
 
-BB11_54:
-	setp.gt.f64	%p52, %fd1, %fd2;
-	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p52;
-	bra.uni 	BB11_72;
+BB22_56:
+	setp.gt.f64	%p50, %fd1, %fd2;
+	selp.f64	%fd55, 0d3FF0000000000000, 0d0000000000000000, %p50;
+	bra.uni 	BB22_76;
 
-BB11_39:
+BB22_39:
 	setp.eq.s32	%p12, %r13, 16;
-	@%p12 bra 	BB11_49;
+	@%p12 bra 	BB22_51;
 
 	setp.eq.s32	%p13, %r13, 17;
-	@%p13 bra 	BB11_45;
-	bra.uni 	BB11_41;
+	@%p13 bra 	BB22_46;
+	bra.uni 	BB22_41;
 
-BB11_45:
-	setp.eq.f64	%p39, %fd2, 0d0000000000000000;
-	setp.eq.f64	%p40, %fd2, 0d8000000000000000;
-	or.pred  	%p41, %p39, %p40;
-	mov.f64 	%fd54, 0d7FF8000000000000;
-	@%p41 bra 	BB11_72;
+BB22_46:
+	setp.eq.f64	%p38, %fd2, 0d0000000000000000;
+	setp.eq.f64	%p39, %fd2, 0d8000000000000000;
+	or.pred  	%p40, %p38, %p39;
+	mov.f64 	%fd55, 0d7FF8000000000000;
+	@%p40 bra 	BB22_76;
 
-	div.rn.f64 	%fd54, %fd1, %fd2;
-	abs.f64 	%fd39, %fd54;
-	setp.gtu.f64	%p42, %fd39, 0d7FF0000000000000;
-	@%p42 bra 	BB11_72;
+	div.rn.f64 	%fd55, %fd1, %fd2;
+	abs.f64 	%fd39, %fd55;
+	setp.gtu.f64	%p41, %fd39, 0d7FF0000000000000;
+	@%p41 bra 	BB22_76;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r22, %temp}, %fd54;
+	mov.b64 	{%temp, %r22}, %fd55;
 	}
+	and.b32  	%r23, %r22, 2147483647;
+	setp.ne.s32	%p42, %r23, 2146435072;
+	@%p42 bra 	BB22_50;
+
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r23}, %fd54;
+	mov.b64 	{%r24, %temp}, %fd55;
 	}
-	and.b32  	%r24, %r23, 2147483647;
-	setp.ne.s32	%p43, %r24, 2146435072;
-	setp.ne.s32	%p44, %r22, 0;
-	or.pred  	%p45, %p43, %p44;
-	@!%p45 bra 	BB11_72;
-	bra.uni 	BB11_48;
-
-BB11_48:
-	cvt.rmi.f64.f64	%fd40, %fd54;
+	setp.eq.s32	%p43, %r24, 0;
+	@%p43 bra 	BB22_76;
+
+BB22_50:
+	cvt.rmi.f64.f64	%fd40, %fd55;
 	mul.f64 	%fd41, %fd2, %fd40;
-	sub.f64 	%fd54, %fd1, %fd41;
-	bra.uni 	BB11_72;
+	sub.f64 	%fd55, %fd1, %fd41;
+	bra.uni 	BB22_76;
 
-BB11_13:
+BB22_13:
 	setp.eq.s32	%p34, %r13, 1;
-	@%p34 bra 	BB11_14;
-	bra.uni 	BB11_72;
+	@%p34 bra 	BB22_14;
+	bra.uni 	BB22_76;
 
-BB11_14:
-	sub.f64 	%fd54, %fd1, %fd2;
-	bra.uni 	BB11_72;
+BB22_14:
+	sub.f64 	%fd55, %fd1, %fd2;
+	bra.uni 	BB22_76;
 
-BB11_29:
+BB22_29:
 	setp.eq.s32	%p22, %r13, 10;
-	@%p22 bra 	BB11_30;
-	bra.uni 	BB11_72;
+	@%p22 bra 	BB22_30;
+	bra.uni 	BB22_76;
 
-BB11_30:
-	setp.neu.f64	%p49, %fd1, %fd2;
-	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p49;
-	bra.uni 	BB11_72;
+BB22_30:
+	setp.neu.f64	%p47, %fd1, %fd2;
+	selp.f64	%fd55, 0d3FF0000000000000, 0d0000000000000000, %p47;
+	bra.uni 	BB22_76;
 
-BB11_20:
+BB22_20:
 	setp.eq.s32	%p29, %r13, 5;
-	@%p29 bra 	BB11_21;
-	bra.uni 	BB11_72;
+	@%p29 bra 	BB22_21;
+	bra.uni 	BB22_76;
 
-BB11_21:
-	setp.lt.f64	%p54, %fd1, %fd2;
-	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p54;
-	bra.uni 	BB11_72;
+BB22_21:
+	setp.lt.f64	%p52, %fd1, %fd2;
+	selp.f64	%fd55, 0d3FF0000000000000, 0d0000000000000000, %p52;
+	bra.uni 	BB22_76;
 
-BB11_37:
+BB22_37:
 	setp.eq.s32	%p16, %r13, 15;
-	@%p16 bra 	BB11_38;
-	bra.uni 	BB11_72;
+	@%p16 bra 	BB22_38;
+	bra.uni 	BB22_76;
 
-BB11_38:
+BB22_38:
 	mul.f64 	%fd43, %fd1, %fd2;
 	mov.f64 	%fd44, 0d3FF0000000000000;
-	sub.f64 	%fd54, %fd44, %fd43;
-	bra.uni 	BB11_72;
+	sub.f64 	%fd55, %fd44, %fd43;
+	bra.uni 	BB22_76;
 
-BB11_16:
+BB22_16:
 	setp.eq.s32	%p32, %r13, 3;
-	@%p32 bra 	BB11_17;
-	bra.uni 	BB11_72;
+	@%p32 bra 	BB22_17;
+	bra.uni 	BB22_76;
 
-BB11_17:
-	div.rn.f64 	%fd54, %fd1, %fd2;
-	bra.uni 	BB11_72;
+BB22_17:
+	div.rn.f64 	%fd55, %fd1, %fd2;
+	bra.uni 	BB22_76;
 
-BB11_52:
-	min.f64 	%fd54, %fd1, %fd2;
-	bra.uni 	BB11_72;
+BB22_54:
+	min.f64 	%fd55, %fd1, %fd2;
+	bra.uni 	BB22_76;
 
-BB11_33:
+BB22_33:
 	setp.eq.s32	%p20, %r13, 13;
-	@%p20 bra 	BB11_34;
-	bra.uni 	BB11_72;
+	@%p20 bra 	BB22_34;
+	bra.uni 	BB22_76;
 
-BB11_34:
+BB22_34:
 	cvt.rni.s64.f64	%rd13, %fd1;
 	cvt.rni.s64.f64	%rd14, %fd2;
 	cvt.u32.u64	%r28, %rd13;
 	cvt.u32.u64	%r29, %rd14;
 	and.b32  	%r30, %r29, %r28;
-	setp.eq.s32	%p48, %r30, 0;
-	selp.f64	%fd54, 0d0000000000000000, 0d3FF0000000000000, %p48;
-	bra.uni 	BB11_72;
+	setp.eq.s32	%p46, %r30, 0;
+	selp.f64	%fd55, 0d0000000000000000, 0d3FF0000000000000, %p46;
+	bra.uni 	BB22_76;
 
-BB11_55:
-	setp.le.f64	%p53, %fd1, %fd2;
-	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p53;
-	bra.uni 	BB11_72;
+BB22_57:
+	setp.gtu.f64	%p51, %fd1, %fd2;
+	selp.f64	%fd55, 0d0000000000000000, 0d3FF0000000000000, %p51;
+	bra.uni 	BB22_76;
 
-BB11_24:
+BB22_24:
 	setp.eq.s32	%p27, %r13, 8;
-	@%p27 bra 	BB11_25;
-	bra.uni 	BB11_72;
+	@%p27 bra 	BB22_25;
+	bra.uni 	BB22_76;
 
-BB11_25:
-	setp.ge.f64	%p51, %fd1, %fd2;
-	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p51;
-	bra.uni 	BB11_72;
+BB22_25:
+	setp.ltu.f64	%p49, %fd1, %fd2;
+	selp.f64	%fd55, 0d0000000000000000, 0d3FF0000000000000, %p49;
+	bra.uni 	BB22_76;
 
-BB11_49:
-	setp.neu.f64	%p46, %fd1, 0d0000000000000000;
+BB22_51:
+	setp.neu.f64	%p44, %fd1, 0d0000000000000000;
 	sub.f64 	%fd42, %fd1, %fd2;
-	selp.f64	%fd54, %fd42, 0d0000000000000000, %p46;
-	bra.uni 	BB11_72;
+	selp.f64	%fd55, %fd42, 0d0000000000000000, %p44;
+	bra.uni 	BB22_76;
 
-BB11_41:
+BB22_41:
 	setp.ne.s32	%p14, %r13, 18;
-	@%p14 bra 	BB11_72;
+	@%p14 bra 	BB22_76;
 
-	div.rn.f64 	%fd54, %fd1, %fd2;
-	abs.f64 	%fd37, %fd54;
+	div.rn.f64 	%fd55, %fd1, %fd2;
+	abs.f64 	%fd37, %fd55;
 	setp.gtu.f64	%p35, %fd37, 0d7FF0000000000000;
-	@%p35 bra 	BB11_72;
+	@%p35 bra 	BB22_76;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r19, %temp}, %fd54;
+	mov.b64 	{%temp, %r19}, %fd55;
 	}
+	and.b32  	%r20, %r19, 2147483647;
+	setp.ne.s32	%p36, %r20, 2146435072;
+	@%p36 bra 	BB22_45;
+
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r20}, %fd54;
+	mov.b64 	{%r21, %temp}, %fd55;
 	}
-	and.b32  	%r21, %r20, 2147483647;
-	setp.ne.s32	%p36, %r21, 2146435072;
-	setp.ne.s32	%p37, %r19, 0;
-	or.pred  	%p38, %p36, %p37;
-	@!%p38 bra 	BB11_72;
-	bra.uni 	BB11_44;
+	setp.eq.s32	%p37, %r21, 0;
+	@%p37 bra 	BB22_76;
 
-BB11_44:
-	cvt.rmi.f64.f64	%fd54, %fd54;
-	bra.uni 	BB11_72;
+BB22_45:
+	cvt.rmi.f64.f64	%fd55, %fd55;
+	bra.uni 	BB22_76;
 
-BB11_59:
-	setp.gt.s32	%p58, %r8, -1;
-	@%p58 bra 	BB11_62;
+BB22_61:
+	setp.gt.s32	%p56, %r8, -1;
+	@%p56 bra 	BB22_64;
 
 	cvt.rzi.f64.f64	%fd45, %fd2;
-	setp.neu.f64	%p59, %fd45, %fd2;
-	selp.f64	%fd52, 0dFFF8000000000000, %fd52, %p59;
+	setp.neu.f64	%p57, %fd45, %fd2;
+	selp.f64	%fd53, 0dFFF8000000000000, %fd53, %p57;
 
-BB11_62:
-	mov.f64 	%fd25, %fd52;
+BB22_64:
+	mov.f64 	%fd25, %fd53;
 	add.f64 	%fd26, %fd1, %fd2;
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r40}, %fd26;
 	}
 	and.b32  	%r41, %r40, 2146435072;
-	setp.ne.s32	%p62, %r41, 2146435072;
-	mov.f64 	%fd51, %fd25;
-	@%p62 bra 	BB11_69;
+	setp.ne.s32	%p60, %r41, 2146435072;
+	mov.f64 	%fd52, %fd25;
+	@%p60 bra 	BB22_73;
 
-	setp.gtu.f64	%p63, %fd19, 0d7FF0000000000000;
-	mov.f64 	%fd51, %fd26;
-	@%p63 bra 	BB11_69;
+	setp.gtu.f64	%p61, %fd19, 0d7FF0000000000000;
+	mov.f64 	%fd52, %fd26;
+	@%p61 bra 	BB22_73;
 
 	abs.f64 	%fd46, %fd2;
-	setp.gtu.f64	%p64, %fd46, 0d7FF0000000000000;
-	mov.f64 	%fd50, %fd26;
-	mov.f64 	%fd51, %fd50;
-	@%p64 bra 	BB11_69;
+	setp.gtu.f64	%p62, %fd46, 0d7FF0000000000000;
+	mov.f64 	%fd51, %fd26;
+	mov.f64 	%fd52, %fd51;
+	@%p62 bra 	BB22_73;
+
+	and.b32  	%r42, %r9, 2147483647;
+	setp.ne.s32	%p63, %r42, 2146435072;
+	@%p63 bra 	BB22_69;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r42, %temp}, %fd2;
+	mov.b64 	{%r43, %temp}, %fd2;
 	}
-	and.b32  	%r43, %r9, 2147483647;
-	setp.eq.s32	%p65, %r43, 2146435072;
-	setp.eq.s32	%p66, %r42, 0;
-	and.pred  	%p67, %p65, %p66;
-	@%p67 bra 	BB11_68;
-	bra.uni 	BB11_66;
-
-BB11_68:
-	setp.gt.f64	%p71, %fd19, 0d3FF0000000000000;
-	selp.b32	%r51, 2146435072, 0, %p71;
-	xor.b32  	%r52, %r51, 2146435072;
-	setp.lt.s32	%p72, %r9, 0;
-	selp.b32	%r53, %r52, %r51, %p72;
-	setp.eq.f64	%p73, %fd1, 0dBFF0000000000000;
-	selp.b32	%r54, 1072693248, %r53, %p73;
-	mov.u32 	%r55, 0;
-	mov.b64 	%fd51, {%r55, %r54};
-	bra.uni 	BB11_69;
-
-BB11_66:
+	setp.eq.s32	%p64, %r43, 0;
+	@%p64 bra 	BB22_72;
+
+BB22_69:
+	and.b32  	%r44, %r8, 2147483647;
+	setp.ne.s32	%p65, %r44, 2146435072;
+	mov.f64 	%fd49, %fd25;
+	mov.f64 	%fd52, %fd49;
+	@%p65 bra 	BB22_73;
+
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r44, %temp}, %fd1;
+	mov.b64 	{%r45, %temp}, %fd1;
 	}
-	and.b32  	%r45, %r8, 2147483647;
-	setp.eq.s32	%p68, %r45, 2146435072;
-	setp.eq.s32	%p69, %r44, 0;
-	and.pred  	%p70, %p68, %p69;
-	mov.f64 	%fd51, %fd25;
-	@!%p70 bra 	BB11_69;
-	bra.uni 	BB11_67;
-
-BB11_67:
+	setp.ne.s32	%p66, %r45, 0;
+	mov.f64 	%fd52, %fd25;
+	@%p66 bra 	BB22_73;
+
 	shr.s32 	%r46, %r9, 31;
 	and.b32  	%r47, %r46, -2146435072;
-	selp.b32	%r48, -1048576, 2146435072, %p1;
-	add.s32 	%r49, %r48, %r47;
-	mov.u32 	%r50, 0;
-	mov.b64 	%fd51, {%r50, %r49};
-
-BB11_69:
-	setp.eq.f64	%p74, %fd2, 0d0000000000000000;
-	setp.eq.f64	%p75, %fd1, 0d3FF0000000000000;
-	or.pred  	%p76, %p75, %p74;
-	selp.f64	%fd54, 0d3FF0000000000000, %fd51, %p76;
-
-BB11_72:
+	add.s32 	%r48, %r47, 2146435072;
+	or.b32  	%r49, %r48, -2147483648;
+	selp.b32	%r50, %r49, %r48, %p1;
+	mov.u32 	%r51, 0;
+	mov.b64 	%fd52, {%r51, %r50};
+	bra.uni 	BB22_73;
+
+BB22_72:
+	setp.gt.f64	%p67, %fd19, 0d3FF0000000000000;
+	selp.b32	%r52, 2146435072, 0, %p67;
+	xor.b32  	%r53, %r52, 2146435072;
+	setp.lt.s32	%p68, %r9, 0;
+	selp.b32	%r54, %r53, %r52, %p68;
+	setp.eq.f64	%p69, %fd1, 0dBFF0000000000000;
+	selp.b32	%r55, 1072693248, %r54, %p69;
+	mov.u32 	%r56, 0;
+	mov.b64 	%fd52, {%r56, %r55};
+
+BB22_73:
+	setp.eq.f64	%p70, %fd2, 0d0000000000000000;
+	setp.eq.f64	%p71, %fd1, 0d3FF0000000000000;
+	or.pred  	%p72, %p71, %p70;
+	selp.f64	%fd55, 0d3FF0000000000000, %fd52, %p72;
+
+BB22_76:
 	cvta.to.global.u64 	%rd16, %rd4;
 	mul.wide.s32 	%rd17, %r3, 8;
 	add.s64 	%rd18, %rd16, %rd17;
-	st.global.f64 	[%rd18], %fd54;
+	st.global.f64 	[%rd18], %fd55;
 	bar.sync 	0;
 
-BB11_73:
+BB22_77:
 	ret;
 }
 
-	// .globl	matrix_scalar_op
-.visible .entry matrix_scalar_op(
-	.param .u64 matrix_scalar_op_param_0,
-	.param .f64 matrix_scalar_op_param_1,
-	.param .u64 matrix_scalar_op_param_2,
-	.param .u32 matrix_scalar_op_param_3,
-	.param .u32 matrix_scalar_op_param_4,
-	.param .u32 matrix_scalar_op_param_5
+	// .globl	matrix_matrix_cellwise_op_f
+.visible .entry matrix_matrix_cellwise_op_f(
+	.param .u64 matrix_matrix_cellwise_op_f_param_0,
+	.param .u64 matrix_matrix_cellwise_op_f_param_1,
+	.param .u64 matrix_matrix_cellwise_op_f_param_2,
+	.param .u32 matrix_matrix_cellwise_op_f_param_3,
+	.param .u32 matrix_matrix_cellwise_op_f_param_4,
+	.param .u32 matrix_matrix_cellwise_op_f_param_5,
+	.param .u32 matrix_matrix_cellwise_op_f_param_6,
+	.param .u32 matrix_matrix_cellwise_op_f_param_7
 )
 {
-	.reg .pred 	%p<141>;
-	.reg .b32 	%r<86>;
-	.reg .f64 	%fd<107>;
-	.reg .b64 	%rd<20>;
+	.reg .pred 	%p<76>;
+	.reg .f32 	%f<134>;
+	.reg .b32 	%r<51>;
+	.reg .b64 	%rd<17>;
 
 
-	ld.param.u64 	%rd4, [matrix_scalar_op_param_0];
-	ld.param.f64 	%fd68, [matrix_scalar_op_param_1];
-	ld.param.u64 	%rd5, [matrix_scalar_op_param_2];
-	ld.param.u32 	%r8, [matrix_scalar_op_param_3];
-	ld.param.u32 	%r6, [matrix_scalar_op_param_4];
-	ld.param.u32 	%r7, [matrix_scalar_op_param_5];
-	mov.u32 	%r9, %ntid.x;
-	mov.u32 	%r10, %ctaid.x;
-	mov.u32 	%r11, %tid.x;
-	mad.lo.s32 	%r1, %r9, %r10, %r11;
-	setp.ge.s32	%p3, %r1, %r8;
-	@%p3 bra 	BB12_130;
+	ld.param.u64 	%rd1, [matrix_matrix_cellwise_op_f_param_0];
+	ld.param.u64 	%rd2, [matrix_matrix_cellwise_op_f_param_1];
+	ld.param.u64 	%rd3, [matrix_matrix_cellwise_op_f_param_2];
+	ld.param.u32 	%r12, [matrix_matrix_cellwise_op_f_param_3];
+	ld.param.u32 	%r8, [matrix_matrix_cellwise_op_f_param_4];
+	ld.param.u32 	%r9, [matrix_matrix_cellwise_op_f_param_5];
+	ld.param.u32 	%r10, [matrix_matrix_cellwise_op_f_param_6];
+	ld.param.u32 	%r11, [matrix_matrix_cellwise_op_f_param_7];
+	mov.u32 	%r13, %ntid.x;
+	mov.u32 	%r14, %ctaid.x;
+	mov.u32 	%r15, %tid.x;
+	mad.lo.s32 	%r16, %r13, %r14, %r15;
+	div.s32 	%r1, %r16, %r8;
+	rem.s32 	%r2, %r16, %r8;
+	setp.lt.s32	%p2, %r1, %r12;
+	setp.gt.s32	%p3, %r8, -1;
+	and.pred  	%p4, %p2, %p3;
+	@!%p4 bra 	BB23_71;
+	bra.uni 	BB23_1;
 
-	cvta.to.global.u64 	%rd6, %rd5;
-	cvta.to.global.u64 	%rd7, %rd4;
-	mul.wide.s32 	%rd8, %r1, 8;
-	add.s64 	%rd9, %rd7, %rd8;
-	ld.global.f64 	%fd1, [%rd9];
-	add.s64 	%rd1, %rd6, %rd8;
-	setp.eq.s32	%p4, %r7, 0;
-	@%p4 bra 	BB12_66;
+BB23_1:
+	mad.lo.s32 	%r3, %r1, %r8, %r2;
+	setp.eq.s32	%p5, %r9, 1;
+	mov.u32 	%r49, %r1;
+	@%p5 bra 	BB23_5;
 
-	mov.f64 	%fd98, 0d7FEFFFFFFFFFFFFF;
-	setp.gt.s32	%p5, %r6, 8;
-	@%p5 bra 	BB12_19;
+	setp.ne.s32	%p6, %r9, 2;
+	mov.u32 	%r50, %r3;
+	@%p6 bra 	BB23_4;
 
-	setp.gt.s32	%p19, %r6, 3;
-	@%p19 bra 	BB12_11;
+	mov.u32 	%r50, %r2;
 
-	setp.gt.s32	%p26, %r6, 1;
-	@%p26 bra 	BB12_8;
+BB23_4:
+	mov.u32 	%r44, %r50;
+	mov.u32 	%r4, %r44;
+	mov.u32 	%r49, %r4;
 
-	setp.eq.s32	%p29, %r6, 0;
-	@%p29 bra 	BB12_64;
-	bra.uni 	BB12_6;
+BB23_5:
+	mov.u32 	%r5, %r49;
+	setp.eq.s32	%p7, %r10, 1;
+	mov.u32 	%r47, %r1;
+	@%p7 bra 	BB23_9;
 
-BB12_64:
-	add.f64 	%fd98, %fd1, %fd68;
-	bra.uni 	BB12_65;
+	setp.ne.s32	%p8, %r10, 2;
+	mov.u32 	%r48, %r3;
+	@%p8 bra 	BB23_8;
 
-BB12_66:
-	mov.f64 	%fd106, 0d7FEFFFFFFFFFFFFF;
-	setp.gt.s32	%p73, %r6, 8;
-	@%p73 bra 	BB12_83;
+	mov.u32 	%r48, %r2;
 
-	setp.gt.s32	%p87, %r6, 3;
-	@%p87 bra 	BB12_75;
+BB23_8:
+	mov.u32 	%r47, %r48;
 
-	setp.gt.s32	%p94, %r6, 1;
-	@%p94 bra 	BB12_72;
+BB23_9:
+	cvta.to.global.u64 	%rd4, %rd2;
+	cvta.to.global.u64 	%rd5, %rd1;
+	mul.wide.s32 	%rd6, %r5, 4;
+	add.s64 	%rd7, %rd5, %rd6;
+	ld.global.f32 	%f1, [%rd7];
+	mul.wide.s32 	%rd8, %r47, 4;
+	add.s64 	%rd9, %rd4, %rd8;
+	ld.global.f32 	%f2, [%rd9];
+	mov.f32 	%f133, 0f7F7FFFFF;
+	setp.gt.s32	%p9, %r11, 8;
+	@%p9 bra 	BB23_26;
 
-	setp.eq.s32	%p97, %r6, 0;
-	@%p97 bra 	BB12_128;
-	bra.uni 	BB12_70;
+	setp.gt.s32	%p23, %r11, 3;
+	@%p23 bra 	BB23_18;
 
-BB12_128:
-	add.f64 	%fd106, %fd1, %fd68;
-	bra.uni 	BB12_129;
+	setp.gt.s32	%p30, %r11, 1;
+	@%p30 bra 	BB23_15;
 
-BB12_19:
-	setp.gt.s32	%p6, %r6, 13;
-	@%p6 bra 	BB12_28;
+	setp.eq.s32	%p33, %r11, 0;
+	@%p33 bra 	BB23_69;
+	bra.uni 	BB23_13;
 
-	setp.gt.s32	%p13, %r6, 10;
-	@%p13 bra 	BB12_24;
+BB23_69:
+	add.f32 	%f133, %f1, %f2;
+	bra.uni 	BB23_70;
 
-	setp.eq.s32	%p17, %r6, 9;
-	@%p17 bra 	BB12_46;
-	bra.uni 	BB12_22;
+BB23_26:
+	setp.gt.s32	%p10, %r11, 13;
+	@%p10 bra 	BB23_35;
 
-BB12_46:
-	setp.eq.f64	%p46, %fd1, %fd68;
-	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p46;
-	bra.uni 	BB12_65;
+	setp.gt.s32	%p17, %r11, 10;
+	@%p17 bra 	BB23_31;
 
-BB12_83:
-	setp.gt.s32	%p74, %r6, 13;
-	@%p74 bra 	BB12_92;
+	setp.eq.s32	%p21, %r11, 9;
+	@%p21 bra 	BB23_51;
+	bra.uni 	BB23_29;
+
+BB23_51:
+	setp.eq.f32	%p44, %f1, %f2;
+	selp.f32	%f133, 0f3F800000, 0f00000000, %p44;
+	bra.uni 	BB23_70;
+
+BB23_18:
+	setp.gt.s32	%p24, %r11, 5;
+	@%p24 bra 	BB23_22;
+
+	setp.eq.s32	%p28, %r11, 4;
+	@%p28 bra 	BB23_54;
+	bra.uni 	BB23_20;
+
+BB23_54:
+	mul.f32 	%f53, %f2, 0f3F000000;
+	cvt.rzi.f32.f32	%f54, %f53;
+	fma.rn.f32 	%f55, %f54, 0fC0000000, %f2;
+	abs.f32 	%f19, %f55;
+	abs.f32 	%f20, %f1;
+	setp.lt.f32	%p49, %f20, 0f00800000;
+	mul.f32 	%f56, %f20, 0f4B800000;
+	selp.f32	%f57, 0fC3170000, 0fC2FE0000, %p49;
+	selp.f32	%f58, %f56, %f20, %p49;
+	mov.b32 	 %r23, %f58;
+	and.b32  	%r24, %r23, 8388607;
+	or.b32  	%r25, %r24, 1065353216;
+	mov.b32 	 %f59, %r25;
+	shr.u32 	%r26, %r23, 23;
+	cvt.rn.f32.u32	%f60, %r26;
+	add.f32 	%f61, %f57, %f60;
+	setp.gt.f32	%p50, %f59, 0f3FB504F3;
+	mul.f32 	%f62, %f59, 0f3F000000;
+	add.f32 	%f63, %f61, 0f3F800000;
+	selp.f32	%f64, %f62, %f59, %p50;
+	selp.f32	%f65, %f63, %f61, %p50;
+	add.f32 	%f66, %f64, 0fBF800000;
+	add.f32 	%f50, %f64, 0f3F800000;
+	// inline asm
+	rcp.approx.ftz.f32 %f49,%f50;
+	// inline asm
+	add.f32 	%f67, %f66, %f66;
+	mul.f32 	%f68, %f49, %f67;
+	mul.f32 	%f69, %f68, %f68;
+	mov.f32 	%f70, 0f3C4CAF63;
+	mov.f32 	%f71, 0f3B18F0FE;
+	fma.rn.f32 	%f72, %f71, %f69, %f70;
+	mov.f32 	%f73, 0f3DAAAABD;
+	fma.rn.f32 	%f74, %f72, %f69, %f73;
+	mul.rn.f32 	%f75, %f74, %f69;
+	mul.rn.f32 	%f76, %f75, %f68;
+	sub.f32 	%f77, %f66, %f68;
+	neg.f32 	%f78, %f68;
+	add.f32 	%f79, %f77, %f77;
+	fma.rn.f32 	%f80, %f78, %f66, %f79;
+	mul.rn.f32 	%f81, %f49, %f80;
+	add.f32 	%f82, %f76, %f68;
+	sub.f32 	%f83, %f68, %f82;
+	add.f32 	%f84, %f76, %f83;
+	add.f32 	%f85, %f81, %f84;
+	add.f32 	%f86, %f82, %f85;
+	sub.f32 	%f87, %f82, %f86;
+	add.f32 	%f88, %f85, %f87;
+	mov.f32 	%f89, 0f3F317200;
+	mul.rn.f32 	%f90, %f65, %f89;
+	mov.f32 	%f91, 0f35BFBE8E;
+	mul.rn.f32 	%f92, %f65, %f91;
+	add.f32 	%f93, %f90, %f86;
+	sub.f32 	%f94, %f90, %f93;
+	add.f32 	%f95, %f86, %f94;
+	add.f32 	%f96, %f88, %f95;
+	add.f32 	%f97, %f92, %f96;
+	add.f32 	%f98, %f93, %f97;
+	sub.f32 	%f99, %f93, %f98;
+	add.f32 	%f100, %f97, %f99;
+	abs.f32 	%f21, %f2;
+	setp.gt.f32	%p51, %f21, 0f77F684DF;
+	mul.f32 	%f101, %f2, 0f39000000;
+	selp.f32	%f102, %f101, %f2, %p51;
+	mul.rn.f32 	%f103, %f102, %f98;
+	neg.f32 	%f104, %f103;
+	fma.rn.f32 	%f105, %f102, %f98, %f104;
+	fma.rn.f32 	%f106, %f102, %f100, %f105;
+	mov.f32 	%f107, 0f00000000;
+	fma.rn.f32 	%f108, %f107, %f98, %f106;
+	add.rn.f32 	%f109, %f103, %f108;
+	neg.f32 	%f110, %f109;
+	add.rn.f32 	%f111, %f103, %f110;
+	add.rn.f32 	%f112, %f111, %f108;
+	mov.b32 	 %r27, %f109;
+	setp.eq.s32	%p52, %r27, 1118925336;
+	add.s32 	%r28, %r27, -1;
+	mov.b32 	 %f113, %r28;
+	add.f32 	%f114, %f112, 0f37000000;
+	selp.f32	%f115, %f113, %f109, %p52;
+	selp.f32	%f22, %f114, %f112, %p52;
+	mul.f32 	%f116, %f115, 0f3FB8AA3B;
+	cvt.rzi.f32.f32	%f117, %f116;
+	mov.f32 	%f118, 0fBF317200;
+	fma.rn.f32 	%f119, %f117, %f118, %f115;
+	mov.f32 	%f120, 0fB5BFBE8E;
+	fma.rn.f32 	%f121, %f117, %f120, %f119;
+	mul.f32 	%f52, %f121, 0f3FB8AA3B;
+	// inline asm
+	ex2.approx.ftz.f32 %f51,%f52;
+	// inline asm
+	add.f32 	%f122, %f117, 0f00000000;
+	ex2.approx.f32 	%f123, %f122;
+	mul.f32 	%f124, %f51, %f123;
+	setp.lt.f32	%p53, %f115, 0fC2D20000;
+	selp.f32	%f125, 0f00000000, %f124, %p53;
+	setp.gt.f32	%p54, %f115, 0f42D20000;
+	selp.f32	%f131, 0f7F800000, %f125, %p54;
+	setp.eq.f32	%p55, %f131, 0f7F800000;
+	@%p55 bra 	BB23_56;
+
+	fma.rn.f32 	%f131, %f131, %f22, %f131;
+
+BB23_56:
+	setp.lt.f32	%p56, %f1, 0f00000000;
+	setp.eq.f32	%p57, %f19, 0f3F800000;
+	and.pred  	%p1, %p56, %p57;
+	mov.b32 	 %r29, %f131;
+	xor.b32  	%r30, %r29, -2147483648;
+	mov.b32 	 %f126, %r30;
+	selp.f32	%f132, %f126, %f131, %p1;
+	setp.eq.f32	%p58, %f1, 0f00000000;
+	@%p58 bra 	BB23_59;
+	bra.uni 	BB23_57;
+
+BB23_59:
+	add.f32 	%f128, %f1, %f1;
+	mov.b32 	 %r31, %f128;
+	selp.b32	%r32, %r31, 0, %p57;
+	or.b32  	%r33, %r32, 2139095040;
+	setp.lt.f32	%p62, %f2, 0f00000000;
+	selp.b32	%r34, %r33, %r32, %p62;
+	mov.b32 	 %f132, %r34;
+	bra.uni 	BB23_60;
 
-	setp.gt.s32	%p81, %r6, 10;
-	@%p81 bra 	BB12_88;
+BB23_35:
+	setp.gt.s32	%p11, %r11, 15;
+	@%p11 bra 	BB23_39;
+
+	setp.eq.s32	%p15, %r11, 14;
+	@%p15 bra 	BB23_48;
+	bra.uni 	BB23_37;
+
+BB23_48:
+	cvt.rni.s64.f32	%rd10, %f1;
+	cvt.rni.s64.f32	%rd11, %f2;
+	cvt.u32.u64	%r17, %rd10;
+	cvt.u32.u64	%r18, %rd11;
+	or.b32  	%r19, %r18, %r17;
+	setp.eq.s32	%p41, %r19, 0;
+	selp.f32	%f133, 0f00000000, 0f3F800000, %p41;
+	bra.uni 	BB23_70;
 
-	setp.eq.s32	%p85, %r6, 9;
-	@%p85 bra 	BB12_110;
-	bra.uni 	BB12_86;
+BB23_15:
+	setp.eq.s32	%p31, %r11, 2;
+	@%p31 bra 	BB23_68;
+	bra.uni 	BB23_16;
 
-BB12_110:
-	setp.eq.f64	%p114, %fd1, %fd68;
-	selp.f64	%fd106, 0d3FF0000000000000, 0d0000000000000000, %p114;
-	bra.uni 	BB12_129;
+BB23_68:
+	mul.f32 	%f133, %f1, %f2;
+	bra.uni 	BB23_70;
 
-BB12_11:
-	setp.gt.s32	%p20, %r6, 5;
-	@%p20 bra 	BB12_15;
+BB23_31:
+	setp.eq.s32	%p18, %r11, 11;
+	@%p18 bra 	BB23_50;
 
-	setp.eq.s32	%p24, %r6, 4;
-	@%p24 bra 	BB12_49;
-	bra.uni 	BB12_13;
+	setp.eq.s32	%p19, %r11, 12;
+	@%p19 bra 	BB23_49;
+	bra.uni 	BB23_33;
 
-BB12_49:
-	{
-	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r2}, %fd68;
-	}
-	{
-	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r3}, %fd1;
-	}
-	bfe.u32 	%r24, %r3, 20, 11;
+BB23_49:
+	max.f32 	%f133, %f1, %f2;
+	bra.uni 	BB23_70;
+
+BB23_22:
+	setp.eq.s32	%p25, %r11, 6;
+	@%p25 bra 	BB23_53;
+
+	setp.eq.s32	%p26, %r11, 7;
+	@%p26 bra 	BB23_52;
+	bra.uni 	BB23_24;
+
+BB23_52:
+	setp.gt.f32	%p46, %f1, %f2;
+	selp.f32	%f133, 0f3F800000, 0f00000000, %p46;
+	bra.uni 	BB23_70;
+
+BB23_39:
+	setp.eq.s32	%p12, %r11, 16;
+	@%p12 bra 	BB23_47;
+
+	setp.eq.s32	%p13, %r11, 17;
+	@%p13 bra 	BB23_44;
+	bra.uni 	BB23_41;
+
+BB23_44:
+	setp.eq.f32	%p36, %f2, 0f00000000;
+	setp.eq.f32	%p37, %f2, 0f80000000;
+	or.pred  	%p38, %p36, %p37;
+	mov.f32 	%f133, 0f7FC00000;
+	@%p38 bra 	BB23_70;
+
+	div.rn.f32 	%f133, %f1, %f2;
+	abs.f32 	%f43, %f133;
+	setp.geu.f32	%p39, %f43, 0f7F800000;
+	@%p39 bra 	BB23_70;
+
+	cvt.rmi.f32.f32	%f44, %f133;
+	mul.f32 	%f45, %f2, %f44;
+	sub.f32 	%f133, %f1, %f45;
+	bra.uni 	BB23_70;
+
+BB23_13:
+	setp.eq.s32	%p34, %r11, 1;
+	@%p34 bra 	BB23_14;
+	bra.uni 	BB23_70;
+
+BB23_14:
+	sub.f32 	%f133, %f1, %f2;
+	bra.uni 	BB23_70;
+
+BB23_29:
+	setp.eq.s32	%p22, %r11, 10;
+	@%p22 bra 	BB23_30;
+	bra.uni 	BB23_70;
+
+BB23_30:
+	setp.neu.f32	%p43, %f1, %f2;
+	selp.f32	%f133, 0f3F800000, 0f00000000, %p43;
+	bra.uni 	BB23_70;
+
+BB23_20:
+	setp.eq.s32	%p29, %r11, 5;
+	@%p29 bra 	BB23_21;
+	bra.uni 	BB23_70;
+
+BB23_21:
+	setp.lt.f32	%p48, %f1, %f2;
+	selp.f32	%f133, 0f3F800000, 0f00000000, %p48;
+	bra.uni 	BB23_70;
+
+BB23_37:
+	setp.eq.s32	%p16, %r11, 15;
+	@%p16 bra 	BB23_38;
+	bra.uni 	BB23_70;
+
+BB23_38:
+	mul.f32 	%f47, %f1, %f2;
+	mov.f32 	%f48, 0f3F800000;
+	sub.f32 	%f133, %f48, %f47;
+	bra.uni 	BB23_70;
+
+BB23_16:
+	setp.eq.s32	%p32, %r11, 3;
+	@%p32 bra 	BB23_17;
+	bra.uni 	BB23_70;
+
+BB23_17:
+	div.rn.f32 	%f133, %f1, %f2;
+	bra.uni 	BB23_70;
+
+BB23_50:
+	min.f32 	%f133, %f1, %f2;
+	bra.uni 	BB23_70;
+
+BB23_33:
+	setp.eq.s32	%p20, %r11, 13;
+	@%p20 bra 	BB23_34;
+	bra.uni 	BB23_70;
+
+BB23_34:
+	cvt.rni.s64.f32	%rd12, %f1;
+	cvt.rni.s64.f32	%rd13, %f2;
+	cvt.u32.u64	%r20, %rd12;
+	cvt.u32.u64	%r21, %rd13;
+	and.b32  	%r22, %r21, %r20;
+	setp.eq.s32	%p42, %r22, 0;
+	selp.f32	%f133, 0f00000000, 0f3F800000, %p42;
+	bra.uni 	BB23_70;
+
+BB23_53:
+	setp.gtu.f32	%p47, %f1, %f2;
+	selp.f32	%f133, 0f00000000, 0f3F800000, %p47;
+	bra.uni 	BB23_70;
+
+BB23_24:
+	setp.eq.s32	%p27, %r11, 8;
+	@%p27 bra 	BB23_25;
+	bra.uni 	BB23_70;
+
+BB23_25:
+	setp.ltu.f32	%p45, %f1, %f2;
+	selp.f32	%f133, 0f00000000, 0f3F800000, %p45;
+	bra.uni 	BB23_70;
+
+BB23_47:
+	setp.neu.f32	%p40, %f1, 0f00000000;
+	sub.f32 	%f46, %f1, %f2;
+	selp.f32	%f133, %f46, 0f00000000, %p40;
+	bra.uni 	BB23_70;
+
+BB23_41:
+	setp.ne.s32	%p14, %r11, 18;
+	@%p14 bra 	BB23_70;
+
+	div.rn.f32 	%f133, %f1, %f2;
+	abs.f32 	%f41, %f133;
+	setp.geu.f32	%p35, %f41, 0f7F800000;
+	@%p35 bra 	BB23_70;
+
+	cvt.rmi.f32.f32	%f133, %f133;
+	bra.uni 	BB23_70;
+
+BB23_57:
+	setp.geu.f32	%p59, %f1, 0f00000000;
+	@%p59 bra 	BB23_60;
+
+	cvt.rzi.f32.f32	%f127, %f2;
+	setp.neu.f32	%p60, %f127, %f2;
+	selp.f32	%f132, 0f7FFFFFFF, %f132, %p60;
+
+BB23_60:
+	add.f32 	%f129, %f20, %f21;
+	mov.b32 	 %r35, %f129;
+	setp.lt.s32	%p63, %r35, 2139095040;
+	@%p63 bra 	BB23_67;
+
+	setp.gtu.f32	%p64, %f20, 0f7F800000;
+	setp.gtu.f32	%p65, %f21, 0f7F800000;
+	or.pred  	%p66, %p64, %p65;
+	@%p66 bra 	BB23_66;
+	bra.uni 	BB23_62;
+
+BB23_66:
+	add.f32 	%f132, %f1, %f2;
+	bra.uni 	BB23_67;
+
+BB23_62:
+	setp.eq.f32	%p67, %f21, 0f7F800000;
+	@%p67 bra 	BB23_65;
+	bra.uni 	BB23_63;
+
+BB23_65:
+	setp.gt.f32	%p70, %f20, 0f3F800000;
+	selp.b32	%r39, 2139095040, 0, %p70;
+	xor.b32  	%r40, %r39, 2139095040;
+	setp.lt.f32	%p71, %f2, 0f00000000;
+	selp.b32	%r41, %r40, %r39, %p71;
+	mov.b32 	 %f130, %r41;
+	setp.eq.f32	%p72, %f1, 0fBF800000;
+	selp.f32	%f132, 0f3F800000, %f130, %p72;
+	bra.uni 	BB23_67;
+
+BB23_63:
+	setp.neu.f32	%p68, %f20, 0f7F800000;
+	@%p68 bra 	BB23_67;
+
+	setp.ltu.f32	%p69, %f2, 0f00000000;
+	selp.b32	%r36, 0, 2139095040, %p69;
+	or.b32  	%r37, %r36, -2147483648;
+	selp.b32	%r38, %r37, %r36, %p1;
+	mov.b32 	 %f132, %r38;
+
+BB23_67:
+	setp.eq.f32	%p73, %f2, 0f00000000;
+	setp.eq.f32	%p74, %f1, 0f3F800000;
+	or.pred  	%p75, %p74, %p73;
+	selp.f32	%f133, 0f3F800000, %f132, %p75;
+
+BB23_70:
+	cvta.to.global.u64 	%rd14, %rd3;
+	mul.wide.s32 	%rd15, %r3, 4;
+	add.s64 	%rd16, %rd14, %rd15;
+	st.global.f32 	[%rd16], %f133;
+	bar.sync 	0;
+
+BB23_71:
+	ret;
+}
+
+	// .globl	matrix_scalar_op_d
+.visible .entry matrix_scalar_op_d(
+	.param .u64 matrix_scalar_op_d_param_0,
+	.param .f64 matrix_scalar_op_d_param_1,
+	.param .u64 matrix_scalar_op_d_param_2,
+	.param .u32 matrix_scalar_op_d_param_3,
+	.param .u32 matrix_scalar_op_d_param_4,
+	.param .u32 matrix_scalar_op_d_param_5
+)
+{
+	.reg .pred 	%p<133>;
+	.reg .b32 	%r<88>;
+	.reg .f64 	%fd<109>;
+	.reg .b64 	%rd<20>;
+
+
+	ld.param.u64 	%rd4, [matrix_scalar_op_d_param_0];
+	ld.param.f64 	%fd68, [matrix_scalar_op_d_param_1];
+	ld.param.u64 	%rd5, [matrix_scalar_op_d_param_2];
+	ld.param.u32 	%r8, [matrix_scalar_op_d_param_3];
+	ld.param.u32 	%r6, [matrix_scalar_op_d_param_4];
+	ld.param.u32 	%r7, [matrix_scalar_op_d_param_5];
+	mov.u32 	%r9, %ntid.x;
+	mov.u32 	%r10, %ctaid.x;
+	mov.u32 	%r11, %tid.x;
+	mad.lo.s32 	%r1, %r9, %r10, %r11;
+	setp.ge.s32	%p3, %r1, %r8;
+	@%p3 bra 	BB24_138;
+
+	cvta.to.global.u64 	%rd6, %rd5;
+	cvta.to.global.u64 	%rd7, %rd4;
+	mul.wide.s32 	%rd8, %r1, 8;
+	add.s64 	%rd9, %rd7, %rd8;
+	ld.global.f64 	%fd1, [%rd9];
+	add.s64 	%rd1, %rd6, %rd8;
+	setp.eq.s32	%p4, %r7, 0;
+	@%p4 bra 	BB24_70;
+
+	mov.f64 	%fd99, 0d7FEFFFFFFFFFFFFF;
+	setp.gt.s32	%p5, %r6, 8;
+	@%p5 bra 	BB24_19;
+
+	setp.gt.s32	%p19, %r6, 3;
+	@%p19 bra 	BB24_11;
+
+	setp.gt.s32	%p26, %r6, 1;
+	@%p26 bra 	BB24_8;
+
+	setp.eq.s32	%p29, %r6, 0;
+	@%p29 bra 	BB24_68;
+	bra.uni 	BB24_6;
+
+BB24_68:
+	add.f64 	%fd99, %fd1, %fd68;
+	bra.uni 	BB24_69;
+
+BB24_70:
+	mov.f64 	%fd108, 0d7FEFFFFFFFFFFFFF;
+	setp.gt.s32	%p69, %r6, 8;
+	@%p69 bra 	BB24_87;
+
+	setp.gt.s32	%p83, %r6, 3;
+	@%p83 bra 	BB24_79;
+
+	setp.gt.s32	%p90, %r6, 1;
+	@%p90 bra 	BB24_76;
+
+	setp.eq.s32	%p93, %r6, 0;
+	@%p93 bra 	BB24_136;
+	bra.uni 	BB24_74;
+
+BB24_136:
+	add.f64 	%fd108, %fd1, %fd68;
+	bra.uni 	BB24_137;
+
+BB24_19:
+	setp.gt.s32	%p6, %r6, 13;
+	@%p6 bra 	BB24_28;
+
+	setp.gt.s32	%p13, %r6, 10;
+	@%p13 bra 	BB24_24;
+
+	setp.eq.s32	%p17, %r6, 9;
+	@%p17 bra 	BB24_48;
+	bra.uni 	BB24_22;
+
+BB24_48:
+	setp.eq.f64	%p44, %fd1, %fd68;
+	selp.f64	%fd99, 0d3FF0000000000000, 0d0000000000000000, %p44;
+	bra.uni 	BB24_69;
+
+BB24_87:
+	setp.gt.s32	%p70, %r6, 13;
+	@%p70 bra 	BB24_96;
+
+	setp.gt.s32	%p77, %r6, 10;
+	@%p77 bra 	BB24_92;
+
+	setp.eq.s32	%p81, %r6, 9;
+	@%p81 bra 	BB24_116;
+	bra.uni 	BB24_90;
+
+BB24_116:
+	setp.eq.f64	%p108, %fd1, %fd68;
+	selp.f64	%fd108, 0d3FF0000000000000, 0d0000000000000000, %p108;
+	bra.uni 	BB24_137;
+
+BB24_11:
+	setp.gt.s32	%p20, %r6, 5;
+	@%p20 bra 	BB24_15;
+
+	setp.eq.s32	%p24, %r6, 4;
+	@%p24 bra 	BB24_51;
+	bra.uni 	BB24_13;
+
+BB24_51:
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r2}, %fd68;
+	}
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r3}, %fd1;
+	}
+	bfe.u32 	%r24, %r3, 20, 11;
 	add.s32 	%r25, %r24, -1012;
 	mov.b64 	 %rd14, %fd1;
 	shl.b64 	%rd2, %rd14, %r25;
-	setp.eq.s64	%p51, %rd2, -9223372036854775808;
+	setp.eq.s64	%p49, %rd2, -9223372036854775808;
 	abs.f64 	%fd18, %fd68;
 	// Callseq Start 1
 	{
@@ -1310,69 +2392,69 @@ BB12_49:
 	param0, 
 	param1
 	);
-	ld.param.f64	%fd97, [retval0+0];
+	ld.param.f64	%fd98, [retval0+0];
 	
 	//{
 	}// Callseq End 1
-	setp.lt.s32	%p52, %r2, 0;
-	and.pred  	%p1, %p52, %p51;
-	@!%p1 bra 	BB12_51;
-	bra.uni 	BB12_50;
+	setp.lt.s32	%p50, %r2, 0;
+	and.pred  	%p1, %p50, %p49;
+	@!%p1 bra 	BB24_53;
+	bra.uni 	BB24_52;
 
-BB12_50:
+BB24_52:
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r26}, %fd97;
+	mov.b64 	{%temp, %r26}, %fd98;
 	}
 	xor.b32  	%r27, %r26, -2147483648;
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r28, %temp}, %fd97;
+	mov.b64 	{%r28, %temp}, %fd98;
 	}
-	mov.b64 	%fd97, {%r28, %r27};
+	mov.b64 	%fd98, {%r28, %r27};
 
-BB12_51:
-	mov.f64 	%fd96, %fd97;
-	setp.eq.f64	%p53, %fd68, 0d0000000000000000;
-	@%p53 bra 	BB12_54;
-	bra.uni 	BB12_52;
+BB24_53:
+	mov.f64 	%fd97, %fd98;
+	setp.eq.f64	%p51, %fd68, 0d0000000000000000;
+	@%p51 bra 	BB24_56;
+	bra.uni 	BB24_54;
 
-BB12_54:
-	selp.b32	%r29, %r2, 0, %p51;
+BB24_56:
+	selp.b32	%r29, %r2, 0, %p49;
 	or.b32  	%r30, %r29, 2146435072;
-	setp.lt.s32	%p57, %r3, 0;
-	selp.b32	%r31, %r30, %r29, %p57;
+	setp.lt.s32	%p55, %r3, 0;
+	selp.b32	%r31, %r30, %r29, %p55;
 	mov.u32 	%r32, 0;
-	mov.b64 	%fd96, {%r32, %r31};
-	bra.uni 	BB12_55;
+	mov.b64 	%fd97, {%r32, %r31};
+	bra.uni 	BB24_57;
 
-BB12_28:
+BB24_28:
 	setp.gt.s32	%p7, %r6, 15;
-	@%p7 bra 	BB12_32;
+	@%p7 bra 	BB24_32;
 
 	setp.eq.s32	%p11, %r6, 14;
-	@%p11 bra 	BB12_43;
-	bra.uni 	BB12_30;
+	@%p11 bra 	BB24_45;
+	bra.uni 	BB24_30;
 
-BB12_43:
+BB24_45:
 	cvt.rni.s64.f64	%rd10, %fd68;
 	cvt.rni.s64.f64	%rd11, %fd1;
 	cvt.u32.u64	%r18, %rd10;
 	cvt.u32.u64	%r19, %rd11;
 	or.b32  	%r20, %r19, %r18;
-	setp.eq.s32	%p43, %r20, 0;
-	selp.f64	%fd98, 0d0000000000000000, 0d3FF0000000000000, %p43;
-	bra.uni 	BB12_65;
+	setp.eq.s32	%p41, %r20, 0;
+	selp.f64	%fd99, 0d0000000000000000, 0d3FF0000000000000, %p41;
+	bra.uni 	BB24_69;
 
-BB12_75:
-	setp.gt.s32	%p88, %r6, 5;
-	@%p88 bra 	BB12_79;
+BB24_79:
+	setp.gt.s32	%p84, %r6, 5;
+	@%p84 bra 	BB24_83;
 
-	setp.eq.s32	%p92, %r6, 4;
-	@%p92 bra 	BB12_113;
-	bra.uni 	BB12_77;
+	setp.eq.s32	%p88, %r6, 4;
+	@%p88 bra 	BB24_119;
+	bra.uni 	BB24_81;
 
-BB12_113:
+BB24_119:
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r4}, %fd1;
@@ -1381,11 +2463,11 @@ BB12_113:
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r5}, %fd68;
 	}
-	bfe.u32 	%r61, %r5, 20, 11;
-	add.s32 	%r62, %r61, -1012;
+	bfe.u32 	%r62, %r5, 20, 11;
+	add.s32 	%r63, %r62, -1012;
 	mov.b64 	 %rd19, %fd68;
-	shl.b64 	%rd3, %rd19, %r62;
-	setp.eq.s64	%p119, %rd3, -9223372036854775808;
+	shl.b64 	%rd3, %rd19, %r63;
+	setp.eq.s64	%p113, %rd3, -9223372036854775808;
 	abs.f64 	%fd51, %fd1;
 	// Callseq Start 2
 	{
@@ -1402,621 +2484,1482 @@ BB12_113:
 	param0, 
 	param1
 	);
-	ld.param.f64	%fd105, [retval0+0];
+	ld.param.f64	%fd107, [retval0+0];
 	
 	//{
 	}// Callseq End 2
-	setp.lt.s32	%p120, %r4, 0;
-	and.pred  	%p2, %p120, %p119;
-	@!%p2 bra 	BB12_115;
-	bra.uni 	BB12_114;
+	setp.lt.s32	%p114, %r4, 0;
+	and.pred  	%p2, %p114, %p113;
+	@!%p2 bra 	BB24_121;
+	bra.uni 	BB24_120;
 
-BB12_114:
+BB24_120:
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r63}, %fd105;
+	mov.b64 	{%temp, %r64}, %fd107;
 	}
-	xor.b32  	%r64, %r63, -2147483648;
+	xor.b32  	%r65, %r64, -2147483648;
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r65, %temp}, %fd105;
+	mov.b64 	{%r66, %temp}, %fd107;
 	}
-	mov.b64 	%fd105, {%r65, %r64};
-
-BB12_115:
-	mov.f64 	%fd104, %fd105;
-	setp.eq.f64	%p121, %fd1, 0d0000000000000000;
-	@%p121 bra 	BB12_118;
-	bra.uni 	BB12_116;
-
-BB12_118:
-	selp.b32	%r66, %r4, 0, %p119;
-	or.b32  	%r67, %r66, 2146435072;
-	setp.lt.s32	%p125, %r5, 0;
-	selp.b32	%r68, %r67, %r66, %p125;
-	mov.u32 	%r69, 0;
-	mov.b64 	%fd104, {%r69, %r68};
-	bra.uni 	BB12_119;
-
-BB12_92:
-	setp.gt.s32	%p75, %r6, 15;
-	@%p75 bra 	BB12_96;
-
-	setp.eq.s32	%p79, %r6, 14;
-	@%p79 bra 	BB12_107;
-	bra.uni 	BB12_94;
-
-BB12_107:
+	mov.b64 	%fd107, {%r66, %r65};
+
+BB24_121:
+	mov.f64 	%fd106, %fd107;
+	setp.eq.f64	%p115, %fd1, 0d0000000000000000;
+	@%p115 bra 	BB24_124;
+	bra.uni 	BB24_122;
+
+BB24_124:
+	selp.b32	%r67, %r4, 0, %p113;
+	or.b32  	%r68, %r67, 2146435072;
+	setp.lt.s32	%p119, %r5, 0;
+	selp.b32	%r69, %r68, %r67, %p119;
+	mov.u32 	%r70, 0;
+	mov.b64 	%fd106, {%r70, %r69};
+	bra.uni 	BB24_125;
+
+BB24_96:
+	setp.gt.s32	%p71, %r6, 15;
+	@%p71 bra 	BB24_100;
+
+	setp.eq.s32	%p75, %r6, 14;
+	@%p75 bra 	BB24_113;
+	bra.uni 	BB24_98;
+
+BB24_113:
 	cvt.rni.s64.f64	%rd15, %fd1;
 	cvt.rni.s64.f64	%rd16, %fd68;
-	cvt.u32.u64	%r55, %rd15;
-	cvt.u32.u64	%r56, %rd16;
-	or.b32  	%r57, %r56, %r55;
-	setp.eq.s32	%p111, %r57, 0;
-	selp.f64	%fd106, 0d0000000000000000, 0d3FF0000000000000, %p111;
-	bra.uni 	BB12_129;
-
-BB12_8:
+	cvt.u32.u64	%r56, %rd15;
+	cvt.u32.u64	%r57, %rd16;
+	or.b32  	%r58, %r57, %r56;
+	setp.eq.s32	%p105, %r58, 0;
+	selp.f64	%fd108, 0d0000000000000000, 0d3FF0000000000000, %p105;
+	bra.uni 	BB24_137;
+
+BB24_8:
 	setp.eq.s32	%p27, %r6, 2;
-	@%p27 bra 	BB12_63;
-	bra.uni 	BB12_9;
+	@%p27 bra 	BB24_67;
+	bra.uni 	BB24_9;
 
-BB12_63:
-	mul.f64 	%fd98, %fd1, %fd68;
-	bra.uni 	BB12_65;
+BB24_67:
+	mul.f64 	%fd99, %fd1, %fd68;
+	bra.uni 	BB24_69;
 
-BB12_24:
+BB24_24:
 	setp.eq.s32	%p14, %r6, 11;
-	@%p14 bra 	BB12_45;
+	@%p14 bra 	BB24_47;
 
 	setp.eq.s32	%p15, %r6, 12;
-	@%p15 bra 	BB12_44;
-	bra.uni 	BB12_26;
+	@%p15 bra 	BB24_46;
+	bra.uni 	BB24_26;
 
-BB12_44:
-	max.f64 	%fd98, %fd68, %fd1;
-	bra.uni 	BB12_65;
+BB24_46:
+	max.f64 	%fd99, %fd68, %fd1;
+	bra.uni 	BB24_69;
 
-BB12_15:
+BB24_15:
 	setp.eq.s32	%p21, %r6, 6;
-	@%p21 bra 	BB12_48;
+	@%p21 bra 	BB24_50;
 
 	setp.eq.s32	%p22, %r6, 7;
-	@%p22 bra 	BB12_47;
-	bra.uni 	BB12_17;
+	@%p22 bra 	BB24_49;
+	bra.uni 	BB24_17;
 
-BB12_47:
-	setp.lt.f64	%p48, %fd1, %fd68;
-	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p48;
-	bra.uni 	BB12_65;
+BB24_49:
+	setp.lt.f64	%p46, %fd1, %fd68;
+	selp.f64	%fd99, 0d3FF0000000000000, 0d0000000000000000, %p46;
+	bra.uni 	BB24_69;
 
-BB12_32:
+BB24_32:
 	setp.eq.s32	%p8, %r6, 16;
-	@%p8 bra 	BB12_42;
+	@%p8 bra 	BB24_44;
 
 	setp.eq.s32	%p9, %r6, 17;
-	@%p9 bra 	BB12_38;
-	bra.uni 	BB12_34;
+	@%p9 bra 	BB24_39;
+	bra.uni 	BB24_34;
 
-BB12_38:
-	setp.eq.f64	%p35, %fd1, 0d0000000000000000;
-	setp.eq.f64	%p36, %fd1, 0d8000000000000000;
-	or.pred  	%p37, %p35, %p36;
-	mov.f64 	%fd98, 0d7FF8000000000000;
-	@%p37 bra 	BB12_65;
+BB24_39:
+	setp.eq.f64	%p34, %fd1, 0d0000000000000000;
+	setp.eq.f64	%p35, %fd1, 0d8000000000000000;
+	or.pred  	%p36, %p34, %p35;
+	mov.f64 	%fd99, 0d7FF8000000000000;
+	@%p36 bra 	BB24_69;
 
-	div.rn.f64 	%fd98, %fd68, %fd1;
-	abs.f64 	%fd72, %fd98;
-	setp.gtu.f64	%p38, %fd72, 0d7FF0000000000000;
-	@%p38 bra 	BB12_65;
+	div.rn.f64 	%fd99, %fd68, %fd1;
+	abs.f64 	%fd72, %fd99;
+	setp.gtu.f64	%p37, %fd72, 0d7FF0000000000000;
+	@%p37 bra 	BB24_69;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r15, %temp}, %fd98;
+	mov.b64 	{%temp, %r15}, %fd99;
 	}
+	and.b32  	%r16, %r15, 2147483647;
+	setp.ne.s32	%p38, %r16, 2146435072;
+	@%p38 bra 	BB24_43;
+
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r16}, %fd98;
+	mov.b64 	{%r17, %temp}, %fd99;
 	}
-	and.b32  	%r17, %r16, 2147483647;
-	setp.ne.s32	%p39, %r17, 2146435072;
-	setp.ne.s32	%p40, %r15, 0;
-	or.pred  	%p41, %p39, %p40;
-	@!%p41 bra 	BB12_65;
-	bra.uni 	BB12_41;
-
-BB12_41:
-	cvt.rmi.f64.f64	%fd73, %fd98;
+	setp.eq.s32	%p39, %r17, 0;
+	@%p39 bra 	BB24_69;
+
+BB24_43:
+	cvt.rmi.f64.f64	%fd73, %fd99;
 	mul.f64 	%fd74, %fd1, %fd73;
-	sub.f64 	%fd98, %fd68, %fd74;
-	bra.uni 	BB12_65;
-
-BB12_72:
-	setp.eq.s32	%p95, %r6, 2;
-	@%p95 bra 	BB12_127;
-	bra.uni 	BB12_73;
-
-BB12_127:
-	mul.f64 	%fd106, %fd1, %fd68;
-	bra.uni 	BB12_129;
-
-BB12_88:
-	setp.eq.s32	%p82, %r6, 11;
-	@%p82 bra 	BB12_109;
-
-	setp.eq.s32	%p83, %r6, 12;
-	@%p83 bra 	BB12_108;
-	bra.uni 	BB12_90;
-
-BB12_108:
-	max.f64 	%fd106, %fd1, %fd68;
-	bra.uni 	BB12_129;
-
-BB12_79:
-	setp.eq.s32	%p89, %r6, 6;
-	@%p89 bra 	BB12_112;
-
-	setp.eq.s32	%p90, %r6, 7;
-	@%p90 bra 	BB12_111;
-	bra.uni 	BB12_81;
-
-BB12_111:
-	setp.gt.f64	%p116, %fd1, %fd68;
-	selp.f64	%fd106, 0d3FF0000000000000, 0d0000000000000000, %p116;
-	bra.uni 	BB12_129;
-
-BB12_96:
-	setp.eq.s32	%p76, %r6, 16;
-	@%p76 bra 	BB12_106;
-
-	setp.eq.s32	%p77, %r6, 17;
-	@%p77 bra 	BB12_102;
-	bra.uni 	BB12_98;
-
-BB12_102:
-	setp.eq.f64	%p103, %fd68, 0d0000000000000000;
-	setp.eq.f64	%p104, %fd68, 0d8000000000000000;
-	or.pred  	%p105, %p103, %p104;
-	mov.f64 	%fd106, 0d7FF8000000000000;
-	@%p105 bra 	BB12_129;
-
-	div.rn.f64 	%fd106, %fd1, %fd68;
-	abs.f64 	%fd83, %fd106;
-	setp.gtu.f64	%p106, %fd83, 0d7FF0000000000000;
-	@%p106 bra 	BB12_129;
+	sub.f64 	%fd99, %fd68, %fd74;
+	bra.uni 	BB24_69;
+
+BB24_76:
+	setp.eq.s32	%p91, %r6, 2;
+	@%p91 bra 	BB24_135;
+	bra.uni 	BB24_77;
+
+BB24_135:
+	mul.f64 	%fd108, %fd1, %fd68;
+	bra.uni 	BB24_137;
+
+BB24_92:
+	setp.eq.s32	%p78, %r6, 11;
+	@%p78 bra 	BB24_115;
+
+	setp.eq.s32	%p79, %r6, 12;
+	@%p79 bra 	BB24_114;
+	bra.uni 	BB24_94;
+
+BB24_114:
+	max.f64 	%fd108, %fd1, %fd68;
+	bra.uni 	BB24_137;
+
+BB24_83:
+	setp.eq.s32	%p85, %r6, 6;
+	@%p85 bra 	BB24_118;
+
+	setp.eq.s32	%p86, %r6, 7;
+	@%p86 bra 	BB24_117;
+	bra.uni 	BB24_85;
+
+BB24_117:
+	setp.gt.f64	%p110, %fd1, %fd68;
+	selp.f64	%fd108, 0d3FF0000000000000, 0d0000000000000000, %p110;
+	bra.uni 	BB24_137;
+
+BB24_100:
+	setp.eq.s32	%p72, %r6, 16;
+	@%p72 bra 	BB24_112;
+
+	setp.eq.s32	%p73, %r6, 17;
+	@%p73 bra 	BB24_107;
+	bra.uni 	BB24_102;
+
+BB24_107:
+	setp.eq.f64	%p98, %fd68, 0d0000000000000000;
+	setp.eq.f64	%p99, %fd68, 0d8000000000000000;
+	or.pred  	%p100, %p98, %p99;
+	mov.f64 	%fd108, 0d7FF8000000000000;
+	@%p100 bra 	BB24_137;
+
+	div.rn.f64 	%fd108, %fd1, %fd68;
+	abs.f64 	%fd83, %fd108;
+	setp.gtu.f64	%p101, %fd83, 0d7FF0000000000000;
+	@%p101 bra 	BB24_137;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r52, %temp}, %fd106;
+	mov.b64 	{%temp, %r53}, %fd108;
 	}
+	and.b32  	%r54, %r53, 2147483647;
+	setp.ne.s32	%p102, %r54, 2146435072;
+	@%p102 bra 	BB24_111;
+
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r53}, %fd106;
+	mov.b64 	{%r55, %temp}, %fd108;
 	}
-	and.b32  	%r54, %r53, 2147483647;
-	setp.ne.s32	%p107, %r54, 2146435072;
-	setp.ne.s32	%p108, %r52, 0;
-	or.pred  	%p109, %p107, %p108;
-	@!%p109 bra 	BB12_129;
-	bra.uni 	BB12_105;
-
-BB12_105:
-	cvt.rmi.f64.f64	%fd84, %fd106;
+	setp.eq.s32	%p103, %r55, 0;
+	@%p103 bra 	BB24_137;
+
+BB24_111:
+	cvt.rmi.f64.f64	%fd84, %fd108;
 	mul.f64 	%fd85, %fd84, %fd68;
-	sub.f64 	%fd106, %fd1, %fd85;
-	bra.uni 	BB12_129;
+	sub.f64 	%fd108, %fd1, %fd85;
+	bra.uni 	BB24_137;
 
-BB12_6:
+BB24_6:
 	setp.eq.s32	%p30, %r6, 1;
-	@%p30 bra 	BB12_7;
-	bra.uni 	BB12_65;
+	@%p30 bra 	BB24_7;
+	bra.uni 	BB24_69;
 
-BB12_7:
-	sub.f64 	%fd98, %fd68, %fd1;
-	bra.uni 	BB12_65;
+BB24_7:
+	sub.f64 	%fd99, %fd68, %fd1;
+	bra.uni 	BB24_69;
 
-BB12_22:
+BB24_22:
 	setp.eq.s32	%p18, %r6, 10;
-	@%p18 bra 	BB12_23;
-	bra.uni 	BB12_65;
+	@%p18 bra 	BB24_23;
+	bra.uni 	BB24_69;
 
-BB12_23:
-	setp.neu.f64	%p45, %fd1, %fd68;
-	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p45;
-	bra.uni 	BB12_65;
+BB24_23:
+	setp.neu.f64	%p43, %fd1, %fd68;
+	selp.f64	%fd99, 0d3FF0000000000000, 0d0000000000000000, %p43;
+	bra.uni 	BB24_69;
 
-BB12_13:
+BB24_13:
 	setp.eq.s32	%p25, %r6, 5;
-	@%p25 bra 	BB12_14;
-	bra.uni 	BB12_65;
+	@%p25 bra 	BB24_14;
+	bra.uni 	BB24_69;
 
-BB12_14:
-	setp.gt.f64	%p50, %fd1, %fd68;
-	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p50;
-	bra.uni 	BB12_65;
+BB24_14:
+	setp.gt.f64	%p48, %fd1, %fd68;
+	selp.f64	%fd99, 0d3FF0000000000000, 0d0000000000000000, %p48;
+	bra.uni 	BB24_69;
 
-BB12_30:
+BB24_30:
 	setp.eq.s32	%p12, %r6, 15;
-	@%p12 bra 	BB12_31;
-	bra.uni 	BB12_65;
+	@%p12 bra 	BB24_31;
+	bra.uni 	BB24_69;
 
-BB12_31:
+BB24_31:
 	mul.f64 	%fd76, %fd1, %fd68;
 	mov.f64 	%fd77, 0d3FF0000000000000;
-	sub.f64 	%fd98, %fd77, %fd76;
-	bra.uni 	BB12_65;
+	sub.f64 	%fd99, %fd77, %fd76;
+	bra.uni 	BB24_69;
 
-BB12_9:
+BB24_9:
 	setp.eq.s32	%p28, %r6, 3;
-	@%p28 bra 	BB12_10;
-	bra.uni 	BB12_65;
+	@%p28 bra 	BB24_10;
+	bra.uni 	BB24_69;
 
-BB12_10:
-	div.rn.f64 	%fd98, %fd68, %fd1;
-	bra.uni 	BB12_65;
+BB24_10:
+	div.rn.f64 	%fd99, %fd68, %fd1;
+	bra.uni 	BB24_69;
 
-BB12_45:
-	min.f64 	%fd98, %fd68, %fd1;
-	bra.uni 	BB12_65;
+BB24_47:
+	min.f64 	%fd99, %fd68, %fd1;
+	bra.uni 	BB24_69;
 
-BB12_26:
+BB24_26:
 	setp.eq.s32	%p16, %r6, 13;
-	@%p16 bra 	BB12_27;
-	bra.uni 	BB12_65;
+	@%p16 bra 	BB24_27;
+	bra.uni 	BB24_69;
 
-BB12_27:
+BB24_27:
 	cvt.rni.s64.f64	%rd12, %fd68;
 	cvt.rni.s64.f64	%rd13, %fd1;
 	cvt.u32.u64	%r21, %rd12;
 	cvt.u32.u64	%r22, %rd13;
 	and.b32  	%r23, %r22, %r21;
-	setp.eq.s32	%p44, %r23, 0;
-	selp.f64	%fd98, 0d0000000000000000, 0d3FF0000000000000, %p44;
-	bra.uni 	BB12_65;
+	setp.eq.s32	%p42, %r23, 0;
+	selp.f64	%fd99, 0d0000000000000000, 0d3FF0000000000000, %p42;
+	bra.uni 	BB24_69;
 
-BB12_48:

<TRUNCATED>

[4/4] systemml git commit: [SYSTEMML-1969] Support single-precision operations on GPU backend

Posted by ni...@apache.org.
[SYSTEMML-1969] Support single-precision operations on GPU backend

- Since single-precision operations are faster on most GPU, we should allow our users to perform the instructions on GPU in single precision.
- The GPU backend has been refactored to support arbitrary precision.
- This feature can be enabled via configuration property sysml.floating.point.precision.
- The valid values for this property are double and float. We can support half/mixed precision in a separate PR.

Closes #688.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/abbffc55
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/abbffc55
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/abbffc55

Branch: refs/heads/master
Commit: abbffc55ef8f47f10b6e59b0ae5e1f311f4a8f3e
Parents: 881caa9
Author: Niketan Pansare <np...@us.ibm.com>
Authored: Wed Oct 25 19:25:20 2017 -0700
Committer: Niketan Pansare <np...@us.ibm.com>
Committed: Wed Oct 25 19:26:50 2017 -0700

----------------------------------------------------------------------
 conf/SystemML-config.xml.template               |    3 +
 src/main/cpp/kernels/SystemML.cu                | 1874 ++--
 src/main/cpp/kernels/SystemML.ptx               | 9579 ++++++++++++++----
 .../java/org/apache/sysml/api/DMLScript.java    |    1 +
 .../apache/sysml/api/ScriptExecutorUtils.java   |    4 +
 .../java/org/apache/sysml/conf/DMLConfig.java   |    4 +-
 .../controlprogram/caching/CacheableData.java   |    4 +-
 .../instructions/gpu/context/CSRPointer.java    |   52 +-
 .../instructions/gpu/context/GPUContext.java    |   31 +-
 .../instructions/gpu/context/GPUObject.java     |   91 +-
 .../instructions/gpu/context/JCudaKernels.java  |    9 +-
 .../matrix/data/CudaSupportFunctions.java       |   87 +
 .../DoublePrecisionCudaSupportFunctions.java    |  175 +
 .../runtime/matrix/data/LibMatrixCUDA.java      |  144 +-
 .../runtime/matrix/data/LibMatrixCuDNN.java     |   38 +-
 .../LibMatrixCuDNNConvolutionAlgorithm.java     |    5 +-
 .../data/LibMatrixCuDNNInputRowFetcher.java     |    8 +-
 .../data/LibMatrixCuDNNPoolingDescriptors.java  |    3 +-
 .../runtime/matrix/data/LibMatrixCuMatMult.java |   34 +-
 .../sysml/runtime/matrix/data/MatrixBlock.java  |    5 +-
 .../SinglePrecisionCudaSupportFunctions.java    |  208 +
 .../org/apache/sysml/test/gpu/GPUTests.java     |   20 +-
 .../test/gpu/MatrixMultiplicationOpTest.java    |   22 +-
 23 files changed, 9423 insertions(+), 2978 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/conf/SystemML-config.xml.template
----------------------------------------------------------------------
diff --git a/conf/SystemML-config.xml.template b/conf/SystemML-config.xml.template
index 511e215..8452e75 100644
--- a/conf/SystemML-config.xml.template
+++ b/conf/SystemML-config.xml.template
@@ -93,6 +93,9 @@
     <!-- whether to perform eager CUDA free on rmvar instruction -->
     <sysml.gpu.eager.cudaFree>false</sysml.gpu.eager.cudaFree>
    
+    <!-- the floating point precision. supported values are double, single -->
+    <sysml.floating.point.precision>double</sysml.floating.point.precision>
+    
    <!-- maximum wrap length for instruction and miscellaneous timer column of statistics -->
    <sysml.stats.maxWrapLength>30</sysml.stats.maxWrapLength>
 </root>

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/main/cpp/kernels/SystemML.cu
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.cu b/src/main/cpp/kernels/SystemML.cu
index c243564..d176f8f 100644
--- a/src/main/cpp/kernels/SystemML.cu
+++ b/src/main/cpp/kernels/SystemML.cu
@@ -26,11 +26,28 @@ nvcc -ptx -arch=sm_30 SystemML.cu
 #include <cfloat>
 #include <cmath>
 
+extern "C" __global__ void double2float_f(double *A, float *ret, int N) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < N) {
+  	// TODO: Use __double2float_rd or __double2float_rn  or __double2float_ru or __double2float_rz after 
+    ret[tid] = (float)A[tid];
+  }
+}
+
+extern "C" __global__ void float2double_f(float *A, double *ret, int N) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < N) {
+    ret[tid] = (double)A[tid];
+  }
+}
+
 /**
- * Performs a slice operation where the input matrix is sparse and the output matrix is dense.
- * This function avoids unnecessary sparse to dense conversion of the input matrix.
+ * Performs a slice operation where the input matrix is sparse and the output
+ * matrix is dense.
+ * This function avoids unnecessary sparse to dense conversion of the input
+ * matrix.
  * Parallelization: rows of output matrix.
- * 
+ *
  * @params inVal input val pointer
  * @params inRowPtr input row pointer
  * @params colInd input col index pointer
@@ -41,49 +58,73 @@ nvcc -ptx -arch=sm_30 SystemML.cu
  * @param cu column upper
  * @param retClen number of columns of output matrix
  */
-extern "C"
-__global__ void slice_sparse_dense_row(double* inVal, int* inRowPtr, int* colInd, double* ret, 
-    int rl, int ru, int cl, int cu, int retClen) {
-  	int index = blockIdx.x * blockDim.x + threadIdx.x;
-	int rowIndex = index + rl;
-  	if (rowIndex <= ru){
-  		/*
-		 * TODO: Alternative approach: use dynamic parallelism. We are skipping this for now to avoid
-		 * the complexity of two-step separate compilation and linking process.
-		 *  
-		 * extern "C"
-		 * __global__ void slice_sparse_dense_row_helper(double* inVal, int* inRowPtr, int* colInd, double* ret, 
-		 *     int rl, int ru, int cl, int cu, int retClen, int start, int end, int index) {
-		 *  int i = blockIdx.x * blockDim.x + threadIdx.x + start;   
-		 * 	// Only slice if the index falls into the given range
-		 * 	if(i < end && cl <= colInd[i] && colInd[i] <= cu) {
-		 * 		ret[ index*retClen + (colInd[i] - cl) ] = inVal[i];
-		 * 	}
-		 * }
-		 *
-		 * int size = inRowPtr[rowIndex+1] - inRowPtr[rowIndex];
-		 * double numThreads = (double)min(size, MAX_NUM_THREADS_CHILD_KERNEL);
-		 * slice_sparse_dense_row_helper<<< ceil(numThreads/ MAX_NUM_THREADS_CHILD_KERNEL), MAX_NUM_THREADS_CHILD_KERNEL>>>(inVal, inRowPtr, colInd, ret, 
-    	 *			rl, ru, cl, cu, retClen, inRowPtr[rowIndex], inRowPtr[rowIndex+1], index);
-    	 *
-    	 * Two-step compilation and linking process in JCudaKernels's constructor:
-    	 * cuLinkAddFile(linkState, CUjitInputType.CU_JIT_INPUT_LIBRARY, "/usr/local/cuda/lib64/libcudadevrt.a", jitOptions);
-		 */
-    	// Iterate over elements of the row 'rowIndex'.
-    	for(int i = inRowPtr[rowIndex]; i < inRowPtr[rowIndex+1]; i++) {
-    		// Only slice if the index falls into the given range
-    		if(cl <= colInd[i] && colInd[i] <= cu) {
-    			ret[ index*retClen + (colInd[i] - cl) ] = inVal[i];
-    		}
-    	}
+template <typename T>
+__device__ void slice_sparse_dense_row(T *inVal, int *inRowPtr, int *colInd,
+                                       T *ret, int rl, int ru, int cl, int cu,
+                                       int retClen) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int rowIndex = index + rl;
+  if (rowIndex <= ru) {
+    /*
+     * TODO: Alternative approach: use dynamic parallelism. We are skipping this
+*for now to avoid
+     * the complexity of two-step separate compilation and linking process.
+     *
+     * extern "C"
+     * __global__ void slice_sparse_dense_row_helper(double* inVal, int*
+*inRowPtr, int* colInd, double* ret,
+     *     int rl, int ru, int cl, int cu, int retClen, int start, int end, int
+*index) {
+     *  int i = blockIdx.x * blockDim.x + threadIdx.x + start;
+     * 	// Only slice if the index falls into the given range
+     * 	if(i < end && cl <= colInd[i] && colInd[i] <= cu) {
+     * 		ret[ index*retClen + (colInd[i] - cl) ] = inVal[i];
+     * 	}
+     * }
+     *
+     * int size = inRowPtr[rowIndex+1] - inRowPtr[rowIndex];
+     * double numThreads = (double)min(size, MAX_NUM_THREADS_CHILD_KERNEL);
+     * slice_sparse_dense_row_helper<<< ceil(numThreads/
+*MAX_NUM_THREADS_CHILD_KERNEL), MAX_NUM_THREADS_CHILD_KERNEL>>>(inVal, inRowPtr,
+*colInd, ret,
+*			rl, ru, cl, cu, retClen, inRowPtr[rowIndex],
+*inRowPtr[rowIndex+1], index);
+*
+* Two-step compilation and linking process in JCudaKernels's constructor:
+* cuLinkAddFile(linkState, CUjitInputType.CU_JIT_INPUT_LIBRARY,
+*"/usr/local/cuda/lib64/libcudadevrt.a", jitOptions);
+     */
+    // Iterate over elements of the row 'rowIndex'.
+    for (int i = inRowPtr[rowIndex]; i < inRowPtr[rowIndex + 1]; i++) {
+      // Only slice if the index falls into the given range
+      if (cl <= colInd[i] && colInd[i] <= cu) {
+        ret[index * retClen + (colInd[i] - cl)] = inVal[i];
+      }
     }
+  }
+}
+
+extern "C" __global__ void slice_sparse_dense_row_d(double *inVal, int *inRowPtr,
+                                                   int *colInd, double *ret,
+                                                   int rl, int ru, int cl,
+                                                   int cu, int retClen) {
+  slice_sparse_dense_row(inVal, inRowPtr, colInd, ret, rl, ru, cl, cu, retClen);
+}
+
+extern "C" __global__ void slice_sparse_dense_row_f(float *inVal, int *inRowPtr,
+                                                   int *colInd, float *ret,
+                                                   int rl, int ru, int cl,
+                                                   int cu, int retClen) {
+  slice_sparse_dense_row(inVal, inRowPtr, colInd, ret, rl, ru, cl, cu, retClen);
 }
 
 /**
- * Performs a slice operation where the input matrix is sparse and the output matrix is dense.
- * This function avoids unnecessary sparse to dense conversion of the input matrix.
+ * Performs a slice operation where the input matrix is sparse and the output
+ * matrix is dense.
+ * This function avoids unnecessary sparse to dense conversion of the input
+ * matrix.
  * Parallelization: subset of number of non-zeroes of input matrix.
- * 
+ *
  * @params inVal input val pointer
  * @params inRowPtr input row pointer
  * @params colInd input col index pointer
@@ -94,26 +135,42 @@ __global__ void slice_sparse_dense_row(double* inVal, int* inRowPtr, int* colInd
  * @param cu column upper
  * @param retClen number of columns of output matrix
  */
-extern "C"
-__global__ void slice_sparse_dense_nnz(double* inVal, int* inRowPtr, int* colInd, double* ret, 
-    int rl, int ru, int cl, int cu, int retClen) {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    int i = tid + inRowPtr[rl];
-    
-    // Only slice if the index falls into the given range
-    if(i < inRowPtr[ru+1] && cl <= colInd[i] && colInd[i] <= cu) {
-    	// Find the row index for corresponding non-zero value 'i'.
-    	int rowIndex = rl;
-    	while(inRowPtr[rowIndex+1] <= i) {
-    		rowIndex++;
-    	}
-	    ret[ (rowIndex-rl)*retClen + (colInd[i] - cl) ] = inVal[i];
+template <typename T>
+__device__ void slice_sparse_dense_nnz(T *inVal, int *inRowPtr, int *colInd,
+                                       T *ret, int rl, int ru, int cl, int cu,
+                                       int retClen) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int i = tid + inRowPtr[rl];
+
+  // Only slice if the index falls into the given range
+  if (i < inRowPtr[ru + 1] && cl <= colInd[i] && colInd[i] <= cu) {
+    // Find the row index for corresponding non-zero value 'i'.
+    int rowIndex = rl;
+    while (inRowPtr[rowIndex + 1] <= i) {
+      rowIndex++;
     }
+    ret[(rowIndex - rl) * retClen + (colInd[i] - cl)] = inVal[i];
+  }
+}
+
+extern "C" __global__ void slice_sparse_dense_nnz_d(double *inVal, int *inRowPtr,
+                                                   int *colInd, double *ret,
+                                                   int rl, int ru, int cl,
+                                                   int cu, int retClen) {
+  slice_sparse_dense_nnz(inVal, inRowPtr, colInd, ret, rl, ru, cl, cu, retClen);
+}
+
+extern "C" __global__ void slice_sparse_dense_nnz_f(float *inVal, int *inRowPtr,
+                                                   int *colInd, float *ret,
+                                                   int rl, int ru, int cl,
+                                                   int cu, int retClen) {
+  slice_sparse_dense_nnz(inVal, inRowPtr, colInd, ret, rl, ru, cl, cu, retClen);
 }
 
 /**
- * Performs a slice operation where the input matrix is dense and the output matrix is dense.
- * 
+ * Performs a slice operation where the input matrix is dense and the output
+ * matrix is dense.
+ *
  * @params in dense input pointer
  * @params ret dense output pointer
  * @param rl row lower
@@ -124,17 +181,31 @@ __global__ void slice_sparse_dense_nnz(double* inVal, int* inRowPtr, int* colInd
  * @param retRlen number of rows of output matrix
  * @param retClen number of columns of output matrix
  */
-extern "C"
-__global__ void slice_dense_dense(double* in, double* ret, int rl, int ru, int cl, int cu, int inClen, int retRlen, int retClen) {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
-	int ix = tid / retClen;
-	int iy = tid % retClen;
-	if(ix < retRlen && iy < retClen) {
-	    int inIndex = (ix + rl)*inClen + cl + iy;
-		ret[tid] = in[inIndex];
-	}
+template <typename T>
+__device__ void slice_dense_dense(T *in, T *ret, int rl, int ru, int cl, int cu,
+                                  int inClen, int retRlen, int retClen) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int ix = tid / retClen;
+  int iy = tid % retClen;
+  if (ix < retRlen && iy < retClen) {
+    int inIndex = (ix + rl) * inClen + cl + iy;
+    ret[tid] = in[inIndex];
+  }
+}
+
+extern "C" __global__ void slice_dense_dense_d(double *in, double *ret, int rl,
+                                              int ru, int cl, int cu,
+                                              int inClen, int retRlen,
+                                              int retClen) {
+  slice_dense_dense(in, ret, rl, ru, cl, cu, inClen, retRlen, retClen);
 }
 
+extern "C" __global__ void slice_dense_dense_f(float *in, float *ret, int rl,
+                                              int ru, int cl, int cu,
+                                              int inClen, int retRlen,
+                                              int retClen) {
+  slice_dense_dense(in, ret, rl, ru, cl, cu, inClen, retRlen, retClen);
+}
 
 /**
  * Does a copy of upper to lower triangle of the given matrix
@@ -142,95 +213,161 @@ __global__ void slice_dense_dense(double* in, double* ret, int rl, int ru, int c
  * @param dim the number of rows of the square matrix ret
  * @param N total number of elements of the matrix
  */
-extern "C"
-__global__ void copy_u2l_dense(double* ret, int dim, int N) {
-	int tid = blockIdx.x * blockDim.x + threadIdx.x;
-	int ix = tid / dim;
-	int iy = tid % dim;
-	int id_dest = iy * dim + ix;
-	if(iy > ix && id_dest < N) {
-		// TODO: Potential to reduce the number of threads by half
-		int id_src = tid;
-		ret[id_dest] = ret[id_src];
-	}
-}
-
-extern "C"
-__forceinline__ __device__ double getBoolean(int val) {
-	if(val == 0)
-		return 0.0;
-	else
-		return 1.0;
+template <typename T>
+__device__ void copy_u2l_dense(T *ret, int dim, int N) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int ix = tid / dim;
+  int iy = tid % dim;
+  int id_dest = iy * dim + ix;
+  if (iy > ix && id_dest < N) {
+    // TODO: Potential to reduce the number of threads by half
+    int id_src = tid;
+    ret[id_dest] = ret[id_src];
+  }
+}
+
+extern "C" __global__ void copy_u2l_dense_d(double *ret, int dim, int N) {
+  copy_u2l_dense(ret, dim, N);
+}
+
+extern "C" __global__ void copy_u2l_dense_f(float *ret, int dim, int N) {
+  copy_u2l_dense(ret, dim, N);
+}
+
+// Use this method in templates to fetch the maximum value for a given datatype
+template <typename T>
+__forceinline__ __device__ T T_MAX(T x) {
+  return (T)DBL_MAX;
+}
+template <>
+__forceinline__ __device__ float T_MAX(float x) {
+  return FLT_MAX;
+}
+template <>
+__forceinline__ __device__ double T_MAX(double x) {
+  return DBL_MAX;
 }
 
 // op = {0=plus, 1=minus, 2=multiply, 3=divide, 4=power,
 // 5=less, 6=lessequal, 7=greater, 8=greaterequal, 9=equal, 10=notequal,
 // 11=min, 12=max, 13=and, 14=or, 15=minus1multiply, 16=minusnz,
 // 17=modulus, 18=integer division}
-extern "C"
-__forceinline__ __device__ double binaryOp(double x, double y, int op) {
-	switch(op) {
-        case 0 : return x + y;
-        case 1 : return x - y;
-        case 2 : return x * y;
-        case 3 : return x / y;
-        case 4 : return pow(x, y);
-        case 5 : return getBoolean(x < y);
-        case 6 : return getBoolean(x <= y);
-        case 7 : return getBoolean(x > y);
-        case 8 : return getBoolean(x >= y);
-        case 9 : return getBoolean(x == y);
-        case 10 : return getBoolean(x != y);
-        case 11 : return min(x, y);
-        case 12 : return max(x, y);
-        case 13 : return getBoolean((int)llrint(x) & (int)llrint(y));
-        case 14 : return getBoolean((int)llrint(x) | (int)llrint(y));
-        case 15 : return 1 - x * y;
-        case 16 : return (x != 0.0 ? x - y : 0.0);
-        case 17 : {
-            if (y == 0.0 || y == -0.0){
-                return nan("");
-            }
-            double v = x / y;
-            // Check for v being NaN (v != v) or if it is infinity
-            if (isnan(v) || isinf(v)){
-                return v;
-            } else {
-                v = floor(v);
-            }
-            return x - v * y;
-        }
-        case 18:{
-            double v = x / y;
-            if (isnan(v) || isinf(v)){
-                return v;
-            } else {
-                return floor(v);
-            }
-        }
-        default : return DBL_MAX;
+template <typename T>
+__forceinline__ __device__ T binaryOp(T x, T y, int op) {
+  switch (op) {
+    case 0:
+      return x + y;
+    case 1:
+      return x - y;
+    case 2:
+      return x * y;
+    case 3:
+      return x / y;
+    case 4:
+      return pow(x, y);
+    case 5:
+      return (x < y) == 0 ? 0.0 : 1.0;
+    case 6:
+      return (x <= y) == 0 ? 0.0 : 1.0;
+    case 7:
+      return (x > y) == 0 ? 0.0 : 1.0;
+    case 8:
+      return (x >= y) == 0 ? 0.0 : 1.0;
+    case 9:
+      return (x == y) == 0 ? 0.0 : 1.0;
+    case 10:
+      return (x != y) == 0 ? 0.0 : 1.0;
+    case 11:
+      return min(x, y);
+    case 12:
+      return max(x, y);
+    case 13:
+      return ((int)llrint(x) & (int)llrint(y)) == 0 ? 0.0 : 1.0;
+    case 14:
+      return ((int)llrint(x) | (int)llrint(y)) == 0 ? 0.0 : 1.0;
+    case 15:
+      return 1 - x * y;
+    case 16:
+      return (x != 0.0 ? x - y : 0.0);
+    case 17: {
+      if (y == 0.0 || y == -0.0) {
+        return nan("");
+      }
+      T v = x / y;
+      // Check for v being NaN (v != v) or if it is infinity
+      if (isnan(v) || isinf(v)) {
+        return v;
+      } else {
+        v = floor(v);
+      }
+      return x - v * y;
     }
+    case 18: {
+      T v = x / y;
+      if (isnan(v) || isinf(v)) {
+        return v;
+      } else {
+        return floor(v);
+      }
+    }
+    default:
+      return T_MAX(x);
+  }
 }
 
-extern "C"
-__global__ void relu(double* A,  double* ret, int rlen, int clen) {
-	int tid = blockIdx.x * blockDim.x + threadIdx.x;
-	int ix = tid / clen;
-	int iy = tid % clen;
-	if(ix < rlen && iy < clen) {
-		ret[tid] = max(0.0, A[tid]);
-	}
+/**
+ * Performs forward pass for relu: ret = max(A, 0)
+ *
+ * @param A input array allocated on the GPU
+ * @param ret output array allocated on the GPU
+ * @param rlen the number of rows
+ * @param clen the number of columns
+ */
+template <typename T>
+__device__ void relu(T *A, T *ret, int rlen, int clen) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int ix = tid / clen;
+  int iy = tid % clen;
+  if (ix < rlen && iy < clen) {
+    ret[tid] = max(0.0, A[tid]);
+  }
+}
+
+extern "C" __global__ void relu_d(double *A, double *ret, int rlen, int clen) {
+  relu(A, ret, rlen, clen);
 }
 
-// This method computes the backpropagation errors for previous layer of relu operation
-extern "C"
-__global__ void relu_backward(double* X,  double* dout, double* ret, int rlen, int clen) {
-	int tid = blockIdx.x * blockDim.x + threadIdx.x;
-	int ix = tid / clen;
-	int iy = tid % clen;
-	if(ix < rlen && iy < clen) {
-		ret[tid] = X[tid] > 0 ?  dout[tid] : 0;
-	}
+extern "C" __global__ void relu_f(float *A, float *ret, int rlen, int clen) {
+  relu(A, ret, rlen, clen);
+}
+
+/**
+ * This method computes the backpropagation errors for previous layer of relu operation
+ *
+ * @param X input activation array allocated on the GPU
+ * @param dout errors from previous layer
+ * @param ret output array allocated on the GPU
+ * @param rlen the number of rows
+ * @param clen the number of columns
+ */
+template <typename T>
+__device__ void relu_backward(T *X, T *dout, T *ret, int rlen, int clen) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int ix = tid / clen;
+  int iy = tid % clen;
+  if (ix < rlen && iy < clen) {
+    ret[tid] = X[tid] > 0 ? dout[tid] : 0;
+  }
+}
+
+extern "C" __global__ void relu_backward_d(double *X, double *dout, double *ret,
+                                          int rlen, int clen) {
+  relu_backward(X, dout, ret, rlen, clen);
+}
+
+extern "C" __global__ void relu_backward_f(float *X, float *dout, float *ret,
+                                          int rlen, int clen) {
+  relu_backward(X, dout, ret, rlen, clen);
 }
 
 /**
@@ -241,81 +378,113 @@ __global__ void relu_backward(double* X,  double* dout, double* ret, int rlen, i
  * @param rlen the number of rows
  * @param clen the number of columns
  */
-extern "C"
-__global__ void inplace_add(double* input,  double* ret, int rlen, int clen) {
-	int tid = blockIdx.x * blockDim.x + threadIdx.x;
-	int ix = tid / clen;
-	int iy = tid % clen;
-	if(ix < rlen && iy < clen) {
-		ret[tid] += input[tid];
-	}
+template <typename T>
+__device__ void inplace_add(T *input, T *ret, int rlen, int clen) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int ix = tid / clen;
+  int iy = tid % clen;
+  if (ix < rlen && iy < clen) {
+    ret[tid] += input[tid];
+  }
+}
+
+extern "C" __global__ void inplace_add_d(double *input, double *ret, int rlen,
+                                        int clen) {
+  inplace_add(input, ret, rlen, clen);
+}
+
+extern "C" __global__ void inplace_add_f(float *input, float *ret, int rlen,
+                                        int clen) {
+  inplace_add(input, ret, rlen, clen);
 }
 
 // Performs the operation corresponding to the DML script:
 // ones = matrix(1, rows=1, cols=Hout*Wout)
 // output = input + matrix(bias %*% ones, rows=1, cols=F*Hout*Wout)
-// This operation is often followed by conv2d and hence we have introduced bias_add(input, bias) built-in function
-extern "C"
-__global__ void bias_add(double* input,  double* bias, double* ret, int rlen, int clen, int PQ) {
-	int tid = blockIdx.x * blockDim.x + threadIdx.x;
-	int ix = tid / clen;
-	int iy = tid % clen;
-	if(ix < rlen && iy < clen) {
-		int biasIndex = iy / PQ;
-		ret[tid] = input[tid] + bias[biasIndex];
-	}
+// This operation is often followed by conv2d and hence we have introduced
+// bias_add(input, bias) built-in function
+template <typename T>
+__device__ void bias_add(T *input, T *bias, T *ret, int rlen, int clen,
+                         int PQ) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int ix = tid / clen;
+  int iy = tid % clen;
+  if (ix < rlen && iy < clen) {
+    int biasIndex = iy / PQ;
+    ret[tid] = input[tid] + bias[biasIndex];
+  }
+}
+
+extern "C" __global__ void bias_add_d(double *input, double *bias, double *ret,
+                                     int rlen, int clen, int PQ) {
+  bias_add(input, bias, ret, rlen, clen, PQ);
+}
+
+extern "C" __global__ void bias_add_f(float *input, float *bias, float *ret,
+                                     int rlen, int clen, int PQ) {
+  bias_add(input, bias, ret, rlen, clen, PQ);
 }
 
 // Performs the operation "ret <- A + alpha*B", where B is a vector
-extern "C"
-__global__ void daxpy_matrix_vector(double* A,  double* B, double alpha, double* ret, int rlenA, int clenA, int rlenB, int clenB) {
-	int tid = blockIdx.x * blockDim.x + threadIdx.x;
-	int ix = tid / clenA;
-	int iy = tid % clenA;
-	if(ix < rlenA && iy < clenA) {
-		int index = ix * clenA + iy;
-		if(rlenB == 1) {
-			ret[index] = A[index] + alpha*B[iy];
-		}
-		else {
-			ret[index] = A[index] + alpha*B[ix];
-		}
-	}
-}
-
-// Performs similar operation as bias_add except elementwise multiplication instead of add
-extern "C"
-__global__ void bias_multiply(double* input,  double* bias, double* ret, int rlen, int clen, int PQ) {
-	int tid = blockIdx.x * blockDim.x + threadIdx.x;
-	int ix = tid / clen;
-	int iy = tid % clen;
-	if(ix < rlen && iy < clen) {
-		int biasIndex = iy / PQ;
-		ret[tid] = input[tid] * bias[biasIndex];
-	}
-}
-
-// Compares the value and set
-extern "C"
-__global__ void compare_and_set(double* A,  double* ret, int rlen, int clen, double compareVal, double tol, double ifEqualsVal, double ifLessThanVal, double ifGreaterThanVal) {
-	int tid = blockIdx.x * blockDim.x + threadIdx.x;
-	int ix = tid / clen;
-	int iy = tid % clen;
-	int index = ix * clen + iy;
-	if(ix < rlen && iy < clen) {
-		if(abs(A[index]-compareVal) < tol)
-			ret[index] = ifEqualsVal;
-		else if(A[index] < compareVal)
-			ret[index] = ifLessThanVal;
-		else
-			ret[index] = ifGreaterThanVal;
-	}
+template <typename T>
+__device__ void daxpy_matrix_vector(T *A, T *B, double alpha, T *ret, int rlenA,
+                                    int clenA, int rlenB, int clenB) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int ix = tid / clenA;
+  int iy = tid % clenA;
+  if (ix < rlenA && iy < clenA) {
+    int index = ix * clenA + iy;
+    if (rlenB == 1) {
+      ret[index] = A[index] + alpha * B[iy];
+    } else {
+      ret[index] = A[index] + alpha * B[ix];
+    }
+  }
 }
 
+extern "C" __global__ void daxpy_matrix_vector_d(double *A, double *B,
+                                                double alpha, double *ret,
+                                                int rlenA, int clenA, int rlenB,
+                                                int clenB) {
+  daxpy_matrix_vector(A, B, alpha, ret, rlenA, clenA, rlenB, clenB);
+}
+
+extern "C" __global__ void daxpy_matrix_vector_f(float *A, float *B,
+                                                double alpha, float *ret,
+                                                int rlenA, int clenA, int rlenB,
+                                                int clenB) {
+  daxpy_matrix_vector(A, B, alpha, ret, rlenA, clenA, rlenB, clenB);
+}
+
+// Performs similar operation as bias_add except elementwise multiplication
+// instead of add
+template <typename T>
+__device__ void bias_multiply(T *input, T *bias, T *ret, int rlen, int clen,
+                              int PQ) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int ix = tid / clen;
+  int iy = tid % clen;
+  if (ix < rlen && iy < clen) {
+    int biasIndex = iy / PQ;
+    ret[tid] = input[tid] * bias[biasIndex];
+  }
+}
+
+extern "C" __global__ void bias_multiply_d(double *input, double *bias,
+                                          double *ret, int rlen, int clen,
+                                          int PQ) {
+  bias_multiply(input, bias, ret, rlen, clen, PQ);
+}
+
+extern "C" __global__ void bias_multiply_f(float *input, float *bias, float *ret,
+                                          int rlen, int clen, int PQ) {
+  bias_multiply(input, bias, ret, rlen, clen, PQ);
+}
 
 /**
  * Performs a binary cellwise arithmetic operation on 2 matrices.
- * Either both matrices are of equal size or one of them is a vector or both are.
+ * Either both matrices are of equal size or one of them is a vector or both
+ * are.
  * @param A                 first input matrix allocated on GPU
  * @param B                 second input matrix allocated on GPU
  * @param C                 output allocated on GPU
@@ -323,37 +492,55 @@ __global__ void compare_and_set(double* A,  double* ret, int rlen, int clen, dou
  * @param maxClen           maximum of the column lengths of A and B
  * @param vectorAStatus     if A is a row vector, column vector or neither
  * @param vectorBStatus     if B is a row vector, column vector or neither
- * @param op                the numeric code of the arithmetic operation to perform
+ * @param op                the numeric code of the arithmetic operation to
+ * perform
  *
  */
-extern "C"
-__global__ void matrix_matrix_cellwise_op(double* A, double* B, double* C,
-	int maxRlen, int maxClen, int vectorAStatus, int vectorBStatus, int op) {
-	int tid = blockIdx.x * blockDim.x + threadIdx.x;
-	int ix = tid / maxClen;
-	int iy = tid % maxClen;
-
-	if(ix < maxRlen && iy < maxClen) {
-		int outIndex = ix * maxClen + iy;
-		int aIndex = outIndex;
-		int bIndex = outIndex;
-		if(vectorAStatus == 1)
-			aIndex = ix; // clen == 1
-		else if(vectorAStatus == 2)
-			aIndex = iy; // rlen == 1
-		if(vectorBStatus == 1)
-			bIndex = ix; // clen == 1
-		else if(vectorBStatus == 2)
-			bIndex = iy; // rlen == 1
-		C[outIndex] = binaryOp(A[aIndex], B[bIndex], op);
-		//printf("C[%d] = A[%d](%f) B[%d](%f) (%d %d)\n", outIndex, aIndex, A[aIndex], bIndex,  B[bIndex], (ix+1), (iy+1));
-	__syncthreads();
-	}
+template <typename T>
+__device__ void matrix_matrix_cellwise_op(T *A, T *B, T *C, int maxRlen,
+                                          int maxClen, int vectorAStatus,
+                                          int vectorBStatus, int op) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int ix = tid / maxClen;
+  int iy = tid % maxClen;
+
+  if (ix < maxRlen && iy < maxClen) {
+    int outIndex = ix * maxClen + iy;
+    int aIndex = outIndex;
+    int bIndex = outIndex;
+    if (vectorAStatus == 1)
+      aIndex = ix;  // clen == 1
+    else if (vectorAStatus == 2)
+      aIndex = iy;  // rlen == 1
+    if (vectorBStatus == 1)
+      bIndex = ix;  // clen == 1
+    else if (vectorBStatus == 2)
+      bIndex = iy;  // rlen == 1
+    C[outIndex] = binaryOp(A[aIndex], B[bIndex], op);
+    // printf("C[%d] = A[%d](%f) B[%d](%f) (%d %d)\n", outIndex, aIndex,
+    // A[aIndex], bIndex,  B[bIndex], (ix+1), (iy+1));
+    __syncthreads();
+  }
+}
+
+extern "C" __global__ void matrix_matrix_cellwise_op_d(
+    double *A, double *B, double *C, int maxRlen, int maxClen,
+    int vectorAStatus, int vectorBStatus, int op) {
+  matrix_matrix_cellwise_op(A, B, C, maxRlen, maxClen, vectorAStatus,
+                            vectorBStatus, op);
+}
+
+extern "C" __global__ void matrix_matrix_cellwise_op_f(
+    float *A, float *B, float *C, int maxRlen, int maxClen, int vectorAStatus,
+    int vectorBStatus, int op) {
+  matrix_matrix_cellwise_op(A, B, C, maxRlen, maxClen, vectorAStatus,
+                            vectorBStatus, op);
 }
 
 /**
  * Performs an arithmetic operation between a matrix and a scalar.
- * C = s op A or C = A op s (where A is the matrix, s is the scalar and op is the operation)
+ * C = s op A or C = A op s (where A is the matrix, s is the scalar and op is
+ * the operation)
  * @param A             input matrix allocated on GPU
  * @param scalar        scalar input
  * @param C             output matrix allocated on GPU
@@ -361,32 +548,53 @@ __global__ void matrix_matrix_cellwise_op(double* A, double* B, double* C,
  * @param op            number code of the arithmetic operation to perform
  * @param isLeftScalar  whether the scalar is on the left side
  */
-extern "C"
-__global__ void matrix_scalar_op(double* A, double scalar, double* C, int size, int op, int isLeftScalar) {
-	int index = blockIdx.x *blockDim.x + threadIdx.x;
-	if(index < size) {
-		if(isLeftScalar) {
-			C[index] = binaryOp(scalar, A[index], op);
-		} else {
-			C[index] = binaryOp(A[index], scalar, op);
-		}
-	}
-	__syncthreads();
+template <typename T>
+__device__ void matrix_scalar_op(T *A, T scalar, T *C, int size, int op,
+                                 int isLeftScalar) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < size) {
+    if (isLeftScalar) {
+      C[index] = binaryOp(scalar, A[index], op);
+    } else {
+      C[index] = binaryOp(A[index], scalar, op);
+    }
+  }
+  __syncthreads();
 }
 
+extern "C" __global__ void matrix_scalar_op_d(double *A, double scalar,
+                                             double *C, int size, int op,
+                                             int isLeftScalar) {
+  matrix_scalar_op(A, scalar, C, size, op, isLeftScalar);
+}
+
+extern "C" __global__ void matrix_scalar_op_f(float *A, double scalar, float *C,
+                                             int size, int op,
+                                             int isLeftScalar) {
+  matrix_scalar_op(A, (float)scalar, C, size, op, isLeftScalar);
+}
 
 /**
- * Sets all elements (fills) of a double array of given length with a given scalar value
+ * Sets all elements (fills) of a double array of given length with a given
+ * scalar value
  * @param A         array to be filled
  * @param scalar    value to fill array with
  * @param lenA      length of array A
  */
-extern "C"
-__global__ void fill(double* A, double scalar, int lenA) {
+template <typename T>
+__device__ void fill(T *A, T scalar, int lenA) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
-	if (index < lenA){
-	    A[index] = scalar;
-	}
+  if (index < lenA) {
+    A[index] = scalar;
+  }
+}
+
+extern "C" __global__ void fill_d(double *A, double scalar, int lenA) {
+  fill(A, scalar, lenA);
+}
+
+extern "C" __global__ void fill_f(float *A, double scalar, int lenA) {
+  fill(A, (float)scalar, lenA);
 }
 
 /**
@@ -402,29 +610,39 @@ __global__ void fill(double* A, double scalar, int lenA) {
  * @param rowsB  rows in B
  * @param colsB  columns in B
  */
-extern "C"
-__global__ void cbind(double *A, double *B, double *C, int rowsA, int colsA, int rowsB, int colsB) {
-	int maxClen = max(colsA, colsB);
-	int tid = blockIdx.x * blockDim.x + threadIdx.x;
-	int ix = tid / maxClen;
-	int iy = tid % maxClen;
-
-	int colsC = colsA + colsB;
-	int rowsC = rowsA;
-
-	// Copy an element of A into C into the appropriate location
-	if (ix < rowsA && iy < colsA) {
-		double elemA = A[ix * colsA + iy];
-		C[ix * colsC + iy] = elemA;
-	}
+template <typename T>
+__device__ void cbind(T *A, T *B, T *C, int rowsA, int colsA, int rowsB,
+                      int colsB) {
+  int maxClen = max(colsA, colsB);
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int ix = tid / maxClen;
+  int iy = tid % maxClen;
+
+  int colsC = colsA + colsB;
+  int rowsC = rowsA;
+
+  // Copy an element of A into C into the appropriate location
+  if (ix < rowsA && iy < colsA) {
+    T elemA = A[ix * colsA + iy];
+    C[ix * colsC + iy] = elemA;
+  }
+
+  // Copy an element of B into C into the appropriate location
+  if (ix < rowsB && iy < colsB) {
+    T elemB = B[ix * colsB + iy];
+    C[ix * colsC + (iy + colsA)] = elemB;
+  }
+}
 
-	// Copy an element of B into C into the appropriate location
-	if (ix < rowsB && iy < colsB) {
-		double elemB = B[ix * colsB + iy];
-		C[ix * colsC + (iy + colsA)] = elemB;
-	}
+extern "C" __global__ void cbind_d(double *A, double *B, double *C, int rowsA,
+                                  int colsA, int rowsB, int colsB) {
+  cbind(A, B, C, rowsA, colsA, rowsB, colsB);
 }
 
+extern "C" __global__ void cbind_f(float *A, float *B, float *C, int rowsA,
+                                  int colsA, int rowsB, int colsB) {
+  cbind(A, B, C, rowsA, colsA, rowsB, colsB);
+}
 
 /**
  * Appends Matrix B to the bottom of Matrix A into a new matrix C
@@ -441,176 +659,263 @@ __global__ void cbind(double *A, double *B, double *C, int rowsA, int colsA, int
  * @param rowsB  rows in B
  * @param colsB  columns in B
  */
-extern "C"
-__global__ void rbind(double *A, double *B, double *C, int rowsA, int colsA, int rowsB, int colsB) {
-	int maxClen = max(colsA, colsB);
-	int tid = blockIdx.x * blockDim.x + threadIdx.x;
-	int ix = tid / maxClen;
-	int iy = tid % maxClen;
-
-	int rowsC = rowsA + rowsB;
-	int colsC = colsA;
-
-	// Copy an element of A into C into the appropriate location
-	if (ix < rowsA && iy < colsA) {
-		double elemA = A[ix * colsA + iy];
-		C[ix * colsC + iy] = elemA;
-	}
+template <typename T>
+__device__ void rbind(T *A, T *B, T *C, int rowsA, int colsA, int rowsB,
+                      int colsB) {
+  int maxClen = max(colsA, colsB);
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int ix = tid / maxClen;
+  int iy = tid % maxClen;
+
+  int rowsC = rowsA + rowsB;
+  int colsC = colsA;
+
+  // Copy an element of A into C into the appropriate location
+  if (ix < rowsA && iy < colsA) {
+    T elemA = A[ix * colsA + iy];
+    C[ix * colsC + iy] = elemA;
+  }
+
+  // Copy an element of B into C into the appropriate location
+  if (ix < rowsB && iy < colsB) {
+    T elemB = B[ix * colsB + iy];
+    C[(ix + rowsA) * colsC + iy] = elemB;
+  }
+}
 
-	// Copy an element of B into C into the appropriate location
-	if (ix < rowsB && iy < colsB) {
-		double elemB = B[ix * colsB + iy];
-		C[(ix + rowsA) * colsC + iy] = elemB;
-	}
+extern "C" __global__ void rbind_d(double *A, double *B, double *C, int rowsA,
+                                  int colsA, int rowsB, int colsB) {
+  rbind(A, B, C, rowsA, colsA, rowsB, colsB);
 }
 
+extern "C" __global__ void rbind_f(float *A, float *B, float *C, int rowsA,
+                                  int colsA, int rowsB, int colsB) {
+  rbind(A, B, C, rowsA, colsA, rowsB, colsB);
+}
 
 /**
  * Does a reduce operation over all elements of the array.
- * This method has been adapted from the Reduction sample in the NVIDIA CUDA Samples (v8.0)
+ * This method has been adapted from the Reduction sample in the NVIDIA CUDA
+ * Samples (v8.0)
  * and the Reduction example available through jcuda.org
- * When invoked initially, all blocks partly compute the reduction operation over the entire array
- * and writes it to the output/temporary array. A second invokation needs to happen to get the
+ * When invoked initially, all blocks partly compute the reduction operation
+ * over the entire array
+ * and writes it to the output/temporary array. A second invokation needs to
+ * happen to get the
  * reduced value.
- * The number of threads, blocks and amount of shared memory is calculated in a specific way.
- * Please refer to the NVIDIA CUDA Sample or the SystemML code that invokes this method to see
+ * The number of threads, blocks and amount of shared memory is calculated in a
+ * specific way.
+ * Please refer to the NVIDIA CUDA Sample or the SystemML code that invokes this
+ * method to see
  * how its done.
- * The template-ized version of this function is similar to what is found in NVIDIA CUB
+ * The template-ized version of this function is similar to what is found in
+ * NVIDIA CUB
  *
- * @param ReductionOp       Type of the functor object that implements the reduction operation
+ * @param ReductionOp       Type of the functor object that implements the
+ * reduction operation
  */
-template <typename ReductionOp>
+template <typename ReductionOp, typename T>
 __device__ void reduce(
-    double *g_idata,            ///< input data stored in device memory (of size n)
-    double *g_odata,            ///< output/temporary array stored in device memory (of size n)
-    unsigned int n,             ///< size of the input and temporary/output arrays
-    ReductionOp reduction_op,	///< Reduction operation to perform (functor object)
-	double initialValue)  		///< initial value for the reduction variable
+    T *g_idata,  ///< input data stored in device memory (of size n)
+    T *g_odata,  ///< output/temporary array stored in device memory (of size n)
+    unsigned int n,  ///< size of the input and temporary/output arrays
+    ReductionOp
+        reduction_op,  ///< Reduction operation to perform (functor object)
+    T initialValue)    ///< initial value for the reduction variable
 {
-    extern __shared__ double sdata[];
-
-    // perform first level of reduction,
-    // reading from global memory, writing to shared memory
-    unsigned int tid = threadIdx.x;
-    unsigned int i = blockIdx.x*blockDim.x*2 + threadIdx.x;
-    unsigned int gridSize = blockDim.x*2*gridDim.x;
-
-    double v = initialValue;
-
-    // we reduce multiple elements per thread.  The number is determined by the
-    // number of active thread blocks (via gridDim).  More blocks will result
-    // in a larger gridSize and therefore fewer elements per thread
-    while (i < n)
-    {
-        v = reduction_op(v, g_idata[i]);
-        // ensure we don't read out of bounds
-        if (i + blockDim.x < n)
-            v = reduction_op(v, g_idata[i+blockDim.x]);
-        i += gridSize;
+  // extern __shared__ T sdata[];
+  extern __shared__ __align__(sizeof(T)) unsigned char my_sdata[];
+  T *sdata = reinterpret_cast<T *>(my_sdata);
+
+  // perform first level of reduction,
+  // reading from global memory, writing to shared memory
+  unsigned int tid = threadIdx.x;
+  unsigned int i = blockIdx.x * blockDim.x * 2 + threadIdx.x;
+  unsigned int gridSize = blockDim.x * 2 * gridDim.x;
+
+  T v = initialValue;
+
+  // we reduce multiple elements per thread.  The number is determined by the
+  // number of active thread blocks (via gridDim).  More blocks will result
+  // in a larger gridSize and therefore fewer elements per thread
+  while (i < n) {
+    v = reduction_op(v, g_idata[i]);
+    // ensure we don't read out of bounds
+    if (i + blockDim.x < n) v = reduction_op(v, g_idata[i + blockDim.x]);
+    i += gridSize;
+  }
+
+  // each thread puts its local sum into shared memory
+  sdata[tid] = v;
+  __syncthreads();
+
+  // do reduction in shared mem
+  if (blockDim.x >= 1024) {
+    if (tid < 512) {
+      sdata[tid] = v = reduction_op(v, sdata[tid + 512]);
     }
-
-    // each thread puts its local sum into shared memory
-    sdata[tid] = v;
     __syncthreads();
-
-
-    // do reduction in shared mem
-		if (blockDim.x >= 1024){ if (tid < 512) { sdata[tid] = v = reduction_op(v, sdata[tid + 512]); } __syncthreads(); }
-    if (blockDim.x >= 512) { if (tid < 256) { sdata[tid] = v = reduction_op(v, sdata[tid + 256]); } __syncthreads(); }
-    if (blockDim.x >= 256) { if (tid < 128) { sdata[tid] = v = reduction_op(v, sdata[tid + 128]); } __syncthreads(); }
-    if (blockDim.x >= 128) { if (tid <  64) { sdata[tid] = v = reduction_op(v, sdata[tid +  64]); } __syncthreads(); }
-
-    if (tid < 32)
-    {
-        // now that we are using warp-synchronous programming (below)
-        // we need to declare our shared memory volatile so that the compiler
-        // doesn't reorder stores to it and induce incorrect behavior.
-        volatile double* smem = sdata;
-        if (blockDim.x >=  64) { smem[tid] = v = reduction_op(v, smem[tid + 32]); }
-        if (blockDim.x >=  32) { smem[tid] = v = reduction_op(v, smem[tid + 16]); }
-        if (blockDim.x >=  16) { smem[tid] = v = reduction_op(v, smem[tid +  8]); }
-        if (blockDim.x >=   8) { smem[tid] = v = reduction_op(v, smem[tid +  4]); }
-        if (blockDim.x >=   4) { smem[tid] = v = reduction_op(v, smem[tid +  2]); }
-        if (blockDim.x >=   2) { smem[tid] = v = reduction_op(v, smem[tid +  1]); }
+  }
+  if (blockDim.x >= 512) {
+    if (tid < 256) {
+      sdata[tid] = v = reduction_op(v, sdata[tid + 256]);
     }
+    __syncthreads();
+  }
+  if (blockDim.x >= 256) {
+    if (tid < 128) {
+      sdata[tid] = v = reduction_op(v, sdata[tid + 128]);
+    }
+    __syncthreads();
+  }
+  if (blockDim.x >= 128) {
+    if (tid < 64) {
+      sdata[tid] = v = reduction_op(v, sdata[tid + 64]);
+    }
+    __syncthreads();
+  }
+
+  if (tid < 32) {
+    // now that we are using warp-synchronous programming (below)
+    // we need to declare our shared memory volatile so that the compiler
+    // doesn't reorder stores to it and induce incorrect behavior.
+    volatile T *smem = sdata;
+    if (blockDim.x >= 64) {
+      smem[tid] = v = reduction_op(v, smem[tid + 32]);
+    }
+    if (blockDim.x >= 32) {
+      smem[tid] = v = reduction_op(v, smem[tid + 16]);
+    }
+    if (blockDim.x >= 16) {
+      smem[tid] = v = reduction_op(v, smem[tid + 8]);
+    }
+    if (blockDim.x >= 8) {
+      smem[tid] = v = reduction_op(v, smem[tid + 4]);
+    }
+    if (blockDim.x >= 4) {
+      smem[tid] = v = reduction_op(v, smem[tid + 2]);
+    }
+    if (blockDim.x >= 2) {
+      smem[tid] = v = reduction_op(v, smem[tid + 1]);
+    }
+  }
 
-    // write result for this block to global mem
-    if (tid == 0)
-        g_odata[blockIdx.x] = sdata[0];
+  // write result for this block to global mem
+  if (tid == 0) g_odata[blockIdx.x] = sdata[0];
 }
 
-
-
 /**
  * Does a reduce (sum) over each row of the array.
  * This kernel must be launched with as many blocks as there are rows.
- * The intuition for this kernel is that each block does a reduction over a single row.
- * The maximum number of blocks that can launched (as of compute capability 3.0) is 2^31 - 1
- * This works out fine for SystemML, since the maximum elements in a Java array can be 2^31 - c (some small constant)
- * If the matrix is "fat" and "short", i.e. there are small number of rows and a large number of columns,
+ * The intuition for this kernel is that each block does a reduction over a
+ * single row.
+ * The maximum number of blocks that can launched (as of compute capability 3.0)
+ * is 2^31 - 1
+ * This works out fine for SystemML, since the maximum elements in a Java array
+ * can be 2^31 - c (some small constant)
+ * If the matrix is "fat" and "short", i.e. there are small number of rows and a
+ * large number of columns,
  * there could be under-utilization of the hardware.
- * The template-ized version of this function is similar to what is found in NVIDIA CUB
- * @param ReductionOp       Type of the functor object that implements the reduction operation
- * @param AssignmentOp      Type of the functor object that is used to modify the value before writing it to its final location in global memory for each row
+ * The template-ized version of this function is similar to what is found in
+ * NVIDIA CUB
+ * @param ReductionOp       Type of the functor object that implements the
+ * reduction operation
+ * @param AssignmentOp      Type of the functor object that is used to modify
+ * the value before writing it to its final location in global memory for each
+ * row
  */
-template <typename ReductionOp,
-          typename AssignmentOp>
+template <typename ReductionOp, typename AssignmentOp, typename T>
 __device__ void reduce_row(
-    double *g_idata,            ///< input data stored in device memory (of size rows*cols)
-    double *g_odata,            ///< output/temporary array store in device memory (of size rows*cols)
-    unsigned int rows,          ///< rows in input and temporary/output arrays
-    unsigned int cols,          ///< columns in input and temporary/output arrays
-    ReductionOp reduction_op,		///< Reduction operation to perform (functor object)
-    AssignmentOp assignment_op, ///< Operation to perform before assigning this to its final location in global memory for each row
-    double initialValue){  			///< initial value for the reduction variable
-    extern __shared__ double sdata[];
-
-    // one block per row
-    if (blockIdx.x >= rows) {
-        return;
+    T *g_idata,  ///< input data stored in device memory (of size rows*cols)
+    T *g_odata,  ///< output/temporary array store in device memory (of size
+                 ///rows*cols)
+    unsigned int rows,  ///< rows in input and temporary/output arrays
+    unsigned int cols,  ///< columns in input and temporary/output arrays
+    ReductionOp
+        reduction_op,  ///< Reduction operation to perform (functor object)
+    AssignmentOp assignment_op,  ///< Operation to perform before assigning this
+                                 ///to its final location in global memory for
+                                 ///each row
+    T initialValue) {            ///< initial value for the reduction variable
+  // extern __shared__ T sdata[];
+  extern __shared__ __align__(sizeof(T)) unsigned char my_sdata[];
+  T *sdata = reinterpret_cast<T *>(my_sdata);
+
+  // one block per row
+  if (blockIdx.x >= rows) {
+    return;
+  }
+
+  unsigned int block = blockIdx.x;
+  unsigned int tid = threadIdx.x;
+  unsigned int i = tid;
+  unsigned int block_offset = block * cols;
+
+  T v = initialValue;
+  while (i < cols) {
+    v = reduction_op(v, g_idata[block_offset + i]);
+    i += blockDim.x;
+  }
+
+  // each thread puts its local sum into shared memory
+  sdata[tid] = v;
+  __syncthreads();
+
+  // do reduction in shared mem
+  if (blockDim.x >= 1024) {
+    if (tid < 512) {
+      sdata[tid] = v = reduction_op(v, sdata[tid + 512]);
     }
-
-    unsigned int block = blockIdx.x;
-    unsigned int tid = threadIdx.x;
-    unsigned int i = tid;
-    unsigned int block_offset = block * cols;
-
-    double v = initialValue;
-    while (i < cols){
-        v = reduction_op(v, g_idata[block_offset + i]);
-        i += blockDim.x;
+    __syncthreads();
+  }
+  if (blockDim.x >= 512) {
+    if (tid < 256) {
+      sdata[tid] = v = reduction_op(v, sdata[tid + 256]);
     }
-
-    // each thread puts its local sum into shared memory
-    sdata[tid] = v;
     __syncthreads();
-
- 		// do reduction in shared mem
-  	if (blockDim.x >= 1024){ if (tid < 512) { sdata[tid] = v = reduction_op(v, sdata[tid + 512]); } __syncthreads(); }
-    if (blockDim.x >= 512) { if (tid < 256) { sdata[tid] = v = reduction_op(v, sdata[tid + 256]); } __syncthreads(); }
-    if (blockDim.x >= 256) { if (tid < 128) { sdata[tid] = v = reduction_op(v, sdata[tid + 128]); } __syncthreads(); }
-    if (blockDim.x >= 128) { if (tid <  64) { sdata[tid] = v = reduction_op(v, sdata[tid +  64]); } __syncthreads(); }
-
-    if (tid < 32)
-    {
-        // now that we are using warp-synchronous programming (below)
-        // we need to declare our shared memory volatile so that the compiler
-        // doesn't reorder stores to it and induce incorrect behavior.
-        volatile double* smem = sdata;
-        if (blockDim.x >=  64) { smem[tid] = v = reduction_op(v, smem[tid + 32]); }
-        if (blockDim.x >=  32) { smem[tid] = v = reduction_op(v, smem[tid + 16]); }
-        if (blockDim.x >=  16) { smem[tid] = v = reduction_op(v, smem[tid +  8]); }
-        if (blockDim.x >=   8) { smem[tid] = v = reduction_op(v, smem[tid +  4]); }
-        if (blockDim.x >=   4) { smem[tid] = v = reduction_op(v, smem[tid +  2]); }
-        if (blockDim.x >=   2) { smem[tid] = v = reduction_op(v, smem[tid +  1]); }
+  }
+  if (blockDim.x >= 256) {
+    if (tid < 128) {
+      sdata[tid] = v = reduction_op(v, sdata[tid + 128]);
+    }
+    __syncthreads();
+  }
+  if (blockDim.x >= 128) {
+    if (tid < 64) {
+      sdata[tid] = v = reduction_op(v, sdata[tid + 64]);
+    }
+    __syncthreads();
+  }
+
+  if (tid < 32) {
+    // now that we are using warp-synchronous programming (below)
+    // we need to declare our shared memory volatile so that the compiler
+    // doesn't reorder stores to it and induce incorrect behavior.
+    volatile T *smem = sdata;
+    if (blockDim.x >= 64) {
+      smem[tid] = v = reduction_op(v, smem[tid + 32]);
+    }
+    if (blockDim.x >= 32) {
+      smem[tid] = v = reduction_op(v, smem[tid + 16]);
+    }
+    if (blockDim.x >= 16) {
+      smem[tid] = v = reduction_op(v, smem[tid + 8]);
+    }
+    if (blockDim.x >= 8) {
+      smem[tid] = v = reduction_op(v, smem[tid + 4]);
+    }
+    if (blockDim.x >= 4) {
+      smem[tid] = v = reduction_op(v, smem[tid + 2]);
     }
+    if (blockDim.x >= 2) {
+      smem[tid] = v = reduction_op(v, smem[tid + 1]);
+    }
+  }
 
-    // write result for this block to global mem, modify it with assignment op
-    if (tid == 0)
-        g_odata[block] = assignment_op(sdata[0]);
+  // write result for this block to global mem, modify it with assignment op
+  if (tid == 0) g_odata[block] = assignment_op(sdata[0]);
 }
 
-
 /**
  * Does a column wise reduction.
  * The intuition is that there are as many global threads as there are columns
@@ -618,57 +923,59 @@ __device__ void reduce_row(
  * This of course leads to a under-utilization of the GPU resources.
  * For cases, where the number of columns is small, there can be unused SMs
  *
- * The template-ized version of this function is similar to what is found in NVIDIA CUB
- * @param ReductionOp       Type of the functor object that implements the reduction operation
- * @param AssignmentOp      Type of the functor object that is used to modify the value before writing it to its final location in global memory for each column
+ * The template-ized version of this function is similar to what is found in
+ * NVIDIA CUB
+ * @param ReductionOp       Type of the functor object that implements the
+ * reduction operation
+ * @param AssignmentOp      Type of the functor object that is used to modify
+ * the value before writing it to its final location in global memory for each
+ * column
  */
-template <typename ReductionOp,
-          typename AssignmentOp>
+template <typename ReductionOp, typename AssignmentOp, typename T>
 __device__ void reduce_col(
-    double *g_idata,            ///< input data stored in device memory (of size rows*cols)
-    double *g_odata,            ///< output/temporary array store in device memory (of size rows*cols)
-    unsigned int rows,          ///< rows in input and temporary/output arrays
-    unsigned int cols,          ///< columns in input and temporary/output arrays
-    ReductionOp reduction_op,	///< Reduction operation to perform (functor object)
-    AssignmentOp assignment_op, ///< Operation to perform before assigning this to its final location in global memory for each column
-    double initialValue)  		///< initial value for the reduction variable
+    T *g_idata,  ///< input data stored in device memory (of size rows*cols)
+    T *g_odata,  ///< output/temporary array store in device memory (of size
+                 ///rows*cols)
+    unsigned int rows,  ///< rows in input and temporary/output arrays
+    unsigned int cols,  ///< columns in input and temporary/output arrays
+    ReductionOp
+        reduction_op,  ///< Reduction operation to perform (functor object)
+    AssignmentOp assignment_op,  ///< Operation to perform before assigning this
+                                 ///to its final location in global memory for
+                                 ///each column
+    T initialValue)              ///< initial value for the reduction variable
 {
-    unsigned int global_tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (global_tid >= cols) {
-        return;
-    }
-
-    unsigned int i = global_tid;
-    unsigned int grid_size = cols;
-    double val = initialValue;
-
-    while (i < rows * cols) {
-      val = reduction_op(val, g_idata[i]);
-      i += grid_size;
-    }
-    g_odata[global_tid] = assignment_op(val);
+  unsigned int global_tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (global_tid >= cols) {
+    return;
+  }
+
+  unsigned int i = global_tid;
+  unsigned int grid_size = cols;
+  T val = initialValue;
+
+  while (i < rows * cols) {
+    val = reduction_op(val, g_idata[i]);
+    i += grid_size;
+  }
+  g_odata[global_tid] = assignment_op(val);
 }
 
 /**
  * Functor op for assignment op. This is a dummy/identity op.
  */
-typedef struct {
-    __device__ __forceinline__
-    double operator()(double a) const {
-        return a;
-    }
-} IdentityOp;
+template <typename T>
+struct IdentityOp {
+  __device__ __forceinline__ T operator()(T a) const { return a; }
+};
 
 /**
  * Functor op for summation operation
  */
-typedef struct {
-    __device__ __forceinline__
-    double operator()(double a, double b) const {
-        return a + b;
-    }
-} SumOp;
-
+template <typename T>
+struct SumOp {
+  __device__ __forceinline__ T operator()(T a, T b) const { return a + b; }
+};
 
 /**
  * Do a summation over all elements of an array/matrix
@@ -676,10 +983,20 @@ typedef struct {
  * @param g_odata   output/temporary array stored in device memory (of size n)
  * @param n         size of the input and temporary/output arrays
  */
-extern "C"
-__global__ void reduce_sum(double *g_idata, double *g_odata, unsigned int n){
-	SumOp op;
-  reduce<SumOp>(g_idata, g_odata, n, op, 0.0);
+template <typename T>
+__device__ void reduce_sum(T *g_idata, T *g_odata, unsigned int n) {
+  SumOp<T> op;
+  reduce<SumOp<T>, T>(g_idata, g_odata, n, op, (T)0.0);
+}
+
+extern "C" __global__ void reduce_sum_d(double *g_idata, double *g_odata,
+                                       unsigned int n) {
+  reduce_sum(g_idata, g_odata, n);
+}
+
+extern "C" __global__ void reduce_sum_f(float *g_idata, float *g_odata,
+                                       unsigned int n) {
+  reduce_sum(g_idata, g_odata, n);
 }
 
 /**
@@ -689,11 +1006,25 @@ __global__ void reduce_sum(double *g_idata, double *g_odata, unsigned int n){
  * @param rows      number of rows in input matrix
  * @param cols      number of columns in input matrix
  */
-extern "C"
-__global__ void reduce_row_sum(double *g_idata, double *g_odata, unsigned int rows, unsigned int cols){
-    SumOp op;
-    IdentityOp aop;
-    reduce_row<SumOp, IdentityOp>(g_idata, g_odata, rows, cols, op, aop, 0.0);
+template <typename T>
+__device__ void reduce_row_sum(T *g_idata, T *g_odata, unsigned int rows,
+                               unsigned int cols) {
+  SumOp<T> op;
+  IdentityOp<T> aop;
+  reduce_row<SumOp<T>, IdentityOp<T>, T>(g_idata, g_odata, rows, cols, op, aop,
+                                         0.0);
+}
+
+extern "C" __global__ void reduce_row_sum_d(double *g_idata, double *g_odata,
+                                           unsigned int rows,
+                                           unsigned int cols) {
+  reduce_row_sum(g_idata, g_odata, rows, cols);
+}
+
+extern "C" __global__ void reduce_row_sum_f(float *g_idata, float *g_odata,
+                                           unsigned int rows,
+                                           unsigned int cols) {
+  reduce_row_sum(g_idata, g_odata, rows, cols);
 }
 
 /**
@@ -703,23 +1034,39 @@ __global__ void reduce_row_sum(double *g_idata, double *g_odata, unsigned int ro
  * @param rows      number of rows in input matrix
  * @param cols      number of columns in input matrix
  */
-extern "C"
-__global__ void reduce_col_sum(double *g_idata, double *g_odata, unsigned int rows, unsigned int cols){
-    SumOp op;
-    IdentityOp aop;
-    reduce_col<SumOp, IdentityOp>(g_idata, g_odata, rows, cols, op, aop, 0.0);
+template <typename T>
+__device__ void reduce_col_sum(T *g_idata, T *g_odata, unsigned int rows,
+                               unsigned int cols) {
+  SumOp<T> op;
+  IdentityOp<T> aop;
+  reduce_col<SumOp<T>, IdentityOp<T>, T>(g_idata, g_odata, rows, cols, op, aop,
+                                         (T)0.0);
+}
+
+extern "C" __global__ void reduce_col_sum_d(double *g_idata, double *g_odata,
+                                           unsigned int rows,
+                                           unsigned int cols) {
+  reduce_col_sum(g_idata, g_odata, rows, cols);
 }
 
+extern "C" __global__ void reduce_col_sum_f(float *g_idata, float *g_odata,
+                                           unsigned int rows,
+                                           unsigned int cols) {
+  reduce_col_sum(g_idata, g_odata, rows, cols);
+}
 
 /**
  * Functor op for max operation
  */
-typedef struct {
-    __device__ __forceinline__
-    double operator()(double a, double b) const {
-        return fmax(a, b);
-    }
-} MaxOp;
+template <typename T>
+struct MaxOp {
+  __device__ __forceinline__ T operator()(T a, T b) const { return fmax(a, b); }
+};
+
+template<>
+struct MaxOp<float> {
+  __device__ __forceinline__ float operator()(float a, float b) const { return fmaxf(a, b); }
+};
 
 
 /**
@@ -728,10 +1075,20 @@ typedef struct {
  * @param g_odata   output/temporary array stode in device memory (of size n)
  * @param n         size of the input and temporary/output arrays
  */
-extern "C"
-__global__ void reduce_max(double *g_idata, double *g_odata, unsigned int n){
-    MaxOp op;
-    reduce<MaxOp>(g_idata, g_odata, n, op, -DBL_MAX);
+template <typename T>
+__device__ void reduce_max(T *g_idata, T *g_odata, unsigned int n) {
+  MaxOp<T> op;
+  reduce<MaxOp<T>, T>(g_idata, g_odata, n, op, -T_MAX(g_idata[0]));
+}
+
+extern "C" __global__ void reduce_max_d(double *g_idata, double *g_odata,
+                                       unsigned int n) {
+  reduce_max(g_idata, g_odata, n);
+}
+
+extern "C" __global__ void reduce_max_f(float *g_idata, float *g_odata,
+                                       unsigned int n) {
+  reduce_max(g_idata, g_odata, n);
 }
 
 /**
@@ -741,11 +1098,25 @@ __global__ void reduce_max(double *g_idata, double *g_odata, unsigned int n){
  * @param rows      number of rows in input matrix
  * @param cols      number of columns in input matrix
  */
-extern "C"
-__global__ void reduce_row_max(double *g_idata, double *g_odata, unsigned int rows, unsigned int cols){
-    MaxOp op;
-    IdentityOp aop;
-    reduce_row<MaxOp, IdentityOp>(g_idata, g_odata, rows, cols, op, aop, -DBL_MAX);
+template <typename T>
+__device__ void reduce_row_max(T *g_idata, T *g_odata, unsigned int rows,
+                               unsigned int cols) {
+  MaxOp<T> op;
+  IdentityOp<T> aop;
+  reduce_row<MaxOp<T>, IdentityOp<T>, T>(g_idata, g_odata, rows, cols, op, aop,
+                                         -T_MAX(g_idata[0]));
+}
+
+extern "C" __global__ void reduce_row_max_d(double *g_idata, double *g_odata,
+                                           unsigned int rows,
+                                           unsigned int cols) {
+  reduce_row_max(g_idata, g_odata, rows, cols);
+}
+
+extern "C" __global__ void reduce_row_max_f(float *g_idata, float *g_odata,
+                                           unsigned int rows,
+                                           unsigned int cols) {
+  reduce_row_max(g_idata, g_odata, rows, cols);
 }
 
 /**
@@ -755,22 +1126,34 @@ __global__ void reduce_row_max(double *g_idata, double *g_odata, unsigned int ro
  * @param rows      number of rows in input matrix
  * @param cols      number of columns in input matrix
  */
-extern "C"
-__global__ void reduce_col_max(double *g_idata, double *g_odata, unsigned int rows, unsigned int cols){
-    MaxOp op;
-    IdentityOp aop;
-    reduce_col<MaxOp, IdentityOp>(g_idata, g_odata, rows, cols, op, aop, -DBL_MAX);
+template <typename T>
+__device__ void reduce_col_max(T *g_idata, T *g_odata, unsigned int rows,
+                               unsigned int cols) {
+  MaxOp<T> op;
+  IdentityOp<T> aop;
+  reduce_col<MaxOp<T>, IdentityOp<T>, T>(g_idata, g_odata, rows, cols, op, aop,
+                                         (T)-T_MAX(g_idata[0]));
+}
+
+extern "C" __global__ void reduce_col_max_d(double *g_idata, double *g_odata,
+                                           unsigned int rows,
+                                           unsigned int cols) {
+  reduce_col_max(g_idata, g_odata, rows, cols);
+}
+
+extern "C" __global__ void reduce_col_max_f(float *g_idata, float *g_odata,
+                                           unsigned int rows,
+                                           unsigned int cols) {
+  reduce_col_max(g_idata, g_odata, rows, cols);
 }
 
 /**
  * Functor op for min operation
  */
-typedef struct {
-    __device__ __forceinline__
-    double operator()(double a, double b) const {
-        return fmin(a, b);
-    }
-} MinOp;
+template <typename T>
+struct MinOp {
+  __device__ __forceinline__ T operator()(T a, T b) const { return fmin(a, b); }
+};
 
 /**
  * Do a min over all elements of an array/matrix
@@ -778,10 +1161,20 @@ typedef struct {
  * @param g_odata   output/temporary array stode in device memory (of size n)
  * @param n         size of the input and temporary/output arrays
  */
-extern "C"
-__global__ void reduce_min(double *g_idata, double *g_odata, unsigned int n){
-	MinOp op;
-    reduce<MinOp>(g_idata, g_odata, n, op, DBL_MAX);
+template <typename T>
+__device__ void reduce_min(T *g_idata, T *g_odata, unsigned int n) {
+  MinOp<T> op;
+  reduce<MinOp<T>, T>(g_idata, g_odata, n, op, T_MAX(g_idata[0]));
+}
+
+extern "C" __global__ void reduce_min_d(double *g_idata, double *g_odata,
+                                       unsigned int n) {
+  reduce_min(g_idata, g_odata, n);
+}
+
+extern "C" __global__ void reduce_min_f(float *g_idata, float *g_odata,
+                                       unsigned int n) {
+  reduce_min(g_idata, g_odata, n);
 }
 
 /**
@@ -791,11 +1184,25 @@ __global__ void reduce_min(double *g_idata, double *g_odata, unsigned int n){
  * @param rows      number of rows in input matrix
  * @param cols      number of columns in input matrix
  */
-extern "C"
-__global__ void reduce_row_min(double *g_idata, double *g_odata, unsigned int rows, unsigned int cols){
-    MinOp op;
-    IdentityOp aop;
-    reduce_row<MinOp, IdentityOp>(g_idata, g_odata, rows, cols, op, aop, DBL_MAX);
+template <typename T>
+__device__ void reduce_row_min(T *g_idata, T *g_odata, unsigned int rows,
+                               unsigned int cols) {
+  MinOp<T> op;
+  IdentityOp<T> aop;
+  reduce_row<MinOp<T>, IdentityOp<T>, T>(g_idata, g_odata, rows, cols, op, aop,
+                                         T_MAX(g_idata[0]));
+}
+
+extern "C" __global__ void reduce_row_min_d(double *g_idata, double *g_odata,
+                                           unsigned int rows,
+                                           unsigned int cols) {
+  reduce_row_min(g_idata, g_odata, rows, cols);
+}
+
+extern "C" __global__ void reduce_row_min_f(float *g_idata, float *g_odata,
+                                           unsigned int rows,
+                                           unsigned int cols) {
+  reduce_row_min(g_idata, g_odata, rows, cols);
 }
 
 /**
@@ -805,22 +1212,34 @@ __global__ void reduce_row_min(double *g_idata, double *g_odata, unsigned int ro
  * @param rows      number of rows in input matrix
  * @param cols      number of columns in input matrix
  */
-extern "C"
-__global__ void reduce_col_min(double *g_idata, double *g_odata, unsigned int rows, unsigned int cols){
-    MinOp op;
-    IdentityOp aop;
-    reduce_col<MinOp>(g_idata, g_odata, rows, cols, op, aop, DBL_MAX);
+template <typename T>
+__device__ void reduce_col_min(T *g_idata, T *g_odata, unsigned int rows,
+                               unsigned int cols) {
+  MinOp<T> op;
+  IdentityOp<T> aop;
+  reduce_col<MinOp<T>, IdentityOp<T>, T>(g_idata, g_odata, rows, cols, op, aop,
+                                         T_MAX(g_idata[0]));
+}
+
+extern "C" __global__ void reduce_col_min_d(double *g_idata, double *g_odata,
+                                           unsigned int rows,
+                                           unsigned int cols) {
+  reduce_col_min(g_idata, g_odata, rows, cols);
+}
+
+extern "C" __global__ void reduce_col_min_f(float *g_idata, float *g_odata,
+                                           unsigned int rows,
+                                           unsigned int cols) {
+  reduce_col_min(g_idata, g_odata, rows, cols);
 }
 
 /**
  * Functor op for product operation
  */
-typedef struct {
-    __device__ __forceinline__
-    double operator()(double a, double b) const {
-        return a * b;
-    }
-} ProductOp;
+template <typename T>
+struct ProductOp {
+  __device__ __forceinline__ T operator()(T a, T b) const { return a * b; }
+};
 
 /**
  * Do a product over all elements of an array/matrix
@@ -828,26 +1247,35 @@ typedef struct {
  * @param g_odata   output/temporary array stode in device memory (of size n)
  * @param n         size of the input and temporary/output arrays
  */
-extern "C"
-__global__ void reduce_prod(double *g_idata, double *g_odata, unsigned int n){
-	ProductOp op;
-    reduce<ProductOp>(g_idata, g_odata, n, op, 1.0);
+template <typename T>
+__device__ void reduce_prod(T *g_idata, T *g_odata, unsigned int n) {
+  ProductOp<T> op;
+  reduce<ProductOp<T>, T>(g_idata, g_odata, n, op, (T)1.0);
+}
+
+extern "C" __global__ void reduce_prod_d(double *g_idata, double *g_odata,
+                                        unsigned int n) {
+  reduce_prod(g_idata, g_odata, n);
+}
+
+extern "C" __global__ void reduce_prod_f(float *g_idata, float *g_odata,
+                                        unsigned int n) {
+  reduce_prod(g_idata, g_odata, n);
 }
 
 /**
  * Functor op for mean operation
  */
+template <typename T>
 struct MeanOp {
-    const long _size;   ///< Number of elements by which to divide to calculate mean
-		__device__ __forceinline__
-    MeanOp(long size): _size(size) {}
-    __device__ __forceinline__
-    double operator()(double total) const {
-        return total / _size;
-    }
+  const long
+      _size;  ///< Number of elements by which to divide to calculate mean
+  __device__ __forceinline__ MeanOp(long size) : _size(size) {}
+  __device__ __forceinline__ T operator()(T total) const {
+    return total / _size;
+  }
 };
 
-
 /**
  * Do a mean over all rows of a matrix
  * @param g_idata   input matrix stored in device memory (of size rows * cols)
@@ -855,11 +1283,25 @@ struct MeanOp {
  * @param rows      number of rows in input matrix
  * @param cols      number of columns in input matrix
  */
-extern "C"
-__global__ void reduce_row_mean(double *g_idata, double *g_odata, unsigned int rows, unsigned int cols){
-    SumOp op;
-    MeanOp aop(cols);
-    reduce_row<SumOp, MeanOp>(g_idata, g_odata, rows, cols, op, aop, 0.0);
+template <typename T>
+__device__ void reduce_row_mean(T *g_idata, T *g_odata, unsigned int rows,
+                                unsigned int cols) {
+  SumOp<T> op;
+  MeanOp<T> aop(cols);
+  reduce_row<SumOp<T>, MeanOp<T>, T>(g_idata, g_odata, rows, cols, op, aop,
+                                     (T)0.0);
+}
+
+extern "C" __global__ void reduce_row_mean_d(double *g_idata, double *g_odata,
+                                            unsigned int rows,
+                                            unsigned int cols) {
+  reduce_row_mean(g_idata, g_odata, rows, cols);
+}
+
+extern "C" __global__ void reduce_row_mean_f(float *g_idata, float *g_odata,
+                                            unsigned int rows,
+                                            unsigned int cols) {
+  reduce_row_mean(g_idata, g_odata, rows, cols);
 }
 
 /**
@@ -869,13 +1311,26 @@ __global__ void reduce_row_mean(double *g_idata, double *g_odata, unsigned int r
  * @param rows      number of rows in input matrix
  * @param cols      number of columns in input matrix
  */
-extern "C"
-__global__ void reduce_col_mean(double *g_idata, double *g_odata, unsigned int rows, unsigned int cols){
-    SumOp op;
-    MeanOp aop(rows);
-    reduce_col<SumOp, MeanOp>(g_idata, g_odata, rows, cols, op, aop, 0.0);
+template <typename T>
+__device__ void reduce_col_mean(T *g_idata, T *g_odata, unsigned int rows,
+                                unsigned int cols) {
+  SumOp<T> op;
+  MeanOp<T> aop(rows);
+  reduce_col<SumOp<T>, MeanOp<T>, T>(g_idata, g_odata, rows, cols, op, aop,
+                                     0.0);
 }
 
+extern "C" __global__ void reduce_col_mean_d(double *g_idata, double *g_odata,
+                                            unsigned int rows,
+                                            unsigned int cols) {
+  reduce_col_mean(g_idata, g_odata, rows, cols);
+}
+
+extern "C" __global__ void reduce_col_mean_f(float *g_idata, float *g_odata,
+                                            unsigned int rows,
+                                            unsigned int cols) {
+  reduce_col_mean(g_idata, g_odata, rows, cols);
+}
 
 /**
  * Do an exp over all the elements of a matrix
@@ -883,12 +1338,21 @@ __global__ void reduce_col_mean(double *g_idata, double *g_odata, unsigned int r
  * @param C the pre-allocated output matrix (of length = size)
  * @param siz the length of the input and output matrices
  */
-extern "C"
-__global__ void matrix_exp(double *A, double *C, unsigned int size) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    if (index < size){
-        C[index] = exp(A[index]);
-    }
+template <typename T>
+__device__ void matrix_exp(T *A, T *C, unsigned int size) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < size) {
+    C[index] = exp(A[index]);
+  }
+}
+
+extern "C" __global__ void matrix_exp_d(double *A, double *C,
+                                       unsigned int size) {
+  matrix_exp(A, C, size);
+}
+
+extern "C" __global__ void matrix_exp_f(float *A, float *C, unsigned int size) {
+  matrix_exp(A, C, size);
 }
 
 /**
@@ -897,12 +1361,21 @@ __global__ void matrix_exp(double *A, double *C, unsigned int size) {
  * @param C the pre-allocated output matrix (of length = size)
  * @param siz the length of the input and output matrices
  */
-extern "C"
-__global__ void matrix_sqrt(double *A, double *C, unsigned int size) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    if (index < size){
-        C[index] = sqrt(A[index]);
-    }
+template <typename T>
+__device__ void matrix_sqrt(T *A, T *C, unsigned int size) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < size) {
+    C[index] = sqrt(A[index]);
+  }
+}
+
+extern "C" __global__ void matrix_sqrt_d(double *A, double *C,
+                                        unsigned int size) {
+  matrix_sqrt(A, C, size);
+}
+
+extern "C" __global__ void matrix_sqrt_f(float *A, float *C, unsigned int size) {
+  matrix_sqrt(A, C, size);
 }
 
 /**
@@ -911,12 +1384,22 @@ __global__ void matrix_sqrt(double *A, double *C, unsigned int size) {
  * @param C the pre-allocated output matrix (of length = size)
  * @param siz the length of the input and output matrices
  */
-extern "C"
-__global__ void matrix_round(double *A, double *C, unsigned int size) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    if (index < size){
-        C[index] = (double)llround(A[index]);
-    }
+template <typename T>
+__device__ void matrix_round(T *A, T *C, unsigned int size) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < size) {
+    C[index] = (T)llround(A[index]);
+  }
+}
+
+extern "C" __global__ void matrix_round_d(double *A, double *C,
+                                         unsigned int size) {
+  matrix_round(A, C, size);
+}
+
+extern "C" __global__ void matrix_round_f(float *A, float *C,
+                                         unsigned int size) {
+  matrix_round(A, C, size);
 }
 
 /**
@@ -925,12 +1408,21 @@ __global__ void matrix_round(double *A, double *C, unsigned int size) {
  * @param C the pre-allocated output matrix (of length = size)
  * @param siz the length of the input and output matrices
  */
-extern "C"
-__global__ void matrix_abs(double *A, double *C, unsigned int size) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    if (index < size){
-        C[index] = (double)fabs(A[index]);
-    }
+template <typename T>
+__device__ void matrix_abs(T *A, T *C, unsigned int size) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < size) {
+    C[index] = (T)fabs(A[index]);
+  }
+}
+
+extern "C" __global__ void matrix_abs_d(double *A, double *C,
+                                       unsigned int size) {
+  matrix_abs(A, C, size);
+}
+
+extern "C" __global__ void matrix_abs_f(float *A, float *C, unsigned int size) {
+  matrix_abs(A, C, size);
 }
 
 /**
@@ -939,12 +1431,21 @@ __global__ void matrix_abs(double *A, double *C, unsigned int size) {
  * @param C the pre-allocated output matrix (of length = size)
  * @param siz the length of the input and output matrices
  */
-extern "C"
-__global__ void matrix_log(double *A, double *C, unsigned int size) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    if (index < size){
-        C[index] = log(A[index]);
-    }
+template <typename T>
+__device__ void matrix_log(T *A, T *C, unsigned int size) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < size) {
+    C[index] = log(A[index]);
+  }
+}
+
+extern "C" __global__ void matrix_log_d(double *A, double *C,
+                                       unsigned int size) {
+  matrix_log(A, C, size);
+}
+
+extern "C" __global__ void matrix_log_f(float *A, float *C, unsigned int size) {
+  matrix_log(A, C, size);
 }
 
 /**
@@ -953,12 +1454,22 @@ __global__ void matrix_log(double *A, double *C, unsigned int size) {
  * @param C the pre-allocated output matrix (of length = size)
  * @param siz the length of the input and output matrices
  */
-extern "C"
-__global__ void matrix_floor(double *A, double *C, unsigned int size) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    if (index < size){
-        C[index] = floor(A[index]);
-    }
+template <typename T>
+__device__ void matrix_floor(T *A, T *C, unsigned int size) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < size) {
+    C[index] = floor(A[index]);
+  }
+}
+
+extern "C" __global__ void matrix_floor_d(double *A, double *C,
+                                         unsigned int size) {
+  matrix_floor(A, C, size);
+}
+
+extern "C" __global__ void matrix_floor_f(float *A, float *C,
+                                         unsigned int size) {
+  matrix_floor(A, C, size);
 }
 
 /**
@@ -967,12 +1478,21 @@ __global__ void matrix_floor(double *A, double *C, unsigned int size) {
  * @param C the pre-allocated output matrix (of length = size)
  * @param siz the length of the input and output matrices
  */
-extern "C"
-__global__ void matrix_ceil(double *A, double *C, unsigned int size) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    if (index < size){
-        C[index] = ceil(A[index]);
-    }
+template <typename T>
+__device__ void matrix_ceil(T *A, T *C, unsigned int size) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < size) {
+    C[index] = ceil(A[index]);
+  }
+}
+
+extern "C" __global__ void matrix_ceil_d(double *A, double *C,
+                                        unsigned int size) {
+  matrix_ceil(A, C, size);
+}
+
+extern "C" __global__ void matrix_ceil_f(float *A, float *C, unsigned int size) {
+  matrix_ceil(A, C, size);
 }
 
 /**
@@ -981,12 +1501,21 @@ __global__ void matrix_ceil(double *A, double *C, unsigned int size) {
  * @param C the pre-allocated output matrix (of length = size)
  * @param siz the length of the input and output matrices
  */
-extern "C"
-__global__ void matrix_sin(double *A, double *C, unsigned int size) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    if (index < size){
-        C[index] = sin(A[index]);
-    }
+template <typename T>
+__device__ void matrix_sin(T *A, T *C, unsigned int size) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < size) {
+    C[index] = sin(A[index]);
+  }
+}
+
+extern "C" __global__ void matrix_sin_d(double *A, double *C,
+                                       unsigned int size) {
+  matrix_sin(A, C, size);
+}
+
+extern "C" __global__ void matrix_sin_f(float *A, float *C, unsigned int size) {
+  matrix_sin(A, C, size);
 }
 
 /**
@@ -995,12 +1524,21 @@ __global__ void matrix_sin(double *A, double *C, unsigned int size) {
  * @param C the pre-allocated output matrix (of length = size)
  * @param siz the length of the input and output matrices
  */
-extern "C"
-__global__ void matrix_sinh(double *A, double *C, unsigned int size) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    if (index < size){
-        C[index] = sinh(A[index]);
-    }
+template <typename T>
+__device__ void matrix_sinh(T *A, T *C, unsigned int size) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < size) {
+    C[index] = sinh(A[index]);
+  }
+}
+
+extern "C" __global__ void matrix_sinh_d(double *A, double *C,
+                                        unsigned int size) {
+  matrix_sinh(A, C, size);
+}
+
+extern "C" __global__ void matrix_sinh_f(float *A, float *C, unsigned int size) {
+  matrix_sinh(A, C, size);
 }
 
 /**
@@ -1009,12 +1547,21 @@ __global__ void matrix_sinh(double *A, double *C, unsigned int size) {
  * @param C the pre-allocated output matrix (of length = size)
  * @param siz the length of the input and output matrices
  */
-extern "C"
-__global__ void matrix_cos(double *A, double *C, unsigned int size) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    if (index < size){
-        C[index] = cos(A[index]);
-    }
+template <typename T>
+__device__ void matrix_cos(T *A, T *C, unsigned int size) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < size) {
+    C[index] = cos(A[index]);
+  }
+}
+
+extern "C" __global__ void matrix_cos_d(double *A, double *C,
+                                       unsigned int size) {
+  matrix_cos(A, C, size);
+}
+
+extern "C" __global__ void matrix_cos_f(float *A, float *C, unsigned int size) {
+  matrix_cos(A, C, size);
 }
 
 /**
@@ -1023,12 +1570,21 @@ __global__ void matrix_cos(double *A, double *C, unsigned int size) {
  * @param C the pre-allocated output matrix (of length = size)
  * @param siz the length of the input and output matrices
  */
-extern "C"
-__global__ void matrix_cosh(double *A, double *C, unsigned int size) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    if (index < size){
-        C[index] = cosh(A[index]);
-    }
+template <typename T>
+__device__ void matrix_cosh(T *A, T *C, unsigned int size) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < size) {
+    C[index] = cosh(A[index]);
+  }
+}
+
+extern "C" __global__ void matrix_cosh_d(double *A, double *C,
+                                        unsigned int size) {
+  matrix_cosh(A, C, size);
+}
+
+extern "C" __global__ void matrix_cosh_f(float *A, float *C, unsigned int size) {
+  matrix_cosh(A, C, size);
 }
 
 /**
@@ -1037,12 +1593,21 @@ __global__ void matrix_cosh(double *A, double *C, unsigned int size) {
  * @param C the pre-allocated output matrix (of length = size)
  * @param siz the length of the input and output matrices
  */
-extern "C"
-__global__ void matrix_tan(double *A, double *C, unsigned int size) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    if (index < size){
-        C[index] = tan(A[index]);
-    }
+template <typename T>
+__device__ void matrix_tan(T *A, T *C, unsigned int size) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < size) {
+    C[index] = tan(A[index]);
+  }
+}
+
+extern "C" __global__ void matrix_tan_d(double *A, double *C,
+                                       unsigned int size) {
+  matrix_tan(A, C, size);
+}
+
+extern "C" __global__ void matrix_tan_f(float *A, float *C, unsigned int size) {
+  matrix_tan(A, C, size);
 }
 
 /**
@@ -1051,12 +1616,21 @@ __global__ void matrix_tan(double *A, double *C, unsigned int size) {
  * @param C the pre-allocated output matrix (of length = size)
  * @param siz the length of the input and output matrices
  */
-extern "C"
-__global__ void matrix_tanh(double *A, double *C, unsigned int size) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    if (index < size){
-        C[index] = tanh(A[index]);
-    }
+template <typename T>
+__device__ void matrix_tanh(T *A, T *C, unsigned int size) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < size) {
+    C[index] = tanh(A[index]);
+  }
+}
+
+extern "C" __global__ void matrix_tanh_d(double *A, double *C,
+                                        unsigned int size) {
+  matrix_tanh(A, C, size);
+}
+
+extern "C" __global__ void matrix_tanh_f(float *A, float *C, unsigned int size) {
+  matrix_tanh(A, C, size);
 }
 
 /**
@@ -1065,12 +1639,21 @@ __global__ void matrix_tanh(double *A, double *C, unsigned int size) {
  * @param C the pre-allocated output matrix (of length = size)
  * @param siz the length of the input and output matrices
  */
-extern "C"
-__global__ void matrix_asin(double *A, double *C, unsigned int size) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    if (index < size){
-        C[index] = asin(A[index]);
-    }
+template <typename T>
+__device__ void matrix_asin(T *A, T *C, unsigned int size) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < size) {
+    C[index] = asin(A[index]);
+  }
+}
+
+extern "C" __global__ void matrix_asin_d(double *A, double *C,
+                                        unsigned int size) {
+  matrix_asin(A, C, size);
+}
+
+extern "C" __global__ void matrix_asin_f(float *A, float *C, unsigned int size) {
+  matrix_asin(A, C, size);
 }
 
 /**
@@ -1079,12 +1662,21 @@ __global__ void matrix_asin(double *A, double *C, unsigned int size) {
  * @param C the pre-allocated output matrix (of length = size)
  * @param siz the length of the input and output matrices
  */
-extern "C"
-__global__ void matrix_acos(double *A, double *C, unsigned int size) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    if (index < size){
-        C[index] = acos(A[index]);
-    }
+template <typename T>
+__device__ void matrix_acos(T *A, T *C, unsigned int size) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < size) {
+    C[index] = acos(A[index]);
+  }
+}
+
+extern "C" __global__ void matrix_acos_d(double *A, double *C,
+                                        unsigned int size) {
+  matrix_acos(A, C, size);
+}
+
+extern "C" __global__ void matrix_acos_f(float *A, float *C, unsigned int size) {
+  matrix_acos(A, C, size);
 }
 
 /**
@@ -1093,12 +1685,21 @@ __global__ void matrix_acos(double *A, double *C, unsigned int size) {
  * @param C the pre-allocated output matrix (of length = size)
  * @param siz the length of the input and output matrices
  */
-extern "C"
-__global__ void matrix_atan(double *A, double *C, unsigned int size) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    if (index < size){
-        C[index] = atan(A[index]);
-    }
+template <typename T>
+__device__ void matrix_atan(T *A, T *C, unsigned int size) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < size) {
+    C[index] = atan(A[index]);
+  }
+}
+
+extern "C" __global__ void matrix_atan_d(double *A, double *C,
+                                        unsigned int size) {
+  matrix_atan(A, C, size);
+}
+
+extern "C" __global__ void matrix_atan_f(float *A, float *C, unsigned int size) {
+  matrix_atan(A, C, size);
 }
 
 /**
@@ -1108,14 +1709,23 @@ __global__ void matrix_atan(double *A, double *C, unsigned int size) {
  * @param C the pre-allocated output matrix (of length = size)
  * @param siz the length of the input and output matrices
  */
-extern "C"
-__global__ void matrix_sign(double *A, double *C, unsigned int size) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    if (index < size){
-        if (A[index] == 0.0) {
-            C[index] = 0.0;
-        } else {
-            C[index] = copysign(1.0, A[index]);
-        }
+template <typename T>
+__device__ void matrix_sign(T *A, T *C, unsigned int size) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < size) {
+    if (A[index] == 0.0) {
+      C[index] = 0.0;
+    } else {
+      C[index] = copysign(1.0, A[index]);
     }
+  }
+}
+
+extern "C" __global__ void matrix_sign_d(double *A, double *C,
+                                        unsigned int size) {
+  matrix_sign(A, C, size);
 }
+
+extern "C" __global__ void matrix_sign_f(float *A, float *C, unsigned int size) {
+  matrix_sign(A, C, size);
+}
\ No newline at end of file