You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@systemml.apache.org by na...@apache.org on 2017/05/17 17:56:37 UTC

[1/2] incubator-systemml git commit: [SYSTEMML-1344] sqrt, round, abs, log, floor, ceil, trig funcs & sign for GPU

Repository: incubator-systemml
Updated Branches:
  refs/heads/master 0d553e384 -> 1fc764b9b


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1fc764b9/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
index f4c523b..48b7da6 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
@@ -61,41 +61,55 @@ public abstract class GPUInstruction extends Instruction
 	public final static String MISC_TIMER_SYRK_LIB = 												"Msyrk"; 	// time spent in symmetric rank-k update
 
 	// Other BLAS instructions
-	public final static String MISC_TIMER_DAXPY_LIB = "daxpy";	// time spent in daxpy
-	public final static String MISC_TIMER_QR_BUFFER = "qr_buffer"; 	// time spent in calculating buffer needed to perform QR
-	public final static String MISC_TIMER_QR = "qr"; 	// time spent in doing QR
-	public final static String MISC_TIMER_ORMQR = "ormqr"; // time spent in ormqr
-	public final static String MISC_TIMER_TRSM = "trsm"; // time spent in cublas Dtrsm
+	public final static String MISC_TIMER_DAXPY_LIB =   "daxpy";    // time spent in daxpy
+	public final static String MISC_TIMER_QR_BUFFER =   "qr_buffer";// time spent in calculating buffer needed to perform QR
+	public final static String MISC_TIMER_QR =          "qr";       // time spent in doing QR
+	public final static String MISC_TIMER_ORMQR =       "ormqr";    // time spent in ormqr
+	public final static String MISC_TIMER_TRSM =        "trsm";     // time spent in cublas Dtrsm
 
 	// Transpose
-	public final static String MISC_TIMER_SPARSE_DGEAM_LIB = 	"sdgeaml"; 	// time spent in sparse transpose (and other ops of type a*op(A) + b*op(B))
-	public final static String MISC_TIMER_DENSE_DGEAM_LIB = 	"ddgeaml"; 	// time spent in dense transpose (and other ops of type a*op(A) + b*op(B))
-	public final static String MISC_TIMER_TRANSPOSE_LIB = 		"dtl";			// time spent on dense transpose, this includes allocation of output
+	public final static String MISC_TIMER_SPARSE_DGEAM_LIB =    "sdgeaml";  // time spent in sparse transpose (and other ops of type a*op(A) + b*op(B))
+	public final static String MISC_TIMER_DENSE_DGEAM_LIB =     "ddgeaml";  // time spent in dense transpose (and other ops of type a*op(A) + b*op(B))
+	public final static String MISC_TIMER_TRANSPOSE_LIB =       "dtl";      // time spent on dense transpose, this includes allocation of output
 
 	// Custom kernels
-	public final static String MISC_TIMER_MATRIX_MATRIX_CELLWISE_OP_KERNEL = 	"mmck";	// time spent in matrix-matrix cellwise operations
-	public final static String MISC_TIMER_COMPARE_AND_SET_KERNEL = 						"cask";	// time spent in compareAndSet kernel
-	public final static String MISC_TIMER_EXP_KERNEL = 												"expk";	// time spent in the exp kernel
-	public final static String MISC_TIMER_DAXPY_MV_KERNEL = 									"daxpymv";	// time spent in the daxpy_matrix_vector kernel
-	public final static String MISC_TIMER_UPPER_TO_LOWER_TRIANGLE_KERNEL = 		"u2lk"; // time spent in the copy_u2l_dense kernel
-	public final static String MISC_TIMER_FILL_KERNEL	=												"fillk"; // time spent in the "fill" kernel
-	public final static String MISC_TIMER_MATRIX_SCALAR_OP_KERNEL = 					"msk";	// time spent in the matrix scalar kernel
-	public final static String MISC_TIMER_REDUCE_ALL_KERNEL = 								"rallk"; // time spent in reduce all kernel
-	public final static String MISC_TIMER_REDUCE_ROW_KERNEL = 								"rrowk"; // time spent in reduce row kernel
-	public final static String MISC_TIMER_REDUCE_COL_KERNEL = 								"rcolk";	// time spent in reduce column kernel
+	public final static String MISC_TIMER_MATRIX_MATRIX_CELLWISE_OP_KERNEL = "mmck";   // time spent in matrix-matrix cellwise operations
+	public final static String MISC_TIMER_COMPARE_AND_SET_KERNEL =           "cask";   // time spent in compareAndSet kernel
+	public final static String MISC_TIMER_EXP_KERNEL =                       "expk";   // time spent in the exp kernel
+	public final static String MISC_TIMER_SQRT_KERNEL =                      "sqrtk";   // time spent in the sqrt kernel
+	public final static String MISC_TIMER_ROUND_KERNEL =                     "roundk";   // time spent in the round kernel
+	public final static String MISC_TIMER_ABS_KERNEL =                       "absk";   // time spent in the abs kernel
+	public final static String MISC_TIMER_LOG_KERNEL =                       "logk";   // time spent in the log kernel
+	public final static String MISC_TIMER_FLOOR_KERNEL =                     "floork";  // time spent in the floor kernel
+	public final static String MISC_TIMER_CEIL_KERNEL =                      "ceilk";   // time spent in the ceil kernel
+	public final static String MISC_TIMER_SIN_KERNEL =                       "sink";   // time spent in the sin kernel
+	public final static String MISC_TIMER_COS_KERNEL =                       "cosk";   // time spent in the cos kernel
+	public final static String MISC_TIMER_TAN_KERNEL =                       "tank";   // time spent in the tan kernel
+	public final static String MISC_TIMER_ASIN_KERNEL =                      "asink";   // time spent in the asin kernel
+	public final static String MISC_TIMER_ACOS_KERNEL =                      "acosk";   // time spent in the acos kernel
+	public final static String MISC_TIMER_ATAN_KERNEL =                      "atank";   // time spent in the atan kernel
+	public final static String MISC_TIMER_SIGN_KERNEL =                      "signk";   // time spent in the sign kernel
+
+	public final static String MISC_TIMER_DAXPY_MV_KERNEL =                  "daxpymv";// time spent in the daxpy_matrix_vector kernel
+	public final static String MISC_TIMER_UPPER_TO_LOWER_TRIANGLE_KERNEL =   "u2lk";   // time spent in the copy_u2l_dense kernel
+	public final static String MISC_TIMER_FILL_KERNEL =                      "fillk";  // time spent in the "fill" kernel
+	public final static String MISC_TIMER_MATRIX_SCALAR_OP_KERNEL =          "msk";    // time spent in the matrix scalar kernel
+	public final static String MISC_TIMER_REDUCE_ALL_KERNEL =                "rallk";  // time spent in reduce all kernel
+	public final static String MISC_TIMER_REDUCE_ROW_KERNEL =                "rrowk";  // time spent in reduce row kernel
+	public final static String MISC_TIMER_REDUCE_COL_KERNEL =                "rcolk";  // time spent in reduce column kernel
 
 	// Deep learning operators
-	public final static String MISC_TIMER_ACTIVATION_FORWARD_LIB = 					"nnaf"; // time spent in cudnnActivationForward
-	public final static String MISC_TIMER_CONVOLUTION_FORWARD_LIB =					"nncf"; // time spent in cudnnConvolutionForward
-	public final static String MISC_TIMER_CONVOLUTION_BACKWARD_FILTER_LIB = "nncbf"; // time spent in cudnnConvolutionBackwardFilter
-	public final static String MISC_TIMER_CONVOLUTION_BACKWARD_DATA_LIB = 	"nncbd"; // time spent in cudnnConvolutionBackwardData
-	public final static String MISC_TIMER_MAXPOOLING_FORWARD_LIB =					"nnmf"; // time spent in cudnnPoolingForward
-	public final static String MISC_TIMER_MAXPOOLING_BACKWARD_LIB = 				"nnmb"; // time spent in cudnnPoolingBackward
-	public final static String MISC_TIMER_BIAS_ADD_LIB = 										"nnba"; // time spent in bias_add cuda kernel
-	public final static String MISC_TIMER_RELU_BACKWARD_KERNEL= 						"nnrbk"; // time spent in relu_backward cuda kernel
-	public final static String MISC_TIMER_RELU_KERNEL = 										"nnrk";	// time spent in the relu kernel
-	public final static String MISC_TIMER_CUDNN_INIT = 											"nni";	// time spent in initializations for cudnn call
-	public final static String MISC_TIMER_CUDNN_CLEANUP = 									"nnc";	// time spent in cleanup for cudnn call
+	public final static String MISC_TIMER_ACTIVATION_FORWARD_LIB =         "nnaf";  // time spent in cudnnActivationForward
+	public final static String MISC_TIMER_CONVOLUTION_FORWARD_LIB =        "nncf";  // time spent in cudnnConvolutionForward
+	public final static String MISC_TIMER_CONVOLUTION_BACKWARD_FILTER_LIB ="nncbf"; // time spent in cudnnConvolutionBackwardFilter
+	public final static String MISC_TIMER_CONVOLUTION_BACKWARD_DATA_LIB =  "nncbd"; // time spent in cudnnConvolutionBackwardData
+	public final static String MISC_TIMER_MAXPOOLING_FORWARD_LIB =         "nnmf";  // time spent in cudnnPoolingForward
+	public final static String MISC_TIMER_MAXPOOLING_BACKWARD_LIB =        "nnmb";  // time spent in cudnnPoolingBackward
+	public final static String MISC_TIMER_BIAS_ADD_LIB =                   "nnba";  // time spent in bias_add cuda kernel
+	public final static String MISC_TIMER_RELU_BACKWARD_KERNEL=            "nnrbk"; // time spent in relu_backward cuda kernel
+	public final static String MISC_TIMER_RELU_KERNEL =                    "nnrk";  // time spent in the relu kernel
+	public final static String MISC_TIMER_CUDNN_INIT =                     "nni";   // time spent in initializations for cudnn call
+	public final static String MISC_TIMER_CUDNN_CLEANUP =                  "nnc";   // time spent in cleanup for cudnn call
 
 
 	protected GPUINSTRUCTION_TYPE _gputype;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1fc764b9/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
index ce25dec..7b50285 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
@@ -42,14 +42,39 @@ public class MatrixBuiltinGPUInstruction extends BuiltinUnaryGPUInstruction {
 		MatrixObject mat = getMatrixInputForGPUInstruction(ec, _input.getName());
 		ec.setMetaData(_output.getName(), mat.getNumRows(), mat.getNumColumns());
 
-		if(opcode.equals("sel+")) {
-			LibMatrixCUDA.relu(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName());
-
-		} else if (opcode.equals("exp")) {
-			LibMatrixCUDA.exp(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName());
-		}
-		else {
-			throw new DMLRuntimeException("Unsupported GPU operator:" + opcode);
+		switch(opcode) {
+			case "sel+":
+				LibMatrixCUDA.relu(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+			case "exp":
+				LibMatrixCUDA.exp(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+			case "sqrt":
+				LibMatrixCUDA.sqrt(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+			case "log":
+				LibMatrixCUDA.log(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+			case "round":
+				LibMatrixCUDA.round(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+			case "floor":
+				LibMatrixCUDA.floor(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+			case "ceil":
+				LibMatrixCUDA.ceil(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+			case "abs":
+				LibMatrixCUDA.abs(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+			case "sin":
+				LibMatrixCUDA.sin(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+			case "cos":
+				LibMatrixCUDA.cos(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+			case "tan":
+				LibMatrixCUDA.tan(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+			case "asin":
+				LibMatrixCUDA.asin(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+			case "acos":
+				LibMatrixCUDA.acos(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+			case "atan":
+				LibMatrixCUDA.atan(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+			case "sign":
+				LibMatrixCUDA.sign(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+			default:
+				throw new DMLRuntimeException("Unsupported GPU operator:" + opcode);
 		}
 		ec.releaseMatrixInputForGPUInstruction(_input.getName());
 		ec.releaseMatrixOutputForGPUInstruction(_output.getName());

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1fc764b9/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
index 05257e5..0ff9d14 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
@@ -467,7 +467,7 @@ public class CSRPointer {
       cusparseDcsr2dense(cusparseHandle, rows, cols, descr, val, rowPtr, colInd, A, rows);
       //cudaDeviceSynchronize;
     } else {
-      LOG.warn("in CSRPointer, the values array, row pointers array or column indices array was null");
+      LOG.debug("in CSRPointer, the values array, row pointers array or column indices array was null");
     }
     return A;
   }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1fc764b9/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
index d735e38..be3cc09 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -510,7 +510,9 @@ public class GPUObject {
 		setDenseMatrixCudaPointer(allocate(size));
 		addReadLock();
 		// The "fill" kernel is called which treats the matrix "jcudaDensePtr" like a vector and fills it with value "v"
-		getGPUContext().getKernels().launchKernel("fill", ExecutionConfig.getConfigForSimpleVectorOperations(numElems), getJcudaDenseMatrixPtr(), v, numElems);
+		// If the fill value is 0, no need to call the special kernel, the allocate memsets the allocated region to 0
+		if (v != 0)
+			getGPUContext().getKernels().launchKernel("fill", ExecutionConfig.getConfigForSimpleVectorOperations(numElems), getJcudaDenseMatrixPtr(), v, numElems);
 	}
 
 	/**

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1fc764b9/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index a99571a..074119b 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -2885,29 +2885,239 @@ public class LibMatrixCUDA {
 	 * @throws DMLRuntimeException	if DMLRuntimeException occurs
 	 */
 	public static void exp(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
+		LOG.trace("GPU : exp" + ", GPUContext=" + gCtx);
+		// e^0 = 1, create a dense block full of 1s
+		unaryOp(ec, gCtx, in1, "matrix_exp", 1, outputName, instName, GPUInstruction.MISC_TIMER_EXP_KERNEL);
+	}
+
+	/**
+	 * Performs an "sqrt" operation on a matrix on the GPU
+	 * @param ec	execution context
+	 * @param gCtx a valid {@link GPUContext}
+	 * @param instName the invoking instruction's name for record {@link Statistics}.
+	 * @param in1	input matrix
+	 * @param outputName	output matrix name
+	 * @throws DMLRuntimeException	if DMLRuntimeException occurs
+	 */
+	public static void sqrt(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
+		LOG.trace("GPU : sqrt" + ", GPUContext=" + gCtx);
+		// sqrt(0) = 0, create a dense block full of 0s
+		unaryOp(ec, gCtx, in1, "matrix_sqrt", 0, outputName, instName, GPUInstruction.MISC_TIMER_SQRT_KERNEL);
+	}
+
+	/**
+	 * Performs an "round" operation on a matrix on the GPU
+	 * @param ec	execution context
+	 * @param gCtx a valid {@link GPUContext}
+	 * @param instName the invoking instruction's name for record {@link Statistics}.
+	 * @param in1	input matrix
+	 * @param outputName	output matrix name
+	 * @throws DMLRuntimeException	if DMLRuntimeException occurs
+	 */
+	public static void round(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
+		LOG.trace("GPU : round" + ", GPUContext=" + gCtx);
+		// round(0) = 0, create a dense block full of 0s
+		unaryOp(ec, gCtx, in1, "matrix_round", 0, outputName, instName, GPUInstruction.MISC_TIMER_ROUND_KERNEL);
+	}
+
+	/**
+	 * Performs an "abs" operation on a matrix on the GPU
+	 * @param ec	execution context
+	 * @param gCtx a valid {@link GPUContext}
+	 * @param instName the invoking instruction's name for record {@link Statistics}.
+	 * @param in1	input matrix
+	 * @param outputName	output matrix name
+	 * @throws DMLRuntimeException	if DMLRuntimeException occurs
+	 */
+	public static void abs(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
+		LOG.trace("GPU : abs" + ", GPUContext=" + gCtx);
+		// abs(0) = 0, create a dense block full of 0s
+		unaryOp(ec, gCtx, in1, "matrix_abs", 0, outputName, instName, GPUInstruction.MISC_TIMER_ABS_KERNEL);
+	}
+
+	/**
+	 * Performs an "log" operation on a matrix on the GPU
+	 * @param ec	execution context
+	 * @param gCtx a valid {@link GPUContext}
+	 * @param instName the invoking instruction's name for record {@link Statistics}.
+	 * @param in1	input matrix
+	 * @param outputName	output matrix name
+	 * @throws DMLRuntimeException	if DMLRuntimeException occurs
+	 */
+	public static void log(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
+		LOG.trace("GPU : log" + ", GPUContext=" + gCtx);
+		// log(0) = -Inf
+		unaryOp(ec, gCtx, in1, "matrix_log", Double.NEGATIVE_INFINITY, outputName, instName, GPUInstruction.MISC_TIMER_LOG_KERNEL);
+	}
+
+	/**
+	 * Performs an "floor" operation on a matrix on the GPU
+	 * @param ec	execution context
+	 * @param gCtx a valid {@link GPUContext}
+	 * @param instName the invoking instruction's name for record {@link Statistics}.
+	 * @param in1	input matrix
+	 * @param outputName	output matrix name
+	 * @throws DMLRuntimeException	if DMLRuntimeException occurs
+	 */
+	public static void floor(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
+		LOG.trace("GPU : floor" + ", GPUContext=" + gCtx);
+		// floor(0) = 0
+		unaryOp(ec, gCtx, in1, "matrix_floor", 0, outputName, instName, GPUInstruction.MISC_TIMER_FLOOR_KERNEL);
+	}
+
+	/**
+	 * Performs an "ceil" operation on a matrix on the GPU
+	 * @param ec	execution context
+	 * @param gCtx a valid {@link GPUContext}
+	 * @param instName the invoking instruction's name for record {@link Statistics}.
+	 * @param in1	input matrix
+	 * @param outputName	output matrix name
+	 * @throws DMLRuntimeException	if DMLRuntimeException occurs
+	 */
+	public static void ceil(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
+		LOG.trace("GPU : ceil" + ", GPUContext=" + gCtx);
+		// ceil(0) = 0
+		unaryOp(ec, gCtx, in1, "matrix_ceil", 0, outputName, instName, GPUInstruction.MISC_TIMER_CEIL_KERNEL);
+	}
+
+	/**
+	 * Performs an "sin" operation on a matrix on the GPU
+	 * @param ec	execution context
+	 * @param gCtx a valid {@link GPUContext}
+	 * @param instName the invoking instruction's name for record {@link Statistics}.
+	 * @param in1	input matrix
+	 * @param outputName	output matrix name
+	 * @throws DMLRuntimeException	if DMLRuntimeException occurs
+	 */
+	public static void sin(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
+		LOG.trace("GPU : sin" + ", GPUContext=" + gCtx);
+		// sin(0) = 0
+		unaryOp(ec, gCtx, in1, "matrix_sin", 0, outputName, instName, GPUInstruction.MISC_TIMER_SIN_KERNEL);
+	}
+
+	/**
+	 * Performs an "cos" operation on a matrix on the GPU
+	 * @param ec	execution context
+	 * @param gCtx a valid {@link GPUContext}
+	 * @param instName the invoking instruction's name for record {@link Statistics}.
+	 * @param in1	input matrix
+	 * @param outputName	output matrix name
+	 * @throws DMLRuntimeException	if DMLRuntimeException occurs
+	 */
+	public static void cos(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
+		LOG.trace("GPU : cos" + ", GPUContext=" + gCtx);
+		// cos(0) = 1
+		unaryOp(ec, gCtx, in1, "matrix_cos", 1, outputName, instName, GPUInstruction.MISC_TIMER_COS_KERNEL);
+	}
+
+	/**
+	 * Performs an "tan" operation on a matrix on the GPU
+	 * @param ec	execution context
+	 * @param gCtx a valid {@link GPUContext}
+	 * @param instName the invoking instruction's name for record {@link Statistics}.
+	 * @param in1	input matrix
+	 * @param outputName	output matrix name
+	 * @throws DMLRuntimeException	if DMLRuntimeException occurs
+	 */
+	public static void tan(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
+		LOG.trace("GPU : tan" + ", GPUContext=" + gCtx);
+		// tan(0) = 0
+		unaryOp(ec, gCtx, in1, "matrix_tan", 0, outputName, instName, GPUInstruction.MISC_TIMER_TAN_KERNEL);
+	}
+
+	/**
+	 * Performs an "asin" operation on a matrix on the GPU
+	 * @param ec	execution context
+	 * @param gCtx a valid {@link GPUContext}
+	 * @param instName the invoking instruction's name for record {@link Statistics}.
+	 * @param in1	input matrix
+	 * @param outputName	output matrix name
+	 * @throws DMLRuntimeException	if DMLRuntimeException occurs
+	 */
+	public static void asin(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
+		LOG.trace("GPU : asin" + ", GPUContext=" + gCtx);
+		// asin(0) = 0
+		unaryOp(ec, gCtx, in1, "matrix_asin", 0, outputName, instName, GPUInstruction.MISC_TIMER_ASIN_KERNEL);
+	}
+
+	/**
+	 * Performs an "acos" operation on a matrix on the GPU
+	 * @param ec	execution context
+	 * @param gCtx a valid {@link GPUContext}
+	 * @param instName the invoking instruction's name for record {@link Statistics}.
+	 * @param in1	input matrix
+	 * @param outputName	output matrix name
+	 * @throws DMLRuntimeException	if DMLRuntimeException occurs
+	 */
+	public static void acos(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
+		LOG.trace("GPU : acos" + ", GPUContext=" + gCtx);
+		// acos(0) = PI/2
+		unaryOp(ec, gCtx, in1, "matrix_acos", Math.PI/2.0, outputName, instName, GPUInstruction.MISC_TIMER_ACOS_KERNEL);
+	}
+
+	/**
+	 * Performs an "atan" operation on a matrix on the GPU
+	 * @param ec	execution context
+	 * @param gCtx a valid {@link GPUContext}
+	 * @param instName the invoking instruction's name for record {@link Statistics}.
+	 * @param in1	input matrix
+	 * @param outputName	output matrix name
+	 * @throws DMLRuntimeException	if DMLRuntimeException occurs
+	 */
+	public static void atan(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
+		LOG.trace("GPU : atan" + ", GPUContext=" + gCtx);
+		// atan(0) = 0
+		unaryOp(ec, gCtx, in1, "matrix_atan", 0, outputName, instName, GPUInstruction.MISC_TIMER_ATAN_KERNEL);
+	}
+
+	/**
+	 * Performs an "sign" operation on a matrix on the GPU
+	 * @param ec	execution context
+	 * @param gCtx a valid {@link GPUContext}
+	 * @param instName the invoking instruction's name for record {@link Statistics}.
+	 * @param in1	input matrix
+	 * @param outputName	output matrix name
+	 * @throws DMLRuntimeException	if DMLRuntimeException occurs
+	 */
+	public static void sign(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
+		LOG.trace("GPU : sign" + ", GPUContext=" + gCtx);
+		// sign(0) = 0
+		unaryOp(ec, gCtx, in1, "matrix_sign", 0, outputName, instName, GPUInstruction.MISC_TIMER_SIGN_KERNEL);
+	}
+
+
+	/**
+	 * A helper function for all Unary ops (sqrt, abs, sin.. etc)
+	 * @param ec valid execution context
+	 * @param gCtx a valid {@link GPUContext}
+	 * @param in1 input matrix
+	 * @param kernel name of CUDA kernel for the unary op to execute
+	 * @param sparseAndEmptyFillValue the result of the unary op on a completely empty input matrix block
+	 * @param outputName output matrix name
+	 * @param instName the invoking instruction's name for record {@link Statistics}.
+	 * @param kernelTimer the name of the timer to measure the kernel invocation
+	 * @throws DMLRuntimeException
+	 */
+	private static void unaryOp(ExecutionContext ec, GPUContext gCtx, MatrixObject in1, String kernel, double sparseAndEmptyFillValue, String outputName, String instName, String kernelTimer) throws DMLRuntimeException {
 		if (ec.getGPUContext() != gCtx)
 			throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
-		LOG.trace("GPU : exp" + ", GPUContext=" + gCtx);
 		GPUObject in = in1.getGPUObject(gCtx);
 		boolean isSparseAndEmpty = in.isSparseAndEmpty();
 		long t1=0;
 		if (isSparseAndEmpty) {
-			// e^0 = 1, create a dense block full of 1s
 			MatrixObject out = ec.getMatrixObject(outputName);
 			ec.allocateGPUMatrixObject(outputName);
-			out.getGPUObject(gCtx).allocateAndFillDense(1);
+			out.getGPUObject(gCtx).allocateAndFillDense(sparseAndEmptyFillValue);
 		} else {
 			// Dense
 			MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName);
 			Pointer output = getDensePointer(gCtx, out, instName);
-			// If the input is in sparse format, convert it to dense.
-			// The output will always be dense, because for all x, exp(x) > 0
 			Pointer input = getDensePointer(gCtx, in1, instName);
 			int size = (int)(in1.getNumColumns() * in1.getNumRows());
 			if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
-			getCudaKernels(gCtx).launchKernel("matrix_exp", ExecutionConfig.getConfigForSimpleVectorOperations(size),
-							input, output, size);
-			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_EXP_KERNEL, System.nanoTime() - t1);
+			getCudaKernels(gCtx).launchKernel(kernel, ExecutionConfig.getConfigForSimpleVectorOperations(size),
+					input, output, size);
+			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, kernelTimer, System.nanoTime() - t1);
 		}
 	}

[2/2] incubator-systemml git commit: [SYSTEMML-1344] sqrt, round, abs, log, floor, ceil, trig funcs & sign for GPU

Posted by na...@apache.org.

[SYSTEMML-1344] sqrt,round,abs,log,floor,ceil,trig funcs & sign for GPU

Closes #503


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/1fc764b9
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/1fc764b9
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/1fc764b9

Branch: refs/heads/master
Commit: 1fc764b9b099271822056a82e248acdbb785dc63
Parents: 0d553e3
Author: Nakul Jindal <na...@gmail.com>
Authored: Wed May 17 10:55:51 2017 -0700
Committer: Nakul Jindal <na...@gmail.com>
Committed: Wed May 17 10:55:51 2017 -0700

----------------------------------------------------------------------
 src/main/cpp/kernels/Makefile                   |   28 +
 src/main/cpp/kernels/SystemML.cu                |  187 ++
 src/main/cpp/kernels/SystemML.ptx               | 2506 ++++++++++++++----
 .../java/org/apache/sysml/hops/UnaryOp.java     |   10 +-
 .../instructions/GPUInstructionParser.java      |   19 +-
 .../instructions/gpu/GPUInstruction.java        |   72 +-
 .../gpu/MatrixBuiltinGPUInstruction.java        |   41 +-
 .../instructions/gpu/context/CSRPointer.java    |    2 +-
 .../instructions/gpu/context/GPUObject.java     |    4 +-
 .../runtime/matrix/data/LibMatrixCUDA.java      |  226 +-
 10 files changed, 2577 insertions(+), 518 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1fc764b9/src/main/cpp/kernels/Makefile
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/Makefile b/src/main/cpp/kernels/Makefile
new file mode 100644
index 0000000..0b003f3
--- /dev/null
+++ b/src/main/cpp/kernels/Makefile
@@ -0,0 +1,28 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+NVCC=nvcc
+CUDAFLAGS= -ptx -c -arch=sm_30
+
+SystemML.o: SystemML.cu
+	$(NVCC) $(CUDAFLAGS)  SystemML.cu
+
+all: SystemML.o
+	;
+
+clean:
+	rm -rf SystemML.ptx

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1fc764b9/src/main/cpp/kernels/SystemML.cu
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.cu b/src/main/cpp/kernels/SystemML.cu
index 2651e4a..5b4574e 100644
--- a/src/main/cpp/kernels/SystemML.cu
+++ b/src/main/cpp/kernels/SystemML.cu
@@ -656,3 +656,190 @@ __global__ void matrix_exp(double *A, double *C, unsigned int size) {
         C[index] = exp(A[index]);
     }
 }
+
+/**
+ * Do an sqrt over all the elements of a matrix
+ * @param A the input matrix (of length = size)
+ * @param C the pre-allocated output matrix (of length = size)
+ * @param siz the length of the input and output matrices
+ */
+extern "C"
+__global__ void matrix_sqrt(double *A, double *C, unsigned int size) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index < size){
+        C[index] = sqrt(A[index]);
+    }
+}
+
+/**
+ * Do an round over all the elements of a matrix
+ * @param A the input matrix (of length = size)
+ * @param C the pre-allocated output matrix (of length = size)
+ * @param siz the length of the input and output matrices
+ */
+extern "C"
+__global__ void matrix_round(double *A, double *C, unsigned int size) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index < size){
+        C[index] = (double)llround(A[index]);
+    }
+}
+
+/**
+ * Do an abs over all the elements of a matrix
+ * @param A the input matrix (of length = size)
+ * @param C the pre-allocated output matrix (of length = size)
+ * @param siz the length of the input and output matrices
+ */
+extern "C"
+__global__ void matrix_abs(double *A, double *C, unsigned int size) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index < size){
+        C[index] = (double)fabs(A[index]);
+    }
+}
+
+/**
+ * Do an log over all the elements of a matrix
+ * @param A the input matrix (of length = size)
+ * @param C the pre-allocated output matrix (of length = size)
+ * @param siz the length of the input and output matrices
+ */
+extern "C"
+__global__ void matrix_log(double *A, double *C, unsigned int size) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index < size){
+        C[index] = log(A[index]);
+    }
+}
+
+/**
+ * Do an floor over all the elements of a matrix
+ * @param A the input matrix (of length = size)
+ * @param C the pre-allocated output matrix (of length = size)
+ * @param siz the length of the input and output matrices
+ */
+extern "C"
+__global__ void matrix_floor(double *A, double *C, unsigned int size) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index < size){
+        C[index] = floor(A[index]);
+    }
+}
+
+/**
+ * Do an ceil over all the elements of a matrix
+ * @param A the input matrix (of length = size)
+ * @param C the pre-allocated output matrix (of length = size)
+ * @param siz the length of the input and output matrices
+ */
+extern "C"
+__global__ void matrix_ceil(double *A, double *C, unsigned int size) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index < size){
+        C[index] = ceil(A[index]);
+    }
+}
+
+/**
+ * Do an sin over all the elements of a matrix
+ * @param A the input matrix (of length = size)
+ * @param C the pre-allocated output matrix (of length = size)
+ * @param siz the length of the input and output matrices
+ */
+extern "C"
+__global__ void matrix_sin(double *A, double *C, unsigned int size) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index < size){
+        C[index] = sin(A[index]);
+    }
+}
+
+/**
+ * Do an cos over all the elements of a matrix
+ * @param A the input matrix (of length = size)
+ * @param C the pre-allocated output matrix (of length = size)
+ * @param siz the length of the input and output matrices
+ */
+extern "C"
+__global__ void matrix_cos(double *A, double *C, unsigned int size) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index < size){
+        C[index] = cos(A[index]);
+    }
+}
+
+/**
+ * Do an tan over all the elements of a matrix
+ * @param A the input matrix (of length = size)
+ * @param C the pre-allocated output matrix (of length = size)
+ * @param siz the length of the input and output matrices
+ */
+extern "C"
+__global__ void matrix_tan(double *A, double *C, unsigned int size) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index < size){
+        C[index] = tan(A[index]);
+    }
+}
+
+/**
+ * Do an asin over all the elements of a matrix
+ * @param A the input matrix (of length = size)
+ * @param C the pre-allocated output matrix (of length = size)
+ * @param siz the length of the input and output matrices
+ */
+extern "C"
+__global__ void matrix_asin(double *A, double *C, unsigned int size) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index < size){
+        C[index] = asin(A[index]);
+    }
+}
+
+/**
+ * Do an acos over all the elements of a matrix
+ * @param A the input matrix (of length = size)
+ * @param C the pre-allocated output matrix (of length = size)
+ * @param siz the length of the input and output matrices
+ */
+extern "C"
+__global__ void matrix_acos(double *A, double *C, unsigned int size) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index < size){
+        C[index] = acos(A[index]);
+    }
+}
+
+/**
+ * Do an atan over all the elements of a matrix
+ * @param A the input matrix (of length = size)
+ * @param C the pre-allocated output matrix (of length = size)
+ * @param siz the length of the input and output matrices
+ */
+extern "C"
+__global__ void matrix_atan(double *A, double *C, unsigned int size) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index < size){
+        C[index] = atan(A[index]);
+    }
+}
+
+/**
+ * Do an sign over all the elements of a matrix
+ * Assign -1, 0 or 1 depending on the element being negative, 0 or positive
+ * @param A the input matrix (of length = size)
+ * @param C the pre-allocated output matrix (of length = size)
+ * @param siz the length of the input and output matrices
+ */
+extern "C"
+__global__ void matrix_sign(double *A, double *C, unsigned int size) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index < size){
+        if (A[index] == 0.0) {
+            C[index] = 0.0;
+        } else {
+            C[index] = copysign(1.0, A[index]);
+        }
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1fc764b9/src/main/cpp/kernels/SystemML.ptx
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.ptx b/src/main/cpp/kernels/SystemML.ptx
index 50002f5..3229581 100644
--- a/src/main/cpp/kernels/SystemML.ptx
+++ b/src/main/cpp/kernels/SystemML.ptx
@@ -1,8 +1,8 @@
 //
 // Generated by NVIDIA NVVM Compiler
 //
-// Compiler Build ID: CL-21112126
-// Cuda compilation tools, release 8.0, V8.0.43
+// Compiler Build ID: CL-21554848
+// Cuda compilation tools, release 8.0, V8.0.61
 // Based on LLVM 3.4svn
 //
 
@@ -11,6 +11,12 @@
 .address_size 64
 
 	// .globl	copy_u2l_dense
+.func  (.param .b64 func_retval0) __internal_trig_reduction_slowpathd
+(
+	.param .b64 __internal_trig_reduction_slowpathd_param_0,
+	.param .b64 __internal_trig_reduction_slowpathd_param_1
+)
+;
 .func  (.param .b64 func_retval0) __internal_accurate_pow
 (
 	.param .b64 __internal_accurate_pow_param_0,
@@ -18,6 +24,8 @@
 )
 ;
 .extern .shared .align 8 .b8 sdata[];
+.const .align 8 .b8 __cudart_i2opi_d[144] = {8, 93, 141, 31, 177, 95, 251, 107, 234, 146, 82, 138, 247, 57, 7, 61, 123, 241, 229, 235, 199, 186, 39, 117, 45, 234, 95, 158, 102, 63, 70, 79, 183, 9, 203, 39, 207, 126, 54, 109, 31, 109, 10, 90, 139, 17, 47, 239, 15, 152, 5, 222, 255, 151, 248, 31, 59, 40, 249, 189, 139, 95, 132, 156, 244, 57, 83, 131, 57, 214, 145, 57, 65, 126, 95, 180, 38, 112, 156, 233, 132, 68, 187, 46, 245, 53, 130, 232, 62, 167, 41, 177, 28, 235, 29, 254, 28, 146, 209, 9, 234, 46, 73, 6, 224, 210, 77, 66, 58, 110, 36, 183, 97, 197, 187, 222, 171, 99, 81, 254, 65, 144, 67, 60, 153, 149, 98, 219, 192, 221, 52, 245, 209, 87, 39, 252, 41, 21, 68, 78, 110, 131, 249, 162};
+.const .align 8 .b8 __cudart_sin_cos_coeffs[128] = {186, 94, 120, 249, 101, 219, 229, 61, 70, 210, 176, 44, 241, 229, 90, 190, 146, 227, 172, 105, 227, 29, 199, 62, 161, 98, 219, 25, 160, 1, 42, 191, 24, 8, 17, 17, 17, 17, 129, 63, 84, 85, 85, 85, 85, 85, 197, 191, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 100, 129, 253, 32, 131, 255, 168, 189, 40, 133, 239, 193, 167, 238, 33, 62, 217, 230, 6, 142, 79, 126, 146, 190, 233, 188, 221, 25, 160, 1, 250, 62, 71, 93, 193, 22, 108, 193, 86, 191, 81, 85, 85, 85, 85, 85, 165, 63, 0, 0, 0, 0, 0, 0, 224, 191, 0, 0, 0, 0, 0, 0, 240, 63};
 
 .visible .entry copy_u2l_dense(
 	.param .u64 copy_u2l_dense_param_0,
@@ -442,9 +450,9 @@ BB6_6:
 	.param .u32 matrix_matrix_cellwise_op_param_7
 )
 {
-	.reg .pred 	%p<54>;
-	.reg .b32 	%r<55>;
-	.reg .f64 	%fd<39>;
+	.reg .pred 	%p<52>;
+	.reg .b32 	%r<56>;
+	.reg .f64 	%fd<40>;
 	.reg .b64 	%rd<15>;
 
 
@@ -467,40 +475,40 @@ BB6_6:
 	setp.lt.s32	%p2, %r1, %r14;
 	setp.lt.s32	%p3, %r2, %r10;
 	and.pred  	%p4, %p2, %p3;
-	@!%p4 bra 	BB7_53;
+	@!%p4 bra 	BB7_55;
 	bra.uni 	BB7_1;
 
 BB7_1:
 	mad.lo.s32 	%r3, %r1, %r10, %r2;
 	setp.eq.s32	%p5, %r11, 1;
-	mov.u32 	%r53, %r1;
+	mov.u32 	%r54, %r1;
 	@%p5 bra 	BB7_5;
 
 	setp.ne.s32	%p6, %r11, 2;
-	mov.u32 	%r54, %r3;
+	mov.u32 	%r55, %r3;
 	@%p6 bra 	BB7_4;
 
-	mov.u32 	%r54, %r2;
+	mov.u32 	%r55, %r2;
 
 BB7_4:
-	mov.u32 	%r48, %r54;
-	mov.u32 	%r4, %r48;
-	mov.u32 	%r53, %r4;
+	mov.u32 	%r49, %r55;
+	mov.u32 	%r4, %r49;
+	mov.u32 	%r54, %r4;
 
 BB7_5:
-	mov.u32 	%r5, %r53;
+	mov.u32 	%r5, %r54;
 	setp.eq.s32	%p7, %r12, 1;
-	mov.u32 	%r51, %r1;
+	mov.u32 	%r52, %r1;
 	@%p7 bra 	BB7_9;
 
 	setp.ne.s32	%p8, %r12, 2;
-	mov.u32 	%r52, %r3;
+	mov.u32 	%r53, %r3;
 	@%p8 bra 	BB7_8;
 
-	mov.u32 	%r52, %r2;
+	mov.u32 	%r53, %r2;
 
 BB7_8:
-	mov.u32 	%r51, %r52;
+	mov.u32 	%r52, %r53;
 
 BB7_9:
 	cvta.to.global.u64 	%rd5, %rd3;
@@ -508,10 +516,10 @@ BB7_9:
 	mul.wide.s32 	%rd7, %r5, 8;
 	add.s64 	%rd8, %rd6, %rd7;
 	ld.global.f64 	%fd1, [%rd8];
-	mul.wide.s32 	%rd9, %r51, 8;
+	mul.wide.s32 	%rd9, %r52, 8;
 	add.s64 	%rd10, %rd5, %rd9;
 	ld.global.f64 	%fd2, [%rd10];
-	mov.f64 	%fd38, 0d7FEFFFFFFFFFFFFF;
+	mov.f64 	%fd39, 0d7FEFFFFFFFFFFFFF;
 	setp.gt.s32	%p9, %r13, 5;
 	@%p9 bra 	BB7_19;
 
@@ -519,15 +527,15 @@ BB7_9:
 	@%p19 bra 	BB7_15;
 
 	setp.eq.s32	%p23, %r13, 0;
-	@%p23 bra 	BB7_51;
+	@%p23 bra 	BB7_53;
 
 	setp.eq.s32	%p24, %r13, 1;
-	@%p24 bra 	BB7_50;
+	@%p24 bra 	BB7_52;
 	bra.uni 	BB7_13;
 
-BB7_50:
-	sub.f64 	%fd38, %fd1, %fd2;
-	bra.uni 	BB7_52;
+BB7_52:
+	sub.f64 	%fd39, %fd1, %fd2;
+	bra.uni 	BB7_54;
 
 BB7_19:
 	setp.gt.s32	%p10, %r13, 8;
@@ -542,12 +550,12 @@ BB7_19:
 
 BB7_33:
 	setp.gt.f64	%p29, %fd1, %fd2;
-	selp.f64	%fd38, 0d3FF0000000000000, 0d0000000000000000, %p29;
-	bra.uni 	BB7_52;
+	selp.f64	%fd39, 0d3FF0000000000000, 0d0000000000000000, %p29;
+	bra.uni 	BB7_54;
 
 BB7_15:
 	setp.eq.s32	%p20, %r13, 3;
-	@%p20 bra 	BB7_49;
+	@%p20 bra 	BB7_51;
 
 	setp.eq.s32	%p21, %r13, 4;
 	@%p21 bra 	BB7_35;
@@ -583,7 +591,7 @@ BB7_35:
 	param0, 
 	param1
 	);
-	ld.param.f64	%fd37, [retval0+0];
+	ld.param.f64	%fd38, [retval0+0];
 	
 	//{
 	}// Callseq End 0
@@ -595,17 +603,17 @@ BB7_35:
 BB7_36:
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r23}, %fd37;
+	mov.b64 	{%temp, %r23}, %fd38;
 	}
 	xor.b32  	%r24, %r23, -2147483648;
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r25, %temp}, %fd37;
+	mov.b64 	{%r25, %temp}, %fd38;
 	}
-	mov.b64 	%fd37, {%r25, %r24};
+	mov.b64 	%fd38, {%r25, %r24};
 
 BB7_37:
-	mov.f64 	%fd36, %fd37;
+	mov.f64 	%fd37, %fd38;
 	setp.eq.f64	%p34, %fd1, 0d0000000000000000;
 	@%p34 bra 	BB7_40;
 	bra.uni 	BB7_38;
@@ -616,7 +624,7 @@ BB7_40:
 	setp.lt.s32	%p38, %r9, 0;
 	selp.b32	%r28, %r27, %r26, %p38;
 	mov.u32 	%r29, 0;
-	mov.b64 	%fd36, {%r29, %r28};
+	mov.b64 	%fd37, {%r29, %r28};
 	bra.uni 	BB7_41;
 
 BB7_24:
@@ -629,8 +637,8 @@ BB7_24:
 
 BB7_32:
 	setp.eq.f64	%p27, %fd1, %fd2;
-	selp.f64	%fd38, 0d3FF0000000000000, 0d0000000000000000, %p27;
-	bra.uni 	BB7_52;
+	selp.f64	%fd39, 0d3FF0000000000000, 0d0000000000000000, %p27;
+	bra.uni 	BB7_54;
 
 BB7_28:
 	setp.eq.s32	%p12, %r13, 11;
@@ -638,67 +646,67 @@ BB7_28:
 	bra.uni 	BB7_29;
 
 BB7_31:
-	min.f64 	%fd38, %fd1, %fd2;
-	bra.uni 	BB7_52;
+	min.f64 	%fd39, %fd1, %fd2;
+	bra.uni 	BB7_54;
 
-BB7_51:
-	add.f64 	%fd38, %fd1, %fd2;
-	bra.uni 	BB7_52;
+BB7_53:
+	add.f64 	%fd39, %fd1, %fd2;
+	bra.uni 	BB7_54;
 
 BB7_13:
 	setp.eq.s32	%p25, %r13, 2;
 	@%p25 bra 	BB7_14;
-	bra.uni 	BB7_52;
+	bra.uni 	BB7_54;
 
 BB7_14:
-	mul.f64 	%fd38, %fd1, %fd2;
-	bra.uni 	BB7_52;
+	mul.f64 	%fd39, %fd1, %fd2;
+	bra.uni 	BB7_54;
 
 BB7_34:
 	setp.le.f64	%p30, %fd1, %fd2;
-	selp.f64	%fd38, 0d3FF0000000000000, 0d0000000000000000, %p30;
-	bra.uni 	BB7_52;
+	selp.f64	%fd39, 0d3FF0000000000000, 0d0000000000000000, %p30;
+	bra.uni 	BB7_54;
 
 BB7_22:
 	setp.eq.s32	%p18, %r13, 8;
 	@%p18 bra 	BB7_23;
-	bra.uni 	BB7_52;
+	bra.uni 	BB7_54;
 
 BB7_23:
 	setp.ge.f64	%p28, %fd1, %fd2;
-	selp.f64	%fd38, 0d3FF0000000000000, 0d0000000000000000, %p28;
-	bra.uni 	BB7_52;
+	selp.f64	%fd39, 0d3FF0000000000000, 0d0000000000000000, %p28;
+	bra.uni 	BB7_54;
 
-BB7_49:
-	div.rn.f64 	%fd38, %fd1, %fd2;
-	bra.uni 	BB7_52;
+BB7_51:
+	div.rn.f64 	%fd39, %fd1, %fd2;
+	bra.uni 	BB7_54;
 
 BB7_17:
 	setp.eq.s32	%p22, %r13, 5;
 	@%p22 bra 	BB7_18;
-	bra.uni 	BB7_52;
+	bra.uni 	BB7_54;
 
 BB7_18:
 	setp.lt.f64	%p31, %fd1, %fd2;
-	selp.f64	%fd38, 0d3FF0000000000000, 0d0000000000000000, %p31;
-	bra.uni 	BB7_52;
+	selp.f64	%fd39, 0d3FF0000000000000, 0d0000000000000000, %p31;
+	bra.uni 	BB7_54;
 
 BB7_26:
 	setp.eq.s32	%p15, %r13, 10;
 	@%p15 bra 	BB7_27;
-	bra.uni 	BB7_52;
+	bra.uni 	BB7_54;
 
 BB7_27:
 	setp.neu.f64	%p26, %fd1, %fd2;
-	selp.f64	%fd38, 0d3FF0000000000000, 0d0000000000000000, %p26;
-	bra.uni 	BB7_52;
+	selp.f64	%fd39, 0d3FF0000000000000, 0d0000000000000000, %p26;
+	bra.uni 	BB7_54;
 
 BB7_29:
 	setp.ne.s32	%p13, %r13, 12;
-	@%p13 bra 	BB7_52;
+	@%p13 bra 	BB7_54;
 
-	max.f64 	%fd38, %fd1, %fd2;
-	bra.uni 	BB7_52;
+	max.f64 	%fd39, %fd1, %fd2;
+	bra.uni 	BB7_54;
 
 BB7_38:
 	setp.gt.s32	%p35, %r8, -1;
@@ -706,10 +714,10 @@ BB7_38:
 
 	cvt.rzi.f64.f64	%fd29, %fd2;
 	setp.neu.f64	%p36, %fd29, %fd2;
-	selp.f64	%fd36, 0dFFF8000000000000, %fd36, %p36;
+	selp.f64	%fd37, 0dFFF8000000000000, %fd37, %p36;
 
 BB7_41:
-	mov.f64 	%fd17, %fd36;
+	mov.f64 	%fd17, %fd37;
 	add.f64 	%fd18, %fd1, %fd2;
 	{
 	.reg .b32 %temp; 
@@ -717,77 +725,79 @@ BB7_41:
 	}
 	and.b32  	%r31, %r30, 2146435072;
 	setp.ne.s32	%p39, %r31, 2146435072;
-	mov.f64 	%fd35, %fd17;
-	@%p39 bra 	BB7_48;
+	mov.f64 	%fd36, %fd17;
+	@%p39 bra 	BB7_50;
 
 	setp.gtu.f64	%p40, %fd11, 0d7FF0000000000000;
-	mov.f64 	%fd35, %fd18;
-	@%p40 bra 	BB7_48;
+	mov.f64 	%fd36, %fd18;
+	@%p40 bra 	BB7_50;
 
 	abs.f64 	%fd30, %fd2;
 	setp.gtu.f64	%p41, %fd30, 0d7FF0000000000000;
-	mov.f64 	%fd34, %fd18;
-	mov.f64 	%fd35, %fd34;
-	@%p41 bra 	BB7_48;
+	mov.f64 	%fd35, %fd18;
+	mov.f64 	%fd36, %fd35;
+	@%p41 bra 	BB7_50;
+
+	and.b32  	%r32, %r9, 2147483647;
+	setp.ne.s32	%p42, %r32, 2146435072;
+	@%p42 bra 	BB7_46;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r32, %temp}, %fd2;
+	mov.b64 	{%r33, %temp}, %fd2;
 	}
-	and.b32  	%r33, %r9, 2147483647;
-	setp.eq.s32	%p42, %r33, 2146435072;
-	setp.eq.s32	%p43, %r32, 0;
-	and.pred  	%p44, %p42, %p43;
-	@%p44 bra 	BB7_47;
-	bra.uni 	BB7_45;
+	setp.eq.s32	%p43, %r33, 0;
+	@%p43 bra 	BB7_49;
 
-BB7_47:
-	setp.gt.f64	%p48, %fd11, 0d3FF0000000000000;
-	selp.b32	%r41, 2146435072, 0, %p48;
-	xor.b32  	%r42, %r41, 2146435072;
-	setp.lt.s32	%p49, %r9, 0;
-	selp.b32	%r43, %r42, %r41, %p49;
-	setp.eq.f64	%p50, %fd1, 0dBFF0000000000000;
-	selp.b32	%r44, 1072693248, %r43, %p50;
-	mov.u32 	%r45, 0;
-	mov.b64 	%fd35, {%r45, %r44};
-	bra.uni 	BB7_48;
+BB7_46:
+	and.b32  	%r34, %r8, 2147483647;
+	setp.ne.s32	%p44, %r34, 2146435072;
+	mov.f64 	%fd33, %fd17;
+	mov.f64 	%fd36, %fd33;
+	@%p44 bra 	BB7_50;
 
-BB7_45:
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r34, %temp}, %fd1;
+	mov.b64 	{%r35, %temp}, %fd1;
 	}
-	and.b32  	%r35, %r8, 2147483647;
-	setp.eq.s32	%p45, %r35, 2146435072;
-	setp.eq.s32	%p46, %r34, 0;
-	and.pred  	%p47, %p45, %p46;
-	mov.f64 	%fd35, %fd17;
-	@!%p47 bra 	BB7_48;
-	bra.uni 	BB7_46;
+	setp.ne.s32	%p45, %r35, 0;
+	mov.f64 	%fd36, %fd17;
+	@%p45 bra 	BB7_50;
 
-BB7_46:
 	shr.s32 	%r36, %r9, 31;
 	and.b32  	%r37, %r36, -2146435072;
-	selp.b32	%r38, -1048576, 2146435072, %p1;
-	add.s32 	%r39, %r38, %r37;
-	mov.u32 	%r40, 0;
-	mov.b64 	%fd35, {%r40, %r39};
+	add.s32 	%r38, %r37, 2146435072;
+	or.b32  	%r39, %r38, -2147483648;
+	selp.b32	%r40, %r39, %r38, %p1;
+	mov.u32 	%r41, 0;
+	mov.b64 	%fd36, {%r41, %r40};
+	bra.uni 	BB7_50;
 
-BB7_48:
-	setp.eq.f64	%p51, %fd2, 0d0000000000000000;
-	setp.eq.f64	%p52, %fd1, 0d3FF0000000000000;
-	or.pred  	%p53, %p52, %p51;
-	selp.f64	%fd38, 0d3FF0000000000000, %fd35, %p53;
+BB7_49:
+	setp.gt.f64	%p46, %fd11, 0d3FF0000000000000;
+	selp.b32	%r42, 2146435072, 0, %p46;
+	xor.b32  	%r43, %r42, 2146435072;
+	setp.lt.s32	%p47, %r9, 0;
+	selp.b32	%r44, %r43, %r42, %p47;
+	setp.eq.f64	%p48, %fd1, 0dBFF0000000000000;
+	selp.b32	%r45, 1072693248, %r44, %p48;
+	mov.u32 	%r46, 0;
+	mov.b64 	%fd36, {%r46, %r45};
 
-BB7_52:
+BB7_50:
+	setp.eq.f64	%p49, %fd2, 0d0000000000000000;
+	setp.eq.f64	%p50, %fd1, 0d3FF0000000000000;
+	or.pred  	%p51, %p50, %p49;
+	selp.f64	%fd39, 0d3FF0000000000000, %fd36, %p51;
+
+BB7_54:
 	cvta.to.global.u64 	%rd12, %rd4;
 	mul.wide.s32 	%rd13, %r3, 8;
 	add.s64 	%rd14, %rd12, %rd13;
-	st.global.f64 	[%rd14], %fd38;
+	st.global.f64 	[%rd14], %fd39;
 	bar.sync 	0;
 
-BB7_53:
+BB7_55:
 	ret;
 }
 
@@ -801,9 +811,9 @@ BB7_53:
 	.param .u32 matrix_scalar_op_param_5
 )
 {
-	.reg .pred 	%p<95>;
-	.reg .b32 	%r<62>;
-	.reg .f64 	%fd<75>;
+	.reg .pred 	%p<91>;
+	.reg .b32 	%r<64>;
+	.reg .f64 	%fd<77>;
 	.reg .b64 	%rd<12>;
 
 
@@ -818,7 +828,7 @@ BB7_53:
 	mov.u32 	%r11, %tid.x;
 	mad.lo.s32 	%r1, %r10, %r9, %r11;
 	setp.ge.s32	%p3, %r1, %r8;
-	@%p3 bra 	BB8_90;
+	@%p3 bra 	BB8_94;
 
 	cvta.to.global.u64 	%rd6, %rd5;
 	cvta.to.global.u64 	%rd7, %rd4;
@@ -827,9 +837,9 @@ BB7_53:
 	ld.global.f64 	%fd1, [%rd9];
 	add.s64 	%rd1, %rd6, %rd8;
 	setp.eq.s32	%p4, %r7, 0;
-	@%p4 bra 	BB8_46;
+	@%p4 bra 	BB8_48;
 
-	mov.f64 	%fd66, 0d7FEFFFFFFFFFFFFF;
+	mov.f64 	%fd67, 0d7FEFFFFFFFFFFFFF;
 	setp.gt.s32	%p5, %r6, 5;
 	@%p5 bra 	BB8_12;
 
@@ -837,34 +847,34 @@ BB7_53:
 	@%p15 bra 	BB8_8;
 
 	setp.eq.s32	%p19, %r6, 0;
-	@%p19 bra 	BB8_44;
+	@%p19 bra 	BB8_46;
 
 	setp.eq.s32	%p20, %r6, 1;
-	@%p20 bra 	BB8_43;
+	@%p20 bra 	BB8_45;
 	bra.uni 	BB8_6;
 
-BB8_43:
-	sub.f64 	%fd66, %fd52, %fd1;
-	bra.uni 	BB8_45;
+BB8_45:
+	sub.f64 	%fd67, %fd52, %fd1;
+	bra.uni 	BB8_47;
 
-BB8_46:
-	mov.f64 	%fd74, 0d7FEFFFFFFFFFFFFF;
-	setp.gt.s32	%p50, %r6, 5;
-	@%p50 bra 	BB8_56;
+BB8_48:
+	mov.f64 	%fd76, 0d7FEFFFFFFFFFFFFF;
+	setp.gt.s32	%p48, %r6, 5;
+	@%p48 bra 	BB8_58;
 
-	setp.gt.s32	%p60, %r6, 2;
-	@%p60 bra 	BB8_52;
+	setp.gt.s32	%p58, %r6, 2;
+	@%p58 bra 	BB8_54;
 
-	setp.eq.s32	%p64, %r6, 0;
-	@%p64 bra 	BB8_88;
+	setp.eq.s32	%p62, %r6, 0;
+	@%p62 bra 	BB8_92;
 
-	setp.eq.s32	%p65, %r6, 1;
-	@%p65 bra 	BB8_87;
-	bra.uni 	BB8_50;
+	setp.eq.s32	%p63, %r6, 1;
+	@%p63 bra 	BB8_91;
+	bra.uni 	BB8_52;
 
-BB8_87:
-	sub.f64 	%fd74, %fd1, %fd52;
-	bra.uni 	BB8_89;
+BB8_91:
+	sub.f64 	%fd76, %fd1, %fd52;
+	bra.uni 	BB8_93;
 
 BB8_12:
 	setp.gt.s32	%p6, %r6, 8;
@@ -879,28 +889,28 @@ BB8_12:
 
 BB8_26:
 	setp.lt.f64	%p25, %fd1, %fd52;
-	selp.f64	%fd66, 0d3FF0000000000000, 0d0000000000000000, %p25;
-	bra.uni 	BB8_45;
+	selp.f64	%fd67, 0d3FF0000000000000, 0d0000000000000000, %p25;
+	bra.uni 	BB8_47;
 
-BB8_56:
-	setp.gt.s32	%p51, %r6, 8;
-	@%p51 bra 	BB8_61;
+BB8_58:
+	setp.gt.s32	%p49, %r6, 8;
+	@%p49 bra 	BB8_63;
 
-	setp.eq.s32	%p57, %r6, 6;
-	@%p57 bra 	BB8_71;
+	setp.eq.s32	%p55, %r6, 6;
+	@%p55 bra 	BB8_73;
 
-	setp.eq.s32	%p58, %r6, 7;
-	@%p58 bra 	BB8_70;
-	bra.uni 	BB8_59;
+	setp.eq.s32	%p56, %r6, 7;
+	@%p56 bra 	BB8_72;
+	bra.uni 	BB8_61;
 
-BB8_70:
-	setp.gt.f64	%p70, %fd1, %fd52;
-	selp.f64	%fd74, 0d3FF0000000000000, 0d0000000000000000, %p70;
-	bra.uni 	BB8_89;
+BB8_72:
+	setp.gt.f64	%p68, %fd1, %fd52;
+	selp.f64	%fd76, 0d3FF0000000000000, 0d0000000000000000, %p68;
+	bra.uni 	BB8_93;
 
 BB8_8:
 	setp.eq.s32	%p16, %r6, 3;
-	@%p16 bra 	BB8_42;
+	@%p16 bra 	BB8_44;
 
 	setp.eq.s32	%p17, %r6, 4;
 	@%p17 bra 	BB8_28;
@@ -936,7 +946,7 @@ BB8_28:
 	param0, 
 	param1
 	);
-	ld.param.f64	%fd65, [retval0+0];
+	ld.param.f64	%fd66, [retval0+0];
 	
 	//{
 	}// Callseq End 1
@@ -948,17 +958,17 @@ BB8_28:
 BB8_29:
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r14}, %fd65;
+	mov.b64 	{%temp, %r14}, %fd66;
 	}
 	xor.b32  	%r15, %r14, -2147483648;
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r16, %temp}, %fd65;
+	mov.b64 	{%r16, %temp}, %fd66;
 	}
-	mov.b64 	%fd65, {%r16, %r15};
+	mov.b64 	%fd66, {%r16, %r15};
 
 BB8_30:
-	mov.f64 	%fd64, %fd65;
+	mov.f64 	%fd65, %fd66;
 	setp.eq.f64	%p30, %fd52, 0d0000000000000000;
 	@%p30 bra 	BB8_33;
 	bra.uni 	BB8_31;
@@ -969,7 +979,7 @@ BB8_33:
 	setp.lt.s32	%p34, %r3, 0;
 	selp.b32	%r19, %r18, %r17, %p34;
 	mov.u32 	%r20, 0;
-	mov.b64 	%fd64, {%r20, %r19};
+	mov.b64 	%fd65, {%r20, %r19};
 	bra.uni 	BB8_34;
 
 BB8_17:
@@ -982,18 +992,18 @@ BB8_17:
 
 BB8_25:
 	setp.eq.f64	%p23, %fd1, %fd52;
-	selp.f64	%fd66, 0d3FF0000000000000, 0d0000000000000000, %p23;
-	bra.uni 	BB8_45;
+	selp.f64	%fd67, 0d3FF0000000000000, 0d0000000000000000, %p23;
+	bra.uni 	BB8_47;
 
-BB8_52:
-	setp.eq.s32	%p61, %r6, 3;
-	@%p61 bra 	BB8_86;
+BB8_54:
+	setp.eq.s32	%p59, %r6, 3;
+	@%p59 bra 	BB8_90;
 
-	setp.eq.s32	%p62, %r6, 4;
-	@%p62 bra 	BB8_72;
-	bra.uni 	BB8_54;
+	setp.eq.s32	%p60, %r6, 4;
+	@%p60 bra 	BB8_74;
+	bra.uni 	BB8_56;
 
-BB8_72:
+BB8_74:
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r4}, %fd1;
@@ -1002,11 +1012,11 @@ BB8_72:
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r5}, %fd52;
 	}
-	bfe.u32 	%r37, %r5, 20, 11;
-	add.s32 	%r38, %r37, -1012;
+	bfe.u32 	%r38, %r5, 20, 11;
+	add.s32 	%r39, %r38, -1012;
 	mov.b64 	 %rd11, %fd52;
-	shl.b64 	%rd3, %rd11, %r38;
-	setp.eq.s64	%p73, %rd3, -9223372036854775808;
+	shl.b64 	%rd3, %rd11, %r39;
+	setp.eq.s64	%p71, %rd3, -9223372036854775808;
 	abs.f64 	%fd35, %fd1;
 	// Callseq Start 2
 	{
@@ -1023,54 +1033,54 @@ BB8_72:
 	param0, 
 	param1
 	);
-	ld.param.f64	%fd73, [retval0+0];
+	ld.param.f64	%fd75, [retval0+0];
 	
 	//{
 	}// Callseq End 2
-	setp.lt.s32	%p74, %r4, 0;
-	and.pred  	%p2, %p74, %p73;
-	@!%p2 bra 	BB8_74;
-	bra.uni 	BB8_73;
+	setp.lt.s32	%p72, %r4, 0;
+	and.pred  	%p2, %p72, %p71;
+	@!%p2 bra 	BB8_76;
+	bra.uni 	BB8_75;
 
-BB8_73:
+BB8_75:
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r39}, %fd73;
+	mov.b64 	{%temp, %r40}, %fd75;
 	}
-	xor.b32  	%r40, %r39, -2147483648;
+	xor.b32  	%r41, %r40, -2147483648;
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r41, %temp}, %fd73;
+	mov.b64 	{%r42, %temp}, %fd75;
 	}
-	mov.b64 	%fd73, {%r41, %r40};
-
-BB8_74:
-	mov.f64 	%fd72, %fd73;
-	setp.eq.f64	%p75, %fd1, 0d0000000000000000;
-	@%p75 bra 	BB8_77;
-	bra.uni 	BB8_75;
+	mov.b64 	%fd75, {%r42, %r41};
 
-BB8_77:
-	selp.b32	%r42, %r4, 0, %p73;
-	or.b32  	%r43, %r42, 2146435072;
-	setp.lt.s32	%p79, %r5, 0;
-	selp.b32	%r44, %r43, %r42, %p79;
-	mov.u32 	%r45, 0;
-	mov.b64 	%fd72, {%r45, %r44};
-	bra.uni 	BB8_78;
+BB8_76:
+	mov.f64 	%fd74, %fd75;
+	setp.eq.f64	%p73, %fd1, 0d0000000000000000;
+	@%p73 bra 	BB8_79;
+	bra.uni 	BB8_77;
+
+BB8_79:
+	selp.b32	%r43, %r4, 0, %p71;
+	or.b32  	%r44, %r43, 2146435072;
+	setp.lt.s32	%p77, %r5, 0;
+	selp.b32	%r45, %r44, %r43, %p77;
+	mov.u32 	%r46, 0;
+	mov.b64 	%fd74, {%r46, %r45};
+	bra.uni 	BB8_80;
 
-BB8_61:
-	setp.gt.s32	%p52, %r6, 10;
-	@%p52 bra 	BB8_65;
+BB8_63:
+	setp.gt.s32	%p50, %r6, 10;
+	@%p50 bra 	BB8_67;
 
-	setp.eq.s32	%p55, %r6, 9;
-	@%p55 bra 	BB8_69;
-	bra.uni 	BB8_63;
+	setp.eq.s32	%p53, %r6, 9;
+	@%p53 bra 	BB8_71;
+	bra.uni 	BB8_65;
 
-BB8_69:
-	setp.eq.f64	%p68, %fd1, %fd52;
-	selp.f64	%fd74, 0d3FF0000000000000, 0d0000000000000000, %p68;
-	bra.uni 	BB8_89;
+BB8_71:
+	setp.eq.f64	%p66, %fd1, %fd52;
+	selp.f64	%fd76, 0d3FF0000000000000, 0d0000000000000000, %p66;
+	bra.uni 	BB8_93;
 
 BB8_21:
 	setp.eq.s32	%p8, %r6, 11;
@@ -1078,135 +1088,135 @@ BB8_21:
 	bra.uni 	BB8_22;
 
 BB8_24:
-	min.f64 	%fd66, %fd52, %fd1;
-	bra.uni 	BB8_45;
+	min.f64 	%fd67, %fd52, %fd1;
+	bra.uni 	BB8_47;
 
-BB8_44:
-	add.f64 	%fd66, %fd1, %fd52;
-	bra.uni 	BB8_45;
+BB8_46:
+	add.f64 	%fd67, %fd1, %fd52;
+	bra.uni 	BB8_47;
 
 BB8_6:
 	setp.eq.s32	%p21, %r6, 2;
 	@%p21 bra 	BB8_7;
-	bra.uni 	BB8_45;
+	bra.uni 	BB8_47;
 
 BB8_7:
-	mul.f64 	%fd66, %fd1, %fd52;
-	bra.uni 	BB8_45;
+	mul.f64 	%fd67, %fd1, %fd52;
+	bra.uni 	BB8_47;
 
 BB8_27:
 	setp.ge.f64	%p26, %fd1, %fd52;
-	selp.f64	%fd66, 0d3FF0000000000000, 0d0000000000000000, %p26;
-	bra.uni 	BB8_45;
+	selp.f64	%fd67, 0d3FF0000000000000, 0d0000000000000000, %p26;
+	bra.uni 	BB8_47;
 
 BB8_15:
 	setp.eq.s32	%p14, %r6, 8;
 	@%p14 bra 	BB8_16;
-	bra.uni 	BB8_45;
+	bra.uni 	BB8_47;
 
 BB8_16:
 	setp.le.f64	%p24, %fd1, %fd52;
-	selp.f64	%fd66, 0d3FF0000000000000, 0d0000000000000000, %p24;
-	bra.uni 	BB8_45;
+	selp.f64	%fd67, 0d3FF0000000000000, 0d0000000000000000, %p24;
+	bra.uni 	BB8_47;
 
-BB8_42:
-	div.rn.f64 	%fd66, %fd52, %fd1;
-	bra.uni 	BB8_45;
+BB8_44:
+	div.rn.f64 	%fd67, %fd52, %fd1;
+	bra.uni 	BB8_47;
 
 BB8_10:
 	setp.eq.s32	%p18, %r6, 5;
 	@%p18 bra 	BB8_11;
-	bra.uni 	BB8_45;
+	bra.uni 	BB8_47;
 
 BB8_11:
 	setp.gt.f64	%p27, %fd1, %fd52;
-	selp.f64	%fd66, 0d3FF0000000000000, 0d0000000000000000, %p27;
-	bra.uni 	BB8_45;
+	selp.f64	%fd67, 0d3FF0000000000000, 0d0000000000000000, %p27;
+	bra.uni 	BB8_47;
 
-BB8_65:
-	setp.eq.s32	%p53, %r6, 11;
-	@%p53 bra 	BB8_68;
-	bra.uni 	BB8_66;
+BB8_67:
+	setp.eq.s32	%p51, %r6, 11;
+	@%p51 bra 	BB8_70;
+	bra.uni 	BB8_68;
 
-BB8_68:
-	min.f64 	%fd74, %fd1, %fd52;
-	bra.uni 	BB8_89;
+BB8_70:
+	min.f64 	%fd76, %fd1, %fd52;
+	bra.uni 	BB8_93;
 
 BB8_19:
 	setp.eq.s32	%p11, %r6, 10;
 	@%p11 bra 	BB8_20;
-	bra.uni 	BB8_45;
+	bra.uni 	BB8_47;
 
 BB8_20:
 	setp.neu.f64	%p22, %fd1, %fd52;
-	selp.f64	%fd66, 0d3FF0000000000000, 0d0000000000000000, %p22;
-	bra.uni 	BB8_45;
+	selp.f64	%fd67, 0d3FF0000000000000, 0d0000000000000000, %p22;
+	bra.uni 	BB8_47;
 
 BB8_22:
 	setp.ne.s32	%p9, %r6, 12;
-	@%p9 bra 	BB8_45;
-
-	max.f64 	%fd66, %fd52, %fd1;
-	bra.uni 	BB8_45;
+	@%p9 bra 	BB8_47;
 
-BB8_88:
-	add.f64 	%fd74, %fd1, %fd52;
-	bra.uni 	BB8_89;
+	max.f64 	%fd67, %fd52, %fd1;
+	bra.uni 	BB8_47;
 
-BB8_50:
-	setp.eq.s32	%p66, %r6, 2;
-	@%p66 bra 	BB8_51;
-	bra.uni 	BB8_89;
+BB8_92:
+	add.f64 	%fd76, %fd1, %fd52;
+	bra.uni 	BB8_93;
 
-BB8_51:
-	mul.f64 	%fd74, %fd1, %fd52;
-	bra.uni 	BB8_89;
+BB8_52:
+	setp.eq.s32	%p64, %r6, 2;
+	@%p64 bra 	BB8_53;
+	bra.uni 	BB8_93;
 
-BB8_71:
-	setp.le.f64	%p71, %fd1, %fd52;
-	selp.f64	%fd74, 0d3FF0000000000000, 0d0000000000000000, %p71;
-	bra.uni 	BB8_89;
+BB8_53:
+	mul.f64 	%fd76, %fd1, %fd52;
+	bra.uni 	BB8_93;
 
-BB8_59:
-	setp.eq.s32	%p59, %r6, 8;
-	@%p59 bra 	BB8_60;
-	bra.uni 	BB8_89;
+BB8_73:
+	setp.le.f64	%p69, %fd1, %fd52;
+	selp.f64	%fd76, 0d3FF0000000000000, 0d0000000000000000, %p69;
+	bra.uni 	BB8_93;
 
-BB8_60:
-	setp.ge.f64	%p69, %fd1, %fd52;
-	selp.f64	%fd74, 0d3FF0000000000000, 0d0000000000000000, %p69;
-	bra.uni 	BB8_89;
+BB8_61:
+	setp.eq.s32	%p57, %r6, 8;
+	@%p57 bra 	BB8_62;
+	bra.uni 	BB8_93;
 
-BB8_86:
-	div.rn.f64 	%fd74, %fd1, %fd52;
-	bra.uni 	BB8_89;
+BB8_62:
+	setp.ge.f64	%p67, %fd1, %fd52;
+	selp.f64	%fd76, 0d3FF0000000000000, 0d0000000000000000, %p67;
+	bra.uni 	BB8_93;
 
-BB8_54:
-	setp.eq.s32	%p63, %r6, 5;
-	@%p63 bra 	BB8_55;
-	bra.uni 	BB8_89;
+BB8_90:
+	div.rn.f64 	%fd76, %fd1, %fd52;
+	bra.uni 	BB8_93;
 
-BB8_55:
-	setp.lt.f64	%p72, %fd1, %fd52;
-	selp.f64	%fd74, 0d3FF0000000000000, 0d0000000000000000, %p72;
-	bra.uni 	BB8_89;
+BB8_56:
+	setp.eq.s32	%p61, %r6, 5;
+	@%p61 bra 	BB8_57;
+	bra.uni 	BB8_93;
 
-BB8_63:
-	setp.eq.s32	%p56, %r6, 10;
-	@%p56 bra 	BB8_64;
-	bra.uni 	BB8_89;
+BB8_57:
+	setp.lt.f64	%p70, %fd1, %fd52;
+	selp.f64	%fd76, 0d3FF0000000000000, 0d0000000000000000, %p70;
+	bra.uni 	BB8_93;
 
-BB8_64:
-	setp.neu.f64	%p67, %fd1, %fd52;
-	selp.f64	%fd74, 0d3FF0000000000000, 0d0000000000000000, %p67;
-	bra.uni 	BB8_89;
+BB8_65:
+	setp.eq.s32	%p54, %r6, 10;
+	@%p54 bra 	BB8_66;
+	bra.uni 	BB8_93;
 
 BB8_66:
-	setp.ne.s32	%p54, %r6, 12;
-	@%p54 bra 	BB8_89;
+	setp.neu.f64	%p65, %fd1, %fd52;
+	selp.f64	%fd76, 0d3FF0000000000000, 0d0000000000000000, %p65;
+	bra.uni 	BB8_93;
 
-	max.f64 	%fd74, %fd1, %fd52;
-	bra.uni 	BB8_89;
+BB8_68:
+	setp.ne.s32	%p52, %r6, 12;
+	@%p52 bra 	BB8_93;
+
+	max.f64 	%fd76, %fd1, %fd52;
+	bra.uni 	BB8_93;
 
 BB8_31:
 	setp.gt.s32	%p31, %r2, -1;
@@ -1214,10 +1224,10 @@ BB8_31:
 
 	cvt.rzi.f64.f64	%fd54, %fd1;
 	setp.neu.f64	%p32, %fd54, %fd1;
-	selp.f64	%fd64, 0dFFF8000000000000, %fd64, %p32;
+	selp.f64	%fd65, 0dFFF8000000000000, %fd65, %p32;
 
 BB8_34:
-	mov.f64 	%fd16, %fd64;
+	mov.f64 	%fd16, %fd65;
 	add.f64 	%fd17, %fd1, %fd52;
 	{
 	.reg .b32 %temp; 
@@ -1225,157 +1235,161 @@ BB8_34:
 	}
 	and.b32  	%r22, %r21, 2146435072;
 	setp.ne.s32	%p35, %r22, 2146435072;
-	mov.f64 	%fd63, %fd16;
-	@%p35 bra 	BB8_41;
+	mov.f64 	%fd64, %fd16;
+	@%p35 bra 	BB8_43;
 
 	setp.gtu.f64	%p36, %fd10, 0d7FF0000000000000;
-	mov.f64 	%fd63, %fd17;
-	@%p36 bra 	BB8_41;
+	mov.f64 	%fd64, %fd17;
+	@%p36 bra 	BB8_43;
 
 	abs.f64 	%fd55, %fd1;
 	setp.gtu.f64	%p37, %fd55, 0d7FF0000000000000;
-	mov.f64 	%fd62, %fd17;
-	mov.f64 	%fd63, %fd62;
-	@%p37 bra 	BB8_41;
+	mov.f64 	%fd63, %fd17;
+	mov.f64 	%fd64, %fd63;
+	@%p37 bra 	BB8_43;
+
+	and.b32  	%r23, %r3, 2147483647;
+	setp.ne.s32	%p38, %r23, 2146435072;
+	@%p38 bra 	BB8_39;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r23, %temp}, %fd1;
+	mov.b64 	{%r24, %temp}, %fd1;
 	}
-	and.b32  	%r24, %r3, 2147483647;
-	setp.eq.s32	%p38, %r24, 2146435072;
-	setp.eq.s32	%p39, %r23, 0;
-	and.pred  	%p40, %p38, %p39;
-	@%p40 bra 	BB8_40;
-	bra.uni 	BB8_38;
+	setp.eq.s32	%p39, %r24, 0;
+	@%p39 bra 	BB8_42;
 
-BB8_40:
-	setp.gt.f64	%p44, %fd10, 0d3FF0000000000000;
-	selp.b32	%r32, 2146435072, 0, %p44;
-	xor.b32  	%r33, %r32, 2146435072;
-	setp.lt.s32	%p45, %r3, 0;
-	selp.b32	%r34, %r33, %r32, %p45;
-	setp.eq.f64	%p46, %fd52, 0dBFF0000000000000;
-	selp.b32	%r35, 1072693248, %r34, %p46;
-	mov.u32 	%r36, 0;
-	mov.b64 	%fd63, {%r36, %r35};
-	bra.uni 	BB8_41;
+BB8_39:
+	and.b32  	%r25, %r2, 2147483647;
+	setp.ne.s32	%p40, %r25, 2146435072;
+	mov.f64 	%fd61, %fd16;
+	mov.f64 	%fd64, %fd61;
+	@%p40 bra 	BB8_43;
 
-BB8_75:
-	setp.gt.s32	%p76, %r4, -1;
-	@%p76 bra 	BB8_78;
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r26, %temp}, %fd52;
+	}
+	setp.ne.s32	%p41, %r26, 0;
+	mov.f64 	%fd64, %fd16;
+	@%p41 bra 	BB8_43;
+
+	shr.s32 	%r27, %r3, 31;
+	and.b32  	%r28, %r27, -2146435072;
+	add.s32 	%r29, %r28, 2146435072;
+	or.b32  	%r30, %r29, -2147483648;
+	selp.b32	%r31, %r30, %r29, %p1;
+	mov.u32 	%r32, 0;
+	mov.b64 	%fd64, {%r32, %r31};
+	bra.uni 	BB8_43;
+
+BB8_77:
+	setp.gt.s32	%p74, %r4, -1;
+	@%p74 bra 	BB8_80;
 
 	cvt.rzi.f64.f64	%fd57, %fd52;
-	setp.neu.f64	%p77, %fd57, %fd52;
-	selp.f64	%fd72, 0dFFF8000000000000, %fd72, %p77;
+	setp.neu.f64	%p75, %fd57, %fd52;
+	selp.f64	%fd74, 0dFFF8000000000000, %fd74, %p75;
 
-BB8_78:
-	mov.f64 	%fd41, %fd72;
+BB8_80:
+	mov.f64 	%fd41, %fd74;
 	add.f64 	%fd42, %fd1, %fd52;
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r46}, %fd42;
+	mov.b64 	{%temp, %r47}, %fd42;
 	}
-	and.b32  	%r47, %r46, 2146435072;
-	setp.ne.s32	%p80, %r47, 2146435072;
-	mov.f64 	%fd71, %fd41;
-	@%p80 bra 	BB8_85;
+	and.b32  	%r48, %r47, 2146435072;
+	setp.ne.s32	%p78, %r48, 2146435072;
+	mov.f64 	%fd73, %fd41;
+	@%p78 bra 	BB8_89;
 
-	setp.gtu.f64	%p81, %fd35, 0d7FF0000000000000;
-	mov.f64 	%fd71, %fd42;
-	@%p81 bra 	BB8_85;
+	setp.gtu.f64	%p79, %fd35, 0d7FF0000000000000;
+	mov.f64 	%fd73, %fd42;
+	@%p79 bra 	BB8_89;
 
 	abs.f64 	%fd58, %fd52;
-	setp.gtu.f64	%p82, %fd58, 0d7FF0000000000000;
-	mov.f64 	%fd70, %fd42;
-	mov.f64 	%fd71, %fd70;
-	@%p82 bra 	BB8_85;
+	setp.gtu.f64	%p80, %fd58, 0d7FF0000000000000;
+	mov.f64 	%fd72, %fd42;
+	mov.f64 	%fd73, %fd72;
+	@%p80 bra 	BB8_89;
+
+	and.b32  	%r49, %r5, 2147483647;
+	setp.ne.s32	%p81, %r49, 2146435072;
+	@%p81 bra 	BB8_85;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r48, %temp}, %fd52;
+	mov.b64 	{%r50, %temp}, %fd52;
 	}
-	and.b32  	%r49, %r5, 2147483647;
-	setp.eq.s32	%p83, %r49, 2146435072;
-	setp.eq.s32	%p84, %r48, 0;
-	and.pred  	%p85, %p83, %p84;
-	@%p85 bra 	BB8_84;
-	bra.uni 	BB8_82;
-
-BB8_84:
-	setp.gt.f64	%p89, %fd35, 0d3FF0000000000000;
-	selp.b32	%r57, 2146435072, 0, %p89;
-	xor.b32  	%r58, %r57, 2146435072;
-	setp.lt.s32	%p90, %r5, 0;
-	selp.b32	%r59, %r58, %r57, %p90;
-	setp.eq.f64	%p91, %fd1, 0dBFF0000000000000;
-	selp.b32	%r60, 1072693248, %r59, %p91;
-	mov.u32 	%r61, 0;
-	mov.b64 	%fd71, {%r61, %r60};
-	bra.uni 	BB8_85;
-
-BB8_38:
-	{
-	.reg .b32 %temp; 
-	mov.b64 	{%r25, %temp}, %fd52;
-	}
-	and.b32  	%r26, %r2, 2147483647;
-	setp.eq.s32	%p41, %r26, 2146435072;
-	setp.eq.s32	%p42, %r25, 0;
-	and.pred  	%p43, %p41, %p42;
-	mov.f64 	%fd63, %fd16;
-	@!%p43 bra 	BB8_41;
-	bra.uni 	BB8_39;
-
-BB8_39:
-	shr.s32 	%r27, %r3, 31;
-	and.b32  	%r28, %r27, -2146435072;
-	selp.b32	%r29, -1048576, 2146435072, %p1;
-	add.s32 	%r30, %r29, %r28;
-	mov.u32 	%r31, 0;
-	mov.b64 	%fd63, {%r31, %r30};
+	setp.eq.s32	%p82, %r50, 0;
+	@%p82 bra 	BB8_88;
 
-BB8_41:
-	setp.eq.f64	%p47, %fd1, 0d0000000000000000;
-	setp.eq.f64	%p48, %fd52, 0d3FF0000000000000;
-	or.pred  	%p49, %p48, %p47;
-	selp.f64	%fd66, 0d3FF0000000000000, %fd63, %p49;
-
-BB8_45:
-	st.global.f64 	[%rd1], %fd66;
-	bra.uni 	BB8_90;
+BB8_85:
+	and.b32  	%r51, %r4, 2147483647;
+	setp.ne.s32	%p83, %r51, 2146435072;
+	mov.f64 	%fd70, %fd41;
+	mov.f64 	%fd73, %fd70;
+	@%p83 bra 	BB8_89;
 
-BB8_82:
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r50, %temp}, %fd1;
+	mov.b64 	{%r52, %temp}, %fd1;
 	}
-	and.b32  	%r51, %r4, 2147483647;
-	setp.eq.s32	%p86, %r51, 2146435072;
-	setp.eq.s32	%p87, %r50, 0;
-	and.pred  	%p88, %p86, %p87;
-	mov.f64 	%fd71, %fd41;
-	@!%p88 bra 	BB8_85;
-	bra.uni 	BB8_83;
-
-BB8_83:
-	shr.s32 	%r52, %r5, 31;
-	and.b32  	%r53, %r52, -2146435072;
-	selp.b32	%r54, -1048576, 2146435072, %p2;
-	add.s32 	%r55, %r54, %r53;
-	mov.u32 	%r56, 0;
-	mov.b64 	%fd71, {%r56, %r55};
+	setp.ne.s32	%p84, %r52, 0;
+	mov.f64 	%fd73, %fd41;
+	@%p84 bra 	BB8_89;
+
+	shr.s32 	%r53, %r5, 31;
+	and.b32  	%r54, %r53, -2146435072;
+	add.s32 	%r55, %r54, 2146435072;
+	or.b32  	%r56, %r55, -2147483648;
+	selp.b32	%r57, %r56, %r55, %p2;
+	mov.u32 	%r58, 0;
+	mov.b64 	%fd73, {%r58, %r57};
+	bra.uni 	BB8_89;
 
-BB8_85:
-	setp.eq.f64	%p92, %fd52, 0d0000000000000000;
-	setp.eq.f64	%p93, %fd1, 0d3FF0000000000000;
-	or.pred  	%p94, %p93, %p92;
-	selp.f64	%fd74, 0d3FF0000000000000, %fd71, %p94;
+BB8_42:
+	setp.gt.f64	%p42, %fd10, 0d3FF0000000000000;
+	selp.b32	%r33, 2146435072, 0, %p42;
+	xor.b32  	%r34, %r33, 2146435072;
+	setp.lt.s32	%p43, %r3, 0;
+	selp.b32	%r35, %r34, %r33, %p43;
+	setp.eq.f64	%p44, %fd52, 0dBFF0000000000000;
+	selp.b32	%r36, 1072693248, %r35, %p44;
+	mov.u32 	%r37, 0;
+	mov.b64 	%fd64, {%r37, %r36};
+
+BB8_43:
+	setp.eq.f64	%p45, %fd1, 0d0000000000000000;
+	setp.eq.f64	%p46, %fd52, 0d3FF0000000000000;
+	or.pred  	%p47, %p46, %p45;
+	selp.f64	%fd67, 0d3FF0000000000000, %fd64, %p47;
+
+BB8_47:
+	st.global.f64 	[%rd1], %fd67;
+	bra.uni 	BB8_94;
+
+BB8_88:
+	setp.gt.f64	%p85, %fd35, 0d3FF0000000000000;
+	selp.b32	%r59, 2146435072, 0, %p85;
+	xor.b32  	%r60, %r59, 2146435072;
+	setp.lt.s32	%p86, %r5, 0;
+	selp.b32	%r61, %r60, %r59, %p86;
+	setp.eq.f64	%p87, %fd1, 0dBFF0000000000000;
+	selp.b32	%r62, 1072693248, %r61, %p87;
+	mov.u32 	%r63, 0;
+	mov.b64 	%fd73, {%r63, %r62};
 
 BB8_89:
-	st.global.f64 	[%rd1], %fd74;
+	setp.eq.f64	%p88, %fd52, 0d0000000000000000;
+	setp.eq.f64	%p89, %fd1, 0d3FF0000000000000;
+	or.pred  	%p90, %p89, %p88;
+	selp.f64	%fd76, 0d3FF0000000000000, %fd73, %p90;
 
-BB8_90:
+BB8_93:
+	st.global.f64 	[%rd1], %fd76;
+
+BB8_94:
 	bar.sync 	0;
 	ret;
 }
@@ -2928,7 +2942,7 @@ BB19_35:
 	.reg .pred 	%p<20>;
 	.reg .b32 	%r<39>;
 	.reg .f64 	%fd<76>;
-	.reg .b64 	%rd<42>;
+	.reg .b64 	%rd<43>;
 
 
 	ld.param.u64 	%rd1, [reduce_row_mean_param_0];
@@ -3095,12 +3109,13 @@ BB20_33:
 	@%p19 bra 	BB20_35;
 
 	ld.shared.f64 	%fd40, [sdata];
-	cvt.rn.f64.s32	%fd41, %r4;
+	cvt.u64.u32	%rd39, %r4;
+	cvt.rn.f64.s64	%fd41, %rd39;
 	div.rn.f64 	%fd42, %fd40, %fd41;
-	cvta.to.global.u64 	%rd39, %rd2;
-	mul.wide.u32 	%rd40, %r6, 8;
-	add.s64 	%rd41, %rd39, %rd40;
-	st.global.f64 	[%rd41], %fd42;
+	cvta.to.global.u64 	%rd40, %rd2;
+	mul.wide.u32 	%rd41, %r6, 8;
+	add.s64 	%rd42, %rd40, %rd41;
+	st.global.f64 	[%rd42], %fd42;
 
 BB20_35:
 	ret;
@@ -3117,7 +3132,7 @@ BB20_35:
 	.reg .pred 	%p<4>;
 	.reg .b32 	%r<11>;
 	.reg .f64 	%fd<12>;
-	.reg .b64 	%rd<9>;
+	.reg .b64 	%rd<10>;
 
 
 	ld.param.u64 	%rd2, [reduce_col_mean_param_0];
@@ -3154,11 +3169,12 @@ BB21_3:
 
 BB21_4:
 	cvta.to.global.u64 	%rd6, %rd3;
-	cvt.rn.f64.s32	%fd7, %r5;
+	cvt.u64.u32	%rd7, %r5;
+	cvt.rn.f64.s64	%fd7, %rd7;
 	div.rn.f64 	%fd8, %fd10, %fd7;
-	mul.wide.u32 	%rd7, %r1, 8;
-	add.s64 	%rd8, %rd6, %rd7;
-	st.global.f64 	[%rd8], %fd8;
+	mul.wide.u32 	%rd8, %r1, 8;
+	add.s64 	%rd9, %rd6, %rd8;
+	st.global.f64 	[%rd9], %fd8;
 
 BB21_5:
 	ret;
@@ -3277,82 +3293,1638 @@ BB22_5:
 	ret;
 }
 
-.func  (.param .b64 func_retval0) __internal_accurate_pow(
-	.param .b64 __internal_accurate_pow_param_0,
-	.param .b64 __internal_accurate_pow_param_1
+	// .globl	matrix_sqrt
+.visible .entry matrix_sqrt(
+	.param .u64 matrix_sqrt_param_0,
+	.param .u64 matrix_sqrt_param_1,
+	.param .u32 matrix_sqrt_param_2
 )
 {
-	.reg .pred 	%p<10>;
-	.reg .f32 	%f<3>;
-	.reg .b32 	%r<52>;
-	.reg .f64 	%fd<134>;
+	.reg .pred 	%p<2>;
+	.reg .b32 	%r<6>;
+	.reg .f64 	%fd<3>;
+	.reg .b64 	%rd<8>;
 
 
-	ld.param.f64 	%fd12, [__internal_accurate_pow_param_0];
-	ld.param.f64 	%fd13, [__internal_accurate_pow_param_1];
+	ld.param.u64 	%rd1, [matrix_sqrt_param_0];
+	ld.param.u64 	%rd2, [matrix_sqrt_param_1];
+	ld.param.u32 	%r2, [matrix_sqrt_param_2];
+	mov.u32 	%r3, %ctaid.x;
+	mov.u32 	%r4, %ntid.x;
+	mov.u32 	%r5, %tid.x;
+	mad.lo.s32 	%r1, %r4, %r3, %r5;
+	setp.ge.u32	%p1, %r1, %r2;
+	@%p1 bra 	BB23_2;
+
+	cvta.to.global.u64 	%rd3, %rd1;
+	mul.wide.s32 	%rd4, %r1, 8;
+	add.s64 	%rd5, %rd3, %rd4;
+	ld.global.f64 	%fd1, [%rd5];
+	sqrt.rn.f64 	%fd2, %fd1;
+	cvta.to.global.u64 	%rd6, %rd2;
+	add.s64 	%rd7, %rd6, %rd4;
+	st.global.f64 	[%rd7], %fd2;
+
+BB23_2:
+	ret;
+}
+
+	// .globl	matrix_round
+.visible .entry matrix_round(
+	.param .u64 matrix_round_param_0,
+	.param .u64 matrix_round_param_1,
+	.param .u32 matrix_round_param_2
+)
+{
+	.reg .pred 	%p<4>;
+	.reg .b32 	%r<11>;
+	.reg .f64 	%fd<10>;
+	.reg .b64 	%rd<11>;
+
+
+	ld.param.u64 	%rd2, [matrix_round_param_0];
+	ld.param.u64 	%rd3, [matrix_round_param_1];
+	ld.param.u32 	%r2, [matrix_round_param_2];
+	mov.u32 	%r3, %ctaid.x;
+	mov.u32 	%r4, %ntid.x;
+	mov.u32 	%r5, %tid.x;
+	mad.lo.s32 	%r1, %r4, %r3, %r5;
+	setp.ge.u32	%p1, %r1, %r2;
+	@%p1 bra 	BB24_4;
+
+	cvta.to.global.u64 	%rd4, %rd2;
+	cvt.s64.s32	%rd1, %r1;
+	mul.wide.s32 	%rd5, %r1, 8;
+	add.s64 	%rd6, %rd4, %rd5;
+	ld.global.f64 	%fd9, [%rd6];
+	abs.f64 	%fd2, %fd9;
+	setp.ge.f64	%p2, %fd2, 0d4330000000000000;
+	@%p2 bra 	BB24_3;
+
+	add.f64 	%fd5, %fd2, 0d3FE0000000000000;
+	cvt.rzi.f64.f64	%fd6, %fd5;
+	setp.lt.f64	%p3, %fd2, 0d3FE0000000000000;
+	selp.f64	%fd7, 0d0000000000000000, %fd6, %p3;
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r49}, %fd12;
+	mov.b64 	{%r6, %temp}, %fd7;
 	}
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r48, %temp}, %fd12;
+	mov.b64 	{%temp, %r7}, %fd7;
 	}
-	shr.u32 	%r50, %r49, 20;
-	setp.ne.s32	%p1, %r50, 0;
-	@%p1 bra 	BB23_2;
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r8}, %fd9;
+	}
+	and.b32  	%r9, %r8, -2147483648;
+	or.b32  	%r10, %r7, %r9;
+	mov.b64 	%fd9, {%r6, %r10};
 
-	mul.f64 	%fd14, %fd12, 0d4350000000000000;
+BB24_3:
+	cvta.to.global.u64 	%rd7, %rd3;
+	cvt.rzi.s64.f64	%rd8, %fd9;
+	cvt.rn.f64.s64	%fd8, %rd8;
+	shl.b64 	%rd9, %rd1, 3;
+	add.s64 	%rd10, %rd7, %rd9;
+	st.global.f64 	[%rd10], %fd8;
+
+BB24_4:
+	ret;
+}
+
+	// .globl	matrix_abs
+.visible .entry matrix_abs(
+	.param .u64 matrix_abs_param_0,
+	.param .u64 matrix_abs_param_1,
+	.param .u32 matrix_abs_param_2
+)
+{
+	.reg .pred 	%p<2>;
+	.reg .b32 	%r<6>;
+	.reg .f64 	%fd<3>;
+	.reg .b64 	%rd<8>;
+
+
+	ld.param.u64 	%rd1, [matrix_abs_param_0];
+	ld.param.u64 	%rd2, [matrix_abs_param_1];
+	ld.param.u32 	%r2, [matrix_abs_param_2];
+	mov.u32 	%r3, %ctaid.x;
+	mov.u32 	%r4, %ntid.x;
+	mov.u32 	%r5, %tid.x;
+	mad.lo.s32 	%r1, %r4, %r3, %r5;
+	setp.ge.u32	%p1, %r1, %r2;
+	@%p1 bra 	BB25_2;
+
+	cvta.to.global.u64 	%rd3, %rd1;
+	mul.wide.s32 	%rd4, %r1, 8;
+	add.s64 	%rd5, %rd3, %rd4;
+	ld.global.f64 	%fd1, [%rd5];
+	abs.f64 	%fd2, %fd1;
+	cvta.to.global.u64 	%rd6, %rd2;
+	add.s64 	%rd7, %rd6, %rd4;
+	st.global.f64 	[%rd7], %fd2;
+
+BB25_2:
+	ret;
+}
+
+	// .globl	matrix_log
+.visible .entry matrix_log(
+	.param .u64 matrix_log_param_0,
+	.param .u64 matrix_log_param_1,
+	.param .u32 matrix_log_param_2
+)
+{
+	.reg .pred 	%p<6>;
+	.reg .f32 	%f<2>;
+	.reg .b32 	%r<33>;
+	.reg .f64 	%fd<59>;
+	.reg .b64 	%rd<10>;
+
+
+	ld.param.u64 	%rd2, [matrix_log_param_0];
+	ld.param.u64 	%rd3, [matrix_log_param_1];
+	ld.param.u32 	%r12, [matrix_log_param_2];
+	mov.u32 	%r13, %ctaid.x;
+	mov.u32 	%r14, %ntid.x;
+	mov.u32 	%r15, %tid.x;
+	mad.lo.s32 	%r1, %r14, %r13, %r15;
+	setp.ge.u32	%p1, %r1, %r12;
+	@%p1 bra 	BB26_9;
+
+	cvta.to.global.u64 	%rd4, %rd2;
+	cvt.s64.s32	%rd1, %r1;
+	mul.wide.s32 	%rd5, %r1, 8;
+	add.s64 	%rd6, %rd4, %rd5;
+	ld.global.f64 	%fd56, [%rd6];
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r49}, %fd14;
+	mov.b64 	{%temp, %r29}, %fd56;
 	}
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r48, %temp}, %fd14;
+	mov.b64 	{%r30, %temp}, %fd56;
 	}
-	shr.u32 	%r16, %r49, 20;
-	add.s32 	%r50, %r16, -54;
+	mov.u32 	%r31, -1023;
+	setp.gt.s32	%p2, %r29, 1048575;
+	@%p2 bra 	BB26_3;
 
-BB23_2:
-	add.s32 	%r51, %r50, -1023;
-	and.b32  	%r17, %r49, -2146435073;
-	or.b32  	%r18, %r17, 1072693248;
-	mov.b64 	%fd132, {%r48, %r18};
-	setp.lt.u32	%p2, %r18, 1073127583;
-	@%p2 bra 	BB23_4;
+	mul.f64 	%fd56, %fd56, 0d4350000000000000;
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r29}, %fd56;
+	}
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r30, %temp}, %fd56;
+	}
+	mov.u32 	%r31, -1077;
+
+BB26_3:
+	add.s32 	%r18, %r29, -1;
+	setp.lt.u32	%p3, %r18, 2146435071;
+	@%p3 bra 	BB26_5;
+	bra.uni 	BB26_4;
+
+BB26_5:
+	shr.u32 	%r20, %r29, 20;
+	add.s32 	%r32, %r31, %r20;
+	and.b32  	%r21, %r29, -2146435073;
+	or.b32  	%r22, %r21, 1072693248;
+	mov.b64 	%fd57, {%r30, %r22};
+	setp.lt.s32	%p5, %r22, 1073127583;
+	@%p5 bra 	BB26_7;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r19, %temp}, %fd132;
+	mov.b64 	{%r23, %temp}, %fd57;
 	}
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r20}, %fd132;
+	mov.b64 	{%temp, %r24}, %fd57;
 	}
-	add.s32 	%r21, %r20, -1048576;
-	mov.b64 	%fd132, {%r19, %r21};
-	add.s32 	%r51, %r50, -1022;
+	add.s32 	%r25, %r24, -1048576;
+	mov.b64 	%fd57, {%r23, %r25};
+	add.s32 	%r32, %r32, 1;
 
-BB23_4:
-	add.f64 	%fd16, %fd132, 0d3FF0000000000000;
+BB26_7:
+	add.f64 	%fd13, %fd57, 0d3FF0000000000000;
 	// inline asm
-	rcp.approx.ftz.f64 %fd15,%fd16;
+	rcp.approx.ftz.f64 %fd12,%fd13;
 	// inline asm
-	neg.f64 	%fd17, %fd16;
-	mov.f64 	%fd18, 0d3FF0000000000000;
-	fma.rn.f64 	%fd19, %fd17, %fd15, %fd18;
-	fma.rn.f64 	%fd20, %fd19, %fd19, %fd19;
-	fma.rn.f64 	%fd21, %fd20, %fd15, %fd15;
-	add.f64 	%fd22, %fd132, 0dBFF0000000000000;
-	mul.f64 	%fd23, %fd22, %fd21;
-	fma.rn.f64 	%fd24, %fd22, %fd21, %fd23;
-	mul.f64 	%fd25, %fd24, %fd24;
-	mov.f64 	%fd26, 0d3ED0F5D241AD3B5A;
-	mov.f64 	%fd27, 0d3EB0F5FF7D2CAFE2;
-	fma.rn.f64 	%fd28, %fd27, %fd25, %fd26;
-	mov.f64 	%fd29, 0d3EF3B20A75488A3F;
-	fma.rn.f64 	%fd30, %fd28, %fd25, %fd29;
+	neg.f64 	%fd14, %fd13;
+	mov.f64 	%fd15, 0d3FF0000000000000;
+	fma.rn.f64 	%fd16, %fd14, %fd12, %fd15;
+	fma.rn.f64 	%fd17, %fd16, %fd16, %fd16;
+	fma.rn.f64 	%fd18, %fd17, %fd12, %fd12;
+	add.f64 	%fd19, %fd57, 0dBFF0000000000000;
+	mul.f64 	%fd20, %fd19, %fd18;
+	fma.rn.f64 	%fd21, %fd19, %fd18, %fd20;
+	mul.f64 	%fd22, %fd21, %fd21;
+	mov.f64 	%fd23, 0d3ED0EE258B7A8B04;
+	mov.f64 	%fd24, 0d3EB1380B3AE80F1E;
+	fma.rn.f64 	%fd25, %fd24, %fd22, %fd23;
+	mov.f64 	%fd26, 0d3EF3B2669F02676F;
+	fma.rn.f64 	%fd27, %fd25, %fd22, %fd26;
+	mov.f64 	%fd28, 0d3F1745CBA9AB0956;
+	fma.rn.f64 	%fd29, %fd27, %fd22, %fd28;
+	mov.f64 	%fd30, 0d3F3C71C72D1B5154;
+	fma.rn.f64 	%fd31, %fd29, %fd22, %fd30;
+	mov.f64 	%fd32, 0d3F624924923BE72D;
+	fma.rn.f64 	%fd33, %fd31, %fd22, %fd32;
+	mov.f64 	%fd34, 0d3F8999999999A3C4;
+	fma.rn.f64 	%fd35, %fd33, %fd22, %fd34;
+	mov.f64 	%fd36, 0d3FB5555555555554;
+	fma.rn.f64 	%fd37, %fd35, %fd22, %fd36;
+	sub.f64 	%fd38, %fd19, %fd21;
+	add.f64 	%fd39, %fd38, %fd38;
+	neg.f64 	%fd40, %fd21;
+	fma.rn.f64 	%fd41, %fd40, %fd19, %fd39;
+	mul.f64 	%fd42, %fd18, %fd41;
+	mul.f64 	%fd43, %fd22, %fd37;
+	fma.rn.f64 	%fd44, %fd43, %fd21, %fd42;
+	xor.b32  	%r26, %r32, -2147483648;
+	mov.u32 	%r27, 1127219200;
+	mov.b64 	%fd45, {%r26, %r27};
+	mov.u32 	%r28, -2147483648;
+	mov.b64 	%fd46, {%r28, %r27};
+	sub.f64 	%fd47, %fd45, %fd46;
+	mov.f64 	%fd48, 0d3FE62E42FEFA39EF;
+	fma.rn.f64 	%fd49, %fd47, %fd48, %fd21;
+	neg.f64 	%fd50, %fd47;
+	fma.rn.f64 	%fd51, %fd50, %fd48, %fd49;
+	sub.f64 	%fd52, %fd51, %fd21;
+	sub.f64 	%fd53, %fd44, %fd52;
+	mov.f64 	%fd54, 0d3C7ABC9E3B39803F;
+	fma.rn.f64 	%fd55, %fd47, %fd54, %fd53;
+	add.f64 	%fd58, %fd49, %fd55;
+	bra.uni 	BB26_8;
+
+BB26_4:
+	mov.f64 	%fd10, 0d7FF0000000000000;
+	fma.rn.f64 	%fd11, %fd56, %fd10, %fd10;
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r19}, %fd56;
+	}
+	mov.b32 	 %f1, %r19;
+	setp.eq.f32	%p4, %f1, 0f00000000;
+	selp.f64	%fd58, 0dFFF0000000000000, %fd11, %p4;
+
+BB26_8:
+	cvta.to.global.u64 	%rd7, %rd3;
+	shl.b64 	%rd8, %rd1, 3;
+	add.s64 	%rd9, %rd7, %rd8;
+	st.global.f64 	[%rd9], %fd58;
+
+BB26_9:
+	ret;
+}
+
+	// .globl	matrix_floor
+.visible .entry matrix_floor(
+	.param .u64 matrix_floor_param_0,
+	.param .u64 matrix_floor_param_1,
+	.param .u32 matrix_floor_param_2
+)
+{
+	.reg .pred 	%p<2>;
+	.reg .b32 	%r<6>;
+	.reg .f64 	%fd<3>;
+	.reg .b64 	%rd<8>;
+
+
+	ld.param.u64 	%rd1, [matrix_floor_param_0];
+	ld.param.u64 	%rd2, [matrix_floor_param_1];
+	ld.param.u32 	%r2, [matrix_floor_param_2];
+	mov.u32 	%r3, %ctaid.x;
+	mov.u32 	%r4, %ntid.x;
+	mov.u32 	%r5, %tid.x;
+	mad.lo.s32 	%r1, %r4, %r3, %r5;
+	setp.ge.u32	%p1, %r1, %r2;
+	@%p1 bra 	BB27_2;
+
+	cvta.to.global.u64 	%rd3, %rd1;
+	mul.wide.s32 	%rd4, %r1, 8;
+	add.s64 	%rd5, %rd3, %rd4;
+	ld.global.f64 	%fd1, [%rd5];
+	cvt.rmi.f64.f64	%fd2, %fd1;
+	cvta.to.global.u64 	%rd6, %rd2;
+	add.s64 	%rd7, %rd6, %rd4;
+	st.global.f64 	[%rd7], %fd2;
+
+BB27_2:
+	ret;
+}
+
+	// .globl	matrix_ceil
+.visible .entry matrix_ceil(
+	.param .u64 matrix_ceil_param_0,
+	.param .u64 matrix_ceil_param_1,
+	.param .u32 matrix_ceil_param_2
+)
+{
+	.reg .pred 	%p<2>;
+	.reg .b32 	%r<6>;
+	.reg .f64 	%fd<3>;
+	.reg .b64 	%rd<8>;
+
+
+	ld.param.u64 	%rd1, [matrix_ceil_param_0];
+	ld.param.u64 	%rd2, [matrix_ceil_param_1];
+	ld.param.u32 	%r2, [matrix_ceil_param_2];
+	mov.u32 	%r3, %ctaid.x;
+	mov.u32 	%r4, %ntid.x;
+	mov.u32 	%r5, %tid.x;
+	mad.lo.s32 	%r1, %r4, %r3, %r5;
+	setp.ge.u32	%p1, %r1, %r2;
+	@%p1 bra 	BB28_2;
+
+	cvta.to.global.u64 	%rd3, %rd1;
+	mul.wide.s32 	%rd4, %r1, 8;
+	add.s64 	%rd5, %rd3, %rd4;
+	ld.global.f64 	%fd1, [%rd5];
+	cvt.rpi.f64.f64	%fd2, %fd1;
+	cvta.to.global.u64 	%rd6, %rd2;
+	add.s64 	%rd7, %rd6, %rd4;
+	st.global.f64 	[%rd7], %fd2;
+
+BB28_2:
+	ret;
+}
+
+	// .globl	matrix_sin
+.visible .entry matrix_sin(
+	.param .u64 matrix_sin_param_0,
+	.param .u64 matrix_sin_param_1,
+	.param .u32 matrix_sin_param_2
+)
+{
+	.local .align 4 .b8 	__local_depot29[4];
+	.reg .b64 	%SP;
+	.reg .b64 	%SPL;
+	.reg .pred 	%p<7>;
+	.reg .b32 	%r<18>;
+	.reg .f64 	%fd<41>;
+	.reg .b64 	%rd<17>;
+
+
+	mov.u64 	%rd16, __local_depot29;
+	cvta.local.u64 	%SP, %rd16;
+	ld.param.u64 	%rd3, [matrix_sin_param_0];
+	ld.param.u64 	%rd4, [matrix_sin_param_1];
+	ld.param.u32 	%r5, [matrix_sin_param_2];
+	add.u64 	%rd5, %SP, 0;
+	cvta.to.local.u64 	%rd1, %rd5;
+	mov.u32 	%r6, %ntid.x;
+	mov.u32 	%r7, %ctaid.x;
+	mov.u32 	%r8, %tid.x;
+	mad.lo.s32 	%r1, %r6, %r7, %r8;
+	setp.ge.u32	%p1, %r1, %r5;
+	@%p1 bra 	BB29_11;
+
+	cvta.to.global.u64 	%rd6, %rd3;
+	cvt.s64.s32	%rd2, %r1;
+	mul.wide.s32 	%rd7, %r1, 8;
+	add.s64 	%rd8, %rd6, %rd7;
+	ld.global.f64 	%fd38, [%rd8];
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r9}, %fd38;
+	}
+	and.b32  	%r10, %r9, 2147483647;
+	setp.ne.s32	%p2, %r10, 2146435072;
+	@%p2 bra 	BB29_4;
+
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r11, %temp}, %fd38;
+	}
+	setp.ne.s32	%p3, %r11, 0;
+	@%p3 bra 	BB29_4;
+
+	mov.f64 	%fd14, 0d0000000000000000;
+	mul.rn.f64 	%fd38, %fd38, %fd14;
+
+BB29_4:
+	mul.f64 	%fd15, %fd38, 0d3FE45F306DC9C883;
+	cvt.rni.s32.f64	%r17, %fd15;
+	st.local.u32 	[%rd1], %r17;
+	cvt.rn.f64.s32	%fd16, %r17;
+	neg.f64 	%fd17, %fd16;
+	mov.f64 	%fd18, 0d3FF921FB54442D18;
+	fma.rn.f64 	%fd19, %fd17, %fd18, %fd38;
+	mov.f64 	%fd20, 0d3C91A62633145C00;
+	fma.rn.f64 	%fd21, %fd17, %fd20, %fd19;
+	mov.f64 	%fd22, 0d397B839A252049C0;
+	fma.rn.f64 	%fd39, %fd17, %fd22, %fd21;
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r12}, %fd38;
+	}
+	and.b32  	%r13, %r12, 2145386496;
+	setp.lt.u32	%p4, %r13, 1105199104;
+	@%p4 bra 	BB29_6;
+
+	// Callseq Start 3
+	{
+	.reg .b32 temp_param_reg;
+	// <end>}
+	.param .b64 param0;
+	st.param.f64	[param0+0], %fd38;
+	.param .b64 param1;
+	st.param.b64	[param1+0], %rd5;
+	.param .b64 retval0;
+	call.uni (retval0), 
+	__internal_trig_reduction_slowpathd, 
+	(
+	param0, 
+	param1
+	);
+	ld.param.f64	%fd39, [retval0+0];
+	
+	//{
+	}// Callseq End 3
+	ld.local.u32 	%r17, [%rd1];
+
+BB29_6:
+	and.b32  	%r14, %r17, 1;
+	shl.b32 	%r15, %r14, 3;
+	setp.eq.s32	%p5, %r14, 0;
+	selp.f64	%fd23, 0d3DE5DB65F9785EBA, 0dBDA8FF8320FD8164, %p5;
+	mul.wide.u32 	%rd10, %r15, 8;
+	mov.u64 	%rd11, __cudart_sin_cos_coeffs;
+	add.s64 	%rd12, %rd10, %rd11;
+	ld.const.f64 	%fd24, [%rd12+8];
+	mul.rn.f64 	%fd7, %fd39, %fd39;
+	fma.rn.f64 	%fd25, %fd23, %fd7, %fd24;
+	ld.const.f64 	%fd26, [%rd12+16];
+	fma.rn.f64 	%fd27, %fd25, %fd7, %fd26;
+	ld.const.f64 	%fd28, [%rd12+24];
+	fma.rn.f64 	%fd29, %fd27, %fd7, %fd28;
+	ld.const.f64 	%fd30, [%rd12+32];
+	fma.rn.f64 	%fd31, %fd29, %fd7, %fd30;
+	ld.const.f64 	%fd32, [%rd12+40];
+	fma.rn.f64 	%fd33, %fd31, %fd7, %fd32;
+	ld.const.f64 	%fd34, [%rd12+48];
+	fma.rn.f64 	%fd8, %fd33, %fd7, %fd34;
+	fma.rn.f64 	%fd40, %fd8, %fd39, %fd39;
+	@%p5 bra 	BB29_8;
+
+	mov.f64 	%fd35, 0d3FF0000000000000;
+	fma.rn.f64 	%fd40, %fd8, %fd7, %fd35;
+
+BB29_8:
+	and.b32  	%r16, %r17, 2;
+	setp.eq.s32	%p6, %r16, 0;
+	@%p6 bra 	BB29_10;
+
+	mov.f64 	%fd36, 0d0000000000000000;
+	mov.f64 	%fd37, 0dBFF0000000000000;
+	fma.rn.f64 	%fd40, %fd40, %fd37, %fd36;
+
+BB29_10:
+	cvta.to.global.u64 	%rd13, %rd4;
+	shl.b64 	%rd14, %rd2, 3;
+	add.s64 	%rd15, %rd13, %rd14;
+	st.global.f64 	[%rd15], %fd40;
+
+BB29_11:
+	ret;
+}
+
+	// .globl	matrix_cos
+.visible .entry matrix_cos(
+	.param .u64 matrix_cos_param_0,
+	.param .u64 matrix_cos_param_1,
+	.param .u32 matrix_cos_param_2
+)
+{
+	.local .align 4 .b8 	__local_depot30[4];
+	.reg .b64 	%SP;
+	.reg .b64 	%SPL;
+	.reg .pred 	%p<7>;
+	.reg .b32 	%r<19>;
+	.reg .f64 	%fd<41>;
+	.reg .b64 	%rd<17>;
+
+
+	mov.u64 	%rd16, __local_depot30;
+	cvta.local.u64 	%SP, %rd16;
+	ld.param.u64 	%rd3, [matrix_cos_param_0];
+	ld.param.u64 	%rd4, [matrix_cos_param_1];
+	ld.param.u32 	%r6, [matrix_cos_param_2];
+	add.u64 	%rd5, %SP, 0;
+	cvta.to.local.u64 	%rd1, %rd5;
+	mov.u32 	%r7, %ntid.x;
+	mov.u32 	%r8, %ctaid.x;
+	mov.u32 	%r9, %tid.x;
+	mad.lo.s32 	%r1, %r7, %r8, %r9;
+	setp.ge.u32	%p1, %r1, %r6;
+	@%p1 bra 	BB30_11;
+
+	cvta.to.global.u64 	%rd6, %rd3;
+	cvt.s64.s32	%rd2, %r1;
+	mul.wide.s32 	%rd7, %r1, 8;
+	add.s64 	%rd8, %rd6, %rd7;
+	ld.global.f64 	%fd38, [%rd8];
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r10}, %fd38;
+	}
+	and.b32  	%r11, %r10, 2147483647;
+	setp.ne.s32	%p2, %r11, 2146435072;
+	@%p2 bra 	BB30_4;
+
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r12, %temp}, %fd38;
+	}
+	setp.ne.s32	%p3, %r12, 0;
+	@%p3 bra 	BB30_4;
+
+	mov.f64 	%fd14, 0d0000000000000000;
+	mul.rn.f64 	%fd38, %fd38, %fd14;
+
+BB30_4:
+	mul.f64 	%fd15, %fd38, 0d3FE45F306DC9C883;
+	cvt.rni.s32.f64	%r18, %fd15;
+	st.local.u32 	[%rd1], %r18;
+	cvt.rn.f64.s32	%fd16, %r18;
+	neg.f64 	%fd17, %fd16;
+	mov.f64 	%fd18, 0d3FF921FB54442D18;
+	fma.rn.f64 	%fd19, %fd17, %fd18, %fd38;
+	mov.f64 	%fd20, 0d3C91A62633145C00;
+	fma.rn.f64 	%fd21, %fd17, %fd20, %fd19;
+	mov.f64 	%fd22, 0d397B839A252049C0;
+	fma.rn.f64 	%fd39, %fd17, %fd22, %fd21;
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r13}, %fd38;
+	}
+	and.b32  	%r14, %r13, 2145386496;
+	setp.lt.u32	%p4, %r14, 1105199104;
+	@%p4 bra 	BB30_6;
+
+	// Callseq Start 4
+	{
+	.reg .b32 temp_param_reg;
+	// <end>}
+	.param .b64 param0;
+	st.param.f64	[param0+0], %fd38;
+	.param .b64 param1;
+	st.param.b64	[param1+0], %rd5;
+	.param .b64 retval0;
+	call.uni (retval0), 
+	__internal_trig_reduction_slowpathd, 
+	(
+	param0, 
+	param1
+	);
+	ld.param.f64	%fd39, [retval0+0];
+	
+	//{
+	}// Callseq End 4
+	ld.local.u32 	%r18, [%rd1];
+
+BB30_6:
+	add.s32 	%r5, %r18, 1;
+	and.b32  	%r15, %r5, 1;
+	shl.b32 	%r16, %r15, 3;
+	setp.eq.s32	%p5, %r15, 0;
+	selp.f64	%fd23, 0d3DE5DB65F9785EBA, 0dBDA8FF8320FD8164, %p5;
+	mul.wide.u32 	%rd10, %r16, 8;
+	mov.u64 	%rd11, __cudart_sin_cos_coeffs;
+	add.s64 	%rd12, %rd10, %rd11;
+	ld.const.f64 	%fd24, [%rd12+8];
+	mul.rn.f64 	%fd7, %fd39, %fd39;
+	fma.rn.f64 	%fd25, %fd23, %fd7, %fd24;
+	ld.const.f64 	%fd26, [%rd12+16];
+	fma.rn.f64 	%fd27, %fd25, %fd7, %fd26;
+	ld.const.f64 	%fd28, [%rd12+24];
+	fma.rn.f64 	%fd29, %fd27, %fd7, %fd28;
+	ld.const.f64 	%fd30, [%rd12+32];
+	fma.rn.f64 	%fd31, %fd29, %fd7, %fd30;
+	ld.const.f64 	%fd32, [%rd12+40];
+	fma.rn.f64 	%fd33, %fd31, %fd7, %fd32;
+	ld.const.f64 	%fd34, [%rd12+48];
+	fma.rn.f64 	%fd8, %fd33, %fd7, %fd34;
+	fma.rn.f64 	%fd40, %fd8, %fd39, %fd39;
+	@%p5 bra 	BB30_8;
+
+	mov.f64 	%fd35, 0d3FF0000000000000;
+	fma.rn.f64 	%fd40, %fd8, %fd7, %fd35;
+
+BB30_8:
+	and.b32  	%r17, %r5, 2;
+	setp.eq.s32	%p6, %r17, 0;
+	@%p6 bra 	BB30_10;
+
+	mov.f64 	%fd36, 0d0000000000000000;
+	mov.f64 	%fd37, 0dBFF0000000000000;
+	fma.rn.f64 	%fd40, %fd40, %fd37, %fd36;
+
+BB30_10:
+	cvta.to.global.u64 	%rd13, %rd4;
+	shl.b64 	%rd14, %rd2, 3;
+	add.s64 	%rd15, %rd13, %rd14;
+	st.global.f64 	[%rd15], %fd40;
+
+BB30_11:
+	ret;
+}
+
+	// .globl	matrix_tan
+.visible .entry matrix_tan(
+	.param .u64 matrix_tan_param_0,
+	.param .u64 matrix_tan_param_1,
+	.param .u32 matrix_tan_param_2
+)
+{
+	.local .align 4 .b8 	__local_depot31[4];
+	.reg .b64 	%SP;
+	.reg .b64 	%SPL;
+	.reg .pred 	%p<6>;
+	.reg .b32 	%r<16>;
+	.reg .f64 	%fd<66>;
+	.reg .b64 	%rd<14>;
+
+
+	mov.u64 	%rd13, __local_depot31;
+	cvta.local.u64 	%SP, %rd13;
+	ld.param.u64 	%rd3, [matrix_tan_param_0];
+	ld.param.u64 	%rd4, [matrix_tan_param_1];
+	ld.param.u32 	%r5, [matrix_tan_param_2];
+	add.u64 	%rd5, %SP, 0;
+	cvta.to.local.u64 	%rd1, %rd5;
+	mov.u32 	%r6, %ntid.x;
+	mov.u32 	%r7, %ctaid.x;
+	mov.u32 	%r8, %tid.x;
+	mad.lo.s32 	%r1, %r6, %r7, %r8;
+	setp.ge.u32	%p1, %r1, %r5;
+	@%p1 bra 	BB31_9;
+
+	cvta.to.global.u64 	%rd6, %rd3;
+	cvt.s64.s32	%rd2, %r1;
+	mul.wide.s32 	%rd7, %r1, 8;
+	add.s64 	%rd8, %rd6, %rd7;
+	ld.global.f64 	%fd63, [%rd8];
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r9}, %fd63;
+	}
+	and.b32  	%r10, %r9, 2147483647;
+	setp.ne.s32	%p2, %r10, 2146435072;
+	@%p2 bra 	BB31_4;
+
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r11, %temp}, %fd63;
+	}
+	setp.ne.s32	%p3, %r11, 0;
+	@%p3 bra 	BB31_4;
+
+	mov.f64 	%fd11, 0d0000000000000000;
+	mul.rn.f64 	%fd63, %fd63, %fd11;
+
+BB31_4:
+	mul.f64 	%fd12, %fd63, 0d3FE45F306DC9C883;
+	cvt.rni.s32.f64	%r15, %fd12;
+	st.local.u32 	[%rd1], %r15;
+	cvt.rn.f64.s32	%fd13, %r15;
+	neg.f64 	%fd14, %fd13;
+	mov.f64 	%fd15, 0d3FF921FB54442D18;
+	fma.rn.f64 	%fd16, %fd14, %fd15, %fd63;
+	mov.f64 	%fd17, 0d3C91A62633145C00;
+	fma.rn.f64 	%fd18, %fd14, %fd17, %fd16;
+	mov.f64 	%fd19, 0d397B839A252049C0;
+	fma.rn.f64 	%fd64, %fd14, %fd19, %fd18;
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r12}, %fd63;
+	}
+	and.b32  	%r13, %r12, 2145386496;
+	setp.lt.u32	%p4, %r13, 1105199104;
+	@%p4 bra 	BB31_6;
+
+	// Callseq Start 5
+	{
+	.reg .b32 temp_param_reg;
+	// <end>}
+	.param .b64 param0;
+	st.param.f64	[param0+0], %fd63;
+	.param .b64 param1;
+	st.param.b64	[param1+0], %rd5;
+	.param .b64 retval0;
+	call.uni (retval0), 
+	__internal_trig_reduction_slowpathd, 
+	(
+	param0, 
+	param1
+	);
+	ld.param.f64	%fd64, [retval0+0];
+	
+	//{
+	}// Callseq End 5
+	ld.local.u32 	%r15, [%rd1];
+
+BB31_6:
+	mul.f64 	%fd20, %fd64, %fd64;
+	mov.f64 	%fd21, 0dBEF9757C5B27EBB1;
+	mov.f64 	%fd22, 0d3EE48DAC2799BCB9;
+	fma.rn.f64 	%fd23, %fd22, %fd20, %fd21;
+	mov.f64 	%fd24, 0d3F0980E90FD91E04;
+	fma.rn.f64 	%fd25, %fd23, %fd20, %fd24;
+	mov.f64 	%fd26, 0dBEFAE2B0417D7E1D;
+	fma.rn.f64 	%fd27, %fd25, %fd20, %fd26;
+	mov.f64 	%fd28, 0d3F119F5341BFBA57;
+	fma.rn.f64 	%fd29, %fd27, %fd20, %fd28;
+	mov.f64 	%fd30, 0d3F15E791A00F6919;
+	fma.rn.f64 	%fd31, %fd29, %fd20, %fd30;
+	mov.f64 	%fd32, 0d3F2FF2E7FADEC73A;
+	fma.rn.f64 	%fd33, %fd31, %fd20, %fd32;
+	mov.f64 	%fd34, 0d3F434BC1B206DA62;
+	fma.rn.f64 	%fd35, %fd33, %fd20, %fd34;
+	mov.f64 	%fd36, 0d3F57DB18EF2F83F9;
+	fma.rn.f64 	%fd37, %fd35, %fd20, %fd36;
+	mov.f64 	%fd38, 0d3F6D6D2E7AE49FBC;
+	fma.rn.f64 	%fd39, %fd37, %fd20, %fd38;
+	mov.f64 	%fd40, 0d3F8226E3A816A776;
+	fma.rn.f64 	%fd41, %fd39, %fd20, %fd40;
+	mov.f64 	%fd42, 0d3F9664F485D25660;
+	fma.rn.f64 	%fd43, %fd41, %fd20, %fd42;
+	mov.f64 	%fd44, 0d3FABA1BA1BABF31D;
+	fma.rn.f64 	%fd45, %fd43, %fd20, %fd44;
+	mov.f64 	%fd46, 0d3FC11111111105D2;
+	fma.rn.f64 	%fd47, %fd45, %fd20, %fd46;
+	mov.f64 	%fd48, 0d3FD555555555555E;
+	fma.rn.f64 	%fd49, %fd47, %fd20, %fd48;
+	mul.f64 	%fd7, %fd20, %fd49;
+	fma.rn.f64 	%fd65, %fd7, %fd64, %fd64;
+	and.b32  	%r14, %r15, 1;
+	setp.eq.b32	%p5, %r14, 1;
+	@!%p5 bra 	BB31_8;
+	bra.uni 	BB31_7;
+
+BB31_7:
+	sub.f64 	%fd52, %fd65, %fd64;
+	neg.f64 	%fd53, %fd52;
+	fma.rn.f64 	%fd54, %fd7, %fd64, %fd53;
+	// inline asm
+	rcp.approx.ftz.f64 %fd50,%fd65;
+	// inline asm
+	neg.f64 	%fd55, %fd65;
+	mov.f64 	%fd56, 0d3FF0000000000000;
+	fma.rn.f64 	%fd57, %fd55, %fd50, %fd56;
+	fma.rn.f64 	%fd58, %fd57, %fd57, %fd57;
+	fma.rn.f64 	%fd59, %fd58, %fd50, %fd50;
+	neg.f64 	%fd60, %fd59;
+	fma.rn.f64 	%fd61, %fd65, %fd60, %fd56;
+	fma.rn.f64 	%fd62, %fd60, %fd54, %fd61;
+	fma.rn.f64 	%fd65, %fd62, %fd60, %fd60;
+
+BB31_8:
+	cvta.to.global.u64 	%rd10, %rd4;
+	shl.b64 	%rd11, %rd2, 3;
+	add.s64 	%rd12, %rd10, %rd11;
+	st.global.f64 	[%rd12], %fd65;
+
+BB31_9:
+	ret;
+}
+
+	// .globl	matrix_asin
+.visible .entry matrix_asin(
+	.param .u64 matrix_asin_param_0,
+	.param .u64 matrix_asin_param_1,
+	.param .u32 matrix_asin_param_2
+)
+{
+	.reg .pred 	%p<5>;
+	.reg .f32 	%f<3>;
+	.reg .b32 	%r<15>;
+	.reg .f64 	%fd<83>;
+	.reg .b64 	%rd<10>;
+
+
+	ld.param.u64 	%rd2, [matrix_asin_param_0];
+	ld.param.u64 	%rd3, [matrix_asin_param_1];
+	ld.param.u32 	%r3, [matrix_asin_param_2];
+	mov.u32 	%r4, %ctaid.x;
+	mov.u32 	%r5, %ntid.x;
+	mov.u32 	%r6, %tid.x;
+	mad.lo.s32 	%r1, %r5, %r4, %r6;
+	setp.ge.u32	%p1, %r1, %r3;
+	@%p1 bra 	BB32_5;
+
+	cvta.to.global.u64 	%rd4, %rd2;
+	cvt.s64.s32	%rd1, %r1;
+	mul.wide.s32 	%rd5, %r1, 8;
+	add.s64 	%rd6, %rd4, %rd5;
+	ld.global.f64 	%fd1, [%rd6];
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r2}, %fd1;
+	}
+	mov.b32 	 %f1, %r2;
+	abs.f32 	%f2, %f1;
+	setp.lt.f32	%p2, %f2, 0f3FE26666;
+	@%p2 bra 	BB32_3;
+	bra.uni 	BB32_2;
+
+BB32_3:
+	mul.f64 	%fd55, %fd1, %fd1;
+	mov.f64 	%fd56, 0dBFB3823B180754AF;
+	mov.f64 	%fd57, 0d3FB0066BDC1895E9;
+	fma.rn.f64 	%fd58, %fd57, %fd55, %fd56;
+	mov.f64 	%fd59, 0d3FB11E52CC2F79AE;
+	fma.rn.f64 	%fd60, %fd58, %fd55, %fd59;
+	mov.f64 	%fd61, 0dBF924EAF3526861B;
+	fma.rn.f64 	%fd62, %fd60, %fd55, %fd61;
+	mov.f64 	%fd63, 0d3F91DF02A31E6CB7;
+	fma.rn.f64 	%fd64, %fd62, %fd55, %fd63;
+	mov.f64 	%fd65, 0d3F847D18B0EEC6CC;
+	fma.rn.f64 	%fd66, %fd64, %fd55, %fd65;
+	mov.f64 	%fd67, 0d3F8D0AF961BA53B0;
+	fma.rn.f64 	%fd68, %fd66, %fd55, %fd67;
+	mov.f64 	%fd69, 0d3F91BF7734CF1C48;
+	fma.rn.f64 	%fd70, %fd68, %fd55, %fd69;
+	mov.f64 	%fd71, 0d3F96E91483144EF7;
+	fma.rn.f64 	%fd72, %fd70, %fd55, %fd71;
+	mov.f64 	%fd73, 0d3F9F1C6E0A4F9F81;
+	fma.rn.f64 	%fd74, %fd72, %fd55, %fd73;
+	mov.f64 	%fd75, 0d3FA6DB6DC27FA92B;
+	fma.rn.f64 	%fd76, %fd74, %fd55, %fd75;
+	mov.f64 	%fd77, 0d3FB333333320F91B;
+	fma.rn.f64 	%fd78, %fd76, %fd55, %fd77;
+	mov.f64 	%fd79, 0d3FC5555555555F4D;
+	fma.rn.f64 	%fd80, %fd78, %fd55, %fd79;
+	mul.f64 	%fd81, %fd55, %fd80;
+	fma.rn.f64 	%fd82, %fd81, %fd1, %fd1;
+	bra.uni 	BB32_4;
+
+BB32_2:
+	abs.f64 	%fd7, %fd1;
+	mov.f64 	%fd8, 0d3FE0000000000000;
+	mov.f64 	%fd9, 0dBFE0000000000000;
+	fma.rn.f64 	%fd6, %fd9, %fd7, %fd8;
+	// inline asm
+	rsqrt.approx.ftz.f64 %fd5, %fd6;
+	// inline asm
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r7, %temp}, %fd5;
+	}
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r8}, %fd5;
+	}
+	add.s32 	%r9, %r8, -1048576;
+	mov.b64 	%fd10, {%r7, %r9};
+	mul.f64 	%fd11, %fd6, %fd5;
+	neg.f64 	%fd12, %fd11;
+	fma.rn.f64 	%fd13, %fd11, %fd12, %fd6;
+	fma.rn.f64 	%fd14, %fd13, %fd10, %fd11;
+	neg.f64 	%fd15, %fd14;
+	mov.f64 	%fd16, 0d3FF0000000000000;
+	fma.rn.f64 	%fd17, %fd5, %fd15, %fd16;
+	fma.rn.f64 	%fd18, %fd17, %fd10, %fd10;
+	fma.rn.f64 	%fd19, %fd14, %fd15, %fd6;
+	fma.rn.f64 	%fd20, %fd19, %fd18, %fd14;
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r10}, %fd6;
+	}
+	setp.lt.s32	%p3, %r10, 0;
+	selp.f64	%fd21, 0dFFF8000000000000, %fd20, %p3;
+	setp.equ.f64	%p4, %fd6, 0d0000000000000000;
+	selp.f64	%fd22, %fd6, %fd21, %p4;
+	mov.f64 	%fd23, 0dBFB3823B180754AF;
+	mov.f64 	%fd24, 0d3FB0066BDC1895E9;
+	fma.rn.f64 	%fd25, %fd24, %fd6, %fd23;
+	mov.f64 	%fd26, 0d3FB11E52CC2F79AE;
+	fma.rn.f64 	%fd27, %fd25, %fd6, %fd26;
+	mov.f64 	%fd28, 0dBF924EAF3526861B;
+	fma.rn.f64 	%fd29, %fd27, %fd6, %fd28;
+	mov.f64 	%fd30, 0d3F91DF02A31E6CB7;
+	fma.rn.f64 	%fd31, %fd29, %fd6, %fd30;
+	mov.f64 	%fd32, 0d3F847D18B0EEC6CC;
+	fma.rn.f64 	%fd33, %fd31, %fd6, %fd32;
+	mov.f64 	%fd34, 0d3F8D0AF961BA53B0;
+	fma.rn.f64 	%fd35, %fd33, %fd6, %fd34;
+	mov.f64 	%fd36, 0d3F91BF7734CF1C48;
+	fma.rn.f64 	%fd37, %fd35, %fd6, %fd36;
+	mov.f64 	%fd38, 0d3F96E91483144EF7;
+	fma.rn.f64 	%fd39, %fd37, %fd6, %fd38;
+	mov.f64 	%fd40, 0d3F9F1C6E0A4F9F81;
+	fma.rn.f64 	%fd41, %fd39, %fd6, %fd40;
+	mov.f64 	%fd42, 0d3FA6DB6DC27FA92B;
+	fma.rn.f64 	%fd43, %fd41, %fd6, %fd42;
+	mov.f64 	%fd44, 0d3FB333333320F91B;
+	fma.rn.f64 	%fd45, %fd43, %fd6, %fd44;
+	mov.f64 	%fd46, 0d3FC5555555555F4D;
+	fma.rn.f64 	%fd47, %fd45, %fd6, %fd46;
+	mul.f64 	%fd48, %fd6, %fd47;
+	mul.f64 	%fd49, %fd22, 0dC000000000000000;
+	mov.f64 	%fd50, 0d3C91A62633145C07;
+	fma.rn.f64 	%fd51, %fd49, %fd48, %fd50;
+	add.f64 	%fd52, %fd49, 0d3FE921FB54442D18;
+	add.f64 	%fd53, %fd52, %fd51;
+	add.f64 	%fd54, %fd53, 0d3FE921FB54442D18;
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r11, %temp}, %fd54;
+	}
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r12}, %fd54;
+	}
+	and.b32  	%r13, %r2, -2147483648;
+	or.b32  	%r14, %r12, %r13;
+	mov.b64 	%fd82, {%r11, %r14};
+
+BB32_4:
+	cvta.to.global.u64 	%rd7, %rd3;
+	shl.b64 	%rd8, %rd1, 3;
+	add.s64 	%rd9, %rd7, %rd8;
+	st.global.f64 	[%rd9], %fd82;
+
+BB32_5:
+	ret;
+}
+
+	// .globl	matrix_acos
+.visible .entry matrix_acos(
+	.param .u64 matrix_acos_param_0,
+	.param .u64 matrix_acos_param_1,
+	.param .u32 matrix_acos_param_2
+)
+{
+	.reg .pred 	%p<7>;
+	.reg .b32 	%r<17>;
+	.reg .f64 	%fd<95>;
+	.reg .b64 	%rd<10>;
+
+
+	ld.param.u64 	%rd2, [matrix_acos_param_0];
+	ld.param.u64 	%rd3, [matrix_acos_param_1];
+	ld.param.u32 	%r4, [matrix_acos_param_2];
+	mov.u32 	%r5, %ctaid.x;
+	mov.u32 	%r6, %ntid.x;
+	mov.u32 	%r7, %tid.x;
+	mad.lo.s32 	%r1, %r6, %r5, %r7;
+	setp.ge.u32	%p1, %r1, %r4;
+	@%p1 bra 	BB33_14;
+
+	cvta.to.global.u64 	%rd4, %rd2;
+	cvt.s64.s32	%rd1, %r1;
+	mul.wide.s32 	%rd5, %r1, 8;
+	add.s64 	%rd6, %rd4, %rd5;
+	ld.global.f64 	%fd16, [%rd6];
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r2}, %fd16;
+	}
+	abs.f64 	%fd1, %fd16;
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r8}, %fd1;
+	}
+	setp.lt.s32	%p2, %r8, 1071801958;
+	@%p2 bra 	BB33_9;
+	bra.uni 	BB33_2;
+
+BB33_9:
+	mul.f64 	%fd62, %fd1, %fd1;
+	mov.f64 	%fd63, 0dBFB3823B180754AF;
+	mov.f64 	%fd64, 0d3FB0066BDC1895E9;
+	fma.rn.f64 	%fd65, %fd64, %fd62, %fd63;
+	mov.f64 	%fd66, 0d3FB11E52CC2F79AE;
+	fma.rn.f64 	%fd67, %fd65, %fd62, %fd66;
+	mov.f64 	%fd68, 0dBF924EAF3526861B;
+	fma.rn.f64 	%fd69, %fd67, %fd62, %fd68;
+	mov.f64 	%fd70, 0d3F91DF02A31E6CB7;
+	fma.rn.f64 	%fd71, %fd69, %fd62, %fd70;
+	mov.f64 	%fd72, 0d3F847D18B0EEC6CC;
+	fma.rn.f64 	%fd73, %fd71, %fd62, %fd72;
+	mov.f64 	%fd74, 0d3F8D0AF961BA53B0;
+	fma.rn.f64 	%fd75, %fd73, %fd62, %fd74;
+	mov.f64 	%fd76, 0d3F91BF7734CF1C48;
+	fma.rn.f64 	%fd77, %fd75, %fd62, %fd76;
+	mov.f64 	%fd78, 0d3F96E91483144EF7;
+	fma.rn.f64 	%fd79, %fd77, %fd62, %fd78;
+	mov.f64 	%fd80, 0d3F9F1C6E0A4F9F81;
+	fma.rn.f64 	%fd81, %fd79, %fd62, %fd80;
+	mov.f64 	%fd82, 0d3FA6DB6DC27FA92B;
+	fma.rn.f64 	%fd83, %fd81, %fd62, %fd82;
+	mov.f64 	%fd84, 0d3FB333333320F91B;
+	fma.rn.f64 	%fd85, %fd83, %fd62, %fd84;
+	mov.f64 	%fd86, 0d3FC5555555555F4D;
+	fma.rn.f64 	%fd87, %fd85, %fd62, %fd86;
+	mul.f64 	%fd88, %fd62, %fd87;
+	fma.rn.f64 	%fd10, %fd88, %fd1, %fd1;
+	setp.lt.s32	%p6, %r2, 0;
+	@%p6 bra 	BB33_11;
+
+	mov.f64 	%fd89, 0dBC91A62633145C07;
+	add.rn.f64 	%fd90, %fd10, %fd89;
+	neg.f64 	%fd93, %fd90;
+	bra.uni 	BB33_12;
+
+BB33_2:
+	mov.f64 	%fd19, 0d3FF0000000000000;
+	sub.f64 	%fd2, %fd19, %fd1;
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r9, %temp}, %fd2;
+	}
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r3}, %fd2;
+	}
+	add.s32 	%r10, %r3, -1048576;
+	mov.b64 	%fd18, {%r9, %r10};
+	// inline asm
+	rsqrt.approx.ftz.f64 %fd17, %fd18;
+	// inline asm
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r11, %temp}, %fd17;
+	}
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r12}, %fd17;
+	}
+	add.s32 	%r13, %r12, -1048576;
+	mov.b64 	%fd20, {%r11, %r13};
+	mul.f64 	%fd21, %fd18, %fd17;
+	neg.f64 	%fd22, %fd21;
+	fma.rn.f64 	%fd23, %fd21, %fd22, %fd18;
+	fma.rn.f64 	%fd24, %fd23, %fd20, %fd21;
+	neg.f64 	%fd25, %fd24;
+	fma.rn.f64 	%fd26, %fd17, %fd25, %fd19;
+	fma.rn.f64 	%fd27, %fd26, %fd20, %fd20;
+	fma.rn.f64 	%fd28, %fd24, %fd25, %fd18;
+	fma.rn.f64 	%fd3, %fd28, %fd27, %fd24;
+	setp.lt.s32	%p3, %r3, 1;
+	@%p3 bra 	BB33_4;
+
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r14, %temp}, %fd3;
+	}
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r15}, %fd3;
+	}
+	add.s32 	%r16, %r15, 1048576;
+	mov.b64 	%fd29, {%r14, %r16};
+	mov.f64 	%fd30, 0dBEBAC2FE66FAAC4B;
+	mov.f64 	%fd31, 0d3EC715B371155F70;
+	fma.rn.f64 	%fd32, %fd31, %fd2, %fd30;
+	mov.f64 	%fd33, 0d3ED9A9B88EFCD9B8;
+	fma.rn.f64 	%fd34, %fd32, %fd2, %fd33;
+	mov.f64 	%fd35, 0d3EDD0F40A8A0C4C3;
+	fma.rn.f64 	%fd36, %fd34, %fd2, %fd35;
+	mov.f64 	%fd37, 0d3EF46D4CFA9E0E1F;
+	fma.rn.f64 	%fd38, %fd36, %fd2, %fd37;
+	mov.f64 	%fd39, 0d3F079C168D1E2422;
+	fma.rn.f64 	%fd40, %fd38, %fd2, %fd39;
+	mov.f64 	%fd41, 0d3F1C9A88C3BCA540;
+	fma.rn.f64 	%fd42, %fd40, %fd2, %fd41;
+	mov.f64 	%fd43, 0d3F31C4E64BD476DF;
+	fma.rn.f64 	%fd44, %fd42, %fd2, %fd43;
+	mov.f64 	%fd45, 0d3F46E8BA60009C8F;
+	fma.rn.f64 	%fd46, %fd44, %fd2, %fd45;
+	mov.f64 	%fd47, 0d3F5F1C71C62B05A2;
+	fma.rn.f64 	%fd48, %fd46, %fd2, %fd47;
+	mov.f64 	%fd49, 0d3F76DB6DB6DC9F2C;
+	fma.rn.f64 	%fd50, %fd48, %fd2, %fd49;
+	mov.f64 	%fd51, 0d3F9333333333329C;
+	fma.rn.f64 	%fd52, %fd50, %fd2, %fd51;
+	mov.f64 	%fd53, 0d3FB5555555555555;
+	fma.rn.f64 	%fd54, %fd52, %fd2, %fd53;
+	mul.f64 	%fd55, %fd2, %fd54;
+	fma.rn.f64 	%fd94, %fd55, %fd29, %fd29;
+	bra.uni 	BB33_5;
+
+BB33_11:
+	mov.f64 	%fd91, 0d3C91A62633145C07;
+	add.rn.f64 	%fd93, %fd10, %fd91;
+
+BB33_12:
+	mov.f64 	%fd92, 0d3FF921FB54442D18;
+	add.rn.f64 	%fd94, %fd92, %fd93;
+	bra.uni 	BB33_13;
+
+BB33_4:
+	mov.f64 	%fd56, 0d0000000000000000;
+	mul.rn.f64 	%fd94, %fd1, %fd56;
+
+BB33_5:
+	setp.gt.s32	%p4, %r3, -1;
+	@%p4 bra 	BB33_7;
+
+	mov.f64 	%fd57, 0d7FF0000000000000;
+	mul.rn.f64 	%fd94, %fd94, %fd57;
+
+BB33_7:
+	setp.gt.s32	%p5, %r2, -1;
+	@%p5 bra 	BB33_13;
+
+	mov.f64 	%fd58, 0dBCA1A62633145C07;
+	add.rn.f64 	%fd59, %fd94, %fd58;
+	neg.f64 	%fd60, %fd59;
+	mov.f64 	%fd61, 0d400921FB54442D18;
+	add.rn.f64 	%fd94, %fd61, %fd60;
+
+BB33_13:
+	cvta.to.global.u64 	%rd7, %rd3;
+	shl.b64 	%rd8, %rd1, 3;
+	add.s64 	%rd9, %rd7, %rd8;
+	st.global.f64 	[%rd9], %fd94;
+
+BB33_14:
+	ret;
+}
+
+	// .globl	matrix_atan
+.visible .entry matrix_atan(
+	.param .u64 matrix_atan_param_0,
+	.param .u64 matrix_atan_param_1,
+	.param .u32 matrix_atan_param_2
+)
+{
+	.reg .pred 	%p<5>;
+	.reg .b32 	%r<11>;
+	.reg .f64 	%fd<57>;
+	.reg .b64 	%rd<10>;
+
+
+	ld.param.u64 	%rd2, [matrix_atan_param_0];
+	ld.param.u64 	%rd3, [matrix_atan_param_1];
+	ld.param.u32 	%r2, [matrix_atan_param_2];
+	mov.u32 	%r3, %ctaid.x;
+	mov.u32 	%r4, %ntid.x;
+	mov.u32 	%r5, %tid.x;
+	mad.lo.s32 	%r1, %r4, %r3, %r5;
+	setp.ge.u32	%p1, %r1, %r2;
+	@%p1 bra 	BB34_4;
+
+	cvta.to.global.u64 	%rd4, %rd2;
+	cvt.s64.s32	%rd1, %r1;
+	mul.wide.s32 	%rd5, %r1, 8;
+	add.s64 	%rd6, %rd4, %rd5;
+	ld.global.f64 	%fd1, [%rd6];
+	abs.f64 	%fd2, %fd1;
+	setp.leu.f64	%p2, %fd2, 0d3FF0000000000000;
+	mov.f64 	%fd56, %fd2;
+	@%p2 bra 	BB34_3;
+
+	// inline asm
+	rcp.approx.ftz.f64 %fd5,%fd2;
+	// inline asm
+	neg.f64 	%fd7, %fd2;
+	mov.f64 	%fd8, 0d3FF0000000000000;
+	fma.rn.f64 	%fd9, %fd7, %fd5, %fd8;
+	fma.rn.f64 	%fd10, %fd9, %fd9, %fd9;
+	fma.rn.f64 	%fd11, %fd10, %fd5, %fd5;
+	setp.eq.f64	%p3, %fd2, 0d7FF0000000000000;
+	selp.f64	%fd3, 0d0000000000000000, %fd11, %p3;
+	mov.f64 	%fd56, %fd3;
+
+BB34_3:
+	mov.f64 	%fd4, %fd56;
+	cvta.to.global.u64 	%rd7, %rd3;
+	mul.f64 	%fd12, %fd4, %fd4;
+	mov.f64 	%fd13, 0d3F2D3B63DBB65B49;
+	mov.f64 	%fd14, 0dBEF53E1D2A25FF7E;
+	fma.rn.f64 	%fd15, %fd14, %fd12, %fd13;
+	mov.f64 	%fd16, 0dBF5312788DDE082E;
+	fma.rn.f64 	%fd17, %fd15, %fd12, %fd16;
+	mov.f64 	%fd18, 0d3F6F9690C8249315;
+	fma.rn.f64 	%fd19, %fd17, %fd12, %fd18;
+	mov.f64 	%fd20, 0dBF82CF5AABC7CF0D;
+	fma.rn.f64 	%fd21, %fd19, %fd12, %fd20;
+	mov.f64 	%fd22, 0d3F9162B0B2A3BFDE;
+	fma.rn.f64 	%fd23, %fd21, %fd12, %fd22;
+	mov.f64 	%fd24, 0dBF9A7256FEB6FC6B;
+	fma.rn.f64 	%fd25, %fd23, %fd12, %fd24;
+	mov.f64 	%fd26, 0d3FA171560CE4A489;
+	fma.rn.f64 	%fd27, %fd25, %fd12, %fd26;
+	mov.f64 	%fd28, 0dBFA4F44D841450E4;
+	fma.rn.f64 	%fd29, %fd27, %fd12, %fd28;
+	mov.f64 	%fd30, 0d3FA7EE3D3F36BB95;
+	fma.rn.f64 	%fd31, %fd29, %fd12, %fd30;
+	mov.f64 	%fd32, 0dBFAAD32AE04A9FD1;
+	fma.rn.f64 	%fd33, %fd31, %fd12, %fd32;
+	mov.f64 	%fd34, 0d3FAE17813D66954F;
+	fma.rn.f64 	%fd35, %fd33, %fd12, %fd34;
+	mov.f64 	%fd36, 0dBFB11089CA9A5BCD;
+	fma.rn.f64 	%fd37, %fd35, %fd12, %fd36;
+	mov.f64 	%fd38, 0d3FB3B12B2DB51738;
+	fma.rn.f64 	%fd39, %fd37, %fd12, %fd38;
+	mov.f64 	%fd40, 0dBFB745D022F8DC5C;
+	fma.rn.f64 	%fd41, %fd39, %fd12, %fd40;
+	mov.f64 	%fd42, 0d3FBC71C709DFE927;
+	fma.rn.f64 	%fd43, %fd41, %fd12, %fd42;
+	mov.f64 	%fd44, 0dBFC2492491FA1744;
+	fma.rn.f64 	%fd45, %fd43, %fd12, %fd44;
+	mov.f64 	%fd46, 0d3FC99999999840D2;
+	fma.rn.f64 	%fd47, %fd45, %fd12, %fd46;
+	mov.f64 	%fd48, 0dBFD555555555544C;
+	fma.rn.f64 	%fd49, %fd47, %fd12, %fd48;
+	mul.f64 	%fd50, %fd12, %fd49;
+	fma.rn.f64 	%fd51, %fd50, %fd4, %fd4;
+	mov.f64 	%fd52, 0d3FF921FB54442D18;
+	sub.f64 	%fd53, %fd52, %fd51;
+	setp.gt.f64	%p4, %fd2, 0d3FF0000000000000;
+	selp.f64	%fd54, %fd53, %fd51, %p4;
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r6, %temp}, %fd54;
+	}
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r7}, %fd54;
+	}
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r8}, %fd1;
+	}
+	and.b32  	%r9, %r8, -2147483648;
+	or.b32  	%r10, %r7, %r9;
+	mov.b64 	%fd55, {%r6, %r10};
+	shl.b64 	%rd8, %rd1, 3;
+	add.s64 	%rd9, %rd7, %rd8;
+	st.global.f64 	[%rd9], %fd55;
+
+BB34_4:
+	ret;
+}
+
+	// .globl	matrix_sign
+.visible .entry matrix_sign(
+	.param .u64 matrix_sign_param_0,
+	.param .u64 matrix_sign_param_1,
+	.param .u32 matrix_sign_param_2
+)
+{
+	.reg .pred 	%p<3>;
+	.reg .b32 	%r<12>;
+	.reg .f64 	%fd<4>;
+	.reg .b64 	%rd<9>;
+
+
+	ld.param.u64 	%rd2, [matrix_sign_param_0];
+	ld.param.u64 	%rd3, [matrix_sign_param_1];
+	ld.param.u32 	%r2, [matrix_sign_param_2];
+	mov.u32 	%r3, %ctaid.x;
+	mov.u32 	%r4, %ntid.x;
+	mov.u32 	%r5, %tid.x;
+	mad.lo.s32 	%r1, %r4, %r3, %r5;
+	setp.ge.u32	%p1, %r1, %r2;
+	@%p1 bra 	BB35_4;
+
+	cvta.to.global.u64 	%rd4, %rd2;
+	mul.wide.s32 	%rd5, %r1, 8;
+	add.s64 	%rd6, %rd4, %rd5;
+	ld.global.f64 	%fd1, [%rd6];
+	setp.eq.f64	%p2, %fd1, 0d0000000000000000;
+	cvta.to.global.u64 	%rd7, %rd3;
+	add.s64 	%rd1, %rd7, %rd5;
+	@%p2 bra 	BB35_3;
+	bra.uni 	BB35_2;
+
+BB35_3:
+	mov.u64 	%rd8, 0;
+	st.global.u64 	[%rd1], %rd8;
+	bra.uni 	BB35_4;
+
+BB35_2:
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r6}, %fd1;
+	}
+	and.b32  	%r7, %r6, -2147483648;
+	mov.f64 	%fd2, 0d3FF0000000000000;
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r8}, %fd2;
+	}
+	and.b32  	%r9, %r8, 2147483647;
+	or.b32  	%r10, %r9, %r7;
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r11, %temp}, %fd2;
+	}
+	mov.b64 	%fd3, {%r11, %r10};
+	st.global.f64 	[%rd1], %fd3;
+
+BB35_4:
+	ret;
+}
+
+.func  (.param .b64 func_retval0) __internal_trig_reduction_slowpathd(
+	.param .b64 __internal_trig_reduction_slowpathd_param_0,
+	.param .b64 __internal_trig_reduction_slowpathd_param_1
+)
+{
+	.local .align 8 .b8 	__local_depot36[40];
+	.reg .b64 	%SP;
+	.reg .b64 	%SPL;
+	.reg .pred 	%p<9>;
+	.reg .b32 	%r<42>;
+	.reg .f64 	%fd<5>;
+	.reg .b64 	%rd<101>;
+
+
+	mov.u64 	%rd100, __local_depot36;
+	cvta.local.u64 	%SP, %rd100;
+	ld.param.f64 	%fd4, [__internal_trig_reduction_slowpathd_param_0];
+	ld.param.u64 	%rd37, [__internal_trig_reduction_slowpathd_param_1];
+	add.u64 	%rd38, %SP, 0;
+	cvta.to.local.u64 	%rd1, %rd38;
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r1}, %fd4;
+	}
+	and.b32  	%r40, %r1, -2147483648;
+	shr.u32 	%r3, %r1, 20;
+	bfe.u32 	%r4, %r1, 20, 11;
+	setp.eq.s32	%p1, %r4, 2047;
+	@%p1 bra 	BB36_13;
+
+	add.s32 	%r16, %r4, -1024;
+	shr.u32 	%r17, %r16, 6;
+	mov.u32 	%r18, 16;
+	sub.s32 	%r5, %r18, %r17;
+	mov.u32 	%r19, 19;
+	sub.s32 	%r20, %r19, %r17;
+	mov.u32 	%r21, 18;
+	min.s32 	%r6, %r21, %r20;
+	setp.gt.s32	%p2, %r5, %r6;
+	mov.u64 	%rd94, 0;
+	mov.u64 	%rd93, %rd1;
+	@%p2 bra 	BB36_4;
+
+	mov.b64 	 %rd41, %fd4;
+	shl.b64 	%rd42, %rd41, 11;
+	or.b64  	%rd3, %rd42, -9223372036854775808;
+	add.s32 	%r7, %r5, -1;
+	mov.u64 	%rd92, %rd1;
+	bfe.u32 	%r22, %r1, 20, 11;
+	add.s32 	%r23, %r22, -1024;
+	shr.u32 	%r24, %r23, 6;
+	neg.s32 	%r25, %r24;
+	mul.wide.s32 	%rd43, %r25, 8;
+	mov.u64 	%rd44, __cudart_i2opi_d;
+	add.s64 	%rd45, %rd43, %rd44;
+	add.s64 	%rd90, %rd45, 120;
+	mov.u64 	%rd94, 0;
+	mov.u64 	%rd91, %rd1;
+	mov.u32 	%r39, %r7;
+
+BB36_3:
+	.pragma "nounroll";
+	mov.u32 	%r8, %r39;
+	mov.u64 	%rd7, %rd91;
+	ld.const.u64 	%rd48, [%rd90];
+	// inline asm
+	{
+	.reg .u32 r0, r1, r2, r3, alo, ahi, blo, bhi, clo, chi;
+	mov.b64         {alo,ahi}, %rd48;    
+	mov.b64         {blo,bhi}, %rd3;    
+	mov.b64         {clo,chi}, %rd94;    
+	mad.lo.cc.u32   r0, alo, blo, clo;
+	madc.hi.cc.u32  r1, alo, blo, chi;
+	madc.hi.u32     r2, alo, bhi,   0;
+	mad.lo.cc.u32   r1, alo, bhi,  r1;
+	madc.hi.cc.u32  r2, ahi, blo,  r2;
+	madc.hi.u32     r3, ahi, bhi,   0;
+	mad.lo.cc.u32   r1, ahi, blo,  r1;
+	madc.lo.cc.u32  r2, ahi, bhi,  r2;
+	addc.u32        r3,  r3,   0;     
+	mov.b64         %rd46, {r0,r1};      
+	mov.b64         %rd94, {r2,r3};      
+	}
+	// inline asm
+	st.local.u64 	[%rd92], %rd46;
+	add.s32 	%r9, %r8, 1;
+	sub.s32 	%r26, %r9, %r7;
+	mul.wide.s32 	%rd51, %r26, 8;
+	add.s64 	%rd92, %rd1, %rd51;
+	add.s64 	%rd13, %rd7, 8;
+	mov.u64 	%rd93, %rd13;
+	add.s64 	%rd90, %rd90, 8;
+	setp.lt.s32	%p3, %r9, %r6;
+	mov.u64 	%rd91, %rd13;
+	mov.u32 	%r39, %r9;
+	@%p3 bra 	BB36_3;
+
+BB36_4:
+	st.local.u64 	[%rd93], %rd94;
+	ld.local.u64 	%rd95, [%rd1+16];
+	ld.local.u64 	%rd96, [%rd1+24];
+	and.b32  	%r10, %r3, 63;
+	setp.eq.s32	%p4, %r10, 0;
+	@%p4 bra 	BB36_6;
+
+	mov.u32 	%r27, 64;
+	sub.s32 	%r28, %r27, %r10;
+	shl.b64 	%rd52, %rd96, %r10;
+	shr.u64 	%rd53, %rd95, %r28;
+	or.b64  	%rd96, %rd52, %rd53;
+	shl.b64 	%rd54, %rd95, %r10;
+	ld.local.u64 	%rd55, [%rd1+8];
+	shr.u64 	%rd56, %rd55, %r28;
+	or.b64  	%rd95, %rd56, %rd54;
+
+BB36_6:
+	cvta.to.local.u64 	%rd57, %rd37;
+	shr.u64 	%rd58, %rd96, 62;
+	cvt.u32.u64	%r29, %rd58;
+	shr.u64 	%rd59, %rd95, 62;
+	shl.b64 	%rd60, %rd96, 2;
+	or.b64  	%rd98, %rd60, %rd59;
+	shl.b64 	%rd97, %rd95, 2;
+	shr.u64 	%rd61, %rd96, 61;
+	cvt.u32.u64	%r30, %rd61;
+	and.b32  	%r31, %r30, 1;
+	add.s32 	%r32, %r31, %r29;
+	neg.s32 	%r33, %r32;
+	setp.eq.s32	%p5, %r40, 0;
+	selp.b32	%r34, %r32, %r33, %p5;
+	st.local.u32 	[%rd57], %r34;
+	setp.eq.s32	%p6, %r31, 0;
+	@%p6 bra 	BB36_8;
+
+	mov.u64 	%rd65, 0;
+	// inline asm
+	{
+	.reg .u32 r0, r1, r2, r3, a0, a1, a2, a3, b0, b1, b2, b3;
+	mov.b64         {a0,a1}, %rd65;
+	mov.b64         {a2,a3}, %rd65;
+	mov.b64         {b0,b1}, %rd97;
+	mov.b64         {b2,b3}, %rd98;
+	sub.cc.u32      r0, a0, b0; 
+	subc.cc.u32     r1, a1, b1; 
+	subc.cc.u32     r2, a2, b2; 
+	subc.u32        r3, a3, b3; 
+	mov.b64         %rd97, {r0,r1};
+	mov.b64         %rd98, {r2,r3};
+	}
+	// inline asm
+	xor.b32  	%r40, %r40, -2147483648;
+
+BB36_8:
+	clz.b64 	%r41, %rd98;
+	setp.eq.s32	%p7, %r41, 0;
+	@%p7 bra 	BB36_10;
+
+	shl.b64 	%rd68, %rd98, %r41;
+	mov.u32 	%r35, 64;
+	sub.s32 	%r36, %r35, %r41;
+	shr.u64 	%rd69, %rd97, %r36;
+	or.b64  	%rd98, %rd69, %rd68;
+
+BB36_10:
+	mov.u64 	%rd73, -3958705157555305931;
+	// inline asm
+	{
+	.reg .u32 r0, r1, r2, r3, alo, ahi, blo, bhi;
+	mov.b64         {alo,ahi}, %rd98;   
+	mov.b64         {blo,bhi}, %rd73;   
+	mul.lo.u32      r0, alo, blo;    
+	mul.hi.u32      r1, alo, blo;    
+	mad.lo.cc.u32   r1, alo, bhi, r1;
+	madc.hi.u32     r2, alo, bhi,  0;
+	mad.lo.cc.u32   r1, ahi, blo, r1;
+	madc.hi.cc.u32  r2, ahi, blo, r2;
+	madc.hi.u32     r3, ahi, bhi,  0;
+	mad.lo.cc.u32   r2, ahi, bhi, r2;
+	addc.u32        r3, r3,  0;      
+	mov.b64         %rd70, {r0,r1};     
+	mov.b64         %rd99, {r2,r3};     
+	}
+	// inline asm
+	setp.lt.s64	%p8, %rd99, 1;
+	@%p8 bra 	BB36_12;
+
+	// inline asm
+	{
+	.reg .u32 r0, r1, r2, r3, a0, a1, a2, a3, b0, b1, b2, b3;
+	mov.b64         {a0,a1}, %rd70;
+	mov.b64         {a2,a3}, %rd99;
+	mov.b64         {b0,b1}, %rd70;
+	mov.b64         {b2,b3}, %rd99;
+	add.cc.u32      r0, a0, b0; 
+	addc.cc.u32     r1, a1, b1; 
+	addc.cc.u32     r2, a2, b2; 
+	addc.u32        r3, a3, b3; 
+	mov.b64         %rd74, {r0,r1};
+	mov.b64         %rd99, {r2,r3};
+	}
+	// inline asm
+	add.s32 	%r41, %r41, 1;
+
+BB36_12:
+	cvt.u64.u32	%rd80, %r40;
+	shl.b64 	%rd81, %rd80, 32;
+	mov.u32 	%r37, 1022;
+	sub.s32 	%r38, %r37, %r41;
+	cvt.u64.u32	%rd82, %r38;
+	shl.b64 	%rd83, %rd82, 52;
+	add.s64 	%rd84, %rd99, 1;
+	shr.u64 	%rd85, %rd84, 10;
+	add.s64 	%rd86, %rd85, 1;
+	shr.u64 	%rd87, %rd86, 1;
+	add.s64 	%rd88, %rd87, %rd83;
+	or.b64  	%rd89, %rd88, %rd81;
+	mov.b64 	 %fd4, %rd89;
+
+BB36_13:
+	st.param.f64	[func_retval0+0], %fd4;
+	ret;
+}
+
+.func  (.param .b64 func_retval0) __internal_accurate_pow(
+	.param .b64 __internal_accurate_pow_param_0,
+	.param .b64 __internal_accurate_pow_param_1
+)
+{
+	.reg .pred 	%p<9>;
+	.reg .f32 	%f<3>;
+	.reg .b32 	%r<52>;
+	.reg .f64 	%fd<134>;
+
+
+	ld.param.f64 	%fd12, [__internal_accurate_pow_param_0];
+	ld.param.f64 	%fd13, [__internal_accurate_pow_param_1];
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r49}, %fd12;
+	}
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r48, %temp}, %fd12;
+	}
+	shr.u32 	%r50, %r49, 20;
+	setp.ne.s32	%p1, %r50, 0;
+	@%p1 bra 	BB37_2;
+
+	mul.f64 	%fd14, %fd12, 0d4350000000000000;
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r49}, %fd14;
+	}
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r48, %temp}, %fd14;
+	}
+	shr.u32 	%r16, %r49, 20;
+	add.s32 	%r50, %r16, -54;
+
+BB37_2:
+	add.s32 	%r51, %r50, -1023;
+	and.b32  	%r17, %r49, -2146435073;
+	or.b32  	%r18, %r17, 1072693248;
+	mov.b64 	%fd132, {%r48, %r18};
+	setp.lt.u32	%p2, %r18, 1073127583;
+	@%p2 bra 	BB37_4;
+
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r19, %temp}, %fd132;
+	}
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r20}, %fd132;
+	}
+	add.s32 	%r21, %r20, -1048576;
+	mov.b64 	%fd132, {%r19, %r21};
+	add.s32 	%r51, %r50, -1022;
+
+BB37_4:
+	add.f64 	%fd16, %fd132, 0d3FF0000000000000;
+	// inline asm
+	rcp.approx.ftz.f64 %fd15,%fd16;
+	// inline asm
+	neg.f64 	%fd17, %fd16;
+	mov.f64 	%fd18, 0d3FF0000000000000;
+	fma.rn.f64 	%fd19, %fd17, %fd15, %fd18;
+	fma.rn.f64 	%fd20, %fd19, %fd19, %fd19;
+	fma.rn.f64 	%fd21, %fd20, %fd15, %fd15;
+	add.f64 	%fd22, %fd132, 0dBFF0000000000000;
+	mul.f64 	%fd23, %fd22, %fd21;
+	fma.rn.f64 	%fd24, %fd22, %fd21, %fd23;
+	mul.f64 	%fd25, %fd24, %fd24;
+	mov.f64 	%fd26, 0d3ED0F5D241AD3B5A;
+	mov.f64 	%fd27, 0d3EB0F5FF7D2CAFE2;
+	fma.rn.f64 	%fd28, %fd27, %fd25, %fd26;
+	mov.f64 	%fd29, 0d3EF3B20A75488A3F;
+	fma.rn.f64 	%fd30, %fd28, %fd25, %fd29;
 	mov.f64 	%fd31, 0d3F1745CDE4FAECD5;
 	fma.rn.f64 	%fd32, %fd30, %fd25, %fd31;
 	mov.f64 	%fd33, 0d3F3C71C7258A578B;
@@ -3499,13 +5071,13 @@ BB23_4:
 	mov.b32 	 %f2, %r35;
 	abs.f32 	%f1, %f2;
 	setp.lt.f32	%p4, %f1, 0f4086232B;
-	@%p4 bra 	BB23_7;
+	@%p4 bra 	BB37_7;
 
 	setp.lt.f64	%p5, %fd4, 0d0000000000000000;
 	add.f64 	%fd129, %fd4, 0d7FF0000000000000;
 	selp.f64	%fd133, 0d0000000000000000, %fd129, %p5;
 	setp.geu.f32	%p6, %f1, 0f40874800;
-	@%p6 bra 	BB23_7;
+	@%p6 bra 	BB37_7;
 
 	shr.u32 	%r36, %r13, 31;
 	add.s32 	%r37, %r13, %r36;
@@ -3520,26 +5092,26 @@ BB23_4:
 	mov.b64 	%fd131, {%r44, %r43};
 	mul.f64 	%fd133, %fd130, %fd131;
 
-BB23_7:
+BB37_7:
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r45}, %fd133;
 	}
 	and.b32  	%r46, %r45, 2147483647;
 	setp.ne.s32	%p7, %r46, 2146435072;
+	@%p7 bra 	BB37_9;
+
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%r47, %temp}, %fd133;
 	}
-	setp.ne.s32	%p8, %r47, 0;
-	or.pred  	%p9, %p8, %p7;
-	@!%p9 bra 	BB23_9;
-	bra.uni 	BB23_8;
+	setp.eq.s32	%p8, %r47, 0;
+	@%p8 bra 	BB37_10;
 
-BB23_8:
+BB37_9:
 	fma.rn.f64 	%fd133, %fd133, %fd5, %fd133;
 
-BB23_9:
+BB37_10:
 	st.param.f64	[func_retval0+0], %fd133;
 	ret;
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1fc764b9/src/main/java/org/apache/sysml/hops/UnaryOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/UnaryOp.java b/src/main/java/org/apache/sysml/hops/UnaryOp.java
index c75d0e0..451960b 100644
--- a/src/main/java/org/apache/sysml/hops/UnaryOp.java
+++ b/src/main/java/org/apache/sysml/hops/UnaryOp.java
@@ -157,8 +157,14 @@ public class UnaryOp extends Hop implements MultiThreadedHop
 				else //default unary 
 				{
 					int k = isCumulativeUnaryOperation() ? OptimizerUtils.getConstrainedNumThreads( _maxNumThreads ) : 1;
-					if(_op == OpOp1.SELP || _op == OpOp1.EXP) {
-						et = findGPUExecTypeByMemEstimate(et);
+					switch(_op) {
+						case SELP:case EXP:case SQRT:case LOG:case ABS:
+						case ROUND:case FLOOR:case CEIL:
+						case SIN:case COS: case TAN:case ASIN:case ACOS:case ATAN:
+						case SIGN:
+							et = findGPUExecTypeByMemEstimate(et);
+							break;
+						default:
 					}
 					Unary unary1 = new Unary(input.constructLops(), HopsOpOp1LopsU.get(_op), 
 							                 getDataType(), getValueType(), et, k);

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1fc764b9/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
index 4a45521..443d0eb 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
@@ -69,12 +69,27 @@ public class GPUInstructionParser  extends InstructionParser
 		String2GPUInstructionType.put( "^2"   , GPUINSTRUCTION_TYPE.ArithmeticBinary); //special ^ case
 		String2GPUInstructionType.put( "*2"   , GPUINSTRUCTION_TYPE.ArithmeticBinary); //special * case
 		String2GPUInstructionType.put( "-nz"  , GPUINSTRUCTION_TYPE.ArithmeticBinary); //special - case
-		String2GPUInstructionType.put( "+*"  	, GPUINSTRUCTION_TYPE.ArithmeticBinary);
-		String2GPUInstructionType.put( "-*"  	, GPUINSTRUCTION_TYPE.ArithmeticBinary);
+		String2GPUInstructionType.put( "+*"   , GPUINSTRUCTION_TYPE.ArithmeticBinary);
+		String2GPUInstructionType.put( "-*"   , GPUINSTRUCTION_TYPE.ArithmeticBinary);
 		
 		// Builtin functions
 		String2GPUInstructionType.put( "sel+"  	, GPUINSTRUCTION_TYPE.BuiltinUnary);
 		String2GPUInstructionType.put( "exp"  	, GPUINSTRUCTION_TYPE.BuiltinUnary);
+		String2GPUInstructionType.put( "log"  	, GPUINSTRUCTION_TYPE.BuiltinUnary);
+		String2GPUInstructionType.put( "abs"  	, GPUINSTRUCTION_TYPE.BuiltinUnary);
+		String2GPUInstructionType.put( "sqrt"  	, GPUINSTRUCTION_TYPE.BuiltinUnary);
+		String2GPUInstructionType.put( "round"  , GPUINSTRUCTION_TYPE.BuiltinUnary);
+		String2GPUInstructionType.put( "floor"  , GPUINSTRUCTION_TYPE.BuiltinUnary);
+		String2GPUInstructionType.put( "ceil"  	, GPUINSTRUCTION_TYPE.BuiltinUnary);
+		String2GPUInstructionType.put( "sin"  	, GPUINSTRUCTION_TYPE.BuiltinUnary);
+		String2GPUInstructionType.put( "cos"  	, GPUINSTRUCTION_TYPE.BuiltinUnary);
+		String2GPUInstructionType.put( "tan"  	, GPUINSTRUCTION_TYPE.BuiltinUnary);
+		String2GPUInstructionType.put( "asin"  	, GPUINSTRUCTION_TYPE.BuiltinUnary);
+		String2GPUInstructionType.put( "acos"  	, GPUINSTRUCTION_TYPE.BuiltinUnary);
+		String2GPUInstructionType.put( "atan"  	, GPUINSTRUCTION_TYPE.BuiltinUnary);
+		String2GPUInstructionType.put( "sign"  	, GPUINSTRUCTION_TYPE.BuiltinUnary);
+
+
 
 		String2GPUInstructionType.put( "solve"  , GPUINSTRUCTION_TYPE.BuiltinBinary);