You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by na...@apache.org on 2017/05/17 17:56:37 UTC
[1/2] incubator-systemml git commit: [SYSTEMML-1344] sqrt, round, abs,
log, floor, ceil, trig funcs & sign for GPU
Repository: incubator-systemml
Updated Branches:
refs/heads/master 0d553e384 -> 1fc764b9b
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1fc764b9/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
index f4c523b..48b7da6 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
@@ -61,41 +61,55 @@ public abstract class GPUInstruction extends Instruction
public final static String MISC_TIMER_SYRK_LIB = "Msyrk"; // time spent in symmetric rank-k update
// Other BLAS instructions
- public final static String MISC_TIMER_DAXPY_LIB = "daxpy"; // time spent in daxpy
- public final static String MISC_TIMER_QR_BUFFER = "qr_buffer"; // time spent in calculating buffer needed to perform QR
- public final static String MISC_TIMER_QR = "qr"; // time spent in doing QR
- public final static String MISC_TIMER_ORMQR = "ormqr"; // time spent in ormqr
- public final static String MISC_TIMER_TRSM = "trsm"; // time spent in cublas Dtrsm
+ public final static String MISC_TIMER_DAXPY_LIB = "daxpy"; // time spent in daxpy
+ public final static String MISC_TIMER_QR_BUFFER = "qr_buffer";// time spent in calculating buffer needed to perform QR
+ public final static String MISC_TIMER_QR = "qr"; // time spent in doing QR
+ public final static String MISC_TIMER_ORMQR = "ormqr"; // time spent in ormqr
+ public final static String MISC_TIMER_TRSM = "trsm"; // time spent in cublas Dtrsm
// Transpose
- public final static String MISC_TIMER_SPARSE_DGEAM_LIB = "sdgeaml"; // time spent in sparse transpose (and other ops of type a*op(A) + b*op(B))
- public final static String MISC_TIMER_DENSE_DGEAM_LIB = "ddgeaml"; // time spent in dense transpose (and other ops of type a*op(A) + b*op(B))
- public final static String MISC_TIMER_TRANSPOSE_LIB = "dtl"; // time spent on dense transpose, this includes allocation of output
+ public final static String MISC_TIMER_SPARSE_DGEAM_LIB = "sdgeaml"; // time spent in sparse transpose (and other ops of type a*op(A) + b*op(B))
+ public final static String MISC_TIMER_DENSE_DGEAM_LIB = "ddgeaml"; // time spent in dense transpose (and other ops of type a*op(A) + b*op(B))
+ public final static String MISC_TIMER_TRANSPOSE_LIB = "dtl"; // time spent on dense transpose, this includes allocation of output
// Custom kernels
- public final static String MISC_TIMER_MATRIX_MATRIX_CELLWISE_OP_KERNEL = "mmck"; // time spent in matrix-matrix cellwise operations
- public final static String MISC_TIMER_COMPARE_AND_SET_KERNEL = "cask"; // time spent in compareAndSet kernel
- public final static String MISC_TIMER_EXP_KERNEL = "expk"; // time spent in the exp kernel
- public final static String MISC_TIMER_DAXPY_MV_KERNEL = "daxpymv"; // time spent in the daxpy_matrix_vector kernel
- public final static String MISC_TIMER_UPPER_TO_LOWER_TRIANGLE_KERNEL = "u2lk"; // time spent in the copy_u2l_dense kernel
- public final static String MISC_TIMER_FILL_KERNEL = "fillk"; // time spent in the "fill" kernel
- public final static String MISC_TIMER_MATRIX_SCALAR_OP_KERNEL = "msk"; // time spent in the matrix scalar kernel
- public final static String MISC_TIMER_REDUCE_ALL_KERNEL = "rallk"; // time spent in reduce all kernel
- public final static String MISC_TIMER_REDUCE_ROW_KERNEL = "rrowk"; // time spent in reduce row kernel
- public final static String MISC_TIMER_REDUCE_COL_KERNEL = "rcolk"; // time spent in reduce column kernel
+ public final static String MISC_TIMER_MATRIX_MATRIX_CELLWISE_OP_KERNEL = "mmck"; // time spent in matrix-matrix cellwise operations
+ public final static String MISC_TIMER_COMPARE_AND_SET_KERNEL = "cask"; // time spent in compareAndSet kernel
+ public final static String MISC_TIMER_EXP_KERNEL = "expk"; // time spent in the exp kernel
+ public final static String MISC_TIMER_SQRT_KERNEL = "sqrtk"; // time spent in the sqrt kernel
+ public final static String MISC_TIMER_ROUND_KERNEL = "roundk"; // time spent in the round kernel
+ public final static String MISC_TIMER_ABS_KERNEL = "absk"; // time spent in the abs kernel
+ public final static String MISC_TIMER_LOG_KERNEL = "logk"; // time spent in the log kernel
+ public final static String MISC_TIMER_FLOOR_KERNEL = "floork"; // time spent in the floor kernel
+ public final static String MISC_TIMER_CEIL_KERNEL = "ceilk"; // time spent in the ceil kernel
+ public final static String MISC_TIMER_SIN_KERNEL = "sink"; // time spent in the sin kernel
+ public final static String MISC_TIMER_COS_KERNEL = "cosk"; // time spent in the cos kernel
+ public final static String MISC_TIMER_TAN_KERNEL = "tank"; // time spent in the tan kernel
+ public final static String MISC_TIMER_ASIN_KERNEL = "asink"; // time spent in the asin kernel
+ public final static String MISC_TIMER_ACOS_KERNEL = "acosk"; // time spent in the acos kernel
+ public final static String MISC_TIMER_ATAN_KERNEL = "atank"; // time spent in the atan kernel
+ public final static String MISC_TIMER_SIGN_KERNEL = "signk"; // time spent in the sign kernel
+
+ public final static String MISC_TIMER_DAXPY_MV_KERNEL = "daxpymv";// time spent in the daxpy_matrix_vector kernel
+ public final static String MISC_TIMER_UPPER_TO_LOWER_TRIANGLE_KERNEL = "u2lk"; // time spent in the copy_u2l_dense kernel
+ public final static String MISC_TIMER_FILL_KERNEL = "fillk"; // time spent in the "fill" kernel
+ public final static String MISC_TIMER_MATRIX_SCALAR_OP_KERNEL = "msk"; // time spent in the matrix scalar kernel
+ public final static String MISC_TIMER_REDUCE_ALL_KERNEL = "rallk"; // time spent in reduce all kernel
+ public final static String MISC_TIMER_REDUCE_ROW_KERNEL = "rrowk"; // time spent in reduce row kernel
+ public final static String MISC_TIMER_REDUCE_COL_KERNEL = "rcolk"; // time spent in reduce column kernel
// Deep learning operators
- public final static String MISC_TIMER_ACTIVATION_FORWARD_LIB = "nnaf"; // time spent in cudnnActivationForward
- public final static String MISC_TIMER_CONVOLUTION_FORWARD_LIB = "nncf"; // time spent in cudnnConvolutionForward
- public final static String MISC_TIMER_CONVOLUTION_BACKWARD_FILTER_LIB = "nncbf"; // time spent in cudnnConvolutionBackwardFilter
- public final static String MISC_TIMER_CONVOLUTION_BACKWARD_DATA_LIB = "nncbd"; // time spent in cudnnConvolutionBackwardData
- public final static String MISC_TIMER_MAXPOOLING_FORWARD_LIB = "nnmf"; // time spent in cudnnPoolingForward
- public final static String MISC_TIMER_MAXPOOLING_BACKWARD_LIB = "nnmb"; // time spent in cudnnPoolingBackward
- public final static String MISC_TIMER_BIAS_ADD_LIB = "nnba"; // time spent in bias_add cuda kernel
- public final static String MISC_TIMER_RELU_BACKWARD_KERNEL= "nnrbk"; // time spent in relu_backward cuda kernel
- public final static String MISC_TIMER_RELU_KERNEL = "nnrk"; // time spent in the relu kernel
- public final static String MISC_TIMER_CUDNN_INIT = "nni"; // time spent in initializations for cudnn call
- public final static String MISC_TIMER_CUDNN_CLEANUP = "nnc"; // time spent in cleanup for cudnn call
+ public final static String MISC_TIMER_ACTIVATION_FORWARD_LIB = "nnaf"; // time spent in cudnnActivationForward
+ public final static String MISC_TIMER_CONVOLUTION_FORWARD_LIB = "nncf"; // time spent in cudnnConvolutionForward
+ public final static String MISC_TIMER_CONVOLUTION_BACKWARD_FILTER_LIB ="nncbf"; // time spent in cudnnConvolutionBackwardFilter
+ public final static String MISC_TIMER_CONVOLUTION_BACKWARD_DATA_LIB = "nncbd"; // time spent in cudnnConvolutionBackwardData
+ public final static String MISC_TIMER_MAXPOOLING_FORWARD_LIB = "nnmf"; // time spent in cudnnPoolingForward
+ public final static String MISC_TIMER_MAXPOOLING_BACKWARD_LIB = "nnmb"; // time spent in cudnnPoolingBackward
+ public final static String MISC_TIMER_BIAS_ADD_LIB = "nnba"; // time spent in bias_add cuda kernel
+ public final static String MISC_TIMER_RELU_BACKWARD_KERNEL= "nnrbk"; // time spent in relu_backward cuda kernel
+ public final static String MISC_TIMER_RELU_KERNEL = "nnrk"; // time spent in the relu kernel
+ public final static String MISC_TIMER_CUDNN_INIT = "nni"; // time spent in initializations for cudnn call
+ public final static String MISC_TIMER_CUDNN_CLEANUP = "nnc"; // time spent in cleanup for cudnn call
protected GPUINSTRUCTION_TYPE _gputype;
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1fc764b9/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
index ce25dec..7b50285 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
@@ -42,14 +42,39 @@ public class MatrixBuiltinGPUInstruction extends BuiltinUnaryGPUInstruction {
MatrixObject mat = getMatrixInputForGPUInstruction(ec, _input.getName());
ec.setMetaData(_output.getName(), mat.getNumRows(), mat.getNumColumns());
- if(opcode.equals("sel+")) {
- LibMatrixCUDA.relu(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName());
-
- } else if (opcode.equals("exp")) {
- LibMatrixCUDA.exp(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName());
- }
- else {
- throw new DMLRuntimeException("Unsupported GPU operator:" + opcode);
+ switch(opcode) {
+ case "sel+":
+ LibMatrixCUDA.relu(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+ case "exp":
+ LibMatrixCUDA.exp(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+ case "sqrt":
+ LibMatrixCUDA.sqrt(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+ case "log":
+ LibMatrixCUDA.log(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+ case "round":
+ LibMatrixCUDA.round(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+ case "floor":
+ LibMatrixCUDA.floor(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+ case "ceil":
+ LibMatrixCUDA.ceil(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+ case "abs":
+ LibMatrixCUDA.abs(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+ case "sin":
+ LibMatrixCUDA.sin(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+ case "cos":
+ LibMatrixCUDA.cos(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+ case "tan":
+ LibMatrixCUDA.tan(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+ case "asin":
+ LibMatrixCUDA.asin(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+ case "acos":
+ LibMatrixCUDA.acos(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+ case "atan":
+ LibMatrixCUDA.atan(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+ case "sign":
+ LibMatrixCUDA.sign(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+ default:
+ throw new DMLRuntimeException("Unsupported GPU operator:" + opcode);
}
ec.releaseMatrixInputForGPUInstruction(_input.getName());
ec.releaseMatrixOutputForGPUInstruction(_output.getName());
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1fc764b9/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
index 05257e5..0ff9d14 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
@@ -467,7 +467,7 @@ public class CSRPointer {
cusparseDcsr2dense(cusparseHandle, rows, cols, descr, val, rowPtr, colInd, A, rows);
//cudaDeviceSynchronize;
} else {
- LOG.warn("in CSRPointer, the values array, row pointers array or column indices array was null");
+ LOG.debug("in CSRPointer, the values array, row pointers array or column indices array was null");
}
return A;
}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1fc764b9/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
index d735e38..be3cc09 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -510,7 +510,9 @@ public class GPUObject {
setDenseMatrixCudaPointer(allocate(size));
addReadLock();
// The "fill" kernel is called which treats the matrix "jcudaDensePtr" like a vector and fills it with value "v"
- getGPUContext().getKernels().launchKernel("fill", ExecutionConfig.getConfigForSimpleVectorOperations(numElems), getJcudaDenseMatrixPtr(), v, numElems);
+ // If the fill value is 0, no need to call the special kernel, the allocate memsets the allocated region to 0
+ if (v != 0)
+ getGPUContext().getKernels().launchKernel("fill", ExecutionConfig.getConfigForSimpleVectorOperations(numElems), getJcudaDenseMatrixPtr(), v, numElems);
}
/**
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1fc764b9/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index a99571a..074119b 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -2885,29 +2885,239 @@ public class LibMatrixCUDA {
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public static void exp(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
+ LOG.trace("GPU : exp" + ", GPUContext=" + gCtx);
+ // e^0 = 1, create a dense block full of 1s
+ unaryOp(ec, gCtx, in1, "matrix_exp", 1, outputName, instName, GPUInstruction.MISC_TIMER_EXP_KERNEL);
+ }
+
+ /**
+ * Performs an "sqrt" operation on a matrix on the GPU
+ * @param ec execution context
+ * @param gCtx a valid {@link GPUContext}
+ * @param instName the invoking instruction's name for record {@link Statistics}.
+ * @param in1 input matrix
+ * @param outputName output matrix name
+ * @throws DMLRuntimeException if DMLRuntimeException occurs
+ */
+ public static void sqrt(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
+ LOG.trace("GPU : sqrt" + ", GPUContext=" + gCtx);
+ // sqrt(0) = 0, create a dense block full of 0s
+ unaryOp(ec, gCtx, in1, "matrix_sqrt", 0, outputName, instName, GPUInstruction.MISC_TIMER_SQRT_KERNEL);
+ }
+
+ /**
+ * Performs an "round" operation on a matrix on the GPU
+ * @param ec execution context
+ * @param gCtx a valid {@link GPUContext}
+ * @param instName the invoking instruction's name for record {@link Statistics}.
+ * @param in1 input matrix
+ * @param outputName output matrix name
+ * @throws DMLRuntimeException if DMLRuntimeException occurs
+ */
+ public static void round(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
+ LOG.trace("GPU : round" + ", GPUContext=" + gCtx);
+ // round(0) = 0, create a dense block full of 0s
+ unaryOp(ec, gCtx, in1, "matrix_round", 0, outputName, instName, GPUInstruction.MISC_TIMER_ROUND_KERNEL);
+ }
+
+ /**
+ * Performs an "abs" operation on a matrix on the GPU
+ * @param ec execution context
+ * @param gCtx a valid {@link GPUContext}
+ * @param instName the invoking instruction's name for record {@link Statistics}.
+ * @param in1 input matrix
+ * @param outputName output matrix name
+ * @throws DMLRuntimeException if DMLRuntimeException occurs
+ */
+ public static void abs(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
+ LOG.trace("GPU : abs" + ", GPUContext=" + gCtx);
+ // abs(0) = 0, create a dense block full of 0s
+ unaryOp(ec, gCtx, in1, "matrix_abs", 0, outputName, instName, GPUInstruction.MISC_TIMER_ABS_KERNEL);
+ }
+
+ /**
+ * Performs an "log" operation on a matrix on the GPU
+ * @param ec execution context
+ * @param gCtx a valid {@link GPUContext}
+ * @param instName the invoking instruction's name for record {@link Statistics}.
+ * @param in1 input matrix
+ * @param outputName output matrix name
+ * @throws DMLRuntimeException if DMLRuntimeException occurs
+ */
+ public static void log(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
+ LOG.trace("GPU : log" + ", GPUContext=" + gCtx);
+ // log(0) = -Inf
+ unaryOp(ec, gCtx, in1, "matrix_log", Double.NEGATIVE_INFINITY, outputName, instName, GPUInstruction.MISC_TIMER_LOG_KERNEL);
+ }
+
+ /**
+ * Performs an "floor" operation on a matrix on the GPU
+ * @param ec execution context
+ * @param gCtx a valid {@link GPUContext}
+ * @param instName the invoking instruction's name for record {@link Statistics}.
+ * @param in1 input matrix
+ * @param outputName output matrix name
+ * @throws DMLRuntimeException if DMLRuntimeException occurs
+ */
+ public static void floor(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
+ LOG.trace("GPU : floor" + ", GPUContext=" + gCtx);
+ // floor(0) = 0
+ unaryOp(ec, gCtx, in1, "matrix_floor", 0, outputName, instName, GPUInstruction.MISC_TIMER_FLOOR_KERNEL);
+ }
+
+ /**
+ * Performs an "ceil" operation on a matrix on the GPU
+ * @param ec execution context
+ * @param gCtx a valid {@link GPUContext}
+ * @param instName the invoking instruction's name for record {@link Statistics}.
+ * @param in1 input matrix
+ * @param outputName output matrix name
+ * @throws DMLRuntimeException if DMLRuntimeException occurs
+ */
+ public static void ceil(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
+ LOG.trace("GPU : ceil" + ", GPUContext=" + gCtx);
+ // ceil(0) = 0
+ unaryOp(ec, gCtx, in1, "matrix_ceil", 0, outputName, instName, GPUInstruction.MISC_TIMER_CEIL_KERNEL);
+ }
+
+ /**
+ * Performs an "sin" operation on a matrix on the GPU
+ * @param ec execution context
+ * @param gCtx a valid {@link GPUContext}
+ * @param instName the invoking instruction's name for record {@link Statistics}.
+ * @param in1 input matrix
+ * @param outputName output matrix name
+ * @throws DMLRuntimeException if DMLRuntimeException occurs
+ */
+ public static void sin(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
+ LOG.trace("GPU : sin" + ", GPUContext=" + gCtx);
+ // sin(0) = 0
+ unaryOp(ec, gCtx, in1, "matrix_sin", 0, outputName, instName, GPUInstruction.MISC_TIMER_SIN_KERNEL);
+ }
+
+ /**
+ * Performs an "cos" operation on a matrix on the GPU
+ * @param ec execution context
+ * @param gCtx a valid {@link GPUContext}
+ * @param instName the invoking instruction's name for record {@link Statistics}.
+ * @param in1 input matrix
+ * @param outputName output matrix name
+ * @throws DMLRuntimeException if DMLRuntimeException occurs
+ */
+ public static void cos(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
+ LOG.trace("GPU : cos" + ", GPUContext=" + gCtx);
+ // cos(0) = 1
+ unaryOp(ec, gCtx, in1, "matrix_cos", 1, outputName, instName, GPUInstruction.MISC_TIMER_COS_KERNEL);
+ }
+
+ /**
+ * Performs an "tan" operation on a matrix on the GPU
+ * @param ec execution context
+ * @param gCtx a valid {@link GPUContext}
+ * @param instName the invoking instruction's name for record {@link Statistics}.
+ * @param in1 input matrix
+ * @param outputName output matrix name
+ * @throws DMLRuntimeException if DMLRuntimeException occurs
+ */
+ public static void tan(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
+ LOG.trace("GPU : tan" + ", GPUContext=" + gCtx);
+ // tan(0) = 0
+ unaryOp(ec, gCtx, in1, "matrix_tan", 0, outputName, instName, GPUInstruction.MISC_TIMER_TAN_KERNEL);
+ }
+
+ /**
+ * Performs an "asin" operation on a matrix on the GPU
+ * @param ec execution context
+ * @param gCtx a valid {@link GPUContext}
+ * @param instName the invoking instruction's name for record {@link Statistics}.
+ * @param in1 input matrix
+ * @param outputName output matrix name
+ * @throws DMLRuntimeException if DMLRuntimeException occurs
+ */
+ public static void asin(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
+ LOG.trace("GPU : asin" + ", GPUContext=" + gCtx);
+ // asin(0) = 0
+ unaryOp(ec, gCtx, in1, "matrix_asin", 0, outputName, instName, GPUInstruction.MISC_TIMER_ASIN_KERNEL);
+ }
+
+ /**
+ * Performs an "acos" operation on a matrix on the GPU
+ * @param ec execution context
+ * @param gCtx a valid {@link GPUContext}
+ * @param instName the invoking instruction's name for record {@link Statistics}.
+ * @param in1 input matrix
+ * @param outputName output matrix name
+ * @throws DMLRuntimeException if DMLRuntimeException occurs
+ */
+ public static void acos(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
+ LOG.trace("GPU : acos" + ", GPUContext=" + gCtx);
+ // acos(0) = PI/2
+ unaryOp(ec, gCtx, in1, "matrix_acos", Math.PI/2.0, outputName, instName, GPUInstruction.MISC_TIMER_ACOS_KERNEL);
+ }
+
+ /**
+ * Performs an "atan" operation on a matrix on the GPU
+ * @param ec execution context
+ * @param gCtx a valid {@link GPUContext}
+ * @param instName the invoking instruction's name for record {@link Statistics}.
+ * @param in1 input matrix
+ * @param outputName output matrix name
+ * @throws DMLRuntimeException if DMLRuntimeException occurs
+ */
+ public static void atan(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
+ LOG.trace("GPU : atan" + ", GPUContext=" + gCtx);
+ // atan(0) = 0
+ unaryOp(ec, gCtx, in1, "matrix_atan", 0, outputName, instName, GPUInstruction.MISC_TIMER_ATAN_KERNEL);
+ }
+
+ /**
+ * Performs an "sign" operation on a matrix on the GPU
+ * @param ec execution context
+ * @param gCtx a valid {@link GPUContext}
+ * @param instName the invoking instruction's name for record {@link Statistics}.
+ * @param in1 input matrix
+ * @param outputName output matrix name
+ * @throws DMLRuntimeException if DMLRuntimeException occurs
+ */
+ public static void sign(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
+ LOG.trace("GPU : sign" + ", GPUContext=" + gCtx);
+ // sign(0) = 0
+ unaryOp(ec, gCtx, in1, "matrix_sign", 0, outputName, instName, GPUInstruction.MISC_TIMER_SIGN_KERNEL);
+ }
+
+
+ /**
+ * A helper function for all Unary ops (sqrt, abs, sin.. etc)
+ * @param ec valid execution context
+ * @param gCtx a valid {@link GPUContext}
+ * @param in1 input matrix
+ * @param kernel name of CUDA kernel for the unary op to execute
+ * @param sparseAndEmptyFillValue the result of the unary op on a completely empty input matrix block
+ * @param outputName output matrix name
+ * @param instName the invoking instruction's name for record {@link Statistics}.
+ * @param kernelTimer the name of the timer to measure the kernel invocation
+ * @throws DMLRuntimeException
+ */
+ private static void unaryOp(ExecutionContext ec, GPUContext gCtx, MatrixObject in1, String kernel, double sparseAndEmptyFillValue, String outputName, String instName, String kernelTimer) throws DMLRuntimeException {
if (ec.getGPUContext() != gCtx)
throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
- LOG.trace("GPU : exp" + ", GPUContext=" + gCtx);
GPUObject in = in1.getGPUObject(gCtx);
boolean isSparseAndEmpty = in.isSparseAndEmpty();
long t1=0;
if (isSparseAndEmpty) {
- // e^0 = 1, create a dense block full of 1s
MatrixObject out = ec.getMatrixObject(outputName);
ec.allocateGPUMatrixObject(outputName);
- out.getGPUObject(gCtx).allocateAndFillDense(1);
+ out.getGPUObject(gCtx).allocateAndFillDense(sparseAndEmptyFillValue);
} else {
// Dense
MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName);
Pointer output = getDensePointer(gCtx, out, instName);
- // If the input is in sparse format, convert it to dense.
- // The output will always be dense, because for all x, exp(x) > 0
Pointer input = getDensePointer(gCtx, in1, instName);
int size = (int)(in1.getNumColumns() * in1.getNumRows());
if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
- getCudaKernels(gCtx).launchKernel("matrix_exp", ExecutionConfig.getConfigForSimpleVectorOperations(size),
- input, output, size);
- if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_EXP_KERNEL, System.nanoTime() - t1);
+ getCudaKernels(gCtx).launchKernel(kernel, ExecutionConfig.getConfigForSimpleVectorOperations(size),
+ input, output, size);
+ if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, kernelTimer, System.nanoTime() - t1);
}
}
[2/2] incubator-systemml git commit: [SYSTEMML-1344] sqrt, round, abs,
log, floor, ceil, trig funcs & sign for GPU
Posted by na...@apache.org.
[SYSTEMML-1344] sqrt,round,abs,log,floor,ceil,trig funcs & sign for GPU
Closes #503
Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/1fc764b9
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/1fc764b9
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/1fc764b9
Branch: refs/heads/master
Commit: 1fc764b9b099271822056a82e248acdbb785dc63
Parents: 0d553e3
Author: Nakul Jindal <na...@gmail.com>
Authored: Wed May 17 10:55:51 2017 -0700
Committer: Nakul Jindal <na...@gmail.com>
Committed: Wed May 17 10:55:51 2017 -0700
----------------------------------------------------------------------
src/main/cpp/kernels/Makefile | 28 +
src/main/cpp/kernels/SystemML.cu | 187 ++
src/main/cpp/kernels/SystemML.ptx | 2506 ++++++++++++++----
.../java/org/apache/sysml/hops/UnaryOp.java | 10 +-
.../instructions/GPUInstructionParser.java | 19 +-
.../instructions/gpu/GPUInstruction.java | 72 +-
.../gpu/MatrixBuiltinGPUInstruction.java | 41 +-
.../instructions/gpu/context/CSRPointer.java | 2 +-
.../instructions/gpu/context/GPUObject.java | 4 +-
.../runtime/matrix/data/LibMatrixCUDA.java | 226 +-
10 files changed, 2577 insertions(+), 518 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1fc764b9/src/main/cpp/kernels/Makefile
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/Makefile b/src/main/cpp/kernels/Makefile
new file mode 100644
index 0000000..0b003f3
--- /dev/null
+++ b/src/main/cpp/kernels/Makefile
@@ -0,0 +1,28 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+NVCC=nvcc
+CUDAFLAGS= -ptx -c -arch=sm_30
+
+SystemML.o: SystemML.cu
+ $(NVCC) $(CUDAFLAGS) SystemML.cu
+
+all: SystemML.o
+ ;
+
+clean:
+ rm -rf SystemML.ptx
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1fc764b9/src/main/cpp/kernels/SystemML.cu
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.cu b/src/main/cpp/kernels/SystemML.cu
index 2651e4a..5b4574e 100644
--- a/src/main/cpp/kernels/SystemML.cu
+++ b/src/main/cpp/kernels/SystemML.cu
@@ -656,3 +656,190 @@ __global__ void matrix_exp(double *A, double *C, unsigned int size) {
C[index] = exp(A[index]);
}
}
+
+/**
+ * Do an sqrt over all the elements of a matrix
+ * @param A the input matrix (of length = size)
+ * @param C the pre-allocated output matrix (of length = size)
+ * @param siz the length of the input and output matrices
+ */
+extern "C"
+__global__ void matrix_sqrt(double *A, double *C, unsigned int size) {
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
+ if (index < size){
+ C[index] = sqrt(A[index]);
+ }
+}
+
+/**
+ * Do an round over all the elements of a matrix
+ * @param A the input matrix (of length = size)
+ * @param C the pre-allocated output matrix (of length = size)
+ * @param siz the length of the input and output matrices
+ */
+extern "C"
+__global__ void matrix_round(double *A, double *C, unsigned int size) {
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
+ if (index < size){
+ C[index] = (double)llround(A[index]);
+ }
+}
+
+/**
+ * Do an abs over all the elements of a matrix
+ * @param A the input matrix (of length = size)
+ * @param C the pre-allocated output matrix (of length = size)
+ * @param siz the length of the input and output matrices
+ */
+extern "C"
+__global__ void matrix_abs(double *A, double *C, unsigned int size) {
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
+ if (index < size){
+ C[index] = (double)fabs(A[index]);
+ }
+}
+
+/**
+ * Do an log over all the elements of a matrix
+ * @param A the input matrix (of length = size)
+ * @param C the pre-allocated output matrix (of length = size)
+ * @param siz the length of the input and output matrices
+ */
+extern "C"
+__global__ void matrix_log(double *A, double *C, unsigned int size) {
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
+ if (index < size){
+ C[index] = log(A[index]);
+ }
+}
+
+/**
+ * Do an floor over all the elements of a matrix
+ * @param A the input matrix (of length = size)
+ * @param C the pre-allocated output matrix (of length = size)
+ * @param siz the length of the input and output matrices
+ */
+extern "C"
+__global__ void matrix_floor(double *A, double *C, unsigned int size) {
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
+ if (index < size){
+ C[index] = floor(A[index]);
+ }
+}
+
+/**
+ * Do an ceil over all the elements of a matrix
+ * @param A the input matrix (of length = size)
+ * @param C the pre-allocated output matrix (of length = size)
+ * @param siz the length of the input and output matrices
+ */
+extern "C"
+__global__ void matrix_ceil(double *A, double *C, unsigned int size) {
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
+ if (index < size){
+ C[index] = ceil(A[index]);
+ }
+}
+
+/**
+ * Do an sin over all the elements of a matrix
+ * @param A the input matrix (of length = size)
+ * @param C the pre-allocated output matrix (of length = size)
+ * @param siz the length of the input and output matrices
+ */
+extern "C"
+__global__ void matrix_sin(double *A, double *C, unsigned int size) {
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
+ if (index < size){
+ C[index] = sin(A[index]);
+ }
+}
+
+/**
+ * Do an cos over all the elements of a matrix
+ * @param A the input matrix (of length = size)
+ * @param C the pre-allocated output matrix (of length = size)
+ * @param siz the length of the input and output matrices
+ */
+extern "C"
+__global__ void matrix_cos(double *A, double *C, unsigned int size) {
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
+ if (index < size){
+ C[index] = cos(A[index]);
+ }
+}
+
+/**
+ * Do an tan over all the elements of a matrix
+ * @param A the input matrix (of length = size)
+ * @param C the pre-allocated output matrix (of length = size)
+ * @param siz the length of the input and output matrices
+ */
+extern "C"
+__global__ void matrix_tan(double *A, double *C, unsigned int size) {
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
+ if (index < size){
+ C[index] = tan(A[index]);
+ }
+}
+
+/**
+ * Do an asin over all the elements of a matrix
+ * @param A the input matrix (of length = size)
+ * @param C the pre-allocated output matrix (of length = size)
+ * @param siz the length of the input and output matrices
+ */
+extern "C"
+__global__ void matrix_asin(double *A, double *C, unsigned int size) {
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
+ if (index < size){
+ C[index] = asin(A[index]);
+ }
+}
+
+/**
+ * Do an acos over all the elements of a matrix
+ * @param A the input matrix (of length = size)
+ * @param C the pre-allocated output matrix (of length = size)
+ * @param siz the length of the input and output matrices
+ */
+extern "C"
+__global__ void matrix_acos(double *A, double *C, unsigned int size) {
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
+ if (index < size){
+ C[index] = acos(A[index]);
+ }
+}
+
+/**
+ * Do an atan over all the elements of a matrix
+ * @param A the input matrix (of length = size)
+ * @param C the pre-allocated output matrix (of length = size)
+ * @param siz the length of the input and output matrices
+ */
+extern "C"
+__global__ void matrix_atan(double *A, double *C, unsigned int size) {
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
+ if (index < size){
+ C[index] = atan(A[index]);
+ }
+}
+
+/**
+ * Do an sign over all the elements of a matrix
+ * Assign -1, 0 or 1 depending on the element being negative, 0 or positive
+ * @param A the input matrix (of length = size)
+ * @param C the pre-allocated output matrix (of length = size)
+ * @param siz the length of the input and output matrices
+ */
+extern "C"
+__global__ void matrix_sign(double *A, double *C, unsigned int size) {
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
+ if (index < size){
+ if (A[index] == 0.0) {
+ C[index] = 0.0;
+ } else {
+ C[index] = copysign(1.0, A[index]);
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1fc764b9/src/main/cpp/kernels/SystemML.ptx
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.ptx b/src/main/cpp/kernels/SystemML.ptx
index 50002f5..3229581 100644
--- a/src/main/cpp/kernels/SystemML.ptx
+++ b/src/main/cpp/kernels/SystemML.ptx
@@ -1,8 +1,8 @@
//
// Generated by NVIDIA NVVM Compiler
//
-// Compiler Build ID: CL-21112126
-// Cuda compilation tools, release 8.0, V8.0.43
+// Compiler Build ID: CL-21554848
+// Cuda compilation tools, release 8.0, V8.0.61
// Based on LLVM 3.4svn
//
@@ -11,6 +11,12 @@
.address_size 64
// .globl copy_u2l_dense
+.func (.param .b64 func_retval0) __internal_trig_reduction_slowpathd
+(
+ .param .b64 __internal_trig_reduction_slowpathd_param_0,
+ .param .b64 __internal_trig_reduction_slowpathd_param_1
+)
+;
.func (.param .b64 func_retval0) __internal_accurate_pow
(
.param .b64 __internal_accurate_pow_param_0,
@@ -18,6 +24,8 @@
)
;
.extern .shared .align 8 .b8 sdata[];
+.const .align 8 .b8 __cudart_i2opi_d[144] = {8, 93, 141, 31, 177, 95, 251, 107, 234, 146, 82, 138, 247, 57, 7, 61, 123, 241, 229, 235, 199, 186, 39, 117, 45, 234, 95, 158, 102, 63, 70, 79, 183, 9, 203, 39, 207, 126, 54, 109, 31, 109, 10, 90, 139, 17, 47, 239, 15, 152, 5, 222, 255, 151, 248, 31, 59, 40, 249, 189, 139, 95, 132, 156, 244, 57, 83, 131, 57, 214, 145, 57, 65, 126, 95, 180, 38, 112, 156, 233, 132, 68, 187, 46, 245, 53, 130, 232, 62, 167, 41, 177, 28, 235, 29, 254, 28, 146, 209, 9, 234, 46, 73, 6, 224, 210, 77, 66, 58, 110, 36, 183, 97, 197, 187, 222, 171, 99, 81, 254, 65, 144, 67, 60, 153, 149, 98, 219, 192, 221, 52, 245, 209, 87, 39, 252, 41, 21, 68, 78, 110, 131, 249, 162};
+.const .align 8 .b8 __cudart_sin_cos_coeffs[128] = {186, 94, 120, 249, 101, 219, 229, 61, 70, 210, 176, 44, 241, 229, 90, 190, 146, 227, 172, 105, 227, 29, 199, 62, 161, 98, 219, 25, 160, 1, 42, 191, 24, 8, 17, 17, 17, 17, 129, 63, 84, 85, 85, 85, 85, 85, 197, 191, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 100, 129, 253, 32, 131, 255, 168, 189, 40, 133, 239, 193, 167, 238, 33, 62, 217, 230, 6, 142, 79, 126, 146, 190, 233, 188, 221, 25, 160, 1, 250, 62, 71, 93, 193, 22, 108, 193, 86, 191, 81, 85, 85, 85, 85, 85, 165, 63, 0, 0, 0, 0, 0, 0, 224, 191, 0, 0, 0, 0, 0, 0, 240, 63};
.visible .entry copy_u2l_dense(
.param .u64 copy_u2l_dense_param_0,
@@ -442,9 +450,9 @@ BB6_6:
.param .u32 matrix_matrix_cellwise_op_param_7
)
{
- .reg .pred %p<54>;
- .reg .b32 %r<55>;
- .reg .f64 %fd<39>;
+ .reg .pred %p<52>;
+ .reg .b32 %r<56>;
+ .reg .f64 %fd<40>;
.reg .b64 %rd<15>;
@@ -467,40 +475,40 @@ BB6_6:
setp.lt.s32 %p2, %r1, %r14;
setp.lt.s32 %p3, %r2, %r10;
and.pred %p4, %p2, %p3;
- @!%p4 bra BB7_53;
+ @!%p4 bra BB7_55;
bra.uni BB7_1;
BB7_1:
mad.lo.s32 %r3, %r1, %r10, %r2;
setp.eq.s32 %p5, %r11, 1;
- mov.u32 %r53, %r1;
+ mov.u32 %r54, %r1;
@%p5 bra BB7_5;
setp.ne.s32 %p6, %r11, 2;
- mov.u32 %r54, %r3;
+ mov.u32 %r55, %r3;
@%p6 bra BB7_4;
- mov.u32 %r54, %r2;
+ mov.u32 %r55, %r2;
BB7_4:
- mov.u32 %r48, %r54;
- mov.u32 %r4, %r48;
- mov.u32 %r53, %r4;
+ mov.u32 %r49, %r55;
+ mov.u32 %r4, %r49;
+ mov.u32 %r54, %r4;
BB7_5:
- mov.u32 %r5, %r53;
+ mov.u32 %r5, %r54;
setp.eq.s32 %p7, %r12, 1;
- mov.u32 %r51, %r1;
+ mov.u32 %r52, %r1;
@%p7 bra BB7_9;
setp.ne.s32 %p8, %r12, 2;
- mov.u32 %r52, %r3;
+ mov.u32 %r53, %r3;
@%p8 bra BB7_8;
- mov.u32 %r52, %r2;
+ mov.u32 %r53, %r2;
BB7_8:
- mov.u32 %r51, %r52;
+ mov.u32 %r52, %r53;
BB7_9:
cvta.to.global.u64 %rd5, %rd3;
@@ -508,10 +516,10 @@ BB7_9:
mul.wide.s32 %rd7, %r5, 8;
add.s64 %rd8, %rd6, %rd7;
ld.global.f64 %fd1, [%rd8];
- mul.wide.s32 %rd9, %r51, 8;
+ mul.wide.s32 %rd9, %r52, 8;
add.s64 %rd10, %rd5, %rd9;
ld.global.f64 %fd2, [%rd10];
- mov.f64 %fd38, 0d7FEFFFFFFFFFFFFF;
+ mov.f64 %fd39, 0d7FEFFFFFFFFFFFFF;
setp.gt.s32 %p9, %r13, 5;
@%p9 bra BB7_19;
@@ -519,15 +527,15 @@ BB7_9:
@%p19 bra BB7_15;
setp.eq.s32 %p23, %r13, 0;
- @%p23 bra BB7_51;
+ @%p23 bra BB7_53;
setp.eq.s32 %p24, %r13, 1;
- @%p24 bra BB7_50;
+ @%p24 bra BB7_52;
bra.uni BB7_13;
-BB7_50:
- sub.f64 %fd38, %fd1, %fd2;
- bra.uni BB7_52;
+BB7_52:
+ sub.f64 %fd39, %fd1, %fd2;
+ bra.uni BB7_54;
BB7_19:
setp.gt.s32 %p10, %r13, 8;
@@ -542,12 +550,12 @@ BB7_19:
BB7_33:
setp.gt.f64 %p29, %fd1, %fd2;
- selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p29;
- bra.uni BB7_52;
+ selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p29;
+ bra.uni BB7_54;
BB7_15:
setp.eq.s32 %p20, %r13, 3;
- @%p20 bra BB7_49;
+ @%p20 bra BB7_51;
setp.eq.s32 %p21, %r13, 4;
@%p21 bra BB7_35;
@@ -583,7 +591,7 @@ BB7_35:
param0,
param1
);
- ld.param.f64 %fd37, [retval0+0];
+ ld.param.f64 %fd38, [retval0+0];
//{
}// Callseq End 0
@@ -595,17 +603,17 @@ BB7_35:
BB7_36:
{
.reg .b32 %temp;
- mov.b64 {%temp, %r23}, %fd37;
+ mov.b64 {%temp, %r23}, %fd38;
}
xor.b32 %r24, %r23, -2147483648;
{
.reg .b32 %temp;
- mov.b64 {%r25, %temp}, %fd37;
+ mov.b64 {%r25, %temp}, %fd38;
}
- mov.b64 %fd37, {%r25, %r24};
+ mov.b64 %fd38, {%r25, %r24};
BB7_37:
- mov.f64 %fd36, %fd37;
+ mov.f64 %fd37, %fd38;
setp.eq.f64 %p34, %fd1, 0d0000000000000000;
@%p34 bra BB7_40;
bra.uni BB7_38;
@@ -616,7 +624,7 @@ BB7_40:
setp.lt.s32 %p38, %r9, 0;
selp.b32 %r28, %r27, %r26, %p38;
mov.u32 %r29, 0;
- mov.b64 %fd36, {%r29, %r28};
+ mov.b64 %fd37, {%r29, %r28};
bra.uni BB7_41;
BB7_24:
@@ -629,8 +637,8 @@ BB7_24:
BB7_32:
setp.eq.f64 %p27, %fd1, %fd2;
- selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p27;
- bra.uni BB7_52;
+ selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p27;
+ bra.uni BB7_54;
BB7_28:
setp.eq.s32 %p12, %r13, 11;
@@ -638,67 +646,67 @@ BB7_28:
bra.uni BB7_29;
BB7_31:
- min.f64 %fd38, %fd1, %fd2;
- bra.uni BB7_52;
+ min.f64 %fd39, %fd1, %fd2;
+ bra.uni BB7_54;
-BB7_51:
- add.f64 %fd38, %fd1, %fd2;
- bra.uni BB7_52;
+BB7_53:
+ add.f64 %fd39, %fd1, %fd2;
+ bra.uni BB7_54;
BB7_13:
setp.eq.s32 %p25, %r13, 2;
@%p25 bra BB7_14;
- bra.uni BB7_52;
+ bra.uni BB7_54;
BB7_14:
- mul.f64 %fd38, %fd1, %fd2;
- bra.uni BB7_52;
+ mul.f64 %fd39, %fd1, %fd2;
+ bra.uni BB7_54;
BB7_34:
setp.le.f64 %p30, %fd1, %fd2;
- selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p30;
- bra.uni BB7_52;
+ selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p30;
+ bra.uni BB7_54;
BB7_22:
setp.eq.s32 %p18, %r13, 8;
@%p18 bra BB7_23;
- bra.uni BB7_52;
+ bra.uni BB7_54;
BB7_23:
setp.ge.f64 %p28, %fd1, %fd2;
- selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p28;
- bra.uni BB7_52;
+ selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p28;
+ bra.uni BB7_54;
-BB7_49:
- div.rn.f64 %fd38, %fd1, %fd2;
- bra.uni BB7_52;
+BB7_51:
+ div.rn.f64 %fd39, %fd1, %fd2;
+ bra.uni BB7_54;
BB7_17:
setp.eq.s32 %p22, %r13, 5;
@%p22 bra BB7_18;
- bra.uni BB7_52;
+ bra.uni BB7_54;
BB7_18:
setp.lt.f64 %p31, %fd1, %fd2;
- selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p31;
- bra.uni BB7_52;
+ selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p31;
+ bra.uni BB7_54;
BB7_26:
setp.eq.s32 %p15, %r13, 10;
@%p15 bra BB7_27;
- bra.uni BB7_52;
+ bra.uni BB7_54;
BB7_27:
setp.neu.f64 %p26, %fd1, %fd2;
- selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p26;
- bra.uni BB7_52;
+ selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p26;
+ bra.uni BB7_54;
BB7_29:
setp.ne.s32 %p13, %r13, 12;
- @%p13 bra BB7_52;
+ @%p13 bra BB7_54;
- max.f64 %fd38, %fd1, %fd2;
- bra.uni BB7_52;
+ max.f64 %fd39, %fd1, %fd2;
+ bra.uni BB7_54;
BB7_38:
setp.gt.s32 %p35, %r8, -1;
@@ -706,10 +714,10 @@ BB7_38:
cvt.rzi.f64.f64 %fd29, %fd2;
setp.neu.f64 %p36, %fd29, %fd2;
- selp.f64 %fd36, 0dFFF8000000000000, %fd36, %p36;
+ selp.f64 %fd37, 0dFFF8000000000000, %fd37, %p36;
BB7_41:
- mov.f64 %fd17, %fd36;
+ mov.f64 %fd17, %fd37;
add.f64 %fd18, %fd1, %fd2;
{
.reg .b32 %temp;
@@ -717,77 +725,79 @@ BB7_41:
}
and.b32 %r31, %r30, 2146435072;
setp.ne.s32 %p39, %r31, 2146435072;
- mov.f64 %fd35, %fd17;
- @%p39 bra BB7_48;
+ mov.f64 %fd36, %fd17;
+ @%p39 bra BB7_50;
setp.gtu.f64 %p40, %fd11, 0d7FF0000000000000;
- mov.f64 %fd35, %fd18;
- @%p40 bra BB7_48;
+ mov.f64 %fd36, %fd18;
+ @%p40 bra BB7_50;
abs.f64 %fd30, %fd2;
setp.gtu.f64 %p41, %fd30, 0d7FF0000000000000;
- mov.f64 %fd34, %fd18;
- mov.f64 %fd35, %fd34;
- @%p41 bra BB7_48;
+ mov.f64 %fd35, %fd18;
+ mov.f64 %fd36, %fd35;
+ @%p41 bra BB7_50;
+
+ and.b32 %r32, %r9, 2147483647;
+ setp.ne.s32 %p42, %r32, 2146435072;
+ @%p42 bra BB7_46;
{
.reg .b32 %temp;
- mov.b64 {%r32, %temp}, %fd2;
+ mov.b64 {%r33, %temp}, %fd2;
}
- and.b32 %r33, %r9, 2147483647;
- setp.eq.s32 %p42, %r33, 2146435072;
- setp.eq.s32 %p43, %r32, 0;
- and.pred %p44, %p42, %p43;
- @%p44 bra BB7_47;
- bra.uni BB7_45;
+ setp.eq.s32 %p43, %r33, 0;
+ @%p43 bra BB7_49;
-BB7_47:
- setp.gt.f64 %p48, %fd11, 0d3FF0000000000000;
- selp.b32 %r41, 2146435072, 0, %p48;
- xor.b32 %r42, %r41, 2146435072;
- setp.lt.s32 %p49, %r9, 0;
- selp.b32 %r43, %r42, %r41, %p49;
- setp.eq.f64 %p50, %fd1, 0dBFF0000000000000;
- selp.b32 %r44, 1072693248, %r43, %p50;
- mov.u32 %r45, 0;
- mov.b64 %fd35, {%r45, %r44};
- bra.uni BB7_48;
+BB7_46:
+ and.b32 %r34, %r8, 2147483647;
+ setp.ne.s32 %p44, %r34, 2146435072;
+ mov.f64 %fd33, %fd17;
+ mov.f64 %fd36, %fd33;
+ @%p44 bra BB7_50;
-BB7_45:
{
.reg .b32 %temp;
- mov.b64 {%r34, %temp}, %fd1;
+ mov.b64 {%r35, %temp}, %fd1;
}
- and.b32 %r35, %r8, 2147483647;
- setp.eq.s32 %p45, %r35, 2146435072;
- setp.eq.s32 %p46, %r34, 0;
- and.pred %p47, %p45, %p46;
- mov.f64 %fd35, %fd17;
- @!%p47 bra BB7_48;
- bra.uni BB7_46;
+ setp.ne.s32 %p45, %r35, 0;
+ mov.f64 %fd36, %fd17;
+ @%p45 bra BB7_50;
-BB7_46:
shr.s32 %r36, %r9, 31;
and.b32 %r37, %r36, -2146435072;
- selp.b32 %r38, -1048576, 2146435072, %p1;
- add.s32 %r39, %r38, %r37;
- mov.u32 %r40, 0;
- mov.b64 %fd35, {%r40, %r39};
+ add.s32 %r38, %r37, 2146435072;
+ or.b32 %r39, %r38, -2147483648;
+ selp.b32 %r40, %r39, %r38, %p1;
+ mov.u32 %r41, 0;
+ mov.b64 %fd36, {%r41, %r40};
+ bra.uni BB7_50;
-BB7_48:
- setp.eq.f64 %p51, %fd2, 0d0000000000000000;
- setp.eq.f64 %p52, %fd1, 0d3FF0000000000000;
- or.pred %p53, %p52, %p51;
- selp.f64 %fd38, 0d3FF0000000000000, %fd35, %p53;
+BB7_49:
+ setp.gt.f64 %p46, %fd11, 0d3FF0000000000000;
+ selp.b32 %r42, 2146435072, 0, %p46;
+ xor.b32 %r43, %r42, 2146435072;
+ setp.lt.s32 %p47, %r9, 0;
+ selp.b32 %r44, %r43, %r42, %p47;
+ setp.eq.f64 %p48, %fd1, 0dBFF0000000000000;
+ selp.b32 %r45, 1072693248, %r44, %p48;
+ mov.u32 %r46, 0;
+ mov.b64 %fd36, {%r46, %r45};
-BB7_52:
+BB7_50:
+ setp.eq.f64 %p49, %fd2, 0d0000000000000000;
+ setp.eq.f64 %p50, %fd1, 0d3FF0000000000000;
+ or.pred %p51, %p50, %p49;
+ selp.f64 %fd39, 0d3FF0000000000000, %fd36, %p51;
+
+BB7_54:
cvta.to.global.u64 %rd12, %rd4;
mul.wide.s32 %rd13, %r3, 8;
add.s64 %rd14, %rd12, %rd13;
- st.global.f64 [%rd14], %fd38;
+ st.global.f64 [%rd14], %fd39;
bar.sync 0;
-BB7_53:
+BB7_55:
ret;
}
@@ -801,9 +811,9 @@ BB7_53:
.param .u32 matrix_scalar_op_param_5
)
{
- .reg .pred %p<95>;
- .reg .b32 %r<62>;
- .reg .f64 %fd<75>;
+ .reg .pred %p<91>;
+ .reg .b32 %r<64>;
+ .reg .f64 %fd<77>;
.reg .b64 %rd<12>;
@@ -818,7 +828,7 @@ BB7_53:
mov.u32 %r11, %tid.x;
mad.lo.s32 %r1, %r10, %r9, %r11;
setp.ge.s32 %p3, %r1, %r8;
- @%p3 bra BB8_90;
+ @%p3 bra BB8_94;
cvta.to.global.u64 %rd6, %rd5;
cvta.to.global.u64 %rd7, %rd4;
@@ -827,9 +837,9 @@ BB7_53:
ld.global.f64 %fd1, [%rd9];
add.s64 %rd1, %rd6, %rd8;
setp.eq.s32 %p4, %r7, 0;
- @%p4 bra BB8_46;
+ @%p4 bra BB8_48;
- mov.f64 %fd66, 0d7FEFFFFFFFFFFFFF;
+ mov.f64 %fd67, 0d7FEFFFFFFFFFFFFF;
setp.gt.s32 %p5, %r6, 5;
@%p5 bra BB8_12;
@@ -837,34 +847,34 @@ BB7_53:
@%p15 bra BB8_8;
setp.eq.s32 %p19, %r6, 0;
- @%p19 bra BB8_44;
+ @%p19 bra BB8_46;
setp.eq.s32 %p20, %r6, 1;
- @%p20 bra BB8_43;
+ @%p20 bra BB8_45;
bra.uni BB8_6;
-BB8_43:
- sub.f64 %fd66, %fd52, %fd1;
- bra.uni BB8_45;
+BB8_45:
+ sub.f64 %fd67, %fd52, %fd1;
+ bra.uni BB8_47;
-BB8_46:
- mov.f64 %fd74, 0d7FEFFFFFFFFFFFFF;
- setp.gt.s32 %p50, %r6, 5;
- @%p50 bra BB8_56;
+BB8_48:
+ mov.f64 %fd76, 0d7FEFFFFFFFFFFFFF;
+ setp.gt.s32 %p48, %r6, 5;
+ @%p48 bra BB8_58;
- setp.gt.s32 %p60, %r6, 2;
- @%p60 bra BB8_52;
+ setp.gt.s32 %p58, %r6, 2;
+ @%p58 bra BB8_54;
- setp.eq.s32 %p64, %r6, 0;
- @%p64 bra BB8_88;
+ setp.eq.s32 %p62, %r6, 0;
+ @%p62 bra BB8_92;
- setp.eq.s32 %p65, %r6, 1;
- @%p65 bra BB8_87;
- bra.uni BB8_50;
+ setp.eq.s32 %p63, %r6, 1;
+ @%p63 bra BB8_91;
+ bra.uni BB8_52;
-BB8_87:
- sub.f64 %fd74, %fd1, %fd52;
- bra.uni BB8_89;
+BB8_91:
+ sub.f64 %fd76, %fd1, %fd52;
+ bra.uni BB8_93;
BB8_12:
setp.gt.s32 %p6, %r6, 8;
@@ -879,28 +889,28 @@ BB8_12:
BB8_26:
setp.lt.f64 %p25, %fd1, %fd52;
- selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p25;
- bra.uni BB8_45;
+ selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p25;
+ bra.uni BB8_47;
-BB8_56:
- setp.gt.s32 %p51, %r6, 8;
- @%p51 bra BB8_61;
+BB8_58:
+ setp.gt.s32 %p49, %r6, 8;
+ @%p49 bra BB8_63;
- setp.eq.s32 %p57, %r6, 6;
- @%p57 bra BB8_71;
+ setp.eq.s32 %p55, %r6, 6;
+ @%p55 bra BB8_73;
- setp.eq.s32 %p58, %r6, 7;
- @%p58 bra BB8_70;
- bra.uni BB8_59;
+ setp.eq.s32 %p56, %r6, 7;
+ @%p56 bra BB8_72;
+ bra.uni BB8_61;
-BB8_70:
- setp.gt.f64 %p70, %fd1, %fd52;
- selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p70;
- bra.uni BB8_89;
+BB8_72:
+ setp.gt.f64 %p68, %fd1, %fd52;
+ selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p68;
+ bra.uni BB8_93;
BB8_8:
setp.eq.s32 %p16, %r6, 3;
- @%p16 bra BB8_42;
+ @%p16 bra BB8_44;
setp.eq.s32 %p17, %r6, 4;
@%p17 bra BB8_28;
@@ -936,7 +946,7 @@ BB8_28:
param0,
param1
);
- ld.param.f64 %fd65, [retval0+0];
+ ld.param.f64 %fd66, [retval0+0];
//{
}// Callseq End 1
@@ -948,17 +958,17 @@ BB8_28:
BB8_29:
{
.reg .b32 %temp;
- mov.b64 {%temp, %r14}, %fd65;
+ mov.b64 {%temp, %r14}, %fd66;
}
xor.b32 %r15, %r14, -2147483648;
{
.reg .b32 %temp;
- mov.b64 {%r16, %temp}, %fd65;
+ mov.b64 {%r16, %temp}, %fd66;
}
- mov.b64 %fd65, {%r16, %r15};
+ mov.b64 %fd66, {%r16, %r15};
BB8_30:
- mov.f64 %fd64, %fd65;
+ mov.f64 %fd65, %fd66;
setp.eq.f64 %p30, %fd52, 0d0000000000000000;
@%p30 bra BB8_33;
bra.uni BB8_31;
@@ -969,7 +979,7 @@ BB8_33:
setp.lt.s32 %p34, %r3, 0;
selp.b32 %r19, %r18, %r17, %p34;
mov.u32 %r20, 0;
- mov.b64 %fd64, {%r20, %r19};
+ mov.b64 %fd65, {%r20, %r19};
bra.uni BB8_34;
BB8_17:
@@ -982,18 +992,18 @@ BB8_17:
BB8_25:
setp.eq.f64 %p23, %fd1, %fd52;
- selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p23;
- bra.uni BB8_45;
+ selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p23;
+ bra.uni BB8_47;
-BB8_52:
- setp.eq.s32 %p61, %r6, 3;
- @%p61 bra BB8_86;
+BB8_54:
+ setp.eq.s32 %p59, %r6, 3;
+ @%p59 bra BB8_90;
- setp.eq.s32 %p62, %r6, 4;
- @%p62 bra BB8_72;
- bra.uni BB8_54;
+ setp.eq.s32 %p60, %r6, 4;
+ @%p60 bra BB8_74;
+ bra.uni BB8_56;
-BB8_72:
+BB8_74:
{
.reg .b32 %temp;
mov.b64 {%temp, %r4}, %fd1;
@@ -1002,11 +1012,11 @@ BB8_72:
.reg .b32 %temp;
mov.b64 {%temp, %r5}, %fd52;
}
- bfe.u32 %r37, %r5, 20, 11;
- add.s32 %r38, %r37, -1012;
+ bfe.u32 %r38, %r5, 20, 11;
+ add.s32 %r39, %r38, -1012;
mov.b64 %rd11, %fd52;
- shl.b64 %rd3, %rd11, %r38;
- setp.eq.s64 %p73, %rd3, -9223372036854775808;
+ shl.b64 %rd3, %rd11, %r39;
+ setp.eq.s64 %p71, %rd3, -9223372036854775808;
abs.f64 %fd35, %fd1;
// Callseq Start 2
{
@@ -1023,54 +1033,54 @@ BB8_72:
param0,
param1
);
- ld.param.f64 %fd73, [retval0+0];
+ ld.param.f64 %fd75, [retval0+0];
//{
}// Callseq End 2
- setp.lt.s32 %p74, %r4, 0;
- and.pred %p2, %p74, %p73;
- @!%p2 bra BB8_74;
- bra.uni BB8_73;
+ setp.lt.s32 %p72, %r4, 0;
+ and.pred %p2, %p72, %p71;
+ @!%p2 bra BB8_76;
+ bra.uni BB8_75;
-BB8_73:
+BB8_75:
{
.reg .b32 %temp;
- mov.b64 {%temp, %r39}, %fd73;
+ mov.b64 {%temp, %r40}, %fd75;
}
- xor.b32 %r40, %r39, -2147483648;
+ xor.b32 %r41, %r40, -2147483648;
{
.reg .b32 %temp;
- mov.b64 {%r41, %temp}, %fd73;
+ mov.b64 {%r42, %temp}, %fd75;
}
- mov.b64 %fd73, {%r41, %r40};
-
-BB8_74:
- mov.f64 %fd72, %fd73;
- setp.eq.f64 %p75, %fd1, 0d0000000000000000;
- @%p75 bra BB8_77;
- bra.uni BB8_75;
+ mov.b64 %fd75, {%r42, %r41};
-BB8_77:
- selp.b32 %r42, %r4, 0, %p73;
- or.b32 %r43, %r42, 2146435072;
- setp.lt.s32 %p79, %r5, 0;
- selp.b32 %r44, %r43, %r42, %p79;
- mov.u32 %r45, 0;
- mov.b64 %fd72, {%r45, %r44};
- bra.uni BB8_78;
+BB8_76:
+ mov.f64 %fd74, %fd75;
+ setp.eq.f64 %p73, %fd1, 0d0000000000000000;
+ @%p73 bra BB8_79;
+ bra.uni BB8_77;
+
+BB8_79:
+ selp.b32 %r43, %r4, 0, %p71;
+ or.b32 %r44, %r43, 2146435072;
+ setp.lt.s32 %p77, %r5, 0;
+ selp.b32 %r45, %r44, %r43, %p77;
+ mov.u32 %r46, 0;
+ mov.b64 %fd74, {%r46, %r45};
+ bra.uni BB8_80;
-BB8_61:
- setp.gt.s32 %p52, %r6, 10;
- @%p52 bra BB8_65;
+BB8_63:
+ setp.gt.s32 %p50, %r6, 10;
+ @%p50 bra BB8_67;
- setp.eq.s32 %p55, %r6, 9;
- @%p55 bra BB8_69;
- bra.uni BB8_63;
+ setp.eq.s32 %p53, %r6, 9;
+ @%p53 bra BB8_71;
+ bra.uni BB8_65;
-BB8_69:
- setp.eq.f64 %p68, %fd1, %fd52;
- selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p68;
- bra.uni BB8_89;
+BB8_71:
+ setp.eq.f64 %p66, %fd1, %fd52;
+ selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p66;
+ bra.uni BB8_93;
BB8_21:
setp.eq.s32 %p8, %r6, 11;
@@ -1078,135 +1088,135 @@ BB8_21:
bra.uni BB8_22;
BB8_24:
- min.f64 %fd66, %fd52, %fd1;
- bra.uni BB8_45;
+ min.f64 %fd67, %fd52, %fd1;
+ bra.uni BB8_47;
-BB8_44:
- add.f64 %fd66, %fd1, %fd52;
- bra.uni BB8_45;
+BB8_46:
+ add.f64 %fd67, %fd1, %fd52;
+ bra.uni BB8_47;
BB8_6:
setp.eq.s32 %p21, %r6, 2;
@%p21 bra BB8_7;
- bra.uni BB8_45;
+ bra.uni BB8_47;
BB8_7:
- mul.f64 %fd66, %fd1, %fd52;
- bra.uni BB8_45;
+ mul.f64 %fd67, %fd1, %fd52;
+ bra.uni BB8_47;
BB8_27:
setp.ge.f64 %p26, %fd1, %fd52;
- selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p26;
- bra.uni BB8_45;
+ selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p26;
+ bra.uni BB8_47;
BB8_15:
setp.eq.s32 %p14, %r6, 8;
@%p14 bra BB8_16;
- bra.uni BB8_45;
+ bra.uni BB8_47;
BB8_16:
setp.le.f64 %p24, %fd1, %fd52;
- selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p24;
- bra.uni BB8_45;
+ selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p24;
+ bra.uni BB8_47;
-BB8_42:
- div.rn.f64 %fd66, %fd52, %fd1;
- bra.uni BB8_45;
+BB8_44:
+ div.rn.f64 %fd67, %fd52, %fd1;
+ bra.uni BB8_47;
BB8_10:
setp.eq.s32 %p18, %r6, 5;
@%p18 bra BB8_11;
- bra.uni BB8_45;
+ bra.uni BB8_47;
BB8_11:
setp.gt.f64 %p27, %fd1, %fd52;
- selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p27;
- bra.uni BB8_45;
+ selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p27;
+ bra.uni BB8_47;
-BB8_65:
- setp.eq.s32 %p53, %r6, 11;
- @%p53 bra BB8_68;
- bra.uni BB8_66;
+BB8_67:
+ setp.eq.s32 %p51, %r6, 11;
+ @%p51 bra BB8_70;
+ bra.uni BB8_68;
-BB8_68:
- min.f64 %fd74, %fd1, %fd52;
- bra.uni BB8_89;
+BB8_70:
+ min.f64 %fd76, %fd1, %fd52;
+ bra.uni BB8_93;
BB8_19:
setp.eq.s32 %p11, %r6, 10;
@%p11 bra BB8_20;
- bra.uni BB8_45;
+ bra.uni BB8_47;
BB8_20:
setp.neu.f64 %p22, %fd1, %fd52;
- selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p22;
- bra.uni BB8_45;
+ selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p22;
+ bra.uni BB8_47;
BB8_22:
setp.ne.s32 %p9, %r6, 12;
- @%p9 bra BB8_45;
-
- max.f64 %fd66, %fd52, %fd1;
- bra.uni BB8_45;
+ @%p9 bra BB8_47;
-BB8_88:
- add.f64 %fd74, %fd1, %fd52;
- bra.uni BB8_89;
+ max.f64 %fd67, %fd52, %fd1;
+ bra.uni BB8_47;
-BB8_50:
- setp.eq.s32 %p66, %r6, 2;
- @%p66 bra BB8_51;
- bra.uni BB8_89;
+BB8_92:
+ add.f64 %fd76, %fd1, %fd52;
+ bra.uni BB8_93;
-BB8_51:
- mul.f64 %fd74, %fd1, %fd52;
- bra.uni BB8_89;
+BB8_52:
+ setp.eq.s32 %p64, %r6, 2;
+ @%p64 bra BB8_53;
+ bra.uni BB8_93;
-BB8_71:
- setp.le.f64 %p71, %fd1, %fd52;
- selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p71;
- bra.uni BB8_89;
+BB8_53:
+ mul.f64 %fd76, %fd1, %fd52;
+ bra.uni BB8_93;
-BB8_59:
- setp.eq.s32 %p59, %r6, 8;
- @%p59 bra BB8_60;
- bra.uni BB8_89;
+BB8_73:
+ setp.le.f64 %p69, %fd1, %fd52;
+ selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p69;
+ bra.uni BB8_93;
-BB8_60:
- setp.ge.f64 %p69, %fd1, %fd52;
- selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p69;
- bra.uni BB8_89;
+BB8_61:
+ setp.eq.s32 %p57, %r6, 8;
+ @%p57 bra BB8_62;
+ bra.uni BB8_93;
-BB8_86:
- div.rn.f64 %fd74, %fd1, %fd52;
- bra.uni BB8_89;
+BB8_62:
+ setp.ge.f64 %p67, %fd1, %fd52;
+ selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p67;
+ bra.uni BB8_93;
-BB8_54:
- setp.eq.s32 %p63, %r6, 5;
- @%p63 bra BB8_55;
- bra.uni BB8_89;
+BB8_90:
+ div.rn.f64 %fd76, %fd1, %fd52;
+ bra.uni BB8_93;
-BB8_55:
- setp.lt.f64 %p72, %fd1, %fd52;
- selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p72;
- bra.uni BB8_89;
+BB8_56:
+ setp.eq.s32 %p61, %r6, 5;
+ @%p61 bra BB8_57;
+ bra.uni BB8_93;
-BB8_63:
- setp.eq.s32 %p56, %r6, 10;
- @%p56 bra BB8_64;
- bra.uni BB8_89;
+BB8_57:
+ setp.lt.f64 %p70, %fd1, %fd52;
+ selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p70;
+ bra.uni BB8_93;
-BB8_64:
- setp.neu.f64 %p67, %fd1, %fd52;
- selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p67;
- bra.uni BB8_89;
+BB8_65:
+ setp.eq.s32 %p54, %r6, 10;
+ @%p54 bra BB8_66;
+ bra.uni BB8_93;
BB8_66:
- setp.ne.s32 %p54, %r6, 12;
- @%p54 bra BB8_89;
+ setp.neu.f64 %p65, %fd1, %fd52;
+ selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p65;
+ bra.uni BB8_93;
- max.f64 %fd74, %fd1, %fd52;
- bra.uni BB8_89;
+BB8_68:
+ setp.ne.s32 %p52, %r6, 12;
+ @%p52 bra BB8_93;
+
+ max.f64 %fd76, %fd1, %fd52;
+ bra.uni BB8_93;
BB8_31:
setp.gt.s32 %p31, %r2, -1;
@@ -1214,10 +1224,10 @@ BB8_31:
cvt.rzi.f64.f64 %fd54, %fd1;
setp.neu.f64 %p32, %fd54, %fd1;
- selp.f64 %fd64, 0dFFF8000000000000, %fd64, %p32;
+ selp.f64 %fd65, 0dFFF8000000000000, %fd65, %p32;
BB8_34:
- mov.f64 %fd16, %fd64;
+ mov.f64 %fd16, %fd65;
add.f64 %fd17, %fd1, %fd52;
{
.reg .b32 %temp;
@@ -1225,157 +1235,161 @@ BB8_34:
}
and.b32 %r22, %r21, 2146435072;
setp.ne.s32 %p35, %r22, 2146435072;
- mov.f64 %fd63, %fd16;
- @%p35 bra BB8_41;
+ mov.f64 %fd64, %fd16;
+ @%p35 bra BB8_43;
setp.gtu.f64 %p36, %fd10, 0d7FF0000000000000;
- mov.f64 %fd63, %fd17;
- @%p36 bra BB8_41;
+ mov.f64 %fd64, %fd17;
+ @%p36 bra BB8_43;
abs.f64 %fd55, %fd1;
setp.gtu.f64 %p37, %fd55, 0d7FF0000000000000;
- mov.f64 %fd62, %fd17;
- mov.f64 %fd63, %fd62;
- @%p37 bra BB8_41;
+ mov.f64 %fd63, %fd17;
+ mov.f64 %fd64, %fd63;
+ @%p37 bra BB8_43;
+
+ and.b32 %r23, %r3, 2147483647;
+ setp.ne.s32 %p38, %r23, 2146435072;
+ @%p38 bra BB8_39;
{
.reg .b32 %temp;
- mov.b64 {%r23, %temp}, %fd1;
+ mov.b64 {%r24, %temp}, %fd1;
}
- and.b32 %r24, %r3, 2147483647;
- setp.eq.s32 %p38, %r24, 2146435072;
- setp.eq.s32 %p39, %r23, 0;
- and.pred %p40, %p38, %p39;
- @%p40 bra BB8_40;
- bra.uni BB8_38;
+ setp.eq.s32 %p39, %r24, 0;
+ @%p39 bra BB8_42;
-BB8_40:
- setp.gt.f64 %p44, %fd10, 0d3FF0000000000000;
- selp.b32 %r32, 2146435072, 0, %p44;
- xor.b32 %r33, %r32, 2146435072;
- setp.lt.s32 %p45, %r3, 0;
- selp.b32 %r34, %r33, %r32, %p45;
- setp.eq.f64 %p46, %fd52, 0dBFF0000000000000;
- selp.b32 %r35, 1072693248, %r34, %p46;
- mov.u32 %r36, 0;
- mov.b64 %fd63, {%r36, %r35};
- bra.uni BB8_41;
+BB8_39:
+ and.b32 %r25, %r2, 2147483647;
+ setp.ne.s32 %p40, %r25, 2146435072;
+ mov.f64 %fd61, %fd16;
+ mov.f64 %fd64, %fd61;
+ @%p40 bra BB8_43;
-BB8_75:
- setp.gt.s32 %p76, %r4, -1;
- @%p76 bra BB8_78;
+ {
+ .reg .b32 %temp;
+ mov.b64 {%r26, %temp}, %fd52;
+ }
+ setp.ne.s32 %p41, %r26, 0;
+ mov.f64 %fd64, %fd16;
+ @%p41 bra BB8_43;
+
+ shr.s32 %r27, %r3, 31;
+ and.b32 %r28, %r27, -2146435072;
+ add.s32 %r29, %r28, 2146435072;
+ or.b32 %r30, %r29, -2147483648;
+ selp.b32 %r31, %r30, %r29, %p1;
+ mov.u32 %r32, 0;
+ mov.b64 %fd64, {%r32, %r31};
+ bra.uni BB8_43;
+
+BB8_77:
+ setp.gt.s32 %p74, %r4, -1;
+ @%p74 bra BB8_80;
cvt.rzi.f64.f64 %fd57, %fd52;
- setp.neu.f64 %p77, %fd57, %fd52;
- selp.f64 %fd72, 0dFFF8000000000000, %fd72, %p77;
+ setp.neu.f64 %p75, %fd57, %fd52;
+ selp.f64 %fd74, 0dFFF8000000000000, %fd74, %p75;
-BB8_78:
- mov.f64 %fd41, %fd72;
+BB8_80:
+ mov.f64 %fd41, %fd74;
add.f64 %fd42, %fd1, %fd52;
{
.reg .b32 %temp;
- mov.b64 {%temp, %r46}, %fd42;
+ mov.b64 {%temp, %r47}, %fd42;
}
- and.b32 %r47, %r46, 2146435072;
- setp.ne.s32 %p80, %r47, 2146435072;
- mov.f64 %fd71, %fd41;
- @%p80 bra BB8_85;
+ and.b32 %r48, %r47, 2146435072;
+ setp.ne.s32 %p78, %r48, 2146435072;
+ mov.f64 %fd73, %fd41;
+ @%p78 bra BB8_89;
- setp.gtu.f64 %p81, %fd35, 0d7FF0000000000000;
- mov.f64 %fd71, %fd42;
- @%p81 bra BB8_85;
+ setp.gtu.f64 %p79, %fd35, 0d7FF0000000000000;
+ mov.f64 %fd73, %fd42;
+ @%p79 bra BB8_89;
abs.f64 %fd58, %fd52;
- setp.gtu.f64 %p82, %fd58, 0d7FF0000000000000;
- mov.f64 %fd70, %fd42;
- mov.f64 %fd71, %fd70;
- @%p82 bra BB8_85;
+ setp.gtu.f64 %p80, %fd58, 0d7FF0000000000000;
+ mov.f64 %fd72, %fd42;
+ mov.f64 %fd73, %fd72;
+ @%p80 bra BB8_89;
+
+ and.b32 %r49, %r5, 2147483647;
+ setp.ne.s32 %p81, %r49, 2146435072;
+ @%p81 bra BB8_85;
{
.reg .b32 %temp;
- mov.b64 {%r48, %temp}, %fd52;
+ mov.b64 {%r50, %temp}, %fd52;
}
- and.b32 %r49, %r5, 2147483647;
- setp.eq.s32 %p83, %r49, 2146435072;
- setp.eq.s32 %p84, %r48, 0;
- and.pred %p85, %p83, %p84;
- @%p85 bra BB8_84;
- bra.uni BB8_82;
-
-BB8_84:
- setp.gt.f64 %p89, %fd35, 0d3FF0000000000000;
- selp.b32 %r57, 2146435072, 0, %p89;
- xor.b32 %r58, %r57, 2146435072;
- setp.lt.s32 %p90, %r5, 0;
- selp.b32 %r59, %r58, %r57, %p90;
- setp.eq.f64 %p91, %fd1, 0dBFF0000000000000;
- selp.b32 %r60, 1072693248, %r59, %p91;
- mov.u32 %r61, 0;
- mov.b64 %fd71, {%r61, %r60};
- bra.uni BB8_85;
-
-BB8_38:
- {
- .reg .b32 %temp;
- mov.b64 {%r25, %temp}, %fd52;
- }
- and.b32 %r26, %r2, 2147483647;
- setp.eq.s32 %p41, %r26, 2146435072;
- setp.eq.s32 %p42, %r25, 0;
- and.pred %p43, %p41, %p42;
- mov.f64 %fd63, %fd16;
- @!%p43 bra BB8_41;
- bra.uni BB8_39;
-
-BB8_39:
- shr.s32 %r27, %r3, 31;
- and.b32 %r28, %r27, -2146435072;
- selp.b32 %r29, -1048576, 2146435072, %p1;
- add.s32 %r30, %r29, %r28;
- mov.u32 %r31, 0;
- mov.b64 %fd63, {%r31, %r30};
+ setp.eq.s32 %p82, %r50, 0;
+ @%p82 bra BB8_88;
-BB8_41:
- setp.eq.f64 %p47, %fd1, 0d0000000000000000;
- setp.eq.f64 %p48, %fd52, 0d3FF0000000000000;
- or.pred %p49, %p48, %p47;
- selp.f64 %fd66, 0d3FF0000000000000, %fd63, %p49;
-
-BB8_45:
- st.global.f64 [%rd1], %fd66;
- bra.uni BB8_90;
+BB8_85:
+ and.b32 %r51, %r4, 2147483647;
+ setp.ne.s32 %p83, %r51, 2146435072;
+ mov.f64 %fd70, %fd41;
+ mov.f64 %fd73, %fd70;
+ @%p83 bra BB8_89;
-BB8_82:
{
.reg .b32 %temp;
- mov.b64 {%r50, %temp}, %fd1;
+ mov.b64 {%r52, %temp}, %fd1;
}
- and.b32 %r51, %r4, 2147483647;
- setp.eq.s32 %p86, %r51, 2146435072;
- setp.eq.s32 %p87, %r50, 0;
- and.pred %p88, %p86, %p87;
- mov.f64 %fd71, %fd41;
- @!%p88 bra BB8_85;
- bra.uni BB8_83;
-
-BB8_83:
- shr.s32 %r52, %r5, 31;
- and.b32 %r53, %r52, -2146435072;
- selp.b32 %r54, -1048576, 2146435072, %p2;
- add.s32 %r55, %r54, %r53;
- mov.u32 %r56, 0;
- mov.b64 %fd71, {%r56, %r55};
+ setp.ne.s32 %p84, %r52, 0;
+ mov.f64 %fd73, %fd41;
+ @%p84 bra BB8_89;
+
+ shr.s32 %r53, %r5, 31;
+ and.b32 %r54, %r53, -2146435072;
+ add.s32 %r55, %r54, 2146435072;
+ or.b32 %r56, %r55, -2147483648;
+ selp.b32 %r57, %r56, %r55, %p2;
+ mov.u32 %r58, 0;
+ mov.b64 %fd73, {%r58, %r57};
+ bra.uni BB8_89;
-BB8_85:
- setp.eq.f64 %p92, %fd52, 0d0000000000000000;
- setp.eq.f64 %p93, %fd1, 0d3FF0000000000000;
- or.pred %p94, %p93, %p92;
- selp.f64 %fd74, 0d3FF0000000000000, %fd71, %p94;
+BB8_42:
+ setp.gt.f64 %p42, %fd10, 0d3FF0000000000000;
+ selp.b32 %r33, 2146435072, 0, %p42;
+ xor.b32 %r34, %r33, 2146435072;
+ setp.lt.s32 %p43, %r3, 0;
+ selp.b32 %r35, %r34, %r33, %p43;
+ setp.eq.f64 %p44, %fd52, 0dBFF0000000000000;
+ selp.b32 %r36, 1072693248, %r35, %p44;
+ mov.u32 %r37, 0;
+ mov.b64 %fd64, {%r37, %r36};
+
+BB8_43:
+ setp.eq.f64 %p45, %fd1, 0d0000000000000000;
+ setp.eq.f64 %p46, %fd52, 0d3FF0000000000000;
+ or.pred %p47, %p46, %p45;
+ selp.f64 %fd67, 0d3FF0000000000000, %fd64, %p47;
+
+BB8_47:
+ st.global.f64 [%rd1], %fd67;
+ bra.uni BB8_94;
+
+BB8_88:
+ setp.gt.f64 %p85, %fd35, 0d3FF0000000000000;
+ selp.b32 %r59, 2146435072, 0, %p85;
+ xor.b32 %r60, %r59, 2146435072;
+ setp.lt.s32 %p86, %r5, 0;
+ selp.b32 %r61, %r60, %r59, %p86;
+ setp.eq.f64 %p87, %fd1, 0dBFF0000000000000;
+ selp.b32 %r62, 1072693248, %r61, %p87;
+ mov.u32 %r63, 0;
+ mov.b64 %fd73, {%r63, %r62};
BB8_89:
- st.global.f64 [%rd1], %fd74;
+ setp.eq.f64 %p88, %fd52, 0d0000000000000000;
+ setp.eq.f64 %p89, %fd1, 0d3FF0000000000000;
+ or.pred %p90, %p89, %p88;
+ selp.f64 %fd76, 0d3FF0000000000000, %fd73, %p90;
-BB8_90:
+BB8_93:
+ st.global.f64 [%rd1], %fd76;
+
+BB8_94:
bar.sync 0;
ret;
}
@@ -2928,7 +2942,7 @@ BB19_35:
.reg .pred %p<20>;
.reg .b32 %r<39>;
.reg .f64 %fd<76>;
- .reg .b64 %rd<42>;
+ .reg .b64 %rd<43>;
ld.param.u64 %rd1, [reduce_row_mean_param_0];
@@ -3095,12 +3109,13 @@ BB20_33:
@%p19 bra BB20_35;
ld.shared.f64 %fd40, [sdata];
- cvt.rn.f64.s32 %fd41, %r4;
+ cvt.u64.u32 %rd39, %r4;
+ cvt.rn.f64.s64 %fd41, %rd39;
div.rn.f64 %fd42, %fd40, %fd41;
- cvta.to.global.u64 %rd39, %rd2;
- mul.wide.u32 %rd40, %r6, 8;
- add.s64 %rd41, %rd39, %rd40;
- st.global.f64 [%rd41], %fd42;
+ cvta.to.global.u64 %rd40, %rd2;
+ mul.wide.u32 %rd41, %r6, 8;
+ add.s64 %rd42, %rd40, %rd41;
+ st.global.f64 [%rd42], %fd42;
BB20_35:
ret;
@@ -3117,7 +3132,7 @@ BB20_35:
.reg .pred %p<4>;
.reg .b32 %r<11>;
.reg .f64 %fd<12>;
- .reg .b64 %rd<9>;
+ .reg .b64 %rd<10>;
ld.param.u64 %rd2, [reduce_col_mean_param_0];
@@ -3154,11 +3169,12 @@ BB21_3:
BB21_4:
cvta.to.global.u64 %rd6, %rd3;
- cvt.rn.f64.s32 %fd7, %r5;
+ cvt.u64.u32 %rd7, %r5;
+ cvt.rn.f64.s64 %fd7, %rd7;
div.rn.f64 %fd8, %fd10, %fd7;
- mul.wide.u32 %rd7, %r1, 8;
- add.s64 %rd8, %rd6, %rd7;
- st.global.f64 [%rd8], %fd8;
+ mul.wide.u32 %rd8, %r1, 8;
+ add.s64 %rd9, %rd6, %rd8;
+ st.global.f64 [%rd9], %fd8;
BB21_5:
ret;
@@ -3277,82 +3293,1638 @@ BB22_5:
ret;
}
-.func (.param .b64 func_retval0) __internal_accurate_pow(
- .param .b64 __internal_accurate_pow_param_0,
- .param .b64 __internal_accurate_pow_param_1
+ // .globl matrix_sqrt
+.visible .entry matrix_sqrt(
+ .param .u64 matrix_sqrt_param_0,
+ .param .u64 matrix_sqrt_param_1,
+ .param .u32 matrix_sqrt_param_2
)
{
- .reg .pred %p<10>;
- .reg .f32 %f<3>;
- .reg .b32 %r<52>;
- .reg .f64 %fd<134>;
+ .reg .pred %p<2>;
+ .reg .b32 %r<6>;
+ .reg .f64 %fd<3>;
+ .reg .b64 %rd<8>;
- ld.param.f64 %fd12, [__internal_accurate_pow_param_0];
- ld.param.f64 %fd13, [__internal_accurate_pow_param_1];
+ ld.param.u64 %rd1, [matrix_sqrt_param_0];
+ ld.param.u64 %rd2, [matrix_sqrt_param_1];
+ ld.param.u32 %r2, [matrix_sqrt_param_2];
+ mov.u32 %r3, %ctaid.x;
+ mov.u32 %r4, %ntid.x;
+ mov.u32 %r5, %tid.x;
+ mad.lo.s32 %r1, %r4, %r3, %r5;
+ setp.ge.u32 %p1, %r1, %r2;
+ @%p1 bra BB23_2;
+
+ cvta.to.global.u64 %rd3, %rd1;
+ mul.wide.s32 %rd4, %r1, 8;
+ add.s64 %rd5, %rd3, %rd4;
+ ld.global.f64 %fd1, [%rd5];
+ sqrt.rn.f64 %fd2, %fd1;
+ cvta.to.global.u64 %rd6, %rd2;
+ add.s64 %rd7, %rd6, %rd4;
+ st.global.f64 [%rd7], %fd2;
+
+BB23_2:
+ ret;
+}
+
+ // .globl matrix_round
+.visible .entry matrix_round(
+ .param .u64 matrix_round_param_0,
+ .param .u64 matrix_round_param_1,
+ .param .u32 matrix_round_param_2
+)
+{
+ .reg .pred %p<4>;
+ .reg .b32 %r<11>;
+ .reg .f64 %fd<10>;
+ .reg .b64 %rd<11>;
+
+
+ ld.param.u64 %rd2, [matrix_round_param_0];
+ ld.param.u64 %rd3, [matrix_round_param_1];
+ ld.param.u32 %r2, [matrix_round_param_2];
+ mov.u32 %r3, %ctaid.x;
+ mov.u32 %r4, %ntid.x;
+ mov.u32 %r5, %tid.x;
+ mad.lo.s32 %r1, %r4, %r3, %r5;
+ setp.ge.u32 %p1, %r1, %r2;
+ @%p1 bra BB24_4;
+
+ cvta.to.global.u64 %rd4, %rd2;
+ cvt.s64.s32 %rd1, %r1;
+ mul.wide.s32 %rd5, %r1, 8;
+ add.s64 %rd6, %rd4, %rd5;
+ ld.global.f64 %fd9, [%rd6];
+ abs.f64 %fd2, %fd9;
+ setp.ge.f64 %p2, %fd2, 0d4330000000000000;
+ @%p2 bra BB24_3;
+
+ add.f64 %fd5, %fd2, 0d3FE0000000000000;
+ cvt.rzi.f64.f64 %fd6, %fd5;
+ setp.lt.f64 %p3, %fd2, 0d3FE0000000000000;
+ selp.f64 %fd7, 0d0000000000000000, %fd6, %p3;
{
.reg .b32 %temp;
- mov.b64 {%temp, %r49}, %fd12;
+ mov.b64 {%r6, %temp}, %fd7;
}
{
.reg .b32 %temp;
- mov.b64 {%r48, %temp}, %fd12;
+ mov.b64 {%temp, %r7}, %fd7;
}
- shr.u32 %r50, %r49, 20;
- setp.ne.s32 %p1, %r50, 0;
- @%p1 bra BB23_2;
+ {
+ .reg .b32 %temp;
+ mov.b64 {%temp, %r8}, %fd9;
+ }
+ and.b32 %r9, %r8, -2147483648;
+ or.b32 %r10, %r7, %r9;
+ mov.b64 %fd9, {%r6, %r10};
- mul.f64 %fd14, %fd12, 0d4350000000000000;
+BB24_3:
+ cvta.to.global.u64 %rd7, %rd3;
+ cvt.rzi.s64.f64 %rd8, %fd9;
+ cvt.rn.f64.s64 %fd8, %rd8;
+ shl.b64 %rd9, %rd1, 3;
+ add.s64 %rd10, %rd7, %rd9;
+ st.global.f64 [%rd10], %fd8;
+
+BB24_4:
+ ret;
+}
+
+ // .globl matrix_abs
+.visible .entry matrix_abs(
+ .param .u64 matrix_abs_param_0,
+ .param .u64 matrix_abs_param_1,
+ .param .u32 matrix_abs_param_2
+)
+{
+ .reg .pred %p<2>;
+ .reg .b32 %r<6>;
+ .reg .f64 %fd<3>;
+ .reg .b64 %rd<8>;
+
+
+ ld.param.u64 %rd1, [matrix_abs_param_0];
+ ld.param.u64 %rd2, [matrix_abs_param_1];
+ ld.param.u32 %r2, [matrix_abs_param_2];
+ mov.u32 %r3, %ctaid.x;
+ mov.u32 %r4, %ntid.x;
+ mov.u32 %r5, %tid.x;
+ mad.lo.s32 %r1, %r4, %r3, %r5;
+ setp.ge.u32 %p1, %r1, %r2;
+ @%p1 bra BB25_2;
+
+ cvta.to.global.u64 %rd3, %rd1;
+ mul.wide.s32 %rd4, %r1, 8;
+ add.s64 %rd5, %rd3, %rd4;
+ ld.global.f64 %fd1, [%rd5];
+ abs.f64 %fd2, %fd1;
+ cvta.to.global.u64 %rd6, %rd2;
+ add.s64 %rd7, %rd6, %rd4;
+ st.global.f64 [%rd7], %fd2;
+
+BB25_2:
+ ret;
+}
+
+ // .globl matrix_log
+.visible .entry matrix_log(
+ .param .u64 matrix_log_param_0,
+ .param .u64 matrix_log_param_1,
+ .param .u32 matrix_log_param_2
+)
+{
+ .reg .pred %p<6>;
+ .reg .f32 %f<2>;
+ .reg .b32 %r<33>;
+ .reg .f64 %fd<59>;
+ .reg .b64 %rd<10>;
+
+
+ ld.param.u64 %rd2, [matrix_log_param_0];
+ ld.param.u64 %rd3, [matrix_log_param_1];
+ ld.param.u32 %r12, [matrix_log_param_2];
+ mov.u32 %r13, %ctaid.x;
+ mov.u32 %r14, %ntid.x;
+ mov.u32 %r15, %tid.x;
+ mad.lo.s32 %r1, %r14, %r13, %r15;
+ setp.ge.u32 %p1, %r1, %r12;
+ @%p1 bra BB26_9;
+
+ cvta.to.global.u64 %rd4, %rd2;
+ cvt.s64.s32 %rd1, %r1;
+ mul.wide.s32 %rd5, %r1, 8;
+ add.s64 %rd6, %rd4, %rd5;
+ ld.global.f64 %fd56, [%rd6];
{
.reg .b32 %temp;
- mov.b64 {%temp, %r49}, %fd14;
+ mov.b64 {%temp, %r29}, %fd56;
}
{
.reg .b32 %temp;
- mov.b64 {%r48, %temp}, %fd14;
+ mov.b64 {%r30, %temp}, %fd56;
}
- shr.u32 %r16, %r49, 20;
- add.s32 %r50, %r16, -54;
+ mov.u32 %r31, -1023;
+ setp.gt.s32 %p2, %r29, 1048575;
+ @%p2 bra BB26_3;
-BB23_2:
- add.s32 %r51, %r50, -1023;
- and.b32 %r17, %r49, -2146435073;
- or.b32 %r18, %r17, 1072693248;
- mov.b64 %fd132, {%r48, %r18};
- setp.lt.u32 %p2, %r18, 1073127583;
- @%p2 bra BB23_4;
+ mul.f64 %fd56, %fd56, 0d4350000000000000;
+ {
+ .reg .b32 %temp;
+ mov.b64 {%temp, %r29}, %fd56;
+ }
+ {
+ .reg .b32 %temp;
+ mov.b64 {%r30, %temp}, %fd56;
+ }
+ mov.u32 %r31, -1077;
+
+BB26_3:
+ add.s32 %r18, %r29, -1;
+ setp.lt.u32 %p3, %r18, 2146435071;
+ @%p3 bra BB26_5;
+ bra.uni BB26_4;
+
+BB26_5:
+ shr.u32 %r20, %r29, 20;
+ add.s32 %r32, %r31, %r20;
+ and.b32 %r21, %r29, -2146435073;
+ or.b32 %r22, %r21, 1072693248;
+ mov.b64 %fd57, {%r30, %r22};
+ setp.lt.s32 %p5, %r22, 1073127583;
+ @%p5 bra BB26_7;
{
.reg .b32 %temp;
- mov.b64 {%r19, %temp}, %fd132;
+ mov.b64 {%r23, %temp}, %fd57;
}
{
.reg .b32 %temp;
- mov.b64 {%temp, %r20}, %fd132;
+ mov.b64 {%temp, %r24}, %fd57;
}
- add.s32 %r21, %r20, -1048576;
- mov.b64 %fd132, {%r19, %r21};
- add.s32 %r51, %r50, -1022;
+ add.s32 %r25, %r24, -1048576;
+ mov.b64 %fd57, {%r23, %r25};
+ add.s32 %r32, %r32, 1;
-BB23_4:
- add.f64 %fd16, %fd132, 0d3FF0000000000000;
+BB26_7:
+ add.f64 %fd13, %fd57, 0d3FF0000000000000;
// inline asm
- rcp.approx.ftz.f64 %fd15,%fd16;
+ rcp.approx.ftz.f64 %fd12,%fd13;
// inline asm
- neg.f64 %fd17, %fd16;
- mov.f64 %fd18, 0d3FF0000000000000;
- fma.rn.f64 %fd19, %fd17, %fd15, %fd18;
- fma.rn.f64 %fd20, %fd19, %fd19, %fd19;
- fma.rn.f64 %fd21, %fd20, %fd15, %fd15;
- add.f64 %fd22, %fd132, 0dBFF0000000000000;
- mul.f64 %fd23, %fd22, %fd21;
- fma.rn.f64 %fd24, %fd22, %fd21, %fd23;
- mul.f64 %fd25, %fd24, %fd24;
- mov.f64 %fd26, 0d3ED0F5D241AD3B5A;
- mov.f64 %fd27, 0d3EB0F5FF7D2CAFE2;
- fma.rn.f64 %fd28, %fd27, %fd25, %fd26;
- mov.f64 %fd29, 0d3EF3B20A75488A3F;
- fma.rn.f64 %fd30, %fd28, %fd25, %fd29;
+ neg.f64 %fd14, %fd13;
+ mov.f64 %fd15, 0d3FF0000000000000;
+ fma.rn.f64 %fd16, %fd14, %fd12, %fd15;
+ fma.rn.f64 %fd17, %fd16, %fd16, %fd16;
+ fma.rn.f64 %fd18, %fd17, %fd12, %fd12;
+ add.f64 %fd19, %fd57, 0dBFF0000000000000;
+ mul.f64 %fd20, %fd19, %fd18;
+ fma.rn.f64 %fd21, %fd19, %fd18, %fd20;
+ mul.f64 %fd22, %fd21, %fd21;
+ mov.f64 %fd23, 0d3ED0EE258B7A8B04;
+ mov.f64 %fd24, 0d3EB1380B3AE80F1E;
+ fma.rn.f64 %fd25, %fd24, %fd22, %fd23;
+ mov.f64 %fd26, 0d3EF3B2669F02676F;
+ fma.rn.f64 %fd27, %fd25, %fd22, %fd26;
+ mov.f64 %fd28, 0d3F1745CBA9AB0956;
+ fma.rn.f64 %fd29, %fd27, %fd22, %fd28;
+ mov.f64 %fd30, 0d3F3C71C72D1B5154;
+ fma.rn.f64 %fd31, %fd29, %fd22, %fd30;
+ mov.f64 %fd32, 0d3F624924923BE72D;
+ fma.rn.f64 %fd33, %fd31, %fd22, %fd32;
+ mov.f64 %fd34, 0d3F8999999999A3C4;
+ fma.rn.f64 %fd35, %fd33, %fd22, %fd34;
+ mov.f64 %fd36, 0d3FB5555555555554;
+ fma.rn.f64 %fd37, %fd35, %fd22, %fd36;
+ sub.f64 %fd38, %fd19, %fd21;
+ add.f64 %fd39, %fd38, %fd38;
+ neg.f64 %fd40, %fd21;
+ fma.rn.f64 %fd41, %fd40, %fd19, %fd39;
+ mul.f64 %fd42, %fd18, %fd41;
+ mul.f64 %fd43, %fd22, %fd37;
+ fma.rn.f64 %fd44, %fd43, %fd21, %fd42;
+ xor.b32 %r26, %r32, -2147483648;
+ mov.u32 %r27, 1127219200;
+ mov.b64 %fd45, {%r26, %r27};
+ mov.u32 %r28, -2147483648;
+ mov.b64 %fd46, {%r28, %r27};
+ sub.f64 %fd47, %fd45, %fd46;
+ mov.f64 %fd48, 0d3FE62E42FEFA39EF;
+ fma.rn.f64 %fd49, %fd47, %fd48, %fd21;
+ neg.f64 %fd50, %fd47;
+ fma.rn.f64 %fd51, %fd50, %fd48, %fd49;
+ sub.f64 %fd52, %fd51, %fd21;
+ sub.f64 %fd53, %fd44, %fd52;
+ mov.f64 %fd54, 0d3C7ABC9E3B39803F;
+ fma.rn.f64 %fd55, %fd47, %fd54, %fd53;
+ add.f64 %fd58, %fd49, %fd55;
+ bra.uni BB26_8;
+
+BB26_4:
+ mov.f64 %fd10, 0d7FF0000000000000;
+ fma.rn.f64 %fd11, %fd56, %fd10, %fd10;
+ {
+ .reg .b32 %temp;
+ mov.b64 {%temp, %r19}, %fd56;
+ }
+ mov.b32 %f1, %r19;
+ setp.eq.f32 %p4, %f1, 0f00000000;
+ selp.f64 %fd58, 0dFFF0000000000000, %fd11, %p4;
+
+BB26_8:
+ cvta.to.global.u64 %rd7, %rd3;
+ shl.b64 %rd8, %rd1, 3;
+ add.s64 %rd9, %rd7, %rd8;
+ st.global.f64 [%rd9], %fd58;
+
+BB26_9:
+ ret;
+}
+
+ // .globl matrix_floor
+.visible .entry matrix_floor(
+ .param .u64 matrix_floor_param_0,
+ .param .u64 matrix_floor_param_1,
+ .param .u32 matrix_floor_param_2
+)
+{
+ .reg .pred %p<2>;
+ .reg .b32 %r<6>;
+ .reg .f64 %fd<3>;
+ .reg .b64 %rd<8>;
+
+
+ ld.param.u64 %rd1, [matrix_floor_param_0];
+ ld.param.u64 %rd2, [matrix_floor_param_1];
+ ld.param.u32 %r2, [matrix_floor_param_2];
+ mov.u32 %r3, %ctaid.x;
+ mov.u32 %r4, %ntid.x;
+ mov.u32 %r5, %tid.x;
+ mad.lo.s32 %r1, %r4, %r3, %r5;
+ setp.ge.u32 %p1, %r1, %r2;
+ @%p1 bra BB27_2;
+
+ cvta.to.global.u64 %rd3, %rd1;
+ mul.wide.s32 %rd4, %r1, 8;
+ add.s64 %rd5, %rd3, %rd4;
+ ld.global.f64 %fd1, [%rd5];
+ cvt.rmi.f64.f64 %fd2, %fd1;
+ cvta.to.global.u64 %rd6, %rd2;
+ add.s64 %rd7, %rd6, %rd4;
+ st.global.f64 [%rd7], %fd2;
+
+BB27_2:
+ ret;
+}
+
+ // .globl matrix_ceil
+.visible .entry matrix_ceil(
+ .param .u64 matrix_ceil_param_0,
+ .param .u64 matrix_ceil_param_1,
+ .param .u32 matrix_ceil_param_2
+)
+{
+ .reg .pred %p<2>;
+ .reg .b32 %r<6>;
+ .reg .f64 %fd<3>;
+ .reg .b64 %rd<8>;
+
+
+ ld.param.u64 %rd1, [matrix_ceil_param_0];
+ ld.param.u64 %rd2, [matrix_ceil_param_1];
+ ld.param.u32 %r2, [matrix_ceil_param_2];
+ mov.u32 %r3, %ctaid.x;
+ mov.u32 %r4, %ntid.x;
+ mov.u32 %r5, %tid.x;
+ mad.lo.s32 %r1, %r4, %r3, %r5;
+ setp.ge.u32 %p1, %r1, %r2;
+ @%p1 bra BB28_2;
+
+ cvta.to.global.u64 %rd3, %rd1;
+ mul.wide.s32 %rd4, %r1, 8;
+ add.s64 %rd5, %rd3, %rd4;
+ ld.global.f64 %fd1, [%rd5];
+ cvt.rpi.f64.f64 %fd2, %fd1;
+ cvta.to.global.u64 %rd6, %rd2;
+ add.s64 %rd7, %rd6, %rd4;
+ st.global.f64 [%rd7], %fd2;
+
+BB28_2:
+ ret;
+}
+
+ // .globl matrix_sin
+.visible .entry matrix_sin(
+ .param .u64 matrix_sin_param_0,
+ .param .u64 matrix_sin_param_1,
+ .param .u32 matrix_sin_param_2
+)
+{
+ .local .align 4 .b8 __local_depot29[4];
+ .reg .b64 %SP;
+ .reg .b64 %SPL;
+ .reg .pred %p<7>;
+ .reg .b32 %r<18>;
+ .reg .f64 %fd<41>;
+ .reg .b64 %rd<17>;
+
+
+ mov.u64 %rd16, __local_depot29;
+ cvta.local.u64 %SP, %rd16;
+ ld.param.u64 %rd3, [matrix_sin_param_0];
+ ld.param.u64 %rd4, [matrix_sin_param_1];
+ ld.param.u32 %r5, [matrix_sin_param_2];
+ add.u64 %rd5, %SP, 0;
+ cvta.to.local.u64 %rd1, %rd5;
+ mov.u32 %r6, %ntid.x;
+ mov.u32 %r7, %ctaid.x;
+ mov.u32 %r8, %tid.x;
+ mad.lo.s32 %r1, %r6, %r7, %r8;
+ setp.ge.u32 %p1, %r1, %r5;
+ @%p1 bra BB29_11;
+
+ cvta.to.global.u64 %rd6, %rd3;
+ cvt.s64.s32 %rd2, %r1;
+ mul.wide.s32 %rd7, %r1, 8;
+ add.s64 %rd8, %rd6, %rd7;
+ ld.global.f64 %fd38, [%rd8];
+ {
+ .reg .b32 %temp;
+ mov.b64 {%temp, %r9}, %fd38;
+ }
+ and.b32 %r10, %r9, 2147483647;
+ setp.ne.s32 %p2, %r10, 2146435072;
+ @%p2 bra BB29_4;
+
+ {
+ .reg .b32 %temp;
+ mov.b64 {%r11, %temp}, %fd38;
+ }
+ setp.ne.s32 %p3, %r11, 0;
+ @%p3 bra BB29_4;
+
+ mov.f64 %fd14, 0d0000000000000000;
+ mul.rn.f64 %fd38, %fd38, %fd14;
+
+BB29_4:
+ mul.f64 %fd15, %fd38, 0d3FE45F306DC9C883;
+ cvt.rni.s32.f64 %r17, %fd15;
+ st.local.u32 [%rd1], %r17;
+ cvt.rn.f64.s32 %fd16, %r17;
+ neg.f64 %fd17, %fd16;
+ mov.f64 %fd18, 0d3FF921FB54442D18;
+ fma.rn.f64 %fd19, %fd17, %fd18, %fd38;
+ mov.f64 %fd20, 0d3C91A62633145C00;
+ fma.rn.f64 %fd21, %fd17, %fd20, %fd19;
+ mov.f64 %fd22, 0d397B839A252049C0;
+ fma.rn.f64 %fd39, %fd17, %fd22, %fd21;
+ {
+ .reg .b32 %temp;
+ mov.b64 {%temp, %r12}, %fd38;
+ }
+ and.b32 %r13, %r12, 2145386496;
+ setp.lt.u32 %p4, %r13, 1105199104;
+ @%p4 bra BB29_6;
+
+ // Callseq Start 3
+ {
+ .reg .b32 temp_param_reg;
+ // <end>}
+ .param .b64 param0;
+ st.param.f64 [param0+0], %fd38;
+ .param .b64 param1;
+ st.param.b64 [param1+0], %rd5;
+ .param .b64 retval0;
+ call.uni (retval0),
+ __internal_trig_reduction_slowpathd,
+ (
+ param0,
+ param1
+ );
+ ld.param.f64 %fd39, [retval0+0];
+
+ //{
+ }// Callseq End 3
+ ld.local.u32 %r17, [%rd1];
+
+BB29_6:
+ and.b32 %r14, %r17, 1;
+ shl.b32 %r15, %r14, 3;
+ setp.eq.s32 %p5, %r14, 0;
+ selp.f64 %fd23, 0d3DE5DB65F9785EBA, 0dBDA8FF8320FD8164, %p5;
+ mul.wide.u32 %rd10, %r15, 8;
+ mov.u64 %rd11, __cudart_sin_cos_coeffs;
+ add.s64 %rd12, %rd10, %rd11;
+ ld.const.f64 %fd24, [%rd12+8];
+ mul.rn.f64 %fd7, %fd39, %fd39;
+ fma.rn.f64 %fd25, %fd23, %fd7, %fd24;
+ ld.const.f64 %fd26, [%rd12+16];
+ fma.rn.f64 %fd27, %fd25, %fd7, %fd26;
+ ld.const.f64 %fd28, [%rd12+24];
+ fma.rn.f64 %fd29, %fd27, %fd7, %fd28;
+ ld.const.f64 %fd30, [%rd12+32];
+ fma.rn.f64 %fd31, %fd29, %fd7, %fd30;
+ ld.const.f64 %fd32, [%rd12+40];
+ fma.rn.f64 %fd33, %fd31, %fd7, %fd32;
+ ld.const.f64 %fd34, [%rd12+48];
+ fma.rn.f64 %fd8, %fd33, %fd7, %fd34;
+ fma.rn.f64 %fd40, %fd8, %fd39, %fd39;
+ @%p5 bra BB29_8;
+
+ mov.f64 %fd35, 0d3FF0000000000000;
+ fma.rn.f64 %fd40, %fd8, %fd7, %fd35;
+
+BB29_8:
+ and.b32 %r16, %r17, 2;
+ setp.eq.s32 %p6, %r16, 0;
+ @%p6 bra BB29_10;
+
+ mov.f64 %fd36, 0d0000000000000000;
+ mov.f64 %fd37, 0dBFF0000000000000;
+ fma.rn.f64 %fd40, %fd40, %fd37, %fd36;
+
+BB29_10:
+ cvta.to.global.u64 %rd13, %rd4;
+ shl.b64 %rd14, %rd2, 3;
+ add.s64 %rd15, %rd13, %rd14;
+ st.global.f64 [%rd15], %fd40;
+
+BB29_11:
+ ret;
+}
+
+ // .globl matrix_cos
+.visible .entry matrix_cos(
+ .param .u64 matrix_cos_param_0,
+ .param .u64 matrix_cos_param_1,
+ .param .u32 matrix_cos_param_2
+)
+{
+ .local .align 4 .b8 __local_depot30[4];
+ .reg .b64 %SP;
+ .reg .b64 %SPL;
+ .reg .pred %p<7>;
+ .reg .b32 %r<19>;
+ .reg .f64 %fd<41>;
+ .reg .b64 %rd<17>;
+
+
+ mov.u64 %rd16, __local_depot30;
+ cvta.local.u64 %SP, %rd16;
+ ld.param.u64 %rd3, [matrix_cos_param_0];
+ ld.param.u64 %rd4, [matrix_cos_param_1];
+ ld.param.u32 %r6, [matrix_cos_param_2];
+ add.u64 %rd5, %SP, 0;
+ cvta.to.local.u64 %rd1, %rd5;
+ mov.u32 %r7, %ntid.x;
+ mov.u32 %r8, %ctaid.x;
+ mov.u32 %r9, %tid.x;
+ mad.lo.s32 %r1, %r7, %r8, %r9;
+ setp.ge.u32 %p1, %r1, %r6;
+ @%p1 bra BB30_11;
+
+ cvta.to.global.u64 %rd6, %rd3;
+ cvt.s64.s32 %rd2, %r1;
+ mul.wide.s32 %rd7, %r1, 8;
+ add.s64 %rd8, %rd6, %rd7;
+ ld.global.f64 %fd38, [%rd8];
+ {
+ .reg .b32 %temp;
+ mov.b64 {%temp, %r10}, %fd38;
+ }
+ and.b32 %r11, %r10, 2147483647;
+ setp.ne.s32 %p2, %r11, 2146435072;
+ @%p2 bra BB30_4;
+
+ {
+ .reg .b32 %temp;
+ mov.b64 {%r12, %temp}, %fd38;
+ }
+ setp.ne.s32 %p3, %r12, 0;
+ @%p3 bra BB30_4;
+
+ mov.f64 %fd14, 0d0000000000000000;
+ mul.rn.f64 %fd38, %fd38, %fd14;
+
+BB30_4:
+ mul.f64 %fd15, %fd38, 0d3FE45F306DC9C883;
+ cvt.rni.s32.f64 %r18, %fd15;
+ st.local.u32 [%rd1], %r18;
+ cvt.rn.f64.s32 %fd16, %r18;
+ neg.f64 %fd17, %fd16;
+ mov.f64 %fd18, 0d3FF921FB54442D18;
+ fma.rn.f64 %fd19, %fd17, %fd18, %fd38;
+ mov.f64 %fd20, 0d3C91A62633145C00;
+ fma.rn.f64 %fd21, %fd17, %fd20, %fd19;
+ mov.f64 %fd22, 0d397B839A252049C0;
+ fma.rn.f64 %fd39, %fd17, %fd22, %fd21;
+ {
+ .reg .b32 %temp;
+ mov.b64 {%temp, %r13}, %fd38;
+ }
+ and.b32 %r14, %r13, 2145386496;
+ setp.lt.u32 %p4, %r14, 1105199104;
+ @%p4 bra BB30_6;
+
+ // Callseq Start 4
+ {
+ .reg .b32 temp_param_reg;
+ // <end>}
+ .param .b64 param0;
+ st.param.f64 [param0+0], %fd38;
+ .param .b64 param1;
+ st.param.b64 [param1+0], %rd5;
+ .param .b64 retval0;
+ call.uni (retval0),
+ __internal_trig_reduction_slowpathd,
+ (
+ param0,
+ param1
+ );
+ ld.param.f64 %fd39, [retval0+0];
+
+ //{
+ }// Callseq End 4
+ ld.local.u32 %r18, [%rd1];
+
+BB30_6:
+ add.s32 %r5, %r18, 1;
+ and.b32 %r15, %r5, 1;
+ shl.b32 %r16, %r15, 3;
+ setp.eq.s32 %p5, %r15, 0;
+ selp.f64 %fd23, 0d3DE5DB65F9785EBA, 0dBDA8FF8320FD8164, %p5;
+ mul.wide.u32 %rd10, %r16, 8;
+ mov.u64 %rd11, __cudart_sin_cos_coeffs;
+ add.s64 %rd12, %rd10, %rd11;
+ ld.const.f64 %fd24, [%rd12+8];
+ mul.rn.f64 %fd7, %fd39, %fd39;
+ fma.rn.f64 %fd25, %fd23, %fd7, %fd24;
+ ld.const.f64 %fd26, [%rd12+16];
+ fma.rn.f64 %fd27, %fd25, %fd7, %fd26;
+ ld.const.f64 %fd28, [%rd12+24];
+ fma.rn.f64 %fd29, %fd27, %fd7, %fd28;
+ ld.const.f64 %fd30, [%rd12+32];
+ fma.rn.f64 %fd31, %fd29, %fd7, %fd30;
+ ld.const.f64 %fd32, [%rd12+40];
+ fma.rn.f64 %fd33, %fd31, %fd7, %fd32;
+ ld.const.f64 %fd34, [%rd12+48];
+ fma.rn.f64 %fd8, %fd33, %fd7, %fd34;
+ fma.rn.f64 %fd40, %fd8, %fd39, %fd39;
+ @%p5 bra BB30_8;
+
+ mov.f64 %fd35, 0d3FF0000000000000;
+ fma.rn.f64 %fd40, %fd8, %fd7, %fd35;
+
+BB30_8:
+ and.b32 %r17, %r5, 2;
+ setp.eq.s32 %p6, %r17, 0;
+ @%p6 bra BB30_10;
+
+ mov.f64 %fd36, 0d0000000000000000;
+ mov.f64 %fd37, 0dBFF0000000000000;
+ fma.rn.f64 %fd40, %fd40, %fd37, %fd36;
+
+BB30_10:
+ cvta.to.global.u64 %rd13, %rd4;
+ shl.b64 %rd14, %rd2, 3;
+ add.s64 %rd15, %rd13, %rd14;
+ st.global.f64 [%rd15], %fd40;
+
+BB30_11:
+ ret;
+}
+
+ // .globl matrix_tan
+.visible .entry matrix_tan(
+ .param .u64 matrix_tan_param_0,
+ .param .u64 matrix_tan_param_1,
+ .param .u32 matrix_tan_param_2
+)
+{
+ .local .align 4 .b8 __local_depot31[4];
+ .reg .b64 %SP;
+ .reg .b64 %SPL;
+ .reg .pred %p<6>;
+ .reg .b32 %r<16>;
+ .reg .f64 %fd<66>;
+ .reg .b64 %rd<14>;
+
+
+ mov.u64 %rd13, __local_depot31;
+ cvta.local.u64 %SP, %rd13;
+ ld.param.u64 %rd3, [matrix_tan_param_0];
+ ld.param.u64 %rd4, [matrix_tan_param_1];
+ ld.param.u32 %r5, [matrix_tan_param_2];
+ add.u64 %rd5, %SP, 0;
+ cvta.to.local.u64 %rd1, %rd5;
+ mov.u32 %r6, %ntid.x;
+ mov.u32 %r7, %ctaid.x;
+ mov.u32 %r8, %tid.x;
+ mad.lo.s32 %r1, %r6, %r7, %r8;
+ setp.ge.u32 %p1, %r1, %r5;
+ @%p1 bra BB31_9;
+
+ cvta.to.global.u64 %rd6, %rd3;
+ cvt.s64.s32 %rd2, %r1;
+ mul.wide.s32 %rd7, %r1, 8;
+ add.s64 %rd8, %rd6, %rd7;
+ ld.global.f64 %fd63, [%rd8];
+ {
+ .reg .b32 %temp;
+ mov.b64 {%temp, %r9}, %fd63;
+ }
+ and.b32 %r10, %r9, 2147483647;
+ setp.ne.s32 %p2, %r10, 2146435072;
+ @%p2 bra BB31_4;
+
+ {
+ .reg .b32 %temp;
+ mov.b64 {%r11, %temp}, %fd63;
+ }
+ setp.ne.s32 %p3, %r11, 0;
+ @%p3 bra BB31_4;
+
+ mov.f64 %fd11, 0d0000000000000000;
+ mul.rn.f64 %fd63, %fd63, %fd11;
+
+BB31_4:
+ mul.f64 %fd12, %fd63, 0d3FE45F306DC9C883;
+ cvt.rni.s32.f64 %r15, %fd12;
+ st.local.u32 [%rd1], %r15;
+ cvt.rn.f64.s32 %fd13, %r15;
+ neg.f64 %fd14, %fd13;
+ mov.f64 %fd15, 0d3FF921FB54442D18;
+ fma.rn.f64 %fd16, %fd14, %fd15, %fd63;
+ mov.f64 %fd17, 0d3C91A62633145C00;
+ fma.rn.f64 %fd18, %fd14, %fd17, %fd16;
+ mov.f64 %fd19, 0d397B839A252049C0;
+ fma.rn.f64 %fd64, %fd14, %fd19, %fd18;
+ {
+ .reg .b32 %temp;
+ mov.b64 {%temp, %r12}, %fd63;
+ }
+ and.b32 %r13, %r12, 2145386496;
+ setp.lt.u32 %p4, %r13, 1105199104;
+ @%p4 bra BB31_6;
+
+ // Callseq Start 5
+ {
+ .reg .b32 temp_param_reg;
+ // <end>}
+ .param .b64 param0;
+ st.param.f64 [param0+0], %fd63;
+ .param .b64 param1;
+ st.param.b64 [param1+0], %rd5;
+ .param .b64 retval0;
+ call.uni (retval0),
+ __internal_trig_reduction_slowpathd,
+ (
+ param0,
+ param1
+ );
+ ld.param.f64 %fd64, [retval0+0];
+
+ //{
+ }// Callseq End 5
+ ld.local.u32 %r15, [%rd1];
+
+BB31_6:
+ mul.f64 %fd20, %fd64, %fd64;
+ mov.f64 %fd21, 0dBEF9757C5B27EBB1;
+ mov.f64 %fd22, 0d3EE48DAC2799BCB9;
+ fma.rn.f64 %fd23, %fd22, %fd20, %fd21;
+ mov.f64 %fd24, 0d3F0980E90FD91E04;
+ fma.rn.f64 %fd25, %fd23, %fd20, %fd24;
+ mov.f64 %fd26, 0dBEFAE2B0417D7E1D;
+ fma.rn.f64 %fd27, %fd25, %fd20, %fd26;
+ mov.f64 %fd28, 0d3F119F5341BFBA57;
+ fma.rn.f64 %fd29, %fd27, %fd20, %fd28;
+ mov.f64 %fd30, 0d3F15E791A00F6919;
+ fma.rn.f64 %fd31, %fd29, %fd20, %fd30;
+ mov.f64 %fd32, 0d3F2FF2E7FADEC73A;
+ fma.rn.f64 %fd33, %fd31, %fd20, %fd32;
+ mov.f64 %fd34, 0d3F434BC1B206DA62;
+ fma.rn.f64 %fd35, %fd33, %fd20, %fd34;
+ mov.f64 %fd36, 0d3F57DB18EF2F83F9;
+ fma.rn.f64 %fd37, %fd35, %fd20, %fd36;
+ mov.f64 %fd38, 0d3F6D6D2E7AE49FBC;
+ fma.rn.f64 %fd39, %fd37, %fd20, %fd38;
+ mov.f64 %fd40, 0d3F8226E3A816A776;
+ fma.rn.f64 %fd41, %fd39, %fd20, %fd40;
+ mov.f64 %fd42, 0d3F9664F485D25660;
+ fma.rn.f64 %fd43, %fd41, %fd20, %fd42;
+ mov.f64 %fd44, 0d3FABA1BA1BABF31D;
+ fma.rn.f64 %fd45, %fd43, %fd20, %fd44;
+ mov.f64 %fd46, 0d3FC11111111105D2;
+ fma.rn.f64 %fd47, %fd45, %fd20, %fd46;
+ mov.f64 %fd48, 0d3FD555555555555E;
+ fma.rn.f64 %fd49, %fd47, %fd20, %fd48;
+ mul.f64 %fd7, %fd20, %fd49;
+ fma.rn.f64 %fd65, %fd7, %fd64, %fd64;
+ and.b32 %r14, %r15, 1;
+ setp.eq.b32 %p5, %r14, 1;
+ @!%p5 bra BB31_8;
+ bra.uni BB31_7;
+
+BB31_7:
+ sub.f64 %fd52, %fd65, %fd64;
+ neg.f64 %fd53, %fd52;
+ fma.rn.f64 %fd54, %fd7, %fd64, %fd53;
+ // inline asm
+ rcp.approx.ftz.f64 %fd50,%fd65;
+ // inline asm
+ neg.f64 %fd55, %fd65;
+ mov.f64 %fd56, 0d3FF0000000000000;
+ fma.rn.f64 %fd57, %fd55, %fd50, %fd56;
+ fma.rn.f64 %fd58, %fd57, %fd57, %fd57;
+ fma.rn.f64 %fd59, %fd58, %fd50, %fd50;
+ neg.f64 %fd60, %fd59;
+ fma.rn.f64 %fd61, %fd65, %fd60, %fd56;
+ fma.rn.f64 %fd62, %fd60, %fd54, %fd61;
+ fma.rn.f64 %fd65, %fd62, %fd60, %fd60;
+
+BB31_8:
+ cvta.to.global.u64 %rd10, %rd4;
+ shl.b64 %rd11, %rd2, 3;
+ add.s64 %rd12, %rd10, %rd11;
+ st.global.f64 [%rd12], %fd65;
+
+BB31_9:
+ ret;
+}
+
+ // .globl matrix_asin
+.visible .entry matrix_asin(
+ .param .u64 matrix_asin_param_0,
+ .param .u64 matrix_asin_param_1,
+ .param .u32 matrix_asin_param_2
+)
+{
+ .reg .pred %p<5>;
+ .reg .f32 %f<3>;
+ .reg .b32 %r<15>;
+ .reg .f64 %fd<83>;
+ .reg .b64 %rd<10>;
+
+
+ ld.param.u64 %rd2, [matrix_asin_param_0];
+ ld.param.u64 %rd3, [matrix_asin_param_1];
+ ld.param.u32 %r3, [matrix_asin_param_2];
+ mov.u32 %r4, %ctaid.x;
+ mov.u32 %r5, %ntid.x;
+ mov.u32 %r6, %tid.x;
+ mad.lo.s32 %r1, %r5, %r4, %r6;
+ setp.ge.u32 %p1, %r1, %r3;
+ @%p1 bra BB32_5;
+
+ cvta.to.global.u64 %rd4, %rd2;
+ cvt.s64.s32 %rd1, %r1;
+ mul.wide.s32 %rd5, %r1, 8;
+ add.s64 %rd6, %rd4, %rd5;
+ ld.global.f64 %fd1, [%rd6];
+ {
+ .reg .b32 %temp;
+ mov.b64 {%temp, %r2}, %fd1;
+ }
+ mov.b32 %f1, %r2;
+ abs.f32 %f2, %f1;
+ setp.lt.f32 %p2, %f2, 0f3FE26666;
+ @%p2 bra BB32_3;
+ bra.uni BB32_2;
+
+BB32_3:
+ mul.f64 %fd55, %fd1, %fd1;
+ mov.f64 %fd56, 0dBFB3823B180754AF;
+ mov.f64 %fd57, 0d3FB0066BDC1895E9;
+ fma.rn.f64 %fd58, %fd57, %fd55, %fd56;
+ mov.f64 %fd59, 0d3FB11E52CC2F79AE;
+ fma.rn.f64 %fd60, %fd58, %fd55, %fd59;
+ mov.f64 %fd61, 0dBF924EAF3526861B;
+ fma.rn.f64 %fd62, %fd60, %fd55, %fd61;
+ mov.f64 %fd63, 0d3F91DF02A31E6CB7;
+ fma.rn.f64 %fd64, %fd62, %fd55, %fd63;
+ mov.f64 %fd65, 0d3F847D18B0EEC6CC;
+ fma.rn.f64 %fd66, %fd64, %fd55, %fd65;
+ mov.f64 %fd67, 0d3F8D0AF961BA53B0;
+ fma.rn.f64 %fd68, %fd66, %fd55, %fd67;
+ mov.f64 %fd69, 0d3F91BF7734CF1C48;
+ fma.rn.f64 %fd70, %fd68, %fd55, %fd69;
+ mov.f64 %fd71, 0d3F96E91483144EF7;
+ fma.rn.f64 %fd72, %fd70, %fd55, %fd71;
+ mov.f64 %fd73, 0d3F9F1C6E0A4F9F81;
+ fma.rn.f64 %fd74, %fd72, %fd55, %fd73;
+ mov.f64 %fd75, 0d3FA6DB6DC27FA92B;
+ fma.rn.f64 %fd76, %fd74, %fd55, %fd75;
+ mov.f64 %fd77, 0d3FB333333320F91B;
+ fma.rn.f64 %fd78, %fd76, %fd55, %fd77;
+ mov.f64 %fd79, 0d3FC5555555555F4D;
+ fma.rn.f64 %fd80, %fd78, %fd55, %fd79;
+ mul.f64 %fd81, %fd55, %fd80;
+ fma.rn.f64 %fd82, %fd81, %fd1, %fd1;
+ bra.uni BB32_4;
+
+BB32_2:
+ abs.f64 %fd7, %fd1;
+ mov.f64 %fd8, 0d3FE0000000000000;
+ mov.f64 %fd9, 0dBFE0000000000000;
+ fma.rn.f64 %fd6, %fd9, %fd7, %fd8;
+ // inline asm
+ rsqrt.approx.ftz.f64 %fd5, %fd6;
+ // inline asm
+ {
+ .reg .b32 %temp;
+ mov.b64 {%r7, %temp}, %fd5;
+ }
+ {
+ .reg .b32 %temp;
+ mov.b64 {%temp, %r8}, %fd5;
+ }
+ add.s32 %r9, %r8, -1048576;
+ mov.b64 %fd10, {%r7, %r9};
+ mul.f64 %fd11, %fd6, %fd5;
+ neg.f64 %fd12, %fd11;
+ fma.rn.f64 %fd13, %fd11, %fd12, %fd6;
+ fma.rn.f64 %fd14, %fd13, %fd10, %fd11;
+ neg.f64 %fd15, %fd14;
+ mov.f64 %fd16, 0d3FF0000000000000;
+ fma.rn.f64 %fd17, %fd5, %fd15, %fd16;
+ fma.rn.f64 %fd18, %fd17, %fd10, %fd10;
+ fma.rn.f64 %fd19, %fd14, %fd15, %fd6;
+ fma.rn.f64 %fd20, %fd19, %fd18, %fd14;
+ {
+ .reg .b32 %temp;
+ mov.b64 {%temp, %r10}, %fd6;
+ }
+ setp.lt.s32 %p3, %r10, 0;
+ selp.f64 %fd21, 0dFFF8000000000000, %fd20, %p3;
+ setp.equ.f64 %p4, %fd6, 0d0000000000000000;
+ selp.f64 %fd22, %fd6, %fd21, %p4;
+ mov.f64 %fd23, 0dBFB3823B180754AF;
+ mov.f64 %fd24, 0d3FB0066BDC1895E9;
+ fma.rn.f64 %fd25, %fd24, %fd6, %fd23;
+ mov.f64 %fd26, 0d3FB11E52CC2F79AE;
+ fma.rn.f64 %fd27, %fd25, %fd6, %fd26;
+ mov.f64 %fd28, 0dBF924EAF3526861B;
+ fma.rn.f64 %fd29, %fd27, %fd6, %fd28;
+ mov.f64 %fd30, 0d3F91DF02A31E6CB7;
+ fma.rn.f64 %fd31, %fd29, %fd6, %fd30;
+ mov.f64 %fd32, 0d3F847D18B0EEC6CC;
+ fma.rn.f64 %fd33, %fd31, %fd6, %fd32;
+ mov.f64 %fd34, 0d3F8D0AF961BA53B0;
+ fma.rn.f64 %fd35, %fd33, %fd6, %fd34;
+ mov.f64 %fd36, 0d3F91BF7734CF1C48;
+ fma.rn.f64 %fd37, %fd35, %fd6, %fd36;
+ mov.f64 %fd38, 0d3F96E91483144EF7;
+ fma.rn.f64 %fd39, %fd37, %fd6, %fd38;
+ mov.f64 %fd40, 0d3F9F1C6E0A4F9F81;
+ fma.rn.f64 %fd41, %fd39, %fd6, %fd40;
+ mov.f64 %fd42, 0d3FA6DB6DC27FA92B;
+ fma.rn.f64 %fd43, %fd41, %fd6, %fd42;
+ mov.f64 %fd44, 0d3FB333333320F91B;
+ fma.rn.f64 %fd45, %fd43, %fd6, %fd44;
+ mov.f64 %fd46, 0d3FC5555555555F4D;
+ fma.rn.f64 %fd47, %fd45, %fd6, %fd46;
+ mul.f64 %fd48, %fd6, %fd47;
+ mul.f64 %fd49, %fd22, 0dC000000000000000;
+ mov.f64 %fd50, 0d3C91A62633145C07;
+ fma.rn.f64 %fd51, %fd49, %fd48, %fd50;
+ add.f64 %fd52, %fd49, 0d3FE921FB54442D18;
+ add.f64 %fd53, %fd52, %fd51;
+ add.f64 %fd54, %fd53, 0d3FE921FB54442D18;
+ {
+ .reg .b32 %temp;
+ mov.b64 {%r11, %temp}, %fd54;
+ }
+ {
+ .reg .b32 %temp;
+ mov.b64 {%temp, %r12}, %fd54;
+ }
+ and.b32 %r13, %r2, -2147483648;
+ or.b32 %r14, %r12, %r13;
+ mov.b64 %fd82, {%r11, %r14};
+
+BB32_4:
+ cvta.to.global.u64 %rd7, %rd3;
+ shl.b64 %rd8, %rd1, 3;
+ add.s64 %rd9, %rd7, %rd8;
+ st.global.f64 [%rd9], %fd82;
+
+BB32_5:
+ ret;
+}
+
+ // .globl matrix_acos
+.visible .entry matrix_acos(
+ .param .u64 matrix_acos_param_0,
+ .param .u64 matrix_acos_param_1,
+ .param .u32 matrix_acos_param_2
+)
+{
+ .reg .pred %p<7>;
+ .reg .b32 %r<17>;
+ .reg .f64 %fd<95>;
+ .reg .b64 %rd<10>;
+
+
+ ld.param.u64 %rd2, [matrix_acos_param_0];
+ ld.param.u64 %rd3, [matrix_acos_param_1];
+ ld.param.u32 %r4, [matrix_acos_param_2];
+ mov.u32 %r5, %ctaid.x;
+ mov.u32 %r6, %ntid.x;
+ mov.u32 %r7, %tid.x;
+ mad.lo.s32 %r1, %r6, %r5, %r7;
+ setp.ge.u32 %p1, %r1, %r4;
+ @%p1 bra BB33_14;
+
+ cvta.to.global.u64 %rd4, %rd2;
+ cvt.s64.s32 %rd1, %r1;
+ mul.wide.s32 %rd5, %r1, 8;
+ add.s64 %rd6, %rd4, %rd5;
+ ld.global.f64 %fd16, [%rd6];
+ {
+ .reg .b32 %temp;
+ mov.b64 {%temp, %r2}, %fd16;
+ }
+ abs.f64 %fd1, %fd16;
+ {
+ .reg .b32 %temp;
+ mov.b64 {%temp, %r8}, %fd1;
+ }
+ setp.lt.s32 %p2, %r8, 1071801958;
+ @%p2 bra BB33_9;
+ bra.uni BB33_2;
+
+BB33_9:
+ mul.f64 %fd62, %fd1, %fd1;
+ mov.f64 %fd63, 0dBFB3823B180754AF;
+ mov.f64 %fd64, 0d3FB0066BDC1895E9;
+ fma.rn.f64 %fd65, %fd64, %fd62, %fd63;
+ mov.f64 %fd66, 0d3FB11E52CC2F79AE;
+ fma.rn.f64 %fd67, %fd65, %fd62, %fd66;
+ mov.f64 %fd68, 0dBF924EAF3526861B;
+ fma.rn.f64 %fd69, %fd67, %fd62, %fd68;
+ mov.f64 %fd70, 0d3F91DF02A31E6CB7;
+ fma.rn.f64 %fd71, %fd69, %fd62, %fd70;
+ mov.f64 %fd72, 0d3F847D18B0EEC6CC;
+ fma.rn.f64 %fd73, %fd71, %fd62, %fd72;
+ mov.f64 %fd74, 0d3F8D0AF961BA53B0;
+ fma.rn.f64 %fd75, %fd73, %fd62, %fd74;
+ mov.f64 %fd76, 0d3F91BF7734CF1C48;
+ fma.rn.f64 %fd77, %fd75, %fd62, %fd76;
+ mov.f64 %fd78, 0d3F96E91483144EF7;
+ fma.rn.f64 %fd79, %fd77, %fd62, %fd78;
+ mov.f64 %fd80, 0d3F9F1C6E0A4F9F81;
+ fma.rn.f64 %fd81, %fd79, %fd62, %fd80;
+ mov.f64 %fd82, 0d3FA6DB6DC27FA92B;
+ fma.rn.f64 %fd83, %fd81, %fd62, %fd82;
+ mov.f64 %fd84, 0d3FB333333320F91B;
+ fma.rn.f64 %fd85, %fd83, %fd62, %fd84;
+ mov.f64 %fd86, 0d3FC5555555555F4D;
+ fma.rn.f64 %fd87, %fd85, %fd62, %fd86;
+ mul.f64 %fd88, %fd62, %fd87;
+ fma.rn.f64 %fd10, %fd88, %fd1, %fd1;
+ setp.lt.s32 %p6, %r2, 0;
+ @%p6 bra BB33_11;
+
+ mov.f64 %fd89, 0dBC91A62633145C07;
+ add.rn.f64 %fd90, %fd10, %fd89;
+ neg.f64 %fd93, %fd90;
+ bra.uni BB33_12;
+
+BB33_2:
+ mov.f64 %fd19, 0d3FF0000000000000;
+ sub.f64 %fd2, %fd19, %fd1;
+ {
+ .reg .b32 %temp;
+ mov.b64 {%r9, %temp}, %fd2;
+ }
+ {
+ .reg .b32 %temp;
+ mov.b64 {%temp, %r3}, %fd2;
+ }
+ add.s32 %r10, %r3, -1048576;
+ mov.b64 %fd18, {%r9, %r10};
+ // inline asm
+ rsqrt.approx.ftz.f64 %fd17, %fd18;
+ // inline asm
+ {
+ .reg .b32 %temp;
+ mov.b64 {%r11, %temp}, %fd17;
+ }
+ {
+ .reg .b32 %temp;
+ mov.b64 {%temp, %r12}, %fd17;
+ }
+ add.s32 %r13, %r12, -1048576;
+ mov.b64 %fd20, {%r11, %r13};
+ mul.f64 %fd21, %fd18, %fd17;
+ neg.f64 %fd22, %fd21;
+ fma.rn.f64 %fd23, %fd21, %fd22, %fd18;
+ fma.rn.f64 %fd24, %fd23, %fd20, %fd21;
+ neg.f64 %fd25, %fd24;
+ fma.rn.f64 %fd26, %fd17, %fd25, %fd19;
+ fma.rn.f64 %fd27, %fd26, %fd20, %fd20;
+ fma.rn.f64 %fd28, %fd24, %fd25, %fd18;
+ fma.rn.f64 %fd3, %fd28, %fd27, %fd24;
+ setp.lt.s32 %p3, %r3, 1;
+ @%p3 bra BB33_4;
+
+ {
+ .reg .b32 %temp;
+ mov.b64 {%r14, %temp}, %fd3;
+ }
+ {
+ .reg .b32 %temp;
+ mov.b64 {%temp, %r15}, %fd3;
+ }
+ add.s32 %r16, %r15, 1048576;
+ mov.b64 %fd29, {%r14, %r16};
+ mov.f64 %fd30, 0dBEBAC2FE66FAAC4B;
+ mov.f64 %fd31, 0d3EC715B371155F70;
+ fma.rn.f64 %fd32, %fd31, %fd2, %fd30;
+ mov.f64 %fd33, 0d3ED9A9B88EFCD9B8;
+ fma.rn.f64 %fd34, %fd32, %fd2, %fd33;
+ mov.f64 %fd35, 0d3EDD0F40A8A0C4C3;
+ fma.rn.f64 %fd36, %fd34, %fd2, %fd35;
+ mov.f64 %fd37, 0d3EF46D4CFA9E0E1F;
+ fma.rn.f64 %fd38, %fd36, %fd2, %fd37;
+ mov.f64 %fd39, 0d3F079C168D1E2422;
+ fma.rn.f64 %fd40, %fd38, %fd2, %fd39;
+ mov.f64 %fd41, 0d3F1C9A88C3BCA540;
+ fma.rn.f64 %fd42, %fd40, %fd2, %fd41;
+ mov.f64 %fd43, 0d3F31C4E64BD476DF;
+ fma.rn.f64 %fd44, %fd42, %fd2, %fd43;
+ mov.f64 %fd45, 0d3F46E8BA60009C8F;
+ fma.rn.f64 %fd46, %fd44, %fd2, %fd45;
+ mov.f64 %fd47, 0d3F5F1C71C62B05A2;
+ fma.rn.f64 %fd48, %fd46, %fd2, %fd47;
+ mov.f64 %fd49, 0d3F76DB6DB6DC9F2C;
+ fma.rn.f64 %fd50, %fd48, %fd2, %fd49;
+ mov.f64 %fd51, 0d3F9333333333329C;
+ fma.rn.f64 %fd52, %fd50, %fd2, %fd51;
+ mov.f64 %fd53, 0d3FB5555555555555;
+ fma.rn.f64 %fd54, %fd52, %fd2, %fd53;
+ mul.f64 %fd55, %fd2, %fd54;
+ fma.rn.f64 %fd94, %fd55, %fd29, %fd29;
+ bra.uni BB33_5;
+
+BB33_11:
+ mov.f64 %fd91, 0d3C91A62633145C07;
+ add.rn.f64 %fd93, %fd10, %fd91;
+
+BB33_12:
+ mov.f64 %fd92, 0d3FF921FB54442D18;
+ add.rn.f64 %fd94, %fd92, %fd93;
+ bra.uni BB33_13;
+
+BB33_4:
+ mov.f64 %fd56, 0d0000000000000000;
+ mul.rn.f64 %fd94, %fd1, %fd56;
+
+BB33_5:
+ setp.gt.s32 %p4, %r3, -1;
+ @%p4 bra BB33_7;
+
+ mov.f64 %fd57, 0d7FF0000000000000;
+ mul.rn.f64 %fd94, %fd94, %fd57;
+
+BB33_7:
+ setp.gt.s32 %p5, %r2, -1;
+ @%p5 bra BB33_13;
+
+ mov.f64 %fd58, 0dBCA1A62633145C07;
+ add.rn.f64 %fd59, %fd94, %fd58;
+ neg.f64 %fd60, %fd59;
+ mov.f64 %fd61, 0d400921FB54442D18;
+ add.rn.f64 %fd94, %fd61, %fd60;
+
+BB33_13:
+ cvta.to.global.u64 %rd7, %rd3;
+ shl.b64 %rd8, %rd1, 3;
+ add.s64 %rd9, %rd7, %rd8;
+ st.global.f64 [%rd9], %fd94;
+
+BB33_14:
+ ret;
+}
+
+ // .globl matrix_atan
+.visible .entry matrix_atan(
+ .param .u64 matrix_atan_param_0,
+ .param .u64 matrix_atan_param_1,
+ .param .u32 matrix_atan_param_2
+)
+{
+ .reg .pred %p<5>;
+ .reg .b32 %r<11>;
+ .reg .f64 %fd<57>;
+ .reg .b64 %rd<10>;
+
+
+ ld.param.u64 %rd2, [matrix_atan_param_0];
+ ld.param.u64 %rd3, [matrix_atan_param_1];
+ ld.param.u32 %r2, [matrix_atan_param_2];
+ mov.u32 %r3, %ctaid.x;
+ mov.u32 %r4, %ntid.x;
+ mov.u32 %r5, %tid.x;
+ mad.lo.s32 %r1, %r4, %r3, %r5;
+ setp.ge.u32 %p1, %r1, %r2;
+ @%p1 bra BB34_4;
+
+ cvta.to.global.u64 %rd4, %rd2;
+ cvt.s64.s32 %rd1, %r1;
+ mul.wide.s32 %rd5, %r1, 8;
+ add.s64 %rd6, %rd4, %rd5;
+ ld.global.f64 %fd1, [%rd6];
+ abs.f64 %fd2, %fd1;
+ setp.leu.f64 %p2, %fd2, 0d3FF0000000000000;
+ mov.f64 %fd56, %fd2;
+ @%p2 bra BB34_3;
+
+ // inline asm
+ rcp.approx.ftz.f64 %fd5,%fd2;
+ // inline asm
+ neg.f64 %fd7, %fd2;
+ mov.f64 %fd8, 0d3FF0000000000000;
+ fma.rn.f64 %fd9, %fd7, %fd5, %fd8;
+ fma.rn.f64 %fd10, %fd9, %fd9, %fd9;
+ fma.rn.f64 %fd11, %fd10, %fd5, %fd5;
+ setp.eq.f64 %p3, %fd2, 0d7FF0000000000000;
+ selp.f64 %fd3, 0d0000000000000000, %fd11, %p3;
+ mov.f64 %fd56, %fd3;
+
+BB34_3:
+ mov.f64 %fd4, %fd56;
+ cvta.to.global.u64 %rd7, %rd3;
+ mul.f64 %fd12, %fd4, %fd4;
+ mov.f64 %fd13, 0d3F2D3B63DBB65B49;
+ mov.f64 %fd14, 0dBEF53E1D2A25FF7E;
+ fma.rn.f64 %fd15, %fd14, %fd12, %fd13;
+ mov.f64 %fd16, 0dBF5312788DDE082E;
+ fma.rn.f64 %fd17, %fd15, %fd12, %fd16;
+ mov.f64 %fd18, 0d3F6F9690C8249315;
+ fma.rn.f64 %fd19, %fd17, %fd12, %fd18;
+ mov.f64 %fd20, 0dBF82CF5AABC7CF0D;
+ fma.rn.f64 %fd21, %fd19, %fd12, %fd20;
+ mov.f64 %fd22, 0d3F9162B0B2A3BFDE;
+ fma.rn.f64 %fd23, %fd21, %fd12, %fd22;
+ mov.f64 %fd24, 0dBF9A7256FEB6FC6B;
+ fma.rn.f64 %fd25, %fd23, %fd12, %fd24;
+ mov.f64 %fd26, 0d3FA171560CE4A489;
+ fma.rn.f64 %fd27, %fd25, %fd12, %fd26;
+ mov.f64 %fd28, 0dBFA4F44D841450E4;
+ fma.rn.f64 %fd29, %fd27, %fd12, %fd28;
+ mov.f64 %fd30, 0d3FA7EE3D3F36BB95;
+ fma.rn.f64 %fd31, %fd29, %fd12, %fd30;
+ mov.f64 %fd32, 0dBFAAD32AE04A9FD1;
+ fma.rn.f64 %fd33, %fd31, %fd12, %fd32;
+ mov.f64 %fd34, 0d3FAE17813D66954F;
+ fma.rn.f64 %fd35, %fd33, %fd12, %fd34;
+ mov.f64 %fd36, 0dBFB11089CA9A5BCD;
+ fma.rn.f64 %fd37, %fd35, %fd12, %fd36;
+ mov.f64 %fd38, 0d3FB3B12B2DB51738;
+ fma.rn.f64 %fd39, %fd37, %fd12, %fd38;
+ mov.f64 %fd40, 0dBFB745D022F8DC5C;
+ fma.rn.f64 %fd41, %fd39, %fd12, %fd40;
+ mov.f64 %fd42, 0d3FBC71C709DFE927;
+ fma.rn.f64 %fd43, %fd41, %fd12, %fd42;
+ mov.f64 %fd44, 0dBFC2492491FA1744;
+ fma.rn.f64 %fd45, %fd43, %fd12, %fd44;
+ mov.f64 %fd46, 0d3FC99999999840D2;
+ fma.rn.f64 %fd47, %fd45, %fd12, %fd46;
+ mov.f64 %fd48, 0dBFD555555555544C;
+ fma.rn.f64 %fd49, %fd47, %fd12, %fd48;
+ mul.f64 %fd50, %fd12, %fd49;
+ fma.rn.f64 %fd51, %fd50, %fd4, %fd4;
+ mov.f64 %fd52, 0d3FF921FB54442D18;
+ sub.f64 %fd53, %fd52, %fd51;
+ setp.gt.f64 %p4, %fd2, 0d3FF0000000000000;
+ selp.f64 %fd54, %fd53, %fd51, %p4;
+ {
+ .reg .b32 %temp;
+ mov.b64 {%r6, %temp}, %fd54;
+ }
+ {
+ .reg .b32 %temp;
+ mov.b64 {%temp, %r7}, %fd54;
+ }
+ {
+ .reg .b32 %temp;
+ mov.b64 {%temp, %r8}, %fd1;
+ }
+ and.b32 %r9, %r8, -2147483648;
+ or.b32 %r10, %r7, %r9;
+ mov.b64 %fd55, {%r6, %r10};
+ shl.b64 %rd8, %rd1, 3;
+ add.s64 %rd9, %rd7, %rd8;
+ st.global.f64 [%rd9], %fd55;
+
+BB34_4:
+ ret;
+}
+
+ // .globl matrix_sign
+.visible .entry matrix_sign(
+ .param .u64 matrix_sign_param_0,
+ .param .u64 matrix_sign_param_1,
+ .param .u32 matrix_sign_param_2
+)
+{
+ .reg .pred %p<3>;
+ .reg .b32 %r<12>;
+ .reg .f64 %fd<4>;
+ .reg .b64 %rd<9>;
+
+
+ ld.param.u64 %rd2, [matrix_sign_param_0];
+ ld.param.u64 %rd3, [matrix_sign_param_1];
+ ld.param.u32 %r2, [matrix_sign_param_2];
+ mov.u32 %r3, %ctaid.x;
+ mov.u32 %r4, %ntid.x;
+ mov.u32 %r5, %tid.x;
+ mad.lo.s32 %r1, %r4, %r3, %r5;
+ setp.ge.u32 %p1, %r1, %r2;
+ @%p1 bra BB35_4;
+
+ cvta.to.global.u64 %rd4, %rd2;
+ mul.wide.s32 %rd5, %r1, 8;
+ add.s64 %rd6, %rd4, %rd5;
+ ld.global.f64 %fd1, [%rd6];
+ setp.eq.f64 %p2, %fd1, 0d0000000000000000;
+ cvta.to.global.u64 %rd7, %rd3;
+ add.s64 %rd1, %rd7, %rd5;
+ @%p2 bra BB35_3;
+ bra.uni BB35_2;
+
+BB35_3:
+ mov.u64 %rd8, 0;
+ st.global.u64 [%rd1], %rd8;
+ bra.uni BB35_4;
+
+BB35_2:
+ {
+ .reg .b32 %temp;
+ mov.b64 {%temp, %r6}, %fd1;
+ }
+ and.b32 %r7, %r6, -2147483648;
+ mov.f64 %fd2, 0d3FF0000000000000;
+ {
+ .reg .b32 %temp;
+ mov.b64 {%temp, %r8}, %fd2;
+ }
+ and.b32 %r9, %r8, 2147483647;
+ or.b32 %r10, %r9, %r7;
+ {
+ .reg .b32 %temp;
+ mov.b64 {%r11, %temp}, %fd2;
+ }
+ mov.b64 %fd3, {%r11, %r10};
+ st.global.f64 [%rd1], %fd3;
+
+BB35_4:
+ ret;
+}
+
+.func (.param .b64 func_retval0) __internal_trig_reduction_slowpathd(
+ .param .b64 __internal_trig_reduction_slowpathd_param_0,
+ .param .b64 __internal_trig_reduction_slowpathd_param_1
+)
+{
+ .local .align 8 .b8 __local_depot36[40];
+ .reg .b64 %SP;
+ .reg .b64 %SPL;
+ .reg .pred %p<9>;
+ .reg .b32 %r<42>;
+ .reg .f64 %fd<5>;
+ .reg .b64 %rd<101>;
+
+
+ mov.u64 %rd100, __local_depot36;
+ cvta.local.u64 %SP, %rd100;
+ ld.param.f64 %fd4, [__internal_trig_reduction_slowpathd_param_0];
+ ld.param.u64 %rd37, [__internal_trig_reduction_slowpathd_param_1];
+ add.u64 %rd38, %SP, 0;
+ cvta.to.local.u64 %rd1, %rd38;
+ {
+ .reg .b32 %temp;
+ mov.b64 {%temp, %r1}, %fd4;
+ }
+ and.b32 %r40, %r1, -2147483648;
+ shr.u32 %r3, %r1, 20;
+ bfe.u32 %r4, %r1, 20, 11;
+ setp.eq.s32 %p1, %r4, 2047;
+ @%p1 bra BB36_13;
+
+ add.s32 %r16, %r4, -1024;
+ shr.u32 %r17, %r16, 6;
+ mov.u32 %r18, 16;
+ sub.s32 %r5, %r18, %r17;
+ mov.u32 %r19, 19;
+ sub.s32 %r20, %r19, %r17;
+ mov.u32 %r21, 18;
+ min.s32 %r6, %r21, %r20;
+ setp.gt.s32 %p2, %r5, %r6;
+ mov.u64 %rd94, 0;
+ mov.u64 %rd93, %rd1;
+ @%p2 bra BB36_4;
+
+ mov.b64 %rd41, %fd4;
+ shl.b64 %rd42, %rd41, 11;
+ or.b64 %rd3, %rd42, -9223372036854775808;
+ add.s32 %r7, %r5, -1;
+ mov.u64 %rd92, %rd1;
+ bfe.u32 %r22, %r1, 20, 11;
+ add.s32 %r23, %r22, -1024;
+ shr.u32 %r24, %r23, 6;
+ neg.s32 %r25, %r24;
+ mul.wide.s32 %rd43, %r25, 8;
+ mov.u64 %rd44, __cudart_i2opi_d;
+ add.s64 %rd45, %rd43, %rd44;
+ add.s64 %rd90, %rd45, 120;
+ mov.u64 %rd94, 0;
+ mov.u64 %rd91, %rd1;
+ mov.u32 %r39, %r7;
+
+BB36_3:
+ .pragma "nounroll";
+ mov.u32 %r8, %r39;
+ mov.u64 %rd7, %rd91;
+ ld.const.u64 %rd48, [%rd90];
+ // inline asm
+ {
+ .reg .u32 r0, r1, r2, r3, alo, ahi, blo, bhi, clo, chi;
+ mov.b64 {alo,ahi}, %rd48;
+ mov.b64 {blo,bhi}, %rd3;
+ mov.b64 {clo,chi}, %rd94;
+ mad.lo.cc.u32 r0, alo, blo, clo;
+ madc.hi.cc.u32 r1, alo, blo, chi;
+ madc.hi.u32 r2, alo, bhi, 0;
+ mad.lo.cc.u32 r1, alo, bhi, r1;
+ madc.hi.cc.u32 r2, ahi, blo, r2;
+ madc.hi.u32 r3, ahi, bhi, 0;
+ mad.lo.cc.u32 r1, ahi, blo, r1;
+ madc.lo.cc.u32 r2, ahi, bhi, r2;
+ addc.u32 r3, r3, 0;
+ mov.b64 %rd46, {r0,r1};
+ mov.b64 %rd94, {r2,r3};
+ }
+ // inline asm
+ st.local.u64 [%rd92], %rd46;
+ add.s32 %r9, %r8, 1;
+ sub.s32 %r26, %r9, %r7;
+ mul.wide.s32 %rd51, %r26, 8;
+ add.s64 %rd92, %rd1, %rd51;
+ add.s64 %rd13, %rd7, 8;
+ mov.u64 %rd93, %rd13;
+ add.s64 %rd90, %rd90, 8;
+ setp.lt.s32 %p3, %r9, %r6;
+ mov.u64 %rd91, %rd13;
+ mov.u32 %r39, %r9;
+ @%p3 bra BB36_3;
+
+BB36_4:
+ st.local.u64 [%rd93], %rd94;
+ ld.local.u64 %rd95, [%rd1+16];
+ ld.local.u64 %rd96, [%rd1+24];
+ and.b32 %r10, %r3, 63;
+ setp.eq.s32 %p4, %r10, 0;
+ @%p4 bra BB36_6;
+
+ mov.u32 %r27, 64;
+ sub.s32 %r28, %r27, %r10;
+ shl.b64 %rd52, %rd96, %r10;
+ shr.u64 %rd53, %rd95, %r28;
+ or.b64 %rd96, %rd52, %rd53;
+ shl.b64 %rd54, %rd95, %r10;
+ ld.local.u64 %rd55, [%rd1+8];
+ shr.u64 %rd56, %rd55, %r28;
+ or.b64 %rd95, %rd56, %rd54;
+
+BB36_6:
+ cvta.to.local.u64 %rd57, %rd37;
+ shr.u64 %rd58, %rd96, 62;
+ cvt.u32.u64 %r29, %rd58;
+ shr.u64 %rd59, %rd95, 62;
+ shl.b64 %rd60, %rd96, 2;
+ or.b64 %rd98, %rd60, %rd59;
+ shl.b64 %rd97, %rd95, 2;
+ shr.u64 %rd61, %rd96, 61;
+ cvt.u32.u64 %r30, %rd61;
+ and.b32 %r31, %r30, 1;
+ add.s32 %r32, %r31, %r29;
+ neg.s32 %r33, %r32;
+ setp.eq.s32 %p5, %r40, 0;
+ selp.b32 %r34, %r32, %r33, %p5;
+ st.local.u32 [%rd57], %r34;
+ setp.eq.s32 %p6, %r31, 0;
+ @%p6 bra BB36_8;
+
+ mov.u64 %rd65, 0;
+ // inline asm
+ {
+ .reg .u32 r0, r1, r2, r3, a0, a1, a2, a3, b0, b1, b2, b3;
+ mov.b64 {a0,a1}, %rd65;
+ mov.b64 {a2,a3}, %rd65;
+ mov.b64 {b0,b1}, %rd97;
+ mov.b64 {b2,b3}, %rd98;
+ sub.cc.u32 r0, a0, b0;
+ subc.cc.u32 r1, a1, b1;
+ subc.cc.u32 r2, a2, b2;
+ subc.u32 r3, a3, b3;
+ mov.b64 %rd97, {r0,r1};
+ mov.b64 %rd98, {r2,r3};
+ }
+ // inline asm
+ xor.b32 %r40, %r40, -2147483648;
+
+BB36_8:
+ clz.b64 %r41, %rd98;
+ setp.eq.s32 %p7, %r41, 0;
+ @%p7 bra BB36_10;
+
+ shl.b64 %rd68, %rd98, %r41;
+ mov.u32 %r35, 64;
+ sub.s32 %r36, %r35, %r41;
+ shr.u64 %rd69, %rd97, %r36;
+ or.b64 %rd98, %rd69, %rd68;
+
+BB36_10:
+ mov.u64 %rd73, -3958705157555305931;
+ // inline asm
+ {
+ .reg .u32 r0, r1, r2, r3, alo, ahi, blo, bhi;
+ mov.b64 {alo,ahi}, %rd98;
+ mov.b64 {blo,bhi}, %rd73;
+ mul.lo.u32 r0, alo, blo;
+ mul.hi.u32 r1, alo, blo;
+ mad.lo.cc.u32 r1, alo, bhi, r1;
+ madc.hi.u32 r2, alo, bhi, 0;
+ mad.lo.cc.u32 r1, ahi, blo, r1;
+ madc.hi.cc.u32 r2, ahi, blo, r2;
+ madc.hi.u32 r3, ahi, bhi, 0;
+ mad.lo.cc.u32 r2, ahi, bhi, r2;
+ addc.u32 r3, r3, 0;
+ mov.b64 %rd70, {r0,r1};
+ mov.b64 %rd99, {r2,r3};
+ }
+ // inline asm
+ setp.lt.s64 %p8, %rd99, 1;
+ @%p8 bra BB36_12;
+
+ // inline asm
+ {
+ .reg .u32 r0, r1, r2, r3, a0, a1, a2, a3, b0, b1, b2, b3;
+ mov.b64 {a0,a1}, %rd70;
+ mov.b64 {a2,a3}, %rd99;
+ mov.b64 {b0,b1}, %rd70;
+ mov.b64 {b2,b3}, %rd99;
+ add.cc.u32 r0, a0, b0;
+ addc.cc.u32 r1, a1, b1;
+ addc.cc.u32 r2, a2, b2;
+ addc.u32 r3, a3, b3;
+ mov.b64 %rd74, {r0,r1};
+ mov.b64 %rd99, {r2,r3};
+ }
+ // inline asm
+ add.s32 %r41, %r41, 1;
+
+BB36_12:
+ cvt.u64.u32 %rd80, %r40;
+ shl.b64 %rd81, %rd80, 32;
+ mov.u32 %r37, 1022;
+ sub.s32 %r38, %r37, %r41;
+ cvt.u64.u32 %rd82, %r38;
+ shl.b64 %rd83, %rd82, 52;
+ add.s64 %rd84, %rd99, 1;
+ shr.u64 %rd85, %rd84, 10;
+ add.s64 %rd86, %rd85, 1;
+ shr.u64 %rd87, %rd86, 1;
+ add.s64 %rd88, %rd87, %rd83;
+ or.b64 %rd89, %rd88, %rd81;
+ mov.b64 %fd4, %rd89;
+
+BB36_13:
+ st.param.f64 [func_retval0+0], %fd4;
+ ret;
+}
+
+.func (.param .b64 func_retval0) __internal_accurate_pow(
+ .param .b64 __internal_accurate_pow_param_0,
+ .param .b64 __internal_accurate_pow_param_1
+)
+{
+ .reg .pred %p<9>;
+ .reg .f32 %f<3>;
+ .reg .b32 %r<52>;
+ .reg .f64 %fd<134>;
+
+
+ ld.param.f64 %fd12, [__internal_accurate_pow_param_0];
+ ld.param.f64 %fd13, [__internal_accurate_pow_param_1];
+ {
+ .reg .b32 %temp;
+ mov.b64 {%temp, %r49}, %fd12;
+ }
+ {
+ .reg .b32 %temp;
+ mov.b64 {%r48, %temp}, %fd12;
+ }
+ shr.u32 %r50, %r49, 20;
+ setp.ne.s32 %p1, %r50, 0;
+ @%p1 bra BB37_2;
+
+ mul.f64 %fd14, %fd12, 0d4350000000000000;
+ {
+ .reg .b32 %temp;
+ mov.b64 {%temp, %r49}, %fd14;
+ }
+ {
+ .reg .b32 %temp;
+ mov.b64 {%r48, %temp}, %fd14;
+ }
+ shr.u32 %r16, %r49, 20;
+ add.s32 %r50, %r16, -54;
+
+BB37_2:
+ add.s32 %r51, %r50, -1023;
+ and.b32 %r17, %r49, -2146435073;
+ or.b32 %r18, %r17, 1072693248;
+ mov.b64 %fd132, {%r48, %r18};
+ setp.lt.u32 %p2, %r18, 1073127583;
+ @%p2 bra BB37_4;
+
+ {
+ .reg .b32 %temp;
+ mov.b64 {%r19, %temp}, %fd132;
+ }
+ {
+ .reg .b32 %temp;
+ mov.b64 {%temp, %r20}, %fd132;
+ }
+ add.s32 %r21, %r20, -1048576;
+ mov.b64 %fd132, {%r19, %r21};
+ add.s32 %r51, %r50, -1022;
+
+BB37_4:
+ add.f64 %fd16, %fd132, 0d3FF0000000000000;
+ // inline asm
+ rcp.approx.ftz.f64 %fd15,%fd16;
+ // inline asm
+ neg.f64 %fd17, %fd16;
+ mov.f64 %fd18, 0d3FF0000000000000;
+ fma.rn.f64 %fd19, %fd17, %fd15, %fd18;
+ fma.rn.f64 %fd20, %fd19, %fd19, %fd19;
+ fma.rn.f64 %fd21, %fd20, %fd15, %fd15;
+ add.f64 %fd22, %fd132, 0dBFF0000000000000;
+ mul.f64 %fd23, %fd22, %fd21;
+ fma.rn.f64 %fd24, %fd22, %fd21, %fd23;
+ mul.f64 %fd25, %fd24, %fd24;
+ mov.f64 %fd26, 0d3ED0F5D241AD3B5A;
+ mov.f64 %fd27, 0d3EB0F5FF7D2CAFE2;
+ fma.rn.f64 %fd28, %fd27, %fd25, %fd26;
+ mov.f64 %fd29, 0d3EF3B20A75488A3F;
+ fma.rn.f64 %fd30, %fd28, %fd25, %fd29;
mov.f64 %fd31, 0d3F1745CDE4FAECD5;
fma.rn.f64 %fd32, %fd30, %fd25, %fd31;
mov.f64 %fd33, 0d3F3C71C7258A578B;
@@ -3499,13 +5071,13 @@ BB23_4:
mov.b32 %f2, %r35;
abs.f32 %f1, %f2;
setp.lt.f32 %p4, %f1, 0f4086232B;
- @%p4 bra BB23_7;
+ @%p4 bra BB37_7;
setp.lt.f64 %p5, %fd4, 0d0000000000000000;
add.f64 %fd129, %fd4, 0d7FF0000000000000;
selp.f64 %fd133, 0d0000000000000000, %fd129, %p5;
setp.geu.f32 %p6, %f1, 0f40874800;
- @%p6 bra BB23_7;
+ @%p6 bra BB37_7;
shr.u32 %r36, %r13, 31;
add.s32 %r37, %r13, %r36;
@@ -3520,26 +5092,26 @@ BB23_4:
mov.b64 %fd131, {%r44, %r43};
mul.f64 %fd133, %fd130, %fd131;
-BB23_7:
+BB37_7:
{
.reg .b32 %temp;
mov.b64 {%temp, %r45}, %fd133;
}
and.b32 %r46, %r45, 2147483647;
setp.ne.s32 %p7, %r46, 2146435072;
+ @%p7 bra BB37_9;
+
{
.reg .b32 %temp;
mov.b64 {%r47, %temp}, %fd133;
}
- setp.ne.s32 %p8, %r47, 0;
- or.pred %p9, %p8, %p7;
- @!%p9 bra BB23_9;
- bra.uni BB23_8;
+ setp.eq.s32 %p8, %r47, 0;
+ @%p8 bra BB37_10;
-BB23_8:
+BB37_9:
fma.rn.f64 %fd133, %fd133, %fd5, %fd133;
-BB23_9:
+BB37_10:
st.param.f64 [func_retval0+0], %fd133;
ret;
}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1fc764b9/src/main/java/org/apache/sysml/hops/UnaryOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/UnaryOp.java b/src/main/java/org/apache/sysml/hops/UnaryOp.java
index c75d0e0..451960b 100644
--- a/src/main/java/org/apache/sysml/hops/UnaryOp.java
+++ b/src/main/java/org/apache/sysml/hops/UnaryOp.java
@@ -157,8 +157,14 @@ public class UnaryOp extends Hop implements MultiThreadedHop
else //default unary
{
int k = isCumulativeUnaryOperation() ? OptimizerUtils.getConstrainedNumThreads( _maxNumThreads ) : 1;
- if(_op == OpOp1.SELP || _op == OpOp1.EXP) {
- et = findGPUExecTypeByMemEstimate(et);
+ switch(_op) {
+ case SELP:case EXP:case SQRT:case LOG:case ABS:
+ case ROUND:case FLOOR:case CEIL:
+ case SIN:case COS: case TAN:case ASIN:case ACOS:case ATAN:
+ case SIGN:
+ et = findGPUExecTypeByMemEstimate(et);
+ break;
+ default:
}
Unary unary1 = new Unary(input.constructLops(), HopsOpOp1LopsU.get(_op),
getDataType(), getValueType(), et, k);
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1fc764b9/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
index 4a45521..443d0eb 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
@@ -69,12 +69,27 @@ public class GPUInstructionParser extends InstructionParser
String2GPUInstructionType.put( "^2" , GPUINSTRUCTION_TYPE.ArithmeticBinary); //special ^ case
String2GPUInstructionType.put( "*2" , GPUINSTRUCTION_TYPE.ArithmeticBinary); //special * case
String2GPUInstructionType.put( "-nz" , GPUINSTRUCTION_TYPE.ArithmeticBinary); //special - case
- String2GPUInstructionType.put( "+*" , GPUINSTRUCTION_TYPE.ArithmeticBinary);
- String2GPUInstructionType.put( "-*" , GPUINSTRUCTION_TYPE.ArithmeticBinary);
+ String2GPUInstructionType.put( "+*" , GPUINSTRUCTION_TYPE.ArithmeticBinary);
+ String2GPUInstructionType.put( "-*" , GPUINSTRUCTION_TYPE.ArithmeticBinary);
// Builtin functions
String2GPUInstructionType.put( "sel+" , GPUINSTRUCTION_TYPE.BuiltinUnary);
String2GPUInstructionType.put( "exp" , GPUINSTRUCTION_TYPE.BuiltinUnary);
+ String2GPUInstructionType.put( "log" , GPUINSTRUCTION_TYPE.BuiltinUnary);
+ String2GPUInstructionType.put( "abs" , GPUINSTRUCTION_TYPE.BuiltinUnary);
+ String2GPUInstructionType.put( "sqrt" , GPUINSTRUCTION_TYPE.BuiltinUnary);
+ String2GPUInstructionType.put( "round" , GPUINSTRUCTION_TYPE.BuiltinUnary);
+ String2GPUInstructionType.put( "floor" , GPUINSTRUCTION_TYPE.BuiltinUnary);
+ String2GPUInstructionType.put( "ceil" , GPUINSTRUCTION_TYPE.BuiltinUnary);
+ String2GPUInstructionType.put( "sin" , GPUINSTRUCTION_TYPE.BuiltinUnary);
+ String2GPUInstructionType.put( "cos" , GPUINSTRUCTION_TYPE.BuiltinUnary);
+ String2GPUInstructionType.put( "tan" , GPUINSTRUCTION_TYPE.BuiltinUnary);
+ String2GPUInstructionType.put( "asin" , GPUINSTRUCTION_TYPE.BuiltinUnary);
+ String2GPUInstructionType.put( "acos" , GPUINSTRUCTION_TYPE.BuiltinUnary);
+ String2GPUInstructionType.put( "atan" , GPUINSTRUCTION_TYPE.BuiltinUnary);
+ String2GPUInstructionType.put( "sign" , GPUINSTRUCTION_TYPE.BuiltinUnary);
+
+
String2GPUInstructionType.put( "solve" , GPUINSTRUCTION_TYPE.BuiltinBinary);