You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2017/09/21 17:08:28 UTC

[2/2] systemml git commit: [SYSTEMML-1923] Support sinh, cosh and tanh as built-in functions

[SYSTEMML-1923] Support sinh, cosh and tanh as built-in functions

- Added sinh, cosh and tanh builtin functions to CP, Spark and GPU backend.
- Added these functions to Python DSL.
- Also, updated the DML language reference and Python reference documentation.

Closes #668.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/50a895f8
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/50a895f8
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/50a895f8

Branch: refs/heads/master
Commit: 50a895f86167d5766b249a3db89f9c2b06f7a0d0
Parents: aa15197
Author: Niketan Pansare <np...@us.ibm.com>
Authored: Thu Sep 21 10:06:04 2017 -0700
Committer: Niketan Pansare <np...@us.ibm.com>
Committed: Thu Sep 21 10:07:15 2017 -0700

----------------------------------------------------------------------
 docs/dml-language-reference.md                  |    2 +-
 docs/python-reference.md                        |    2 +-
 src/main/cpp/kernels/SystemML.cu                |   44 +-
 src/main/cpp/kernels/SystemML.ptx               | 2070 +++++++++++-------
 src/main/java/org/apache/sysml/hops/Hop.java    |   11 +-
 .../java/org/apache/sysml/hops/UnaryOp.java     |   11 +-
 .../sysml/hops/codegen/cplan/CNodeUnary.java    |    1 +
 .../codegen/opt/PlanSelectionFuseCostBased.java |    3 +
 .../opt/PlanSelectionFuseCostBasedV2.java       |    3 +
 src/main/java/org/apache/sysml/lops/Unary.java  |    8 +-
 .../java/org/apache/sysml/lops/UnaryCP.java     |   11 +-
 .../sysml/parser/BuiltinFunctionExpression.java |   12 +
 .../org/apache/sysml/parser/DMLTranslator.java  |   12 +
 .../org/apache/sysml/parser/Expression.java     |    3 +
 .../sysml/runtime/functionobjects/Builtin.java  |   24 +-
 .../instructions/CPInstructionParser.java       |    3 +
 .../instructions/GPUInstructionParser.java      |    3 +
 .../instructions/SPInstructionParser.java       |    3 +
 .../instructions/gpu/GPUInstruction.java        |    3 +
 .../gpu/MatrixBuiltinGPUInstruction.java        |    6 +
 .../runtime/matrix/data/LibMatrixCUDA.java      |   51 +
 .../runtime/matrix/operators/UnaryOperator.java |    2 +
 src/main/python/systemml/defmatrix.py           |    9 +
 .../org/apache/sysml/test/gpu/UnaryOpTests.java |   15 +
 24 files changed, 1474 insertions(+), 838 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/50a895f8/docs/dml-language-reference.md
----------------------------------------------------------------------
diff --git a/docs/dml-language-reference.md b/docs/dml-language-reference.md
index bd66a42..d8ca07f 100644
--- a/docs/dml-language-reference.md
+++ b/docs/dml-language-reference.md
@@ -822,7 +822,7 @@ is same as
 Function | Description | Parameters | Example
 -------- | ----------- | ---------- | -------
 exp(), log(), abs(), sqrt(), round(), floor(), ceil() | Apply mathematical function on input (cell wise if input is matrix) | Input: (&lt;matrix&gt;), or (&lt;scalar&gt;) <br/> Output: &lt;matrix&gt;, or &lt;scalar&gt; | sqrt(X) <br/> log(X,y) <br/> round(X) <br/> floor(X) <br/> ceil(X)
-sin(), cos(), tan(), asin(), acos(), atan() | Apply trigonometric function on input (cell wise if input is matrix) | Input: (&lt;matrix&gt;), or (&lt;scalar&gt;) <br/> Output: &lt;matrix&gt;, or &lt;scalar&gt; | sin(X)
+sin(), cos(), tan(), sinh(), cosh(), tanh(), asin(), acos(), atan() | Apply trigonometric function on input (cell wise if input is matrix) | Input: (&lt;matrix&gt;), or (&lt;scalar&gt;) <br/> Output: &lt;matrix&gt;, or &lt;scalar&gt; | sin(X)
 sign() | Returns a matrix representing the signs of the input matrix elements, where 1 represents positive, 0 represents zero, and -1 represents negative | Input : (A &lt;matrix&gt;) <br/> Output : &lt;matrix&gt; | <span style="white-space: nowrap;">A = matrix("-5 0 3 -3",</span> rows=2, cols=2) <br/>signA = sign(A)<br/>Matrix signA: [[-1, 0], [1, -1]]
 
 

http://git-wip-us.apache.org/repos/asf/systemml/blob/50a895f8/docs/python-reference.md
----------------------------------------------------------------------
diff --git a/docs/python-reference.md b/docs/python-reference.md
index 119c1d0..4fd78fe 100644
--- a/docs/python-reference.md
+++ b/docs/python-reference.md
@@ -191,7 +191,7 @@ In addition to the above mentioned operators, following functions are supported.
 | argmax(self, axis=None)                              | Returns the indices of the maximum values along an axis.                                                                        | axis : int, optional (only axis=1, i.e. rowIndexMax is supported in this version)                                                                                                                                           |
 | cumsum(self, axis=None)                              | Returns the indices of the maximum values along an axis.                                                                        | axis : int, optional (only axis=0, i.e. cumsum along the rows is supported in this version)                                                                                                                                 |
 
-- Global statistical built-In functions: exp, log, abs, sqrt, round, floor, ceil, sin, cos, tan, asin, acos, atan, sign, solve
+- Global statistical built-In functions: exp, log, abs, sqrt, round, floor, ceil, sin, cos, tan, sinh, cosh, tanh, asin, acos, atan, sign, solve
 
 |                                                      | Description                                                                                                                     | Parameters                                                                                                                                                                                              |
 |------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|

http://git-wip-us.apache.org/repos/asf/systemml/blob/50a895f8/src/main/cpp/kernels/SystemML.cu
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.cu b/src/main/cpp/kernels/SystemML.cu
index 3e1a13a..c243564 100644
--- a/src/main/cpp/kernels/SystemML.cu
+++ b/src/main/cpp/kernels/SystemML.cu
@@ -990,6 +990,20 @@ __global__ void matrix_sin(double *A, double *C, unsigned int size) {
 }
 
 /**
+ * Do an sinh over all the elements of a matrix
+ * @param A the input matrix (of length = size)
+ * @param C the pre-allocated output matrix (of length = size)
+ * @param siz the length of the input and output matrices
+ */
+extern "C"
+__global__ void matrix_sinh(double *A, double *C, unsigned int size) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index < size){
+        C[index] = sinh(A[index]);
+    }
+}
+
+/**
  * Do an cos over all the elements of a matrix
  * @param A the input matrix (of length = size)
  * @param C the pre-allocated output matrix (of length = size)
@@ -1004,6 +1018,20 @@ __global__ void matrix_cos(double *A, double *C, unsigned int size) {
 }
 
 /**
+ * Do an cosh over all the elements of a matrix
+ * @param A the input matrix (of length = size)
+ * @param C the pre-allocated output matrix (of length = size)
+ * @param siz the length of the input and output matrices
+ */
+extern "C"
+__global__ void matrix_cosh(double *A, double *C, unsigned int size) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index < size){
+        C[index] = cosh(A[index]);
+    }
+}
+
+/**
  * Do an tan over all the elements of a matrix
  * @param A the input matrix (of length = size)
  * @param C the pre-allocated output matrix (of length = size)
@@ -1018,6 +1046,20 @@ __global__ void matrix_tan(double *A, double *C, unsigned int size) {
 }
 
 /**
+ * Do an tanh over all the elements of a matrix
+ * @param A the input matrix (of length = size)
+ * @param C the pre-allocated output matrix (of length = size)
+ * @param siz the length of the input and output matrices
+ */
+extern "C"
+__global__ void matrix_tanh(double *A, double *C, unsigned int size) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index < size){
+        C[index] = tanh(A[index]);
+    }
+}
+
+/**
  * Do an asin over all the elements of a matrix
  * @param A the input matrix (of length = size)
  * @param C the pre-allocated output matrix (of length = size)
@@ -1076,4 +1118,4 @@ __global__ void matrix_sign(double *A, double *C, unsigned int size) {
             C[index] = copysign(1.0, A[index]);
         }
     }
-}
\ No newline at end of file
+}

http://git-wip-us.apache.org/repos/asf/systemml/blob/50a895f8/src/main/cpp/kernels/SystemML.ptx
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.ptx b/src/main/cpp/kernels/SystemML.ptx
index b4a6559..73b057e 100644
--- a/src/main/cpp/kernels/SystemML.ptx
+++ b/src/main/cpp/kernels/SystemML.ptx
@@ -1,8 +1,8 @@
 //
 // Generated by NVIDIA NVVM Compiler
 //
-// Compiler Build ID: CL-21554848
-// Cuda compilation tools, release 8.0, V8.0.61
+// Compiler Build ID: CL-21124049
+// Cuda compilation tools, release 8.0, V8.0.44
 // Based on LLVM 3.4svn
 //
 
@@ -689,9 +689,9 @@ BB10_6:
 	.param .u32 matrix_matrix_cellwise_op_param_7
 )
 {
-	.reg .pred 	%p<73>;
-	.reg .b32 	%r<66>;
-	.reg .f64 	%fd<56>;
+	.reg .pred 	%p<77>;
+	.reg .b32 	%r<65>;
+	.reg .f64 	%fd<55>;
 	.reg .b64 	%rd<19>;
 
 
@@ -712,40 +712,40 @@ BB10_6:
 	setp.lt.s32	%p2, %r1, %r14;
 	setp.gt.s32	%p3, %r10, -1;
 	and.pred  	%p4, %p2, %p3;
-	@!%p4 bra 	BB11_77;
+	@!%p4 bra 	BB11_73;
 	bra.uni 	BB11_1;
 
 BB11_1:
 	mad.lo.s32 	%r3, %r1, %r10, %r2;
 	setp.eq.s32	%p5, %r11, 1;
-	mov.u32 	%r64, %r1;
+	mov.u32 	%r63, %r1;
 	@%p5 bra 	BB11_5;
 
 	setp.ne.s32	%p6, %r11, 2;
-	mov.u32 	%r65, %r3;
+	mov.u32 	%r64, %r3;
 	@%p6 bra 	BB11_4;
 
-	mov.u32 	%r65, %r2;
+	mov.u32 	%r64, %r2;
 
 BB11_4:
-	mov.u32 	%r59, %r65;
-	mov.u32 	%r4, %r59;
-	mov.u32 	%r64, %r4;
+	mov.u32 	%r58, %r64;
+	mov.u32 	%r4, %r58;
+	mov.u32 	%r63, %r4;
 
 BB11_5:
-	mov.u32 	%r5, %r64;
+	mov.u32 	%r5, %r63;
 	setp.eq.s32	%p7, %r12, 1;
-	mov.u32 	%r62, %r1;
+	mov.u32 	%r61, %r1;
 	@%p7 bra 	BB11_9;
 
 	setp.ne.s32	%p8, %r12, 2;
-	mov.u32 	%r63, %r3;
+	mov.u32 	%r62, %r3;
 	@%p8 bra 	BB11_8;
 
-	mov.u32 	%r63, %r2;
+	mov.u32 	%r62, %r2;
 
 BB11_8:
-	mov.u32 	%r62, %r63;
+	mov.u32 	%r61, %r62;
 
 BB11_9:
 	cvta.to.global.u64 	%rd5, %rd3;
@@ -753,10 +753,10 @@ BB11_9:
 	mul.wide.s32 	%rd7, %r5, 8;
 	add.s64 	%rd8, %rd6, %rd7;
 	ld.global.f64 	%fd1, [%rd8];
-	mul.wide.s32 	%rd9, %r62, 8;
+	mul.wide.s32 	%rd9, %r61, 8;
 	add.s64 	%rd10, %rd5, %rd9;
 	ld.global.f64 	%fd2, [%rd10];
-	mov.f64 	%fd55, 0d7FEFFFFFFFFFFFFF;
+	mov.f64 	%fd54, 0d7FEFFFFFFFFFFFFF;
 	setp.gt.s32	%p9, %r13, 8;
 	@%p9 bra 	BB11_26;
 
@@ -767,12 +767,12 @@ BB11_9:
 	@%p30 bra 	BB11_15;
 
 	setp.eq.s32	%p33, %r13, 0;
-	@%p33 bra 	BB11_75;
+	@%p33 bra 	BB11_71;
 	bra.uni 	BB11_13;
 
-BB11_75:
-	add.f64 	%fd55, %fd1, %fd2;
-	bra.uni 	BB11_76;
+BB11_71:
+	add.f64 	%fd54, %fd1, %fd2;
+	bra.uni 	BB11_72;
 
 BB11_26:
 	setp.gt.s32	%p10, %r13, 13;
@@ -782,23 +782,23 @@ BB11_26:
 	@%p17 bra 	BB11_31;
 
 	setp.eq.s32	%p21, %r13, 9;
-	@%p21 bra 	BB11_55;
+	@%p21 bra 	BB11_53;
 	bra.uni 	BB11_29;
 
-BB11_55:
-	setp.eq.f64	%p48, %fd1, %fd2;
-	selp.f64	%fd55, 0d3FF0000000000000, 0d0000000000000000, %p48;
-	bra.uni 	BB11_76;
+BB11_53:
+	setp.eq.f64	%p50, %fd1, %fd2;
+	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p50;
+	bra.uni 	BB11_72;
 
 BB11_18:
 	setp.gt.s32	%p24, %r13, 5;
 	@%p24 bra 	BB11_22;
 
 	setp.eq.s32	%p28, %r13, 4;
-	@%p28 bra 	BB11_58;
+	@%p28 bra 	BB11_56;
 	bra.uni 	BB11_20;
 
-BB11_58:
+BB11_56:
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r8}, %fd1;
@@ -811,7 +811,7 @@ BB11_58:
 	add.s32 	%r32, %r31, -1012;
 	mov.b64 	 %rd15, %fd2;
 	shl.b64 	%rd1, %rd15, %r32;
-	setp.eq.s64	%p53, %rd1, -9223372036854775808;
+	setp.eq.s64	%p55, %rd1, -9223372036854775808;
 	abs.f64 	%fd19, %fd1;
 	// Callseq Start 0
 	{
@@ -828,192 +828,192 @@ BB11_58:
 	param0, 
 	param1
 	);
-	ld.param.f64	%fd54, [retval0+0];
+	ld.param.f64	%fd53, [retval0+0];
 	
 	//{
 	}// Callseq End 0
-	setp.lt.s32	%p54, %r8, 0;
-	and.pred  	%p1, %p54, %p53;
-	@!%p1 bra 	BB11_60;
-	bra.uni 	BB11_59;
+	setp.lt.s32	%p56, %r8, 0;
+	and.pred  	%p1, %p56, %p55;
+	@!%p1 bra 	BB11_58;
+	bra.uni 	BB11_57;
 
-BB11_59:
+BB11_57:
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r33}, %fd54;
+	mov.b64 	{%temp, %r33}, %fd53;
 	}
 	xor.b32  	%r34, %r33, -2147483648;
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r35, %temp}, %fd54;
+	mov.b64 	{%r35, %temp}, %fd53;
 	}
-	mov.b64 	%fd54, {%r35, %r34};
+	mov.b64 	%fd53, {%r35, %r34};
 
-BB11_60:
-	mov.f64 	%fd53, %fd54;
-	setp.eq.f64	%p55, %fd1, 0d0000000000000000;
-	@%p55 bra 	BB11_63;
-	bra.uni 	BB11_61;
+BB11_58:
+	mov.f64 	%fd52, %fd53;
+	setp.eq.f64	%p57, %fd1, 0d0000000000000000;
+	@%p57 bra 	BB11_61;
+	bra.uni 	BB11_59;
 
-BB11_63:
-	selp.b32	%r36, %r8, 0, %p53;
+BB11_61:
+	selp.b32	%r36, %r8, 0, %p55;
 	or.b32  	%r37, %r36, 2146435072;
-	setp.lt.s32	%p59, %r9, 0;
-	selp.b32	%r38, %r37, %r36, %p59;
+	setp.lt.s32	%p61, %r9, 0;
+	selp.b32	%r38, %r37, %r36, %p61;
 	mov.u32 	%r39, 0;
-	mov.b64 	%fd53, {%r39, %r38};
-	bra.uni 	BB11_64;
+	mov.b64 	%fd52, {%r39, %r38};
+	bra.uni 	BB11_62;
 
 BB11_35:
 	setp.gt.s32	%p11, %r13, 15;
 	@%p11 bra 	BB11_39;
 
 	setp.eq.s32	%p15, %r13, 14;
-	@%p15 bra 	BB11_52;
+	@%p15 bra 	BB11_50;
 	bra.uni 	BB11_37;
 
-BB11_52:
+BB11_50:
 	cvt.rni.s64.f64	%rd11, %fd1;
 	cvt.rni.s64.f64	%rd12, %fd2;
 	cvt.u32.u64	%r25, %rd11;
 	cvt.u32.u64	%r26, %rd12;
 	or.b32  	%r27, %r26, %r25;
-	setp.eq.s32	%p45, %r27, 0;
-	selp.f64	%fd55, 0d0000000000000000, 0d3FF0000000000000, %p45;
-	bra.uni 	BB11_76;
+	setp.eq.s32	%p47, %r27, 0;
+	selp.f64	%fd54, 0d0000000000000000, 0d3FF0000000000000, %p47;
+	bra.uni 	BB11_72;
 
 BB11_15:
 	setp.eq.s32	%p31, %r13, 2;
-	@%p31 bra 	BB11_74;
+	@%p31 bra 	BB11_70;
 	bra.uni 	BB11_16;
 
-BB11_74:
-	mul.f64 	%fd55, %fd1, %fd2;
-	bra.uni 	BB11_76;
+BB11_70:
+	mul.f64 	%fd54, %fd1, %fd2;
+	bra.uni 	BB11_72;
 
 BB11_31:
 	setp.eq.s32	%p18, %r13, 11;
-	@%p18 bra 	BB11_54;
+	@%p18 bra 	BB11_52;
 
 	setp.eq.s32	%p19, %r13, 12;
-	@%p19 bra 	BB11_53;
+	@%p19 bra 	BB11_51;
 	bra.uni 	BB11_33;
 
-BB11_53:
-	max.f64 	%fd55, %fd1, %fd2;
-	bra.uni 	BB11_76;
+BB11_51:
+	max.f64 	%fd54, %fd1, %fd2;
+	bra.uni 	BB11_72;
 
 BB11_22:
 	setp.eq.s32	%p25, %r13, 6;
-	@%p25 bra 	BB11_57;
+	@%p25 bra 	BB11_55;
 
 	setp.eq.s32	%p26, %r13, 7;
-	@%p26 bra 	BB11_56;
+	@%p26 bra 	BB11_54;
 	bra.uni 	BB11_24;
 
-BB11_56:
-	setp.gt.f64	%p50, %fd1, %fd2;
-	selp.f64	%fd55, 0d3FF0000000000000, 0d0000000000000000, %p50;
-	bra.uni 	BB11_76;
+BB11_54:
+	setp.gt.f64	%p52, %fd1, %fd2;
+	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p52;
+	bra.uni 	BB11_72;
 
 BB11_39:
 	setp.eq.s32	%p12, %r13, 16;
-	@%p12 bra 	BB11_51;
+	@%p12 bra 	BB11_49;
 
 	setp.eq.s32	%p13, %r13, 17;
-	@%p13 bra 	BB11_46;
+	@%p13 bra 	BB11_45;
 	bra.uni 	BB11_41;
 
-BB11_46:
-	setp.eq.f64	%p38, %fd2, 0d0000000000000000;
-	setp.eq.f64	%p39, %fd2, 0d8000000000000000;
-	or.pred  	%p40, %p38, %p39;
-	mov.f64 	%fd55, 0d7FF8000000000000;
-	@%p40 bra 	BB11_76;
+BB11_45:
+	setp.eq.f64	%p39, %fd2, 0d0000000000000000;
+	setp.eq.f64	%p40, %fd2, 0d8000000000000000;
+	or.pred  	%p41, %p39, %p40;
+	mov.f64 	%fd54, 0d7FF8000000000000;
+	@%p41 bra 	BB11_72;
 
-	div.rn.f64 	%fd55, %fd1, %fd2;
-	abs.f64 	%fd39, %fd55;
-	setp.gtu.f64	%p41, %fd39, 0d7FF0000000000000;
-	@%p41 bra 	BB11_76;
+	div.rn.f64 	%fd54, %fd1, %fd2;
+	abs.f64 	%fd39, %fd54;
+	setp.gtu.f64	%p42, %fd39, 0d7FF0000000000000;
+	@%p42 bra 	BB11_72;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r22}, %fd55;
+	mov.b64 	{%r22, %temp}, %fd54;
 	}
-	and.b32  	%r23, %r22, 2147483647;
-	setp.ne.s32	%p42, %r23, 2146435072;
-	@%p42 bra 	BB11_50;
-
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r24, %temp}, %fd55;
+	mov.b64 	{%temp, %r23}, %fd54;
 	}
-	setp.eq.s32	%p43, %r24, 0;
-	@%p43 bra 	BB11_76;
-
-BB11_50:
-	cvt.rmi.f64.f64	%fd40, %fd55;
+	and.b32  	%r24, %r23, 2147483647;
+	setp.ne.s32	%p43, %r24, 2146435072;
+	setp.ne.s32	%p44, %r22, 0;
+	or.pred  	%p45, %p43, %p44;
+	@!%p45 bra 	BB11_72;
+	bra.uni 	BB11_48;
+
+BB11_48:
+	cvt.rmi.f64.f64	%fd40, %fd54;
 	mul.f64 	%fd41, %fd2, %fd40;
-	sub.f64 	%fd55, %fd1, %fd41;
-	bra.uni 	BB11_76;
+	sub.f64 	%fd54, %fd1, %fd41;
+	bra.uni 	BB11_72;
 
 BB11_13:
 	setp.eq.s32	%p34, %r13, 1;
 	@%p34 bra 	BB11_14;
-	bra.uni 	BB11_76;
+	bra.uni 	BB11_72;
 
 BB11_14:
-	sub.f64 	%fd55, %fd1, %fd2;
-	bra.uni 	BB11_76;
+	sub.f64 	%fd54, %fd1, %fd2;
+	bra.uni 	BB11_72;
 
 BB11_29:
 	setp.eq.s32	%p22, %r13, 10;
 	@%p22 bra 	BB11_30;
-	bra.uni 	BB11_76;
+	bra.uni 	BB11_72;
 
 BB11_30:
-	setp.neu.f64	%p47, %fd1, %fd2;
-	selp.f64	%fd55, 0d3FF0000000000000, 0d0000000000000000, %p47;
-	bra.uni 	BB11_76;
+	setp.neu.f64	%p49, %fd1, %fd2;
+	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p49;
+	bra.uni 	BB11_72;
 
 BB11_20:
 	setp.eq.s32	%p29, %r13, 5;
 	@%p29 bra 	BB11_21;
-	bra.uni 	BB11_76;
+	bra.uni 	BB11_72;
 
 BB11_21:
-	setp.lt.f64	%p52, %fd1, %fd2;
-	selp.f64	%fd55, 0d3FF0000000000000, 0d0000000000000000, %p52;
-	bra.uni 	BB11_76;
+	setp.lt.f64	%p54, %fd1, %fd2;
+	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p54;
+	bra.uni 	BB11_72;
 
 BB11_37:
 	setp.eq.s32	%p16, %r13, 15;
 	@%p16 bra 	BB11_38;
-	bra.uni 	BB11_76;
+	bra.uni 	BB11_72;
 
 BB11_38:
 	mul.f64 	%fd43, %fd1, %fd2;
 	mov.f64 	%fd44, 0d3FF0000000000000;
-	sub.f64 	%fd55, %fd44, %fd43;
-	bra.uni 	BB11_76;
+	sub.f64 	%fd54, %fd44, %fd43;
+	bra.uni 	BB11_72;
 
 BB11_16:
 	setp.eq.s32	%p32, %r13, 3;
 	@%p32 bra 	BB11_17;
-	bra.uni 	BB11_76;
+	bra.uni 	BB11_72;
 
 BB11_17:
-	div.rn.f64 	%fd55, %fd1, %fd2;
-	bra.uni 	BB11_76;
+	div.rn.f64 	%fd54, %fd1, %fd2;
+	bra.uni 	BB11_72;
 
-BB11_54:
-	min.f64 	%fd55, %fd1, %fd2;
-	bra.uni 	BB11_76;
+BB11_52:
+	min.f64 	%fd54, %fd1, %fd2;
+	bra.uni 	BB11_72;
 
 BB11_33:
 	setp.eq.s32	%p20, %r13, 13;
 	@%p20 bra 	BB11_34;
-	bra.uni 	BB11_76;
+	bra.uni 	BB11_72;
 
 BB11_34:
 	cvt.rni.s64.f64	%rd13, %fd1;
@@ -1021,149 +1021,147 @@ BB11_34:
 	cvt.u32.u64	%r28, %rd13;
 	cvt.u32.u64	%r29, %rd14;
 	and.b32  	%r30, %r29, %r28;
-	setp.eq.s32	%p46, %r30, 0;
-	selp.f64	%fd55, 0d0000000000000000, 0d3FF0000000000000, %p46;
-	bra.uni 	BB11_76;
+	setp.eq.s32	%p48, %r30, 0;
+	selp.f64	%fd54, 0d0000000000000000, 0d3FF0000000000000, %p48;
+	bra.uni 	BB11_72;
 
-BB11_57:
-	setp.le.f64	%p51, %fd1, %fd2;
-	selp.f64	%fd55, 0d3FF0000000000000, 0d0000000000000000, %p51;
-	bra.uni 	BB11_76;
+BB11_55:
+	setp.le.f64	%p53, %fd1, %fd2;
+	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p53;
+	bra.uni 	BB11_72;
 
 BB11_24:
 	setp.eq.s32	%p27, %r13, 8;
 	@%p27 bra 	BB11_25;
-	bra.uni 	BB11_76;
+	bra.uni 	BB11_72;
 
 BB11_25:
-	setp.ge.f64	%p49, %fd1, %fd2;
-	selp.f64	%fd55, 0d3FF0000000000000, 0d0000000000000000, %p49;
-	bra.uni 	BB11_76;
+	setp.ge.f64	%p51, %fd1, %fd2;
+	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p51;
+	bra.uni 	BB11_72;
 
-BB11_51:
-	setp.neu.f64	%p44, %fd1, 0d0000000000000000;
+BB11_49:
+	setp.neu.f64	%p46, %fd1, 0d0000000000000000;
 	sub.f64 	%fd42, %fd1, %fd2;
-	selp.f64	%fd55, %fd42, 0d0000000000000000, %p44;
-	bra.uni 	BB11_76;
+	selp.f64	%fd54, %fd42, 0d0000000000000000, %p46;
+	bra.uni 	BB11_72;
 
 BB11_41:
 	setp.ne.s32	%p14, %r13, 18;
-	@%p14 bra 	BB11_76;
+	@%p14 bra 	BB11_72;
 
-	div.rn.f64 	%fd55, %fd1, %fd2;
-	abs.f64 	%fd37, %fd55;
+	div.rn.f64 	%fd54, %fd1, %fd2;
+	abs.f64 	%fd37, %fd54;
 	setp.gtu.f64	%p35, %fd37, 0d7FF0000000000000;
-	@%p35 bra 	BB11_76;
+	@%p35 bra 	BB11_72;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r19}, %fd55;
+	mov.b64 	{%r19, %temp}, %fd54;
 	}
-	and.b32  	%r20, %r19, 2147483647;
-	setp.ne.s32	%p36, %r20, 2146435072;
-	@%p36 bra 	BB11_45;
-
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r21, %temp}, %fd55;
+	mov.b64 	{%temp, %r20}, %fd54;
 	}
-	setp.eq.s32	%p37, %r21, 0;
-	@%p37 bra 	BB11_76;
+	and.b32  	%r21, %r20, 2147483647;
+	setp.ne.s32	%p36, %r21, 2146435072;
+	setp.ne.s32	%p37, %r19, 0;
+	or.pred  	%p38, %p36, %p37;
+	@!%p38 bra 	BB11_72;
+	bra.uni 	BB11_44;
 
-BB11_45:
-	cvt.rmi.f64.f64	%fd55, %fd55;
-	bra.uni 	BB11_76;
+BB11_44:
+	cvt.rmi.f64.f64	%fd54, %fd54;
+	bra.uni 	BB11_72;
 
-BB11_61:
-	setp.gt.s32	%p56, %r8, -1;
-	@%p56 bra 	BB11_64;
+BB11_59:
+	setp.gt.s32	%p58, %r8, -1;
+	@%p58 bra 	BB11_62;
 
 	cvt.rzi.f64.f64	%fd45, %fd2;
-	setp.neu.f64	%p57, %fd45, %fd2;
-	selp.f64	%fd53, 0dFFF8000000000000, %fd53, %p57;
+	setp.neu.f64	%p59, %fd45, %fd2;
+	selp.f64	%fd52, 0dFFF8000000000000, %fd52, %p59;
 
-BB11_64:
-	mov.f64 	%fd25, %fd53;
+BB11_62:
+	mov.f64 	%fd25, %fd52;
 	add.f64 	%fd26, %fd1, %fd2;
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r40}, %fd26;
 	}
 	and.b32  	%r41, %r40, 2146435072;
-	setp.ne.s32	%p60, %r41, 2146435072;
-	mov.f64 	%fd52, %fd25;
-	@%p60 bra 	BB11_73;
+	setp.ne.s32	%p62, %r41, 2146435072;
+	mov.f64 	%fd51, %fd25;
+	@%p62 bra 	BB11_69;
 
-	setp.gtu.f64	%p61, %fd19, 0d7FF0000000000000;
-	mov.f64 	%fd52, %fd26;
-	@%p61 bra 	BB11_73;
-
-	abs.f64 	%fd46, %fd2;
-	setp.gtu.f64	%p62, %fd46, 0d7FF0000000000000;
+	setp.gtu.f64	%p63, %fd19, 0d7FF0000000000000;
 	mov.f64 	%fd51, %fd26;
-	mov.f64 	%fd52, %fd51;
-	@%p62 bra 	BB11_73;
-
-	and.b32  	%r42, %r9, 2147483647;
-	setp.ne.s32	%p63, %r42, 2146435072;
 	@%p63 bra 	BB11_69;
 
+	abs.f64 	%fd46, %fd2;
+	setp.gtu.f64	%p64, %fd46, 0d7FF0000000000000;
+	mov.f64 	%fd50, %fd26;
+	mov.f64 	%fd51, %fd50;
+	@%p64 bra 	BB11_69;
+
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r43, %temp}, %fd2;
+	mov.b64 	{%r42, %temp}, %fd2;
 	}
-	setp.eq.s32	%p64, %r43, 0;
-	@%p64 bra 	BB11_72;
-
-BB11_69:
-	and.b32  	%r44, %r8, 2147483647;
-	setp.ne.s32	%p65, %r44, 2146435072;
-	mov.f64 	%fd49, %fd25;
-	mov.f64 	%fd52, %fd49;
-	@%p65 bra 	BB11_73;
-
+	and.b32  	%r43, %r9, 2147483647;
+	setp.eq.s32	%p65, %r43, 2146435072;
+	setp.eq.s32	%p66, %r42, 0;
+	and.pred  	%p67, %p65, %p66;
+	@%p67 bra 	BB11_68;
+	bra.uni 	BB11_66;
+
+BB11_68:
+	setp.gt.f64	%p71, %fd19, 0d3FF0000000000000;
+	selp.b32	%r51, 2146435072, 0, %p71;
+	xor.b32  	%r52, %r51, 2146435072;
+	setp.lt.s32	%p72, %r9, 0;
+	selp.b32	%r53, %r52, %r51, %p72;
+	setp.eq.f64	%p73, %fd1, 0dBFF0000000000000;
+	selp.b32	%r54, 1072693248, %r53, %p73;
+	mov.u32 	%r55, 0;
+	mov.b64 	%fd51, {%r55, %r54};
+	bra.uni 	BB11_69;
+
+BB11_66:
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r45, %temp}, %fd1;
+	mov.b64 	{%r44, %temp}, %fd1;
 	}
-	setp.ne.s32	%p66, %r45, 0;
-	mov.f64 	%fd52, %fd25;
-	@%p66 bra 	BB11_73;
-
+	and.b32  	%r45, %r8, 2147483647;
+	setp.eq.s32	%p68, %r45, 2146435072;
+	setp.eq.s32	%p69, %r44, 0;
+	and.pred  	%p70, %p68, %p69;
+	mov.f64 	%fd51, %fd25;
+	@!%p70 bra 	BB11_69;
+	bra.uni 	BB11_67;
+
+BB11_67:
 	shr.s32 	%r46, %r9, 31;
 	and.b32  	%r47, %r46, -2146435072;
-	add.s32 	%r48, %r47, 2146435072;
-	or.b32  	%r49, %r48, -2147483648;
-	selp.b32	%r50, %r49, %r48, %p1;
-	mov.u32 	%r51, 0;
-	mov.b64 	%fd52, {%r51, %r50};
-	bra.uni 	BB11_73;
-
-BB11_72:
-	setp.gt.f64	%p67, %fd19, 0d3FF0000000000000;
-	selp.b32	%r52, 2146435072, 0, %p67;
-	xor.b32  	%r53, %r52, 2146435072;
-	setp.lt.s32	%p68, %r9, 0;
-	selp.b32	%r54, %r53, %r52, %p68;
-	setp.eq.f64	%p69, %fd1, 0dBFF0000000000000;
-	selp.b32	%r55, 1072693248, %r54, %p69;
-	mov.u32 	%r56, 0;
-	mov.b64 	%fd52, {%r56, %r55};
+	selp.b32	%r48, -1048576, 2146435072, %p1;
+	add.s32 	%r49, %r48, %r47;
+	mov.u32 	%r50, 0;
+	mov.b64 	%fd51, {%r50, %r49};
 
-BB11_73:
-	setp.eq.f64	%p70, %fd2, 0d0000000000000000;
-	setp.eq.f64	%p71, %fd1, 0d3FF0000000000000;
-	or.pred  	%p72, %p71, %p70;
-	selp.f64	%fd55, 0d3FF0000000000000, %fd52, %p72;
+BB11_69:
+	setp.eq.f64	%p74, %fd2, 0d0000000000000000;
+	setp.eq.f64	%p75, %fd1, 0d3FF0000000000000;
+	or.pred  	%p76, %p75, %p74;
+	selp.f64	%fd54, 0d3FF0000000000000, %fd51, %p76;
 
-BB11_76:
+BB11_72:
 	cvta.to.global.u64 	%rd16, %rd4;
 	mul.wide.s32 	%rd17, %r3, 8;
 	add.s64 	%rd18, %rd16, %rd17;
-	st.global.f64 	[%rd18], %fd55;
+	st.global.f64 	[%rd18], %fd54;
 	bar.sync 	0;
 
-BB11_77:
+BB11_73:
 	ret;
 }
 
@@ -1177,9 +1175,9 @@ BB11_77:
 	.param .u32 matrix_scalar_op_param_5
 )
 {
-	.reg .pred 	%p<133>;
-	.reg .b32 	%r<88>;
-	.reg .f64 	%fd<109>;
+	.reg .pred 	%p<141>;
+	.reg .b32 	%r<86>;
+	.reg .f64 	%fd<107>;
 	.reg .b64 	%rd<20>;
 
 
@@ -1194,7 +1192,7 @@ BB11_77:
 	mov.u32 	%r11, %tid.x;
 	mad.lo.s32 	%r1, %r9, %r10, %r11;
 	setp.ge.s32	%p3, %r1, %r8;
-	@%p3 bra 	BB12_138;
+	@%p3 bra 	BB12_130;
 
 	cvta.to.global.u64 	%rd6, %rd5;
 	cvta.to.global.u64 	%rd7, %rd4;
@@ -1203,9 +1201,9 @@ BB11_77:
 	ld.global.f64 	%fd1, [%rd9];
 	add.s64 	%rd1, %rd6, %rd8;
 	setp.eq.s32	%p4, %r7, 0;
-	@%p4 bra 	BB12_70;
+	@%p4 bra 	BB12_66;
 
-	mov.f64 	%fd99, 0d7FEFFFFFFFFFFFFF;
+	mov.f64 	%fd98, 0d7FEFFFFFFFFFFFFF;
 	setp.gt.s32	%p5, %r6, 8;
 	@%p5 bra 	BB12_19;
 
@@ -1216,31 +1214,31 @@ BB11_77:
 	@%p26 bra 	BB12_8;
 
 	setp.eq.s32	%p29, %r6, 0;
-	@%p29 bra 	BB12_68;
+	@%p29 bra 	BB12_64;
 	bra.uni 	BB12_6;
 
-BB12_68:
-	add.f64 	%fd99, %fd1, %fd68;
-	bra.uni 	BB12_69;
+BB12_64:
+	add.f64 	%fd98, %fd1, %fd68;
+	bra.uni 	BB12_65;
 
-BB12_70:
-	mov.f64 	%fd108, 0d7FEFFFFFFFFFFFFF;
-	setp.gt.s32	%p69, %r6, 8;
-	@%p69 bra 	BB12_87;
+BB12_66:
+	mov.f64 	%fd106, 0d7FEFFFFFFFFFFFFF;
+	setp.gt.s32	%p73, %r6, 8;
+	@%p73 bra 	BB12_83;
 
-	setp.gt.s32	%p83, %r6, 3;
-	@%p83 bra 	BB12_79;
+	setp.gt.s32	%p87, %r6, 3;
+	@%p87 bra 	BB12_75;
 
-	setp.gt.s32	%p90, %r6, 1;
-	@%p90 bra 	BB12_76;
+	setp.gt.s32	%p94, %r6, 1;
+	@%p94 bra 	BB12_72;
 
-	setp.eq.s32	%p93, %r6, 0;
-	@%p93 bra 	BB12_136;
-	bra.uni 	BB12_74;
+	setp.eq.s32	%p97, %r6, 0;
+	@%p97 bra 	BB12_128;
+	bra.uni 	BB12_70;
 
-BB12_136:
-	add.f64 	%fd108, %fd1, %fd68;
-	bra.uni 	BB12_137;
+BB12_128:
+	add.f64 	%fd106, %fd1, %fd68;
+	bra.uni 	BB12_129;
 
 BB12_19:
 	setp.gt.s32	%p6, %r6, 13;
@@ -1250,39 +1248,39 @@ BB12_19:
 	@%p13 bra 	BB12_24;
 
 	setp.eq.s32	%p17, %r6, 9;
-	@%p17 bra 	BB12_48;
+	@%p17 bra 	BB12_46;
 	bra.uni 	BB12_22;
 
-BB12_48:
-	setp.eq.f64	%p44, %fd1, %fd68;
-	selp.f64	%fd99, 0d3FF0000000000000, 0d0000000000000000, %p44;
-	bra.uni 	BB12_69;
+BB12_46:
+	setp.eq.f64	%p46, %fd1, %fd68;
+	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p46;
+	bra.uni 	BB12_65;
 
-BB12_87:
-	setp.gt.s32	%p70, %r6, 13;
-	@%p70 bra 	BB12_96;
+BB12_83:
+	setp.gt.s32	%p74, %r6, 13;
+	@%p74 bra 	BB12_92;
 
-	setp.gt.s32	%p77, %r6, 10;
-	@%p77 bra 	BB12_92;
+	setp.gt.s32	%p81, %r6, 10;
+	@%p81 bra 	BB12_88;
 
-	setp.eq.s32	%p81, %r6, 9;
-	@%p81 bra 	BB12_116;
-	bra.uni 	BB12_90;
+	setp.eq.s32	%p85, %r6, 9;
+	@%p85 bra 	BB12_110;
+	bra.uni 	BB12_86;
 
-BB12_116:
-	setp.eq.f64	%p108, %fd1, %fd68;
-	selp.f64	%fd108, 0d3FF0000000000000, 0d0000000000000000, %p108;
-	bra.uni 	BB12_137;
+BB12_110:
+	setp.eq.f64	%p114, %fd1, %fd68;
+	selp.f64	%fd106, 0d3FF0000000000000, 0d0000000000000000, %p114;
+	bra.uni 	BB12_129;
 
 BB12_11:
 	setp.gt.s32	%p20, %r6, 5;
 	@%p20 bra 	BB12_15;
 
 	setp.eq.s32	%p24, %r6, 4;
-	@%p24 bra 	BB12_51;
+	@%p24 bra 	BB12_49;
 	bra.uni 	BB12_13;
 
-BB12_51:
+BB12_49:
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r2}, %fd68;
@@ -1295,7 +1293,7 @@ BB12_51:
 	add.s32 	%r25, %r24, -1012;
 	mov.b64 	 %rd14, %fd1;
 	shl.b64 	%rd2, %rd14, %r25;
-	setp.eq.s64	%p49, %rd2, -9223372036854775808;
+	setp.eq.s64	%p51, %rd2, -9223372036854775808;
 	abs.f64 	%fd18, %fd68;
 	// Callseq Start 1
 	{
@@ -1312,69 +1310,69 @@ BB12_51:
 	param0, 
 	param1
 	);
-	ld.param.f64	%fd98, [retval0+0];
+	ld.param.f64	%fd97, [retval0+0];
 	
 	//{
 	}// Callseq End 1
-	setp.lt.s32	%p50, %r2, 0;
-	and.pred  	%p1, %p50, %p49;
-	@!%p1 bra 	BB12_53;
-	bra.uni 	BB12_52;
+	setp.lt.s32	%p52, %r2, 0;
+	and.pred  	%p1, %p52, %p51;
+	@!%p1 bra 	BB12_51;
+	bra.uni 	BB12_50;
 
-BB12_52:
+BB12_50:
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r26}, %fd98;
+	mov.b64 	{%temp, %r26}, %fd97;
 	}
 	xor.b32  	%r27, %r26, -2147483648;
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r28, %temp}, %fd98;
+	mov.b64 	{%r28, %temp}, %fd97;
 	}
-	mov.b64 	%fd98, {%r28, %r27};
+	mov.b64 	%fd97, {%r28, %r27};
 
-BB12_53:
-	mov.f64 	%fd97, %fd98;
-	setp.eq.f64	%p51, %fd68, 0d0000000000000000;
-	@%p51 bra 	BB12_56;
-	bra.uni 	BB12_54;
+BB12_51:
+	mov.f64 	%fd96, %fd97;
+	setp.eq.f64	%p53, %fd68, 0d0000000000000000;
+	@%p53 bra 	BB12_54;
+	bra.uni 	BB12_52;
 
-BB12_56:
-	selp.b32	%r29, %r2, 0, %p49;
+BB12_54:
+	selp.b32	%r29, %r2, 0, %p51;
 	or.b32  	%r30, %r29, 2146435072;
-	setp.lt.s32	%p55, %r3, 0;
-	selp.b32	%r31, %r30, %r29, %p55;
+	setp.lt.s32	%p57, %r3, 0;
+	selp.b32	%r31, %r30, %r29, %p57;
 	mov.u32 	%r32, 0;
-	mov.b64 	%fd97, {%r32, %r31};
-	bra.uni 	BB12_57;
+	mov.b64 	%fd96, {%r32, %r31};
+	bra.uni 	BB12_55;
 
 BB12_28:
 	setp.gt.s32	%p7, %r6, 15;
 	@%p7 bra 	BB12_32;
 
 	setp.eq.s32	%p11, %r6, 14;
-	@%p11 bra 	BB12_45;
+	@%p11 bra 	BB12_43;
 	bra.uni 	BB12_30;
 
-BB12_45:
+BB12_43:
 	cvt.rni.s64.f64	%rd10, %fd68;
 	cvt.rni.s64.f64	%rd11, %fd1;
 	cvt.u32.u64	%r18, %rd10;
 	cvt.u32.u64	%r19, %rd11;
 	or.b32  	%r20, %r19, %r18;
-	setp.eq.s32	%p41, %r20, 0;
-	selp.f64	%fd99, 0d0000000000000000, 0d3FF0000000000000, %p41;
-	bra.uni 	BB12_69;
+	setp.eq.s32	%p43, %r20, 0;
+	selp.f64	%fd98, 0d0000000000000000, 0d3FF0000000000000, %p43;
+	bra.uni 	BB12_65;
 
-BB12_79:
-	setp.gt.s32	%p84, %r6, 5;
-	@%p84 bra 	BB12_83;
+BB12_75:
+	setp.gt.s32	%p88, %r6, 5;
+	@%p88 bra 	BB12_79;
 
-	setp.eq.s32	%p88, %r6, 4;
-	@%p88 bra 	BB12_119;
-	bra.uni 	BB12_81;
+	setp.eq.s32	%p92, %r6, 4;
+	@%p92 bra 	BB12_113;
+	bra.uni 	BB12_77;
 
-BB12_119:
+BB12_113:
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r4}, %fd1;
@@ -1383,11 +1381,11 @@ BB12_119:
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r5}, %fd68;
 	}
-	bfe.u32 	%r62, %r5, 20, 11;
-	add.s32 	%r63, %r62, -1012;
+	bfe.u32 	%r61, %r5, 20, 11;
+	add.s32 	%r62, %r61, -1012;
 	mov.b64 	 %rd19, %fd68;
-	shl.b64 	%rd3, %rd19, %r63;
-	setp.eq.s64	%p113, %rd3, -9223372036854775808;
+	shl.b64 	%rd3, %rd19, %r62;
+	setp.eq.s64	%p119, %rd3, -9223372036854775808;
 	abs.f64 	%fd51, %fd1;
 	// Callseq Start 2
 	{
@@ -1404,267 +1402,267 @@ BB12_119:
 	param0, 
 	param1
 	);
-	ld.param.f64	%fd107, [retval0+0];
+	ld.param.f64	%fd105, [retval0+0];
 	
 	//{
 	}// Callseq End 2
-	setp.lt.s32	%p114, %r4, 0;
-	and.pred  	%p2, %p114, %p113;
-	@!%p2 bra 	BB12_121;
-	bra.uni 	BB12_120;
+	setp.lt.s32	%p120, %r4, 0;
+	and.pred  	%p2, %p120, %p119;
+	@!%p2 bra 	BB12_115;
+	bra.uni 	BB12_114;
 
-BB12_120:
+BB12_114:
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r64}, %fd107;
+	mov.b64 	{%temp, %r63}, %fd105;
 	}
-	xor.b32  	%r65, %r64, -2147483648;
+	xor.b32  	%r64, %r63, -2147483648;
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r66, %temp}, %fd107;
+	mov.b64 	{%r65, %temp}, %fd105;
 	}
-	mov.b64 	%fd107, {%r66, %r65};
+	mov.b64 	%fd105, {%r65, %r64};
 
-BB12_121:
-	mov.f64 	%fd106, %fd107;
-	setp.eq.f64	%p115, %fd1, 0d0000000000000000;
-	@%p115 bra 	BB12_124;
-	bra.uni 	BB12_122;
+BB12_115:
+	mov.f64 	%fd104, %fd105;
+	setp.eq.f64	%p121, %fd1, 0d0000000000000000;
+	@%p121 bra 	BB12_118;
+	bra.uni 	BB12_116;
 
-BB12_124:
-	selp.b32	%r67, %r4, 0, %p113;
-	or.b32  	%r68, %r67, 2146435072;
-	setp.lt.s32	%p119, %r5, 0;
-	selp.b32	%r69, %r68, %r67, %p119;
-	mov.u32 	%r70, 0;
-	mov.b64 	%fd106, {%r70, %r69};
-	bra.uni 	BB12_125;
+BB12_118:
+	selp.b32	%r66, %r4, 0, %p119;
+	or.b32  	%r67, %r66, 2146435072;
+	setp.lt.s32	%p125, %r5, 0;
+	selp.b32	%r68, %r67, %r66, %p125;
+	mov.u32 	%r69, 0;
+	mov.b64 	%fd104, {%r69, %r68};
+	bra.uni 	BB12_119;
 
-BB12_96:
-	setp.gt.s32	%p71, %r6, 15;
-	@%p71 bra 	BB12_100;
+BB12_92:
+	setp.gt.s32	%p75, %r6, 15;
+	@%p75 bra 	BB12_96;
 
-	setp.eq.s32	%p75, %r6, 14;
-	@%p75 bra 	BB12_113;
-	bra.uni 	BB12_98;
+	setp.eq.s32	%p79, %r6, 14;
+	@%p79 bra 	BB12_107;
+	bra.uni 	BB12_94;
 
-BB12_113:
+BB12_107:
 	cvt.rni.s64.f64	%rd15, %fd1;
 	cvt.rni.s64.f64	%rd16, %fd68;
-	cvt.u32.u64	%r56, %rd15;
-	cvt.u32.u64	%r57, %rd16;
-	or.b32  	%r58, %r57, %r56;
-	setp.eq.s32	%p105, %r58, 0;
-	selp.f64	%fd108, 0d0000000000000000, 0d3FF0000000000000, %p105;
-	bra.uni 	BB12_137;
+	cvt.u32.u64	%r55, %rd15;
+	cvt.u32.u64	%r56, %rd16;
+	or.b32  	%r57, %r56, %r55;
+	setp.eq.s32	%p111, %r57, 0;
+	selp.f64	%fd106, 0d0000000000000000, 0d3FF0000000000000, %p111;
+	bra.uni 	BB12_129;
 
 BB12_8:
 	setp.eq.s32	%p27, %r6, 2;
-	@%p27 bra 	BB12_67;
+	@%p27 bra 	BB12_63;
 	bra.uni 	BB12_9;
 
-BB12_67:
-	mul.f64 	%fd99, %fd1, %fd68;
-	bra.uni 	BB12_69;
+BB12_63:
+	mul.f64 	%fd98, %fd1, %fd68;
+	bra.uni 	BB12_65;
 
 BB12_24:
 	setp.eq.s32	%p14, %r6, 11;
-	@%p14 bra 	BB12_47;
+	@%p14 bra 	BB12_45;
 
 	setp.eq.s32	%p15, %r6, 12;
-	@%p15 bra 	BB12_46;
+	@%p15 bra 	BB12_44;
 	bra.uni 	BB12_26;
 
-BB12_46:
-	max.f64 	%fd99, %fd68, %fd1;
-	bra.uni 	BB12_69;
+BB12_44:
+	max.f64 	%fd98, %fd68, %fd1;
+	bra.uni 	BB12_65;
 
 BB12_15:
 	setp.eq.s32	%p21, %r6, 6;
-	@%p21 bra 	BB12_50;
+	@%p21 bra 	BB12_48;
 
 	setp.eq.s32	%p22, %r6, 7;
-	@%p22 bra 	BB12_49;
+	@%p22 bra 	BB12_47;
 	bra.uni 	BB12_17;
 
-BB12_49:
-	setp.lt.f64	%p46, %fd1, %fd68;
-	selp.f64	%fd99, 0d3FF0000000000000, 0d0000000000000000, %p46;
-	bra.uni 	BB12_69;
+BB12_47:
+	setp.lt.f64	%p48, %fd1, %fd68;
+	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p48;
+	bra.uni 	BB12_65;
 
 BB12_32:
 	setp.eq.s32	%p8, %r6, 16;
-	@%p8 bra 	BB12_44;
+	@%p8 bra 	BB12_42;
 
 	setp.eq.s32	%p9, %r6, 17;
-	@%p9 bra 	BB12_39;
+	@%p9 bra 	BB12_38;
 	bra.uni 	BB12_34;
 
-BB12_39:
-	setp.eq.f64	%p34, %fd1, 0d0000000000000000;
-	setp.eq.f64	%p35, %fd1, 0d8000000000000000;
-	or.pred  	%p36, %p34, %p35;
-	mov.f64 	%fd99, 0d7FF8000000000000;
-	@%p36 bra 	BB12_69;
+BB12_38:
+	setp.eq.f64	%p35, %fd1, 0d0000000000000000;
+	setp.eq.f64	%p36, %fd1, 0d8000000000000000;
+	or.pred  	%p37, %p35, %p36;
+	mov.f64 	%fd98, 0d7FF8000000000000;
+	@%p37 bra 	BB12_65;
 
-	div.rn.f64 	%fd99, %fd68, %fd1;
-	abs.f64 	%fd72, %fd99;
-	setp.gtu.f64	%p37, %fd72, 0d7FF0000000000000;
-	@%p37 bra 	BB12_69;
+	div.rn.f64 	%fd98, %fd68, %fd1;
+	abs.f64 	%fd72, %fd98;
+	setp.gtu.f64	%p38, %fd72, 0d7FF0000000000000;
+	@%p38 bra 	BB12_65;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r15}, %fd99;
+	mov.b64 	{%r15, %temp}, %fd98;
 	}
-	and.b32  	%r16, %r15, 2147483647;
-	setp.ne.s32	%p38, %r16, 2146435072;
-	@%p38 bra 	BB12_43;
-
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r17, %temp}, %fd99;
+	mov.b64 	{%temp, %r16}, %fd98;
 	}
-	setp.eq.s32	%p39, %r17, 0;
-	@%p39 bra 	BB12_69;
-
-BB12_43:
-	cvt.rmi.f64.f64	%fd73, %fd99;
+	and.b32  	%r17, %r16, 2147483647;
+	setp.ne.s32	%p39, %r17, 2146435072;
+	setp.ne.s32	%p40, %r15, 0;
+	or.pred  	%p41, %p39, %p40;
+	@!%p41 bra 	BB12_65;
+	bra.uni 	BB12_41;
+
+BB12_41:
+	cvt.rmi.f64.f64	%fd73, %fd98;
 	mul.f64 	%fd74, %fd1, %fd73;
-	sub.f64 	%fd99, %fd68, %fd74;
-	bra.uni 	BB12_69;
+	sub.f64 	%fd98, %fd68, %fd74;
+	bra.uni 	BB12_65;
 
-BB12_76:
-	setp.eq.s32	%p91, %r6, 2;
-	@%p91 bra 	BB12_135;
-	bra.uni 	BB12_77;
+BB12_72:
+	setp.eq.s32	%p95, %r6, 2;
+	@%p95 bra 	BB12_127;
+	bra.uni 	BB12_73;
 
-BB12_135:
-	mul.f64 	%fd108, %fd1, %fd68;
-	bra.uni 	BB12_137;
+BB12_127:
+	mul.f64 	%fd106, %fd1, %fd68;
+	bra.uni 	BB12_129;
 
-BB12_92:
-	setp.eq.s32	%p78, %r6, 11;
-	@%p78 bra 	BB12_115;
+BB12_88:
+	setp.eq.s32	%p82, %r6, 11;
+	@%p82 bra 	BB12_109;
 
-	setp.eq.s32	%p79, %r6, 12;
-	@%p79 bra 	BB12_114;
-	bra.uni 	BB12_94;
+	setp.eq.s32	%p83, %r6, 12;
+	@%p83 bra 	BB12_108;
+	bra.uni 	BB12_90;
 
-BB12_114:
-	max.f64 	%fd108, %fd1, %fd68;
-	bra.uni 	BB12_137;
+BB12_108:
+	max.f64 	%fd106, %fd1, %fd68;
+	bra.uni 	BB12_129;
 
-BB12_83:
-	setp.eq.s32	%p85, %r6, 6;
-	@%p85 bra 	BB12_118;
+BB12_79:
+	setp.eq.s32	%p89, %r6, 6;
+	@%p89 bra 	BB12_112;
 
-	setp.eq.s32	%p86, %r6, 7;
-	@%p86 bra 	BB12_117;
-	bra.uni 	BB12_85;
+	setp.eq.s32	%p90, %r6, 7;
+	@%p90 bra 	BB12_111;
+	bra.uni 	BB12_81;
 
-BB12_117:
-	setp.gt.f64	%p110, %fd1, %fd68;
-	selp.f64	%fd108, 0d3FF0000000000000, 0d0000000000000000, %p110;
-	bra.uni 	BB12_137;
+BB12_111:
+	setp.gt.f64	%p116, %fd1, %fd68;
+	selp.f64	%fd106, 0d3FF0000000000000, 0d0000000000000000, %p116;
+	bra.uni 	BB12_129;
 
-BB12_100:
-	setp.eq.s32	%p72, %r6, 16;
-	@%p72 bra 	BB12_112;
+BB12_96:
+	setp.eq.s32	%p76, %r6, 16;
+	@%p76 bra 	BB12_106;
 
-	setp.eq.s32	%p73, %r6, 17;
-	@%p73 bra 	BB12_107;
-	bra.uni 	BB12_102;
+	setp.eq.s32	%p77, %r6, 17;
+	@%p77 bra 	BB12_102;
+	bra.uni 	BB12_98;
 
-BB12_107:
-	setp.eq.f64	%p98, %fd68, 0d0000000000000000;
-	setp.eq.f64	%p99, %fd68, 0d8000000000000000;
-	or.pred  	%p100, %p98, %p99;
-	mov.f64 	%fd108, 0d7FF8000000000000;
-	@%p100 bra 	BB12_137;
+BB12_102:
+	setp.eq.f64	%p103, %fd68, 0d0000000000000000;
+	setp.eq.f64	%p104, %fd68, 0d8000000000000000;
+	or.pred  	%p105, %p103, %p104;
+	mov.f64 	%fd106, 0d7FF8000000000000;
+	@%p105 bra 	BB12_129;
 
-	div.rn.f64 	%fd108, %fd1, %fd68;
-	abs.f64 	%fd83, %fd108;
-	setp.gtu.f64	%p101, %fd83, 0d7FF0000000000000;
-	@%p101 bra 	BB12_137;
+	div.rn.f64 	%fd106, %fd1, %fd68;
+	abs.f64 	%fd83, %fd106;
+	setp.gtu.f64	%p106, %fd83, 0d7FF0000000000000;
+	@%p106 bra 	BB12_129;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r53}, %fd108;
+	mov.b64 	{%r52, %temp}, %fd106;
 	}
-	and.b32  	%r54, %r53, 2147483647;
-	setp.ne.s32	%p102, %r54, 2146435072;
-	@%p102 bra 	BB12_111;
-
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r55, %temp}, %fd108;
+	mov.b64 	{%temp, %r53}, %fd106;
 	}
-	setp.eq.s32	%p103, %r55, 0;
-	@%p103 bra 	BB12_137;
-
-BB12_111:
-	cvt.rmi.f64.f64	%fd84, %fd108;
+	and.b32  	%r54, %r53, 2147483647;
+	setp.ne.s32	%p107, %r54, 2146435072;
+	setp.ne.s32	%p108, %r52, 0;
+	or.pred  	%p109, %p107, %p108;
+	@!%p109 bra 	BB12_129;
+	bra.uni 	BB12_105;
+
+BB12_105:
+	cvt.rmi.f64.f64	%fd84, %fd106;
 	mul.f64 	%fd85, %fd84, %fd68;
-	sub.f64 	%fd108, %fd1, %fd85;
-	bra.uni 	BB12_137;
+	sub.f64 	%fd106, %fd1, %fd85;
+	bra.uni 	BB12_129;
 
 BB12_6:
 	setp.eq.s32	%p30, %r6, 1;
 	@%p30 bra 	BB12_7;
-	bra.uni 	BB12_69;
+	bra.uni 	BB12_65;
 
 BB12_7:
-	sub.f64 	%fd99, %fd68, %fd1;
-	bra.uni 	BB12_69;
+	sub.f64 	%fd98, %fd68, %fd1;
+	bra.uni 	BB12_65;
 
 BB12_22:
 	setp.eq.s32	%p18, %r6, 10;
 	@%p18 bra 	BB12_23;
-	bra.uni 	BB12_69;
+	bra.uni 	BB12_65;
 
 BB12_23:
-	setp.neu.f64	%p43, %fd1, %fd68;
-	selp.f64	%fd99, 0d3FF0000000000000, 0d0000000000000000, %p43;
-	bra.uni 	BB12_69;
+	setp.neu.f64	%p45, %fd1, %fd68;
+	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p45;
+	bra.uni 	BB12_65;
 
 BB12_13:
 	setp.eq.s32	%p25, %r6, 5;
 	@%p25 bra 	BB12_14;
-	bra.uni 	BB12_69;
+	bra.uni 	BB12_65;
 
 BB12_14:
-	setp.gt.f64	%p48, %fd1, %fd68;
-	selp.f64	%fd99, 0d3FF0000000000000, 0d0000000000000000, %p48;
-	bra.uni 	BB12_69;
+	setp.gt.f64	%p50, %fd1, %fd68;
+	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p50;
+	bra.uni 	BB12_65;
 
 BB12_30:
 	setp.eq.s32	%p12, %r6, 15;
 	@%p12 bra 	BB12_31;
-	bra.uni 	BB12_69;
+	bra.uni 	BB12_65;
 
 BB12_31:
 	mul.f64 	%fd76, %fd1, %fd68;
 	mov.f64 	%fd77, 0d3FF0000000000000;
-	sub.f64 	%fd99, %fd77, %fd76;
-	bra.uni 	BB12_69;
+	sub.f64 	%fd98, %fd77, %fd76;
+	bra.uni 	BB12_65;
 
 BB12_9:
 	setp.eq.s32	%p28, %r6, 3;
 	@%p28 bra 	BB12_10;
-	bra.uni 	BB12_69;
+	bra.uni 	BB12_65;
 
 BB12_10:
-	div.rn.f64 	%fd99, %fd68, %fd1;
-	bra.uni 	BB12_69;
+	div.rn.f64 	%fd98, %fd68, %fd1;
+	bra.uni 	BB12_65;
 
-BB12_47:
-	min.f64 	%fd99, %fd68, %fd1;
-	bra.uni 	BB12_69;
+BB12_45:
+	min.f64 	%fd98, %fd68, %fd1;
+	bra.uni 	BB12_65;
 
 BB12_26:
 	setp.eq.s32	%p16, %r6, 13;
 	@%p16 bra 	BB12_27;
-	bra.uni 	BB12_69;
+	bra.uni 	BB12_65;
 
 BB12_27:
 	cvt.rni.s64.f64	%rd12, %fd68;
@@ -1672,348 +1670,344 @@ BB12_27:
 	cvt.u32.u64	%r21, %rd12;
 	cvt.u32.u64	%r22, %rd13;
 	and.b32  	%r23, %r22, %r21;
-	setp.eq.s32	%p42, %r23, 0;
-	selp.f64	%fd99, 0d0000000000000000, 0d3FF0000000000000, %p42;
-	bra.uni 	BB12_69;
+	setp.eq.s32	%p44, %r23, 0;
+	selp.f64	%fd98, 0d0000000000000000, 0d3FF0000000000000, %p44;
+	bra.uni 	BB12_65;
 
-BB12_50:
-	setp.ge.f64	%p47, %fd1, %fd68;
-	selp.f64	%fd99, 0d3FF0000000000000, 0d0000000000000000, %p47;
-	bra.uni 	BB12_69;
+BB12_48:
+	setp.ge.f64	%p49, %fd1, %fd68;
+	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p49;
+	bra.uni 	BB12_65;
 
 BB12_17:
 	setp.eq.s32	%p23, %r6, 8;
 	@%p23 bra 	BB12_18;
-	bra.uni 	BB12_69;
+	bra.uni 	BB12_65;
 
 BB12_18:
-	setp.le.f64	%p45, %fd1, %fd68;
-	selp.f64	%fd99, 0d3FF0000000000000, 0d0000000000000000, %p45;
-	bra.uni 	BB12_69;
+	setp.le.f64	%p47, %fd1, %fd68;
+	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p47;
+	bra.uni 	BB12_65;
 
-BB12_44:
-	setp.neu.f64	%p40, %fd68, 0d0000000000000000;
+BB12_42:
+	setp.neu.f64	%p42, %fd68, 0d0000000000000000;
 	sub.f64 	%fd75, %fd68, %fd1;
-	selp.f64	%fd99, %fd75, 0d0000000000000000, %p40;
-	bra.uni 	BB12_69;
+	selp.f64	%fd98, %fd75, 0d0000000000000000, %p42;
+	bra.uni 	BB12_65;
 
 BB12_34:
 	setp.ne.s32	%p10, %r6, 18;
-	@%p10 bra 	BB12_69;
+	@%p10 bra 	BB12_65;
 
-	div.rn.f64 	%fd99, %fd68, %fd1;
-	abs.f64 	%fd70, %fd99;
+	div.rn.f64 	%fd98, %fd68, %fd1;
+	abs.f64 	%fd70, %fd98;
 	setp.gtu.f64	%p31, %fd70, 0d7FF0000000000000;
-	@%p31 bra 	BB12_69;
+	@%p31 bra 	BB12_65;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r12}, %fd99;
+	mov.b64 	{%r12, %temp}, %fd98;
 	}
-	and.b32  	%r13, %r12, 2147483647;
-	setp.ne.s32	%p32, %r13, 2146435072;
-	@%p32 bra 	BB12_38;
-
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r14, %temp}, %fd99;
+	mov.b64 	{%temp, %r13}, %fd98;
 	}
-	setp.eq.s32	%p33, %r14, 0;
-	@%p33 bra 	BB12_69;
+	and.b32  	%r14, %r13, 2147483647;
+	setp.ne.s32	%p32, %r14, 2146435072;
+	setp.ne.s32	%p33, %r12, 0;
+	or.pred  	%p34, %p32, %p33;
+	@!%p34 bra 	BB12_65;
+	bra.uni 	BB12_37;
 
-BB12_38:
-	cvt.rmi.f64.f64	%fd99, %fd99;
-	bra.uni 	BB12_69;
+BB12_37:
+	cvt.rmi.f64.f64	%fd98, %fd98;
+	bra.uni 	BB12_65;
 
-BB12_74:
-	setp.eq.s32	%p94, %r6, 1;
-	@%p94 bra 	BB12_75;
-	bra.uni 	BB12_137;
+BB12_70:
+	setp.eq.s32	%p98, %r6, 1;
+	@%p98 bra 	BB12_71;
+	bra.uni 	BB12_129;
 
-BB12_75:
-	sub.f64 	%fd108, %fd1, %fd68;
-	bra.uni 	BB12_137;
+BB12_71:
+	sub.f64 	%fd106, %fd1, %fd68;
+	bra.uni 	BB12_129;
 
-BB12_90:
-	setp.eq.s32	%p82, %r6, 10;
-	@%p82 bra 	BB12_91;
-	bra.uni 	BB12_137;
+BB12_86:
+	setp.eq.s32	%p86, %r6, 10;
+	@%p86 bra 	BB12_87;
+	bra.uni 	BB12_129;
 
-BB12_91:
-	setp.neu.f64	%p107, %fd1, %fd68;
-	selp.f64	%fd108, 0d3FF0000000000000, 0d0000000000000000, %p107;
-	bra.uni 	BB12_137;
+BB12_87:
+	setp.neu.f64	%p113, %fd1, %fd68;
+	selp.f64	%fd106, 0d3FF0000000000000, 0d0000000000000000, %p113;
+	bra.uni 	BB12_129;
 
-BB12_81:
-	setp.eq.s32	%p89, %r6, 5;
-	@%p89 bra 	BB12_82;
-	bra.uni 	BB12_137;
+BB12_77:
+	setp.eq.s32	%p93, %r6, 5;
+	@%p93 bra 	BB12_78;
+	bra.uni 	BB12_129;
 
-BB12_82:
-	setp.lt.f64	%p112, %fd1, %fd68;
-	selp.f64	%fd108, 0d3FF0000000000000, 0d0000000000000000, %p112;
-	bra.uni 	BB12_137;
+BB12_78:
+	setp.lt.f64	%p118, %fd1, %fd68;
+	selp.f64	%fd106, 0d3FF0000000000000, 0d0000000000000000, %p118;
+	bra.uni 	BB12_129;
 
-BB12_98:
-	setp.eq.s32	%p76, %r6, 15;
-	@%p76 bra 	BB12_99;
-	bra.uni 	BB12_137;
+BB12_94:
+	setp.eq.s32	%p80, %r6, 15;
+	@%p80 bra 	BB12_95;
+	bra.uni 	BB12_129;
 
-BB12_99:
+BB12_95:
 	mul.f64 	%fd87, %fd1, %fd68;
 	mov.f64 	%fd88, 0d3FF0000000000000;
-	sub.f64 	%fd108, %fd88, %fd87;
-	bra.uni 	BB12_137;
+	sub.f64 	%fd106, %fd88, %fd87;
+	bra.uni 	BB12_129;
 
-BB12_77:
-	setp.eq.s32	%p92, %r6, 3;
-	@%p92 bra 	BB12_78;
-	bra.uni 	BB12_137;
+BB12_73:
+	setp.eq.s32	%p96, %r6, 3;
+	@%p96 bra 	BB12_74;
+	bra.uni 	BB12_129;
 
-BB12_78:
-	div.rn.f64 	%fd108, %fd1, %fd68;
-	bra.uni 	BB12_137;
+BB12_74:
+	div.rn.f64 	%fd106, %fd1, %fd68;
+	bra.uni 	BB12_129;
 
-BB12_115:
-	min.f64 	%fd108, %fd1, %fd68;
-	bra.uni 	BB12_137;
+BB12_109:
+	min.f64 	%fd106, %fd1, %fd68;
+	bra.uni 	BB12_129;
 
-BB12_94:
-	setp.eq.s32	%p80, %r6, 13;
-	@%p80 bra 	BB12_95;
-	bra.uni 	BB12_137;
+BB12_90:
+	setp.eq.s32	%p84, %r6, 13;
+	@%p84 bra 	BB12_91;
+	bra.uni 	BB12_129;
 
-BB12_95:
+BB12_91:
 	cvt.rni.s64.f64	%rd17, %fd1;
 	cvt.rni.s64.f64	%rd18, %fd68;
-	cvt.u32.u64	%r59, %rd17;
-	cvt.u32.u64	%r60, %rd18;
-	and.b32  	%r61, %r60, %r59;
-	setp.eq.s32	%p106, %r61, 0;
-	selp.f64	%fd108, 0d0000000000000000, 0d3FF0000000000000, %p106;
-	bra.uni 	BB12_137;
+	cvt.u32.u64	%r58, %rd17;
+	cvt.u32.u64	%r59, %rd18;
+	and.b32  	%r60, %r59, %r58;
+	setp.eq.s32	%p112, %r60, 0;
+	selp.f64	%fd106, 0d0000000000000000, 0d3FF0000000000000, %p112;
+	bra.uni 	BB12_129;
 
-BB12_118:
-	setp.le.f64	%p111, %fd1, %fd68;
-	selp.f64	%fd108, 0d3FF0000000000000, 0d0000000000000000, %p111;
-	bra.uni 	BB12_137;
+BB12_112:
+	setp.le.f64	%p117, %fd1, %fd68;
+	selp.f64	%fd106, 0d3FF0000000000000, 0d0000000000000000, %p117;
+	bra.uni 	BB12_129;
 
-BB12_85:
-	setp.eq.s32	%p87, %r6, 8;
-	@%p87 bra 	BB12_86;
-	bra.uni 	BB12_137;
+BB12_81:
+	setp.eq.s32	%p91, %r6, 8;
+	@%p91 bra 	BB12_82;
+	bra.uni 	BB12_129;
 
-BB12_86:
-	setp.ge.f64	%p109, %fd1, %fd68;
-	selp.f64	%fd108, 0d3FF0000000000000, 0d0000000000000000, %p109;
-	bra.uni 	BB12_137;
+BB12_82:
+	setp.ge.f64	%p115, %fd1, %fd68;
+	selp.f64	%fd106, 0d3FF0000000000000, 0d0000000000000000, %p115;
+	bra.uni 	BB12_129;
 
-BB12_112:
-	setp.neu.f64	%p104, %fd1, 0d0000000000000000;
+BB12_106:
+	setp.neu.f64	%p110, %fd1, 0d0000000000000000;
 	sub.f64 	%fd86, %fd1, %fd68;
-	selp.f64	%fd108, %fd86, 0d0000000000000000, %p104;
-	bra.uni 	BB12_137;
+	selp.f64	%fd106, %fd86, 0d0000000000000000, %p110;
+	bra.uni 	BB12_129;
 
-BB12_102:
-	setp.ne.s32	%p74, %r6, 18;
-	@%p74 bra 	BB12_137;
+BB12_98:
+	setp.ne.s32	%p78, %r6, 18;
+	@%p78 bra 	BB12_129;
 
-	div.rn.f64 	%fd108, %fd1, %fd68;
-	abs.f64 	%fd81, %fd108;
-	setp.gtu.f64	%p95, %fd81, 0d7FF0000000000000;
-	@%p95 bra 	BB12_137;
+	div.rn.f64 	%fd106, %fd1, %fd68;
+	abs.f64 	%fd81, %fd106;
+	setp.gtu.f64	%p99, %fd81, 0d7FF0000000000000;
+	@%p99 bra 	BB12_129;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r50}, %fd108;
+	mov.b64 	{%r49, %temp}, %fd106;
 	}
-	and.b32  	%r51, %r50, 2147483647;
-	setp.ne.s32	%p96, %r51, 2146435072;
-	@%p96 bra 	BB12_106;
-
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r52, %temp}, %fd108;
+	mov.b64 	{%temp, %r50}, %fd106;
 	}
-	setp.eq.s32	%p97, %r52, 0;
-	@%p97 bra 	BB12_137;
+	and.b32  	%r51, %r50, 2147483647;
+	setp.ne.s32	%p100, %r51, 2146435072;
+	setp.ne.s32	%p101, %r49, 0;
+	or.pred  	%p102, %p100, %p101;
+	@!%p102 bra 	BB12_129;
+	bra.uni 	BB12_101;
 
-BB12_106:
-	cvt.rmi.f64.f64	%fd108, %fd108;
-	bra.uni 	BB12_137;
+BB12_101:
+	cvt.rmi.f64.f64	%fd106, %fd106;
+	bra.uni 	BB12_129;
 
-BB12_54:
-	setp.gt.s32	%p52, %r2, -1;
-	@%p52 bra 	BB12_57;
+BB12_52:
+	setp.gt.s32	%p54, %r2, -1;
+	@%p54 bra 	BB12_55;
 
 	cvt.rzi.f64.f64	%fd78, %fd1;
-	setp.neu.f64	%p53, %fd78, %fd1;
-	selp.f64	%fd97, 0dFFF8000000000000, %fd97, %p53;
+	setp.neu.f64	%p55, %fd78, %fd1;
+	selp.f64	%fd96, 0dFFF8000000000000, %fd96, %p55;
 
-BB12_57:
-	mov.f64 	%fd24, %fd97;
+BB12_55:
+	mov.f64 	%fd24, %fd96;
 	add.f64 	%fd25, %fd1, %fd68;
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r33}, %fd25;
 	}
 	and.b32  	%r34, %r33, 2146435072;
-	setp.ne.s32	%p56, %r34, 2146435072;
-	mov.f64 	%fd96, %fd24;
-	@%p56 bra 	BB12_66;
+	setp.ne.s32	%p58, %r34, 2146435072;
+	mov.f64 	%fd95, %fd24;
+	@%p58 bra 	BB12_62;
 
-	setp.gtu.f64	%p57, %fd18, 0d7FF0000000000000;
-	mov.f64 	%fd96, %fd25;
-	@%p57 bra 	BB12_66;
-
-	abs.f64 	%fd79, %fd1;
-	setp.gtu.f64	%p58, %fd79, 0d7FF0000000000000;
+	setp.gtu.f64	%p59, %fd18, 0d7FF0000000000000;
 	mov.f64 	%fd95, %fd25;
-	mov.f64 	%fd96, %fd95;
-	@%p58 bra 	BB12_66;
-
-	and.b32  	%r35, %r3, 2147483647;
-	setp.ne.s32	%p59, %r35, 2146435072;
 	@%p59 bra 	BB12_62;
 
-	{
-	.reg .b32 %temp; 
-	mov.b64 	{%r36, %temp}, %fd1;
-	}
-	setp.eq.s32	%p60, %r36, 0;
-	@%p60 bra 	BB12_65;
-
-BB12_62:
-	and.b32  	%r37, %r2, 2147483647;
-	setp.ne.s32	%p61, %r37, 2146435072;
-	mov.f64 	%fd93, %fd24;
-	mov.f64 	%fd96, %fd93;
-	@%p61 bra 	BB12_66;
+	abs.f64 	%fd79, %fd1;
+	setp.gtu.f64	%p60, %fd79, 0d7FF0000000000000;
+	mov.f64 	%fd94, %fd25;
+	mov.f64 	%fd95, %fd94;
+	@%p60 bra 	BB12_62;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r38, %temp}, %fd68;
+	mov.b64 	{%r35, %temp}, %fd1;
 	}
-	setp.ne.s32	%p62, %r38, 0;
-	mov.f64 	%fd96, %fd24;
-	@%p62 bra 	BB12_66;
-
-	shr.s32 	%r39, %r3, 31;
-	and.b32  	%r40, %r39, -2146435072;
-	add.s32 	%r41, %r40, 2146435072;
-	or.b32  	%r42, %r41, -2147483648;
-	selp.b32	%r43, %r42, %r41, %p1;
-	mov.u32 	%r44, 0;
-	mov.b64 	%fd96, {%r44, %r43};
-	bra.uni 	BB12_66;
+	and.b32  	%r36, %r3, 2147483647;
+	setp.eq.s32	%p61, %r36, 2146435072;
+	setp.eq.s32	%p62, %r35, 0;
+	and.pred  	%p63, %p61, %p62;
+	@%p63 bra 	BB12_61;
+	bra.uni 	BB12_59;
+
+BB12_61:
+	setp.gt.f64	%p67, %fd18, 0d3FF0000000000000;
+	selp.b32	%r44, 2146435072, 0, %p67;
+	xor.b32  	%r45, %r44, 2146435072;
+	setp.lt.s32	%p68, %r3, 0;
+	selp.b32	%r46, %r45, %r44, %p68;
+	setp.eq.f64	%p69, %fd68, 0dBFF0000000000000;
+	selp.b32	%r47, 1072693248, %r46, %p69;
+	mov.u32 	%r48, 0;
+	mov.b64 	%fd95, {%r48, %r47};
+	bra.uni 	BB12_62;
 
-BB12_122:
-	setp.gt.s32	%p116, %r4, -1;
-	@%p116 bra 	BB12_125;
+BB12_116:
+	setp.gt.s32	%p122, %r4, -1;
+	@%p122 bra 	BB12_119;
 
 	cvt.rzi.f64.f64	%fd89, %fd68;
-	setp.neu.f64	%p117, %fd89, %fd68;
-	selp.f64	%fd106, 0dFFF8000000000000, %fd106, %p117;
+	setp.neu.f64	%p123, %fd89, %fd68;
+	selp.f64	%fd104, 0dFFF8000000000000, %fd104, %p123;
 
-BB12_125:
-	mov.f64 	%fd57, %fd106;
+BB12_119:
+	mov.f64 	%fd57, %fd104;
 	add.f64 	%fd58, %fd1, %fd68;
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r71}, %fd58;
+	mov.b64 	{%temp, %r70}, %fd58;
 	}
-	and.b32  	%r72, %r71, 2146435072;
-	setp.ne.s32	%p120, %r72, 2146435072;
-	mov.f64 	%fd105, %fd57;
-	@%p120 bra 	BB12_134;
+	and.b32  	%r71, %r70, 2146435072;
+	setp.ne.s32	%p126, %r71, 2146435072;
+	mov.f64 	%fd103, %fd57;
+	@%p126 bra 	BB12_126;
 
-	setp.gtu.f64	%p121, %fd51, 0d7FF0000000000000;
-	mov.f64 	%fd105, %fd58;
-	@%p121 bra 	BB12_134;
+	setp.gtu.f64	%p127, %fd51, 0d7FF0000000000000;
+	mov.f64 	%fd103, %fd58;
+	@%p127 bra 	BB12_126;
 
 	abs.f64 	%fd90, %fd68;
-	setp.gtu.f64	%p122, %fd90, 0d7FF0000000000000;
-	mov.f64 	%fd104, %fd58;
-	mov.f64 	%fd105, %fd104;
-	@%p122 bra 	BB12_134;
+	setp.gtu.f64	%p128, %fd90, 0d7FF0000000000000;
+	mov.f64 	%fd102, %fd58;
+	mov.f64 	%fd103, %fd102;
+	@%p128 bra 	BB12_126;
 
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r72, %temp}, %fd68;
+	}
 	and.b32  	%r73, %r5, 2147483647;
-	setp.ne.s32	%p123, %r73, 2146435072;
-	@%p123 bra 	BB12_130;
+	setp.eq.s32	%p129, %r73, 2146435072;
+	setp.eq.s32	%p130, %r72, 0;
+	and.pred  	%p131, %p129, %p130;
+	@%p131 bra 	BB12_125;
+	bra.uni 	BB12_123;
 
+BB12_125:
+	setp.gt.f64	%p135, %fd51, 0d3FF0000000000000;
+	selp.b32	%r81, 2146435072, 0, %p135;
+	xor.b32  	%r82, %r81, 2146435072;
+	setp.lt.s32	%p136, %r5, 0;
+	selp.b32	%r83, %r82, %r81, %p136;
+	setp.eq.f64	%p137, %fd1, 0dBFF0000000000000;
+	selp.b32	%r84, 1072693248, %r83, %p137;
+	mov.u32 	%r85, 0;
+	mov.b64 	%fd103, {%r85, %r84};
+	bra.uni 	BB12_126;
+
+BB12_59:
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r74, %temp}, %fd68;
+	mov.b64 	{%r37, %temp}, %fd68;
 	}
-	setp.eq.s32	%p124, %r74, 0;
-	@%p124 bra 	BB12_133;
+	and.b32  	%r38, %r2, 2147483647;
+	setp.eq.s32	%p64, %r38, 2146435072;
+	setp.eq.s32	%p65, %r37, 0;
+	and.pred  	%p66, %p64, %p65;
+	mov.f64 	%fd95, %fd24;
+	@!%p66 bra 	BB12_62;
+	bra.uni 	BB12_60;
+
+BB12_60:
+	shr.s32 	%r39, %r3, 31;
+	and.b32  	%r40, %r39, -2146435072;
+	selp.b32	%r41, -1048576, 2146435072, %p1;
+	add.s32 	%r42, %r41, %r40;
+	mov.u32 	%r43, 0;
+	mov.b64 	%fd95, {%r43, %r42};
 
-BB12_130:
-	and.b32  	%r75, %r4, 2147483647;
-	setp.ne.s32	%p125, %r75, 2146435072;
-	mov.f64 	%fd102, %fd57;
-	mov.f64 	%fd105, %fd102;
-	@%p125 bra 	BB12_134;
+BB12_62:
+	setp.eq.f64	%p70, %fd1, 0d0000000000000000;
+	setp.eq.f64	%p71, %fd68, 0d3FF0000000000000;
+	or.pred  	%p72, %p71, %p70;
+	selp.f64	%fd98, 0d3FF0000000000000, %fd95, %p72;
 
+BB12_65:
+	st.global.f64 	[%rd1], %fd98;
+	bra.uni 	BB12_130;
+
+BB12_123:
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r76, %temp}, %fd1;
+	mov.b64 	{%r74, %temp}, %fd1;
 	}
-	setp.ne.s32	%p126, %r76, 0;
-	mov.f64 	%fd105, %fd57;
-	@%p126 bra 	BB12_134;
-
-	shr.s32 	%r77, %r5, 31;
-	and.b32  	%r78, %r77, -2146435072;
-	add.s32 	%r79, %r78, 2146435072;
-	or.b32  	%r80, %r79, -2147483648;
-	selp.b32	%r81, %r80, %r79, %p2;
-	mov.u32 	%r82, 0;
-	mov.b64 	%fd105, {%r82, %r81};
-	bra.uni 	BB12_134;
+	and.b32  	%r75, %r4, 2147483647;
+	setp.eq.s32	%p132, %r75, 2146435072;
+	setp.eq.s32	%p133, %r74, 0;
+	and.pred  	%p134, %p132, %p133;
+	mov.f64 	%fd103, %fd57;
+	@!%p134 bra 	BB12_126;
+	bra.uni 	BB12_124;
 
-BB12_65:
-	setp.gt.f64	%p63, %fd18, 0d3FF0000000000000;
-	selp.b32	%r45, 2146435072, 0, %p63;
-	xor.b32  	%r46, %r45, 2146435072;
-	setp.lt.s32	%p64, %r3, 0;
-	selp.b32	%r47, %r46, %r45, %p64;
-	setp.eq.f64	%p65, %fd68, 0dBFF0000000000000;
-	selp.b32	%r48, 1072693248, %r47, %p65;
-	mov.u32 	%r49, 0;
-	mov.b64 	%fd96, {%r49, %r48};
+BB12_124:
+	shr.s32 	%r76, %r5, 31;
+	and.b32  	%r77, %r76, -2146435072;
+	selp.b32	%r78, -1048576, 2146435072, %p2;
+	add.s32 	%r79, %r78, %r77;
+	mov.u32 	%r80, 0;
+	mov.b64 	%fd103, {%r80, %r79};
+
+BB12_126:
+	setp.eq.f64	%p138, %fd68, 0d0000000000000000;
+	setp.eq.f64	%p139, %fd1, 0d3FF0000000000000;
+	or.pred  	%p140, %p139, %p138;
+	selp.f64	%fd106, 0d3FF0000000000000, %fd103, %p140;
+
+BB12_129:
+	st.global.f64 	[%rd1], %fd106;
 
-BB12_66:
-	setp.eq.f64	%p66, %fd1, 0d0000000000000000;
-	setp.eq.f64	%p67, %fd68, 0d3FF0000000000000;
-	or.pred  	%p68, %p67, %p66;
-	selp.f64	%fd99, 0d3FF0000000000000, %fd96, %p68;
-
-BB12_69:
-	st.global.f64 	[%rd1], %fd99;
-	bra.uni 	BB12_138;
-
-BB12_133:
-	setp.gt.f64	%p127, %fd51, 0d3FF0000000000000;
-	selp.b32	%r83, 2146435072, 0, %p127;
-	xor.b32  	%r84, %r83, 2146435072;
-	setp.lt.s32	%p128, %r5, 0;
-	selp.b32	%r85, %r84, %r83, %p128;
-	setp.eq.f64	%p129, %fd1, 0dBFF0000000000000;
-	selp.b32	%r86, 1072693248, %r85, %p129;
-	mov.u32 	%r87, 0;
-	mov.b64 	%fd105, {%r87, %r86};
-
-BB12_134:
-	setp.eq.f64	%p130, %fd68, 0d0000000000000000;
-	setp.eq.f64	%p131, %fd1, 0d3FF0000000000000;
-	or.pred  	%p132, %p131, %p130;
-	selp.f64	%fd108, 0d3FF0000000000000, %fd105, %p132;
-
-BB12_137:
-	st.global.f64 	[%rd1], %fd108;
-
-BB12_138:
+BB12_130:
 	bar.sync 	0;
 	ret;
 }
@@ -4431,7 +4425,7 @@ BB34_2:
 	.local .align 4 .b8 	__local_depot35[4];
 	.reg .b64 	%SP;
 	.reg .b64 	%SPL;
-	.reg .pred 	%p<7>;
+	.reg .pred 	%p<9>;
 	.reg .b32 	%r<18>;
 	.reg .f64 	%fd<41>;
 	.reg .b64 	%rd<17>;
@@ -4449,7 +4443,7 @@ BB34_2:
 	mov.u32 	%r8, %tid.x;
 	mad.lo.s32 	%r1, %r6, %r7, %r8;
 	setp.ge.u32	%p1, %r1, %r5;
-	@%p1 bra 	BB35_11;
+	@%p1 bra 	BB35_10;
 
 	cvta.to.global.u64 	%rd6, %rd3;
 	cvt.s64.s32	%rd2, %r1;
@@ -4458,23 +4452,24 @@ BB34_2:
 	ld.global.f64 	%fd38, [%rd8];
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r9}, %fd38;
+	mov.b64 	{%r9, %temp}, %fd38;
 	}
-	and.b32  	%r10, %r9, 2147483647;
-	setp.ne.s32	%p2, %r10, 2146435072;
-	@%p2 bra 	BB35_4;
-
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r11, %temp}, %fd38;
+	mov.b64 	{%temp, %r10}, %fd38;
 	}
-	setp.ne.s32	%p3, %r11, 0;
-	@%p3 bra 	BB35_4;
+	and.b32  	%r11, %r10, 2147483647;
+	setp.eq.s32	%p2, %r11, 2146435072;
+	setp.eq.s32	%p3, %r9, 0;
+	and.pred  	%p4, %p2, %p3;
+	@!%p4 bra 	BB35_3;
+	bra.uni 	BB35_2;
 
+BB35_2:
 	mov.f64 	%fd14, 0d0000000000000000;
 	mul.rn.f64 	%fd38, %fd38, %fd14;
 
-BB35_4:
+BB35_3:
 	mul.f64 	%fd15, %fd38, 0d3FE45F306DC9C883;
 	cvt.rni.s32.f64	%r17, %fd15;
 	st.local.u32 	[%rd1], %r17;
@@ -4491,8 +4486,8 @@ BB35_4:
 	mov.b64 	{%temp, %r12}, %fd38;
 	}
 	and.b32  	%r13, %r12, 2145386496;
-	setp.lt.u32	%p4, %r13, 1105199104;
-	@%p4 bra 	BB35_6;
+	setp.lt.u32	%p5, %r13, 1105199104;
+	@%p5 bra 	BB35_5;
 
 	// Callseq Start 3
 	{
@@ -4515,11 +4510,11 @@ BB35_4:
 	}// Callseq End 3
 	ld.local.u32 	%r17, [%rd1];
 
-BB35_6:
+BB35_5:
 	and.b32  	%r14, %r17, 1;
 	shl.b32 	%r15, %r14, 3;
-	setp.eq.s32	%p5, %r14, 0;
-	selp.f64	%fd23, 0d3DE5DB65F9785EBA, 0dBDA8FF8320FD8164, %p5;
+	setp.eq.b32	%p6, %r14, 1;
+	selp.f64	%fd23, 0dBDA8FF8320FD8164, 0d3DE5DB65F9785EBA, %p6;
 	mul.wide.u32 	%rd10, %r15, 8;
 	mov.u64 	%rd11, __cudart_sin_cos_coeffs;
 	add.s64 	%rd12, %rd10, %rd11;
@@ -4537,27 +4532,179 @@ BB35_6:
 	ld.const.f64 	%fd34, [%rd12+48];
 	fma.rn.f64 	%fd8, %fd33, %fd7, %fd34;
 	fma.rn.f64 	%fd40, %fd8, %fd39, %fd39;
-	@%p5 bra 	BB35_8;
+	setp.eq.s32	%p7, %r14, 0;
+	@%p7 bra 	BB35_7;
 
 	mov.f64 	%fd35, 0d3FF0000000000000;
 	fma.rn.f64 	%fd40, %fd8, %fd7, %fd35;
 
-BB35_8:
+BB35_7:
 	and.b32  	%r16, %r17, 2;
-	setp.eq.s32	%p6, %r16, 0;
-	@%p6 bra 	BB35_10;
+	setp.eq.s32	%p8, %r16, 0;
+	@%p8 bra 	BB35_9;
 
 	mov.f64 	%fd36, 0d0000000000000000;
 	mov.f64 	%fd37, 0dBFF0000000000000;
 	fma.rn.f64 	%fd40, %fd40, %fd37, %fd36;
 
-BB35_10:
+BB35_9:
 	cvta.to.global.u64 	%rd13, %rd4;
 	shl.b64 	%rd14, %rd2, 3;
 	add.s64 	%rd15, %rd13, %rd14;
 	st.global.f64 	[%rd15], %fd40;
 
-BB35_11:
+BB35_10:
+	ret;
+}
+
+	// .globl	matrix_sinh
+.visible .entry matrix_sinh(
+	.param .u64 matrix_sinh_param_0,
+	.param .u64 matrix_sinh_param_1,
+	.param .u32 matrix_sinh_param_2
+)
+{
+	.reg .pred 	%p<7>;
+	.reg .b32 	%r<24>;
+	.reg .f64 	%fd<68>;
+	.reg .b64 	%rd<10>;
+
+
+	ld.param.u64 	%rd2, [matrix_sinh_param_0];
+	ld.param.u64 	%rd3, [matrix_sinh_param_1];
+	ld.param.u32 	%r3, [matrix_sinh_param_2];
+	mov.u32 	%r4, %ctaid.x;
+	mov.u32 	%r5, %ntid.x;
+	mov.u32 	%r6, %tid.x;
+	mad.lo.s32 	%r1, %r5, %r4, %r6;
+	setp.ge.u32	%p1, %r1, %r3;
+	@%p1 bra 	BB36_5;
+
+	cvta.to.global.u64 	%rd4, %rd2;
+	cvt.s64.s32	%rd1, %r1;
+	mul.wide.s32 	%rd5, %r1, 8;
+	add.s64 	%rd6, %rd4, %rd5;
+	ld.global.f64 	%fd5, [%rd6];
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r2}, %fd5;
+	}
+	and.b32  	%r7, %r2, 2147483647;
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r8, %temp}, %fd5;
+	}
+	mov.b64 	%fd1, {%r8, %r7};
+	setp.lt.u32	%p2, %r7, 1072693248;
+	@%p2 bra 	BB36_3;
+	bra.uni 	BB36_2;
+
+BB36_3:
+	mul.f64 	%fd51, %fd1, %fd1;
+	mov.f64 	%fd52, 0d3DE611A561D87DEF;
+	mov.f64 	%fd53, 0d3D6B4C75AB274C53;
+	fma.rn.f64 	%fd54, %fd53, %fd51, %fd52;
+	mov.f64 	%fd55, 0d3E5AE64671B18F5C;
+	fma.rn.f64 	%fd56, %fd54, %fd51, %fd55;
+	mov.f64 	%fd57, 0d3EC71DE3A465B1E4;
+	fma.rn.f64 	%fd58, %fd56, %fd51, %fd57;
+	mov.f64 	%fd59, 0d3F2A01A01A02899D;
+	fma.rn.f64 	%fd60, %fd58, %fd51, %fd59;
+	mov.f64 	%fd61, 0d3F811111111110A6;
+	fma.rn.f64 	%fd62, %fd60, %fd51, %fd61;
+	mov.f64 	%fd63, 0d3FC5555555555556;
+	fma.rn.f64 	%fd64, %fd62, %fd51, %fd63;
+	mul.f64 	%fd65, %fd51, %fd64;
+	fma.rn.f64 	%fd67, %fd65, %fd1, %fd1;
+	bra.uni 	BB36_4;
+
+BB36_2:
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r9}, %fd1;
+	}
+	mov.f64 	%fd6, 0d4338000000000000;
+	mov.f64 	%fd7, 0d3FF71547652B82FE;
+	fma.rn.f64 	%fd8, %fd1, %fd7, %fd6;
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r10, %temp}, %fd8;
+	}
+	add.s32 	%r11, %r10, -1;
+	mov.f64 	%fd9, 0dC338000000000000;
+	add.rn.f64 	%fd10, %fd8, %fd9;
+	mov.f64 	%fd11, 0dBFE62E42FEFA39EF;
+	fma.rn.f64 	%fd12, %fd10, %fd11, %fd1;
+	mov.f64 	%fd13, 0dBC7ABC9E3B39803F;
+	fma.rn.f64 	%fd14, %fd10, %fd13, %fd12;
+	add.s32 	%r12, %r9, %r9;
+	setp.lt.u32	%p3, %r12, 2142496327;
+	selp.b32	%r13, 0, %r11, %p3;
+	selp.f64	%fd15, %fd1, %fd14, %p3;
+	mov.f64 	%fd16, 0d3E5AF86D8EBD13CD;
+	mov.f64 	%fd17, 0d3E21F4076ACD15B6;
+	fma.rn.f64 	%fd18, %fd17, %fd15, %fd16;
+	mov.f64 	%fd19, 0d3E927E5092BA033D;
+	fma.rn.f64 	%fd20, %fd18, %fd15, %fd19;
+	mov.f64 	%fd21, 0d3EC71DDE6C5F9DA1;
+	fma.rn.f64 	%fd22, %fd20, %fd15, %fd21;
+	mov.f64 	%fd23, 0d3EFA01A018D034E6;
+	fma.rn.f64 	%fd24, %fd22, %fd15, %fd23;
+	mov.f64 	%fd25, 0d3F2A01A01B3B6940;
+	fma.rn.f64 	%fd26, %fd24, %fd15, %fd25;
+	mov.f64 	%fd27, 0d3F56C16C16C1B5DD;
+	fma.rn.f64 	%fd28, %fd26, %fd15, %fd27;
+	mov.f64 	%fd29, 0d3F8111111110F74D;
+	fma.rn.f64 	%fd30, %fd28, %fd15, %fd29;
+	mov.f64 	%fd31, 0d3FA555555555554D;
+	fma.rn.f64 	%fd32, %fd30, %fd15, %fd31;
+	mov.f64 	%fd33, 0d3FC5555555555557;
+	fma.rn.f64 	%fd34, %fd32, %fd15, %fd33;
+	mov.f64 	%fd35, 0d3FE0000000000000;
+	fma.rn.f64 	%fd36, %fd34, %fd15, %fd35;
+	mul.f64 	%fd37, %fd15, %fd36;
+	fma.rn.f64 	%fd38, %fd37, %fd15, %fd15;
+	setp.eq.s32	%p4, %r13, 1024;
+	selp.b32	%r14, -1, 0, %p4;
+	add.s32 	%r15, %r14, %r13;
+	shl.b32 	%r16, %r15, 20;
+	add.s32 	%r17, %r16, 1072693248;
+	mov.u32 	%r18, 0;
+	mov.b64 	%fd39, {%r18, %r17};
+	mov.u32 	%r19, 1071644672;
+	mov.b64 	%fd40, {%r18, %r19};
+	sub.f64 	%fd41, %fd39, %fd40;
+	fma.rn.f64 	%fd42, %fd38, %fd39, %fd41;
+	add.f64 	%fd43, %fd42, %fd42;
+	selp.f64	%fd44, %fd43, %fd42, %p4;
+	setp.eq.s32	%p5, %r12, 0;
+	selp.f64	%fd45, %fd15, %fd44, %p5;
+	mov.f64 	%fd46, 0d3FF0000000000000;
+	mov.f64 	%fd47, 0d4000000000000000;
+	fma.rn.f64 	%fd48, %fd47, %fd45, %fd46;
+	div.rn.f64 	%fd49, %fd45, %fd48;
+	add.f64 	%fd50, %fd49, %fd45;
+	setp.ge.f64	%p6, %fd1, 0d408633CE8FB9F87E;
+	selp.f64	%fd67, 0d7FF0000000000000, %fd50, %p6;
+
+BB36_4:
+	cvta.to.global.u64 	%rd7, %rd3;
+	and.b32  	%r20, %r2, -2147483648;
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r21}, %fd67;
+	}
+	or.b32  	%r22, %r21, %r20;
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r23, %temp}, %fd67;
+	}
+	mov.b64 	%fd66, {%r23, %r22};
+	shl.b64 	%rd8, %rd1, 3;
+	add.s64 	%rd9, %rd7, %rd8;
+	st.global.f64 	[%rd9], %fd66;
+
+BB36_5:
 	ret;
 }
 
@@ -4568,16 +4715,16 @@ BB35_11:
 	.param .u32 matrix_cos_param_2
 )
 {
-	.local .align 4 .b8 	__local_depot36[4];
+	.local .align 4 .b8 	__local_depot37[4];
 	.reg .b64 	%SP;
 	.reg .b64 	%SPL;
-	.reg .pred 	%p<7>;
+	.reg .pred 	%p<9>;
 	.reg .b32 	%r<19>;
 	.reg .f64 	%fd<41>;
 	.reg .b64 	%rd<17>;
 
 
-	mov.u64 	%rd16, __local_depot36;
+	mov.u64 	%rd16, __local_depot37;
 	cvta.local.u64 	%SP, %rd16;
 	ld.param.u64 	%rd3, [matrix_cos_param_0];
 	ld.param.u64 	%rd4, [matrix_cos_param_1];
@@ -4589,7 +4736,7 @@ BB35_11:
 	mov.u32 	%r9, %tid.x;
 	mad.lo.s32 	%r1, %r7, %r8, %r9;
 	setp.ge.u32	%p1, %r1, %r6;
-	@%p1 bra 	BB36_11;
+	@%p1 bra 	BB37_10;
 
 	cvta.to.global.u64 	%rd6, %rd3;
 	cvt.s64.s32	%rd2, %r1;
@@ -4598,23 +4745,24 @@ BB35_11:
 	ld.global.f64 	%fd38, [%rd8];
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r10}, %fd38;
+	mov.b64 	{%r10, %temp}, %fd38;
 	}
-	and.b32  	%r11, %r10, 2147483647;
-	setp.ne.s32	%p2, %r11, 2146435072;
-	@%p2 bra 	BB36_4;
-
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r12, %temp}, %fd38;
+	mov.b64 	{%temp, %r11}, %fd38;
 	}
-	setp.ne.s32	%p3, %r12, 0;
-	@%p3 bra 	BB36_4;
+	and.b32  	%r12, %r11, 2147483647;
+	setp.eq.s32	%p2, %r12, 2146435072;
+	setp.eq.s32	%p3, %r10, 0;
+	and.pred  	%p4, %p2, %p3;
+	@!%p4 bra 	BB37_3;
+	bra.uni 	BB37_2;
 
+BB37_2:
 	mov.f64 	%fd14, 0d0000000000000000;
 	mul.rn.f64 	%fd38, %fd38, %fd14;
 
-BB36_4:
+BB37_3:
 	mul.f64 	%fd15, %fd38, 0d3FE45F306DC9C883;
 	cvt.rni.s32.f64	%r18, %fd15;
 	st.local.u32 	[%rd1], %r18;
@@ -4631,8 +4779,8 @@ BB36_4:
 	mov.b64 	{%temp, %r13}, %fd38;
 	}
 	and.b32  	%r14, %r13, 2145386496;
-	setp.lt.u32	%p4, %r14, 1105199104;
-	@%p4 bra 	BB36_6;
+	setp.lt.u32	%p5, %r14, 1105199104;
+	@%p5 bra 	BB37_5;
 
 	// Callseq Start 4
 	{
@@ -4655,12 +4803,12 @@ BB36_4:
 	}// Callseq End 4
 	ld.local.u32 	%r18, [%rd1];
 
-BB36_6:
+BB37_5:
 	add.s32 	%r5, %r18, 1;
 	and.b32  	%r15, %r5, 1;
 	shl.b32 	%r16, %r15, 3;
-	setp.eq.s32	%p5, %r15, 0;
-	selp.f64	%fd23, 0d3DE5DB65F9785EBA, 0dBDA8FF8320FD8164, %p5;
+	setp.eq.b32	%p6, %r15, 1;
+	selp.f64	%fd23, 0dBDA8FF8320FD8164, 0d3DE5DB65F9785EBA, %p6;
 	mul.wide.u32 	%rd10, %r16, 8;
 	mov.u64 	%rd11, __cudart_sin_cos_coeffs;
 	add.s64 	%rd12, %rd10, %rd11;
@@ -4678,27 +4826,148 @@ BB36_6:
 	ld.const.f64 	%fd34, [%rd12+48];
 	fma.rn.f64 	%fd8, %fd33, %fd7, %fd34;
 	fma.rn.f64 	%fd40, %fd8, %fd39, %fd39;
-	@%p5 bra 	BB36_8;
+	setp.eq.s32	%p7, %r15, 0;
+	@%p7 bra 	BB37_7;
 
 	mov.f64 	%fd35, 0d3FF0000000000000;
 	fma.rn.f64 	%fd40, %fd8, %fd7, %fd35;
 
-BB36_8:
+BB37_7:
 	and.b32  	%r17, %r5, 2;
-	setp.eq.s32	%p6, %r17, 0;
-	@%p6 bra 	BB36_10;
+	setp.eq.s32	%p8, %r17, 0;
+	@%p8 bra 	BB37_9;
 
 	mov.f64 	%fd36, 0d0000000000000000;
 	mov.f64 	%fd37, 0dBFF0000000000000;
 	fma.rn.f64 	%fd40, %fd40, %fd37, %fd36;
 
-BB36_10:
+BB37_9:
 	cvta.to.global.u64 	%rd13, %rd4;
 	shl.b64 	%rd14, %rd2, 3;
 	add.s64 	%rd15, %rd13, %rd14;
 	st.global.f64 	[%rd15], %fd40;
 
-BB36_11:
+BB37_10:
+	ret;
+}
+
+	// .globl	matrix_cosh
+.visible .entry matrix_cosh(
+	.param .u64 matrix_cosh_param_0,
+	.param .u64 matrix_cosh_param_1,
+	.param .u32 matrix_cosh_param_2
+)
+{
+	.reg .pred 	%p<4>;
+	.reg .b32 	%r<16>;
+	.reg .f64 	%fd<46>;
+	.reg .b64 	%rd<10>;
+
+
+	ld.param.u64 	%rd2, [matrix_cosh_param_0];
+	ld.param.u64 	%rd3, [matrix_cosh_param_1];
+	ld.param.u32 	%r2, [matrix_cosh_param_2];
+	mov.u32 	%r3, %ctaid.x;
+	mov.u32 	%r4, %ntid.x;
+	mov.u32 	%r5, %tid.x;
+	mad.lo.s32 	%r1, %r4, %r3, %r5;
+	setp.ge.u32	%p1, %r1, %r2;
+	@%p1 bra 	BB38_5;
+
+	cvta.to.global.u64 	%rd4, %rd2;
+	cvt.s64.s32	%rd1, %r1;
+	mul.wide.s32 	%rd5, %r1, 8;
+	add.s64 	%rd6, %rd4, %rd5;
+	ld.global.f64 	%fd1, [%rd6];
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r6}, %fd1;
+	}
+	and.b32  	%r7, %r6, 2147483647;
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r8, %temp}, %fd1;
+	}
+	mov.b64 	%fd2, {%r8, %r7};
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r9}, %fd2;
+	}
+	setp.lt.u32	%p2, %r9, 1082536911;
+	@%p2 bra 	BB38_3;
+	bra.uni 	BB38_2;
+
+BB38_3:
+	mov.f64 	%fd8, 0d4338000000000000;
+	mov.f64 	%fd9, 0d3FF71547652B82FE;
+	fma.rn.f64 	%fd10, %fd2, %fd9, %fd8;
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r10, %temp}, %fd10;
+	}
+	mov.f64 	%fd11, 0dC338000000000000;
+	add.rn.f64 	%fd12, %fd10, %fd11;
+	mov.f64 	%fd13, 0dBFE62E42FEFA39EF;
+	fma.rn.f64 	%fd14, %fd12, %fd13, %fd2;
+	mov.f64 	%fd15, 0dBC7ABC9E3B39803F;
+	fma.rn.f64 	%fd16, %fd12, %fd15, %fd14;
+	mov.f64 	%fd17, 0d3E928AF3FCA213EA;
+	mov.f64 	%fd18, 0d3E5ADE1569CE2BDF;
+	fma.rn.f64 	%fd19, %fd18, %fd16, %fd17;
+	mov.f64 	%fd20, 0d3EC71DEE62401315;
+	fma.rn.f64 	%fd21, %fd19, %fd16, %fd20;
+	mov.f64 	%fd22, 0d3EFA01997C89EB71;
+	fma.rn.f64 	%fd23, %fd21, %fd16, %fd22;
+	mov.f64 	%fd24, 0d3F2A01A014761F65;
+	fma.rn.f64 	%fd25, %fd23, %fd16, %fd24;
+	mov.f64 	%fd26, 0d3F56C16C1852B7AF;
+	fma.rn.f64 	%fd27, %fd25, %fd16, %fd26;
+	mov.f64 	%fd28, 0d3F81111111122322;
+	fma.rn.f64 	%fd29, %fd27, %fd16, %fd28;
+	mov.f64 	%fd30, 0d3FA55555555502A1;
+	fma.rn.f64 	%fd31, %fd29, %fd16, %fd30;
+	mov.f64 	%fd32, 0d3FC5555555555511;
+	fma.rn.f64 	%fd33, %fd31, %fd16, %fd32;
+	mov.f64 	%fd34, 0d3FE000000000000B;
+	fma.rn.f64 	%fd35, %fd33, %fd16, %fd34;
+	mov.f64 	%fd36, 0d3FF0000000000000;
+	fma.rn.f64 	%fd37, %fd35, %fd16, %fd36;
+	fma.rn.f64 	%fd38, %fd37, %fd16, %fd36;
+	shl.b32 	%r11, %r10, 20;
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r12, %temp}, %fd38;
+	}
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r13}, %fd38;
+	}
+	add.s32 	%r14, %r11, %r13;
+	add.s32 	%r15, %r14, -2097152;
+	mov.b64 	%fd7, {%r12, %r15};
+	// inline asm
+	rcp.approx.ftz.f64 %fd6,%fd7;
+	// inline asm
+	neg.f64 	%fd39, %fd7;
+	fma.rn.f64 	%fd40, %fd39, %fd6, %fd36;
+	fma.rn.f64 	%fd41, %fd40, %fd40, %fd40;
+	fma.rn.f64 	%fd42, %fd41, %fd6, %fd6;
+	mov.f64 	%fd43, 0d3FB0000000000000;
+	fma.rn.f64 	%fd45, %fd42, %fd43, %fd7;
+	bra.uni 	BB38_4;
+
+BB38_2:
+	setp.le.f64	%p3, %fd1, 0d7FF0000000000000;
+	selp.f64	%fd45, 0d7FF0000000000000, %fd1, %p3;
+
+BB38_4:
+	cvta.to.global.u64 	%rd7, %rd3;
+	shl.b64 	%rd8, %rd1, 3;
+	add.s64 	%rd9, %rd7, %rd8;
+	add.f64 	%fd44, %fd45, %fd45;
+	st.global.f64 	[%rd9], %fd44;
+
+BB38_5:
 	ret;
 }
 
@@ -4709,16 +4978,16 @@ BB36_11:
 	.param .u32 matrix_tan_param_2
 )
 {
-	.local .align 4 .b8 	__local_depot37[4];
+	.local .align 4 .b8 	__local_depot39[4];
 	.reg .b64 	%SP;
 	.reg .b64 	%SPL;
-	.reg .pred 	%p<6>;
+	.reg .pred 	%p<7>;
 	.reg .b32 	%r<16>;
 	.reg .f64 	%fd<66>;
 	.reg .b64 	%rd<14>;
 
 
-	mov.u64 	%rd13, __local_depot37;
+	mov.u64 	%rd13, __local_depot39;
 	cvta.local.u64 	%SP, %rd13;
 	ld.param.u64 	%rd3, [matrix_tan_param_0];
 	ld.param.u64 	%rd4, [matrix_tan_param_1];
@@ -4730,7 +4999,7 @@ BB36_11:
 	mov.u32 	%r8, %tid.x;
 	mad.lo.s32 	%r1, %r6, %r7, %r8;
 	setp.ge.u32	%p1, %r1, %r5;
-	@%p1 bra 	BB37_9;
+	@%p1 bra 	BB39_8;
 
 	cvta.to.global.u64 	%rd6, %rd3;
 	cvt.s64.s32	%rd2, %r1;
@@ -4739,23 +5008,24 @@ BB36_11:
 	ld.global.f64 	%fd63, [%rd8];
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r9}, %fd63;
+	mov.b64 	{%r9, %temp}, %fd63;
 	}
-	and.b32  	%r10, %r9, 2147483647;
-	setp.ne.s32	%p2, %r10, 2146435072;
-	@%p2 bra 	BB37_4;
-
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r11, %temp}, %fd63;
+	mov.b64 	{%temp, %r10}, %fd63;
 	}
-	setp.ne.s32	%p3, %r11, 0;
-	@%p3 bra 	BB37_4;
+	and.b32  	%r11, %r10, 2147483647;
+	setp.eq.s32	%p2, %r11, 2146435072;
+	setp.eq.s32	%p3, %r9, 0;
+	and.pred  	%p4, %p2, %p3;
+	@!%p4 bra 	BB39_3;
+	bra.uni 	BB39_2;
 
+BB39_2:
 	mov.f64 	%fd11, 0d0000000000000000;
 	mul.rn.f64 	%fd63, %fd63, %fd11;
 
-BB37_4:
+BB39_3:
 	mul.f64 	%fd12, %fd63, 0d3FE45F306DC9C883;
 	cvt.rni.s32.f64	%r15, %fd12;
 	st.local.u32 	[%rd1], %r15;
@@ -4772,8 +5042,8 @@ BB37_4:
 	mov.b64 	{%temp, %r12}, %fd63;
 	}
 	and.b32  	%r13, %r12, 2145386496;
-	setp.lt.u32	%p4, %r13, 1105199104;
-	@%p4 bra 	BB37_6;
+	setp.lt.u32	%p5, %r13, 1105199104;
+	@%p5 bra 	BB39_5;
 
 	// Callseq Start 5
 	{
@@ -4796,7 +5066,7 @@ BB37_4:
 	}// Callseq End 5
 	ld.local.u32 	%r15, [%rd1];
 
-BB37_6:
+BB39_5:
 	mul.f64 	%fd20, %fd64, %fd64;
 	mov.f64 	%fd21, 0dBEF9757C5B27EBB1;
 	mov.f64 	%fd22, 0d3EE48DAC2799BCB9;
@@ -4830,11 +5100,11 @@ BB37_6:
 	mul.f64 	%fd7, %fd20, %fd49;
 	fma.rn.f64 	%fd65, %fd7, %fd64, %fd64;
 	and.b32  	%r14, %r15, 1;
-	setp.eq.b32	%p5, %r14, 1;
-	@!%p5 bra 	BB37_8;
-	bra.uni 	BB37_7;
+	setp.eq.b32	%p6, %r14, 1;
+	@!%p6 bra 	BB39_7;
+	bra.uni 	BB39_6;
 
-BB37_7:
+BB39_6:
 	sub.f64 	%fd52, %fd65, %fd64;
 	neg.f64 	%fd53, %fd52;
 	fma.rn.f64 	%fd54, %fd7, %fd64, %fd53;
@@ -4851,13 +5121,159 @@ BB37_7:
 	fma.rn.f64 	%fd62, %fd60, %fd54, %fd61;
 	fma.rn.f64 	%fd65, %fd62, %fd60, %fd60;
 
-BB37_8:
+BB39_7:
 	cvta.to.global.u64 	%rd10, %rd4;
 	shl.b64 	%rd11, %rd2, 3;
 	add.s64 	%rd12, %rd10, %rd11;
 	st.global.f64 	[%rd12], %fd65;
 
-BB37_9:
+BB39_8:
+	ret;
+}
+
+	// .globl	matrix_tanh
+.visible .entry matrix_tanh(
+	.param .u64 matrix_tanh_param_0,
+	.param .u64 matrix_tanh_param_1,
+	.param .u32 matrix_tanh_param_2
+)
+{
+	.reg .pred 	%p<4>;
+	.reg .b32 	%r<17>;
+	.reg .f64 	%fd<74>;
+	.reg .b64 	%rd<10>;
+
+
+	ld.param.u64 	%rd2, [matrix_tanh_param_0];
+	ld.param.u64 	%rd3, [matrix_tanh_param_1];
+	ld.param.u32 	%r4, [matrix_tanh_param_2];
+	mov.u32 	%r5, %ctaid.x;
+	mov.u32 	%r6, %ntid.x;
+	mov.u32 	%r7, %tid.x;
+	mad.lo.s32 	%r1, %r6, %r5, %r7;
+	setp.ge.u32	%p1, %r1, %r4;
+	@%p1 bra 	BB40_5;
+
+	cvta.to.global.u64 	%rd4, %rd2;
+	cvt.s64.s32	%rd1, %r1;
+	mul.wide.s32 	%rd5, %r1, 8;
+	add.s64 	%rd6, %rd4, %rd5;
+	ld.global.f64 	%fd1, [%rd6];
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r2}, %fd1;
+	}
+	and.b32  	%r3, %r2, 2147483647;
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r8, %temp}, %fd1;
+	}
+	mov.b64 	%fd2, {%r8, %r3};
+	setp.ltu.f64	%p2, %fd2, 0d3FE1C7A398201CD6;
+	@%p2 bra 	BB40_3;
+	bra.uni 	BB40_2;
+
+BB40_3:
+	mul.f64 	%fd51, %fd1, %fd1;
+	mov.f64 	%fd52, 0dBF2B9093D89F0E23;
+	mov.f64 	%fd53, 0d3F0ABFFC9B5786C4;
+	fma.rn.f64 	%fd54, %fd53, %fd51, %fd52;
+	mov.f64 	%fd55, 0d3F42FA2744C30B61;
+	fma.rn.f64 	%fd56, %fd54, %fd51, %fd55;
+	mov.f64 	%fd57, 0dBF57CF3B9C1E491D;
+	fma.rn.f64 	%fd58, %fd56, %fd51, %fd57;
+	mov.f64 	%fd59, 0d3F6D6C61D450119A;
+	fma.rn.f64 	%fd60, %fd58, %fd51, %fd59;
+	mov.f64 	%fd61, 0dBF8226DDD44294F5;
+	fma.rn.f64 	%fd62, %fd60, %fd51, %fd61;
+	mov.f64 	%fd63, 0d3F9664F45C2B04A6;
+	fma.rn.f64 	%fd64, %fd62, %fd51, %fd63;
+	mov.f64 	%fd65, 0dBFABA1BA1AD70754;
+	fma.rn.f64 	%fd66, %fd64, %fd51, %fd65;
+	mov.f64 	%fd67, 0d3FC111111110295E;
+	fma.rn.f64 	%fd68, %fd66, %fd51, %fd67;
+	mov.f64 	%fd69, 0dBFD555555555549F;
+	fma.rn.f64 	%fd70, %fd68, %fd51, %fd69;
+	mul.f64 	%fd71, %fd51, %fd70;
+	fma.rn.f64 	%fd73, %fd71, %fd1, %fd1;
+	bra.uni 	BB40_4;
+
+BB40_2:
+	add.f64 	%fd8, %fd2, %fd2;
+	mov.f64 	%fd9, 0d4338000000000000;
+	mov.f64 	%fd10, 0d3FF71547652B82FE;
+	fma.rn.f64 	%fd11, %fd8, %fd10, %fd9;
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r9, %temp}, %fd11;
+	}
+	mov.f64 	%fd12, 0dC338000000000000;
+	add.rn.f64 	%fd13, %fd11, %fd12;
+	mov.f64 	%fd14, 0dBFE62E42FEFA39EF;
+	fma.rn.f64 	%fd15, %fd13, %fd14, %fd8;
+	mov.f64 	%fd16, 0dBC7ABC9E3B39803F;
+	fma.rn.f64 	%fd17, %fd13, %fd16, %fd15;
+	mov.f64 	%fd18, 0d3E5AF86D8EBD13CD;
+	mov.f64 	%fd19, 0d3E21F4076ACD15B6;
+	fma.rn.f64 	%fd20, %fd19, %fd17, %fd18;
+	mov.f64 	%fd21, 0d3E927E5092BA033D;
+	fma.rn.f64 	%fd22, %fd20, %fd17, %fd21;
+	mov.f64 	%fd23, 0d3EC71DDE6C5F9DA1;
+	fma.rn.f64 	%fd24, %fd22, %fd17, %fd23;
+	mov.f64 	%fd25, 0d3EFA01A018D034E6;
+	fma.rn.f64 	%fd26, %fd24, %fd17, %fd25;
+	mov.f64 	%fd27, 0d3F2A01A01B3B6940;
+	fma.rn.f64 	%fd28, %fd26, %fd17, %fd27;
+	mov.f64 	%fd29, 0d3F56C16C16C1B5DD;
+	fma.rn.f64 	%fd30, %fd28, %fd17, %fd29;
+	mov.f64 	%fd31, 0d3F8111111110F74D;
+	fma.rn.f64 	%fd32, %fd30, %fd17, %fd31;
+	mov.f64 	%fd33, 0d3FA555555555554D;
+	fma.rn.f64 	%fd34, %fd32, %fd17, %fd33;
+	mov.f64 	%fd35, 0d3FC5555555555557;
+	fma.rn.f64 	%fd36, %fd34, %fd17, %fd35;
+	mov.f64 	%fd37, 0d3FE0000000000000;
+	fma.rn.f64 	%fd38, %fd36, %fd17, %fd37;
+	mul.f64 	%fd39, %fd17, %fd38;
+	fma.rn.f64 	%fd40, %fd39, %fd17, %fd17;
+	shl.b32 	%r10, %r9, 20;
+	add.s32 	%r11, %r10, 1072693248;
+	mov.u32 	%r12, 0;
+	mov.b64 	%fd41, {%r12, %r11};
+	fma.rn.f64 	%fd42, %fd40, %fd41, %fd41;
+	add.f64 	%fd7, %fd42, 0d3FF0000000000000;
+	// inline asm
+	rcp.approx.ftz.f64 %fd6,%fd7;
+	// inline asm
+	neg.f64 	%fd43, %fd7;
+	mov.f64 	%fd44, 0d3FF0000000000000;
+	fma.rn.f64 	%fd45, %fd43, %fd6, %fd44;
+	fma.rn.f64 	%fd46, %fd45, %fd45, %fd45;
+	fma.rn.f64 	%fd47, %fd46, %fd6, %fd6;
+	neg.f64 	%fd48, %fd47;
+	mov.f64 	%fd49, 0d4000000000000000;
+	fma.rn.f64 	%fd50, %fd49, %fd48, %fd44;
+	setp.gt.u32	%p3, %r3, 1077936127;
+	selp.f64	%fd73, 0d3FF0000000000000, %fd50, %p3;
+
+BB40_4:
+	cvta.to.global.u64 	%rd7, %rd3;
+	and.b32  	%r13, %r2, -2147483648;
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r14}, %fd73;
+	}
+	or.b32  	%r15, %r14, %r13;
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r16, %temp}, %fd73;
+	}
+	mov.b64 	%fd72, {%r16, %r15};
+	shl.b64 	%rd8, %rd1, 3;
+	add.s64 	%rd9, %rd7, %rd8;
+	st.global.f64 	[%rd9], %fd72;
+
+BB40_5:
 	ret;
 }
 
@@ -4883,7 +5299,7 @@ BB37_9:
 	mov.u32 	%r6, %tid.x;
 	mad.lo.s32 	%r1, %r5, %r4, %r6;
 	setp.ge.u32	%p1, %r1, %r3;
-	@%p1 bra 	BB38_5;
+	@%p1 bra 	BB41_5;
 
 	cvta.to.global.u64 	%rd4, %rd2;
 	cvt.s64.s32	%rd1, %r1;
@@ -4897,10 +5313,10 @@ BB37_9:
 	mov.b32 	 %f1, %r2;
 	abs.f32 	%f2, %f1;
 	setp.lt.f32	%p2, %f2, 0f3FE26666;
-	@%p2 bra 	BB38_3;
-	bra.uni 	BB38_2;
+	@%p2 bra 	BB41_3;
+	bra.uni 	BB41_2;
 
-BB38_3:
+BB41_3:
 	mul.f64 	%fd55, %fd1, %fd1;
 	mov.f64 	%fd56, 0dBFB3823B180754AF;
 	mov.f64 	%fd57, 0d3FB0066BDC1895E9;
@@ -4929,9 +5345,9 @@ BB38_3:
 	fma.rn.f64 	%fd80, %fd78, %fd55, %fd79;
 	mul.f64 	%fd81, %fd55, %fd80;
 	fma.rn.f64 	%fd82, %fd81, %fd1, %fd1;
-	bra.uni 	BB38_4;
+	bra.uni 	BB41_4;
 
-BB38_2:
+BB41_2:
 	abs.f64 	%fd7, %fd1;
 	mov.f64 	%fd8, 0d3FE0000000000000;
 	mov.f64 	%fd9, 0dBFE0000000000000;
@@ -4965,8 +5381,8 @@ BB38_2:
 	}
 	setp.lt.s32	%p3, %r10, 0;
 	selp.f64	%fd21, 0dFFF8000000000000, %fd20, %p3;
-	setp.equ.f64	%p4, %fd6, 0d0000000000000000;
-	selp.f64	%fd22, %fd6, %fd21, %p4;
+	setp.ne.f64	%p4, %fd6, 0d0000000000000000;
+	selp.f64	%fd22, %fd21, %fd6, %p4;
 	mov.f64 	%fd23, 0dBFB3823B180754AF;
 	mov.f64 	%fd24, 0d3FB0066BDC1895E9;
 	fma.rn.f64 	%fd25, %fd24, %fd6, %fd23;
@@ -5011,13 +5427,13 @@ BB38_2:
 	or.b32  	%r14, %r12, %r13;
 	mov.b64 	%fd82, {%r11, %r14};
 
-BB38_4:
+BB41_4:
 	cvta.to.global.u64 	%rd7, %rd3;
 	shl.b64 	%rd8, %rd1, 3;
 	add.s64 	%rd9, %rd7, %rd8;
 	st.global.f64 	[%rd9], %fd82;
 
-BB38_5:
+BB41_5:
 	ret;
 }
 
@@ -5042,7 +5458,7 @@ BB38_5:
 	mov.u32 	%r7, %tid.x;
 	mad.lo.s32 	%r1, %r6, %r5, %r7;
 	setp.ge.u32	%p1, %r1, %r4;
-	@%p1 bra 	BB39_14;
+	@%p1 bra 	BB42_14;
 
 	cvta.to.global.u64 	%rd4, %rd2;
 	cvt.s64.s32	%rd1, %r1;
@@ -5059,10 +5475,10 @@ BB38_5:
 	mov.b64 	{%temp, %r8}, %fd1;
 	}
 	setp.lt.s32	%p2, %r8, 1071801958;
-	@%p2 bra 	BB39_9;
-	bra.uni 	BB39_2;
+	@%p2 bra 	BB42_9;
+	bra.uni 	BB42_2;
 
-BB39_9:
+BB42_9:
 	mul.f64 	%fd62, %fd1, %fd1;
 	mov.f64 	%fd63, 0dBFB3823B180754AF;
 	mov.f64 	%fd64, 0d3FB0066BDC1895E9;
@@ -5092,14 +5508,14 @@ BB39_9:
 	mul.f64 	%fd88, %fd62, %fd87;
 	fma.rn.f64 	%fd10, %fd88, %fd1, %fd1;
 	setp.lt.s32	%p6, %r2, 0;
-	@%p6 bra 	BB39_11;
+	@%p6 bra 	BB42_11;
 
 	mov.f64 	%fd89, 0dBC91A62633145C07;
 	add.rn.f64 	%fd90, %fd10, %fd89;
 	neg.f64 	%fd93, %fd90;
-	bra.uni 	BB39_12;
+	bra.uni 	BB42_12;
 
-BB39_2:
+BB42_2:
 	mov.f64 	%fd19, 0d3FF0000000000000;
 	sub.f64 	%fd2, %fd19, %fd1;
 	{
@@ -5135,7 +5551,7 @@ BB39_2:
 	fma.rn.f64 	%fd28, %fd24, %fd25, %fd18;
 	fma.rn.f64 	%fd3, %fd28, %fd27, %fd24;
 	setp.lt.s32	%p3, %r3, 1;
-	@%p3 bra 	BB39_4;
+	@%p3 bra 	BB42_4;
 
 	{
 	.reg .b32 %temp; 
@@ -5174,31 +5590,31 @@ BB39_2:
 	fma.rn.f64 	%fd54, %fd52, %fd2, %fd53;
 	mul.f64 	%fd55, %fd2, %fd54;
 	fma.rn.f64 	%fd94, %fd55, %fd29, %fd29;
-	bra.uni 	BB39_5;
+	bra.uni 	BB42_5;
 
-BB39_11:
+BB42_11:
 	mov.f64 	%fd91, 0d3C91A62633145C07;
 	add.rn.f64 	%fd93, %fd10, %fd91;
 
-BB39_12:
+BB42_12:
 	mov.f64 	%fd92, 0d3FF921FB54442D18;
 	add.rn.f64 	%fd94, %fd92, %fd93;
-	bra.uni 	BB39_13;
+	bra.uni 	BB42_13;
 
-BB39_4:
+BB42_4:
 	mov.f64 	%fd56, 0d0000000000000000;
 	mul.rn.f64 	%fd94, %fd1, %fd56;
 
-BB39_5:
+BB42_5:
 	setp.gt.s32	%p4, %r3, -1;
-	@%p4 bra 	BB39_7;
+	@%p4 bra 	BB42_7;
 
 	mov.f64 	%fd57, 0d7FF0000000000000;
 	mul.rn.f64 	%fd94, %fd94, %fd57;
 
-BB39_7:
+BB42_7:
 	setp.gt.s32	%p5, %r2, -1;
-	@%p5 bra 	BB39_13;
+	@%p5 bra 	BB42_13;
 
 	mov.f64 	%fd58, 0dBCA1A62633145C07;
 	add.rn.f64 	%fd59, %fd94, %fd58;
@@ -5206,13 +5622,13 @@ BB39_7:
 	mov.f64 	%fd61, 0d400921FB54442D18;
 	add.rn.f64 	%fd94, %fd61, %fd60;
 
-BB39_13:
+BB42_13:
 	cvta.to.global.u64 	%rd7, %rd3;
 	shl.b64 	%rd8, %rd1, 3;
 	add.s64 	%rd9, %rd7, %rd8;
 	st.global.f64 	[%rd9], %fd94;
 
-BB39_14:
+BB42_14:
 	ret;
 }
 
@@ -5237,7 +5653,7 @@ BB39_14:
 	mov.u32 	%r5, %tid.x;
 	mad.lo.s32 	%r1, %r4, %r3, %r5;
 	setp.ge.u32	%p1, %r1, %r2;
-	@%p1 bra 	BB40_4;
+	@%p1 bra 	BB43_4;
 
 	cvta.to.global.u64 	%rd4, %rd2;
 	cvt.s64.s32	%rd1, %r1;
@@ -5247,7 +5663,7 @@ BB39_14:
 	abs.f64 	%fd2, %fd1;
 	setp.leu.f64	%p2, %fd2, 0d3FF0000000000000;
 	mov.f64 	%fd56, %fd2;
-	@%p2 bra 	BB40_3;
+	@%p2 bra 	BB43_3;
 
 	// inline asm
 	rcp.approx.ftz.f64 %fd5,%fd2;
@@ -5261,7 +5677,7 @@ BB39_14:
 	selp.f64	%fd3, 0d0000000000000000, %fd11, %p3;
 	mov.f64 	%fd56, %fd3;
 
-BB40_3:
+BB43_3:
 	mov.f64 	%fd4, %fd56;
 	cvta.to.global.u64 	%rd7, %rd3;
 	mul.f64 	%fd12, %fd4, %fd4;
@@ -5327,7 +5743,7 @@ BB40_3:
 	add.s64 	%rd9, %rd7, %rd8;
 	st.global.f64 	[%rd9], %fd55;
 
-BB40_4:
+BB43_4:
 	ret;
 }
 
@@ -5352,7 +5768,7 @@ BB40_4:
 	mov.u32 	%r5, %tid.x;
 	mad.lo.s32 	%r1, %r4, %r3, %r5;
 	setp.ge.u32	%p1, %r1, %r2;
-	@%p1 bra 	BB41_4;
+	@%p1 bra 	BB44_4;
 
 	cvta.to.global.u64 	%rd4, %rd2;
 	mul.wide.s32 	%rd5, %r1, 8;
@@ -5361,15 +5777,15 @@ BB40_4:
 	setp.eq.f64	%p2, %fd1, 0d0000000000000000;
 	cvta.to.global.u64 	%rd7, %rd3;
 	add.s64 	%rd1, %rd7, %rd5;
-	@%p2 bra 	BB41_3;
-	bra.uni 	BB41_2;
+	@%p2 bra 	BB44_3;
+	bra.uni 	BB44_2;
 
-BB41_3:
+BB44_3:
 	mov.u64 	%rd8, 0;
 	st.global.u64 	[%rd1], %rd8;
-	bra.uni 	BB41_4;
+	bra.uni 	BB44_4;
 
-BB41_2:
+BB44_2:
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r6}, %fd1;
@@ -5389,7 +5805,7 @@ BB41_2:
 	mov.b64 	%fd3, {%r11, %r10};
 	st.global.f64 	[%rd1], %fd3;
 
-BB41_4:
+BB44_4:
 	ret;
 }
 
@@ -5398,7 +5814,7 @@ BB41_4:
 	.param .b64 __internal_trig_reduction_slowpathd_param_1
 )
 {
-	.local .align 8 .b8 	__local_depot42[40];
+	.local .align 8 .b8 	__local_depot45[40];
 	.reg .b64 	%SP;
 	.reg .b64 	%SPL;
 	.reg .pred 	%p<9>;
@@ -5407,7 +5823,7 @@ BB41_4:
 	.reg .b64 	%rd<101>;
 
 
-	mov.u64 	%rd100, __local_depot42;
+	mov.u64 	%rd100, __local_depot45;
 	cvta.local.u64 	%SP, %rd100;
 	ld.param.f64 	%fd4, [__internal_trig_reduction_slowpathd_param_0];
 	ld.param.u64 	%rd37, [__internal_trig_reduction_slowpathd_param_1];
@@ -5421,7 +5837,7 @@ BB41_4:
 	shr.u32 	%r3, %r1, 20;
 	bfe.u32 	%r4, %r1, 20, 11;
 	setp.eq.s32	%p1, %r4, 2047;
-	@%p1 bra 	BB42_13;
+	@%p1 bra 	BB45_13;
 
 	add.s32 	%r16, %r4, -1024;
 	shr.u32 	%r17, %r16, 6;
@@ -5434,7 +5850,7 @@ BB41_4:
 	setp.gt.s32	%p2, %r5, %r6;
 	mov.u64 	%rd94, 0;
 	mov.u64 	%rd93, %rd1;
-	@%p2 bra 	BB42_4;
+	@%p2 bra 	BB45_4;
 
 	mov.b64 	 %rd41, %fd4;
 	shl.b64 	%rd42, %rd41, 11;
@@ -5453,7 +5869,7 @@ BB41_4:
 	mov.u64 	%rd91, %rd1;
 	mov.u32 	%r39, %r7;
 
-BB42_3:
+BB45_3:
 	.pragma "nounroll";
 	mov.u32 	%r8, %r39;
 	mov.u64 	%rd7, %rd91;
@@ -5488,15 +5904,15 @@ BB42_3:
 	setp.lt.s32	%p3, %r9, %r6;
 	mov.u64 	%rd91, %rd13;
 	mov.u32 	%r39, %r9;
-	@%p3 bra 	BB42_3;
+	@%p3 bra 	BB45_3;
 
-BB42_4:
+BB45_4:
 	st.local.u64 	[%rd93], %rd94;
 	ld.local.u64 	%rd95, [%rd1+16];
 	ld.local.u64 	%rd96, [%rd1+24];
 	and.b32  	%r10, %r3, 63;
 	setp.eq.s32	%p4, %r10, 0;
-	@%p4 bra 	BB42_6;
+	@%p4 bra 	BB45_6;
 
 	mov.u32 	%r27, 64;
 	sub.s32 	%r28, %r27, %r10;
@@ -5508,7 +5924,7 @@ BB42_4:
 	shr.u64 	%rd56, %rd55, %r28;
 	or.b64  	%rd95, %rd56, %rd54;
 
-BB42_6:
+BB45_6:
 	cvta.to.local.u64 	%rd57, %rd37;
 	shr.u64 	%rd58, %rd96, 62;
 	cvt.u32.u64	%r29, %rd58;
@@ -5521,11 +5937,11 @@ BB42_6:
 	and.b32  	%r31, %r30, 1;
 	add.s32 	%r32, %r31, %r29;
 	neg.s32 	%r33, %r32;
-	setp.eq.s32	%p5, %r40, 0;
-	selp.b32	%r34, %r32, %r33, %p5;
+	setp.ne.s32	%p5, %r40, 0;
+	selp.b32	%r34, %r33, %r32, %p5;
 	st.local.u32 	[%rd57], %r34;
 	setp.eq.s32	%p6, %r31, 0;
-	@%p6 bra 	BB42_8;
+	@%p6 bra 	BB45_8;
 
 	mov.u64 	%rd65, 0;
 	// inline asm
@@ -5545,10 +5961,10 @@ BB42_6:
 	// inline asm
 	xor.b32  	%r40, %r40, -2147483648;
 
-BB42_8:
+BB45_8:
 	clz.b64 	%r41, %rd98;
 	setp.eq.s32	%p7, %r41, 0;
-	@%p7 bra 	BB42_10;
+	@%p7 bra 	BB45_10;
 
 	shl.b64 	%rd68, %rd98, %r41;
 	mov.u32 	%r35, 64;
@@ -5556,7 +5972,7 @@ BB42_8:
 	shr.u64 	%rd69, %rd97, %r36;
 	or.b64  	%rd98, %rd69, %rd68;
 
-BB42_10:
+BB45_10:
 	mov.u64 	%rd73, -3958705157555305931;
 	// inline asm
 	{
@@ -5577,7 +5993,7 @@ BB42_10:
 	}
 	// inline asm
 	setp.lt.s64	%p8, %rd99, 1;
-	@%p8 bra 	BB42_12;
+	@%p8 bra 	BB45_12;
 
 	// inline asm
 	{
@@ -5596,7 +6012,7 @@ BB42_10:
 	// inline asm
 	add.s32 	%r41, %r41, 1;
 
-BB42_12:
+BB45_12:
 	cvt.u64.u32	%rd80, %r40;
 	shl.b64 	%rd81, %rd80, 32;
 	mov.u32 	%r37, 1022;
@@ -5611,7 +6027,7 @@ BB42_12:
 	or.b64  	%rd89, %rd88, %rd81;
 	mov.b64 	 %fd4, %rd89;
 
-BB42_13:
+BB45_13:
 	st.param.f64	[func_retval0+0], %fd4;
 	ret;
 }
@@ -5621,7 +6037,7 @@ BB42_13:
 	.param .b64 __internal_accurate_pow_param_1
 )
 {
-	.reg .pred 	%p<9>;
+	.reg .pred 	%p<10>;
 	.reg .f32 	%f<3>;
 	.reg .b32 	%r<52>;
 	.reg .f64 	%fd<134>;
@@ -5639,7 +6055,7 @@ BB42_13:
 	}
 	shr.u32 	%r50, %r49, 20;
 	setp.ne.s32	%p1, %r50, 0;
-	@%p1 bra 	BB43_2;
+	@%p1 bra 	BB46_2;
 
 	mul.f64 	%fd14, %fd12, 0d4350000000000000;
 	{
@@ -5653,13 +6069,13 @@ BB42_13:
 	shr.u32 	%r16, %r49, 20;
 	add.s32 	%r50, %r16, -54;
 
-BB43_2:
+BB46_2:
 	add.s32 	%r51, %r50, -1023;
 	and.b32  	%r17, %r49, -2146435073;
 	or.b32  	%r18, %r17, 1072693248;
 	mov.b64 	%fd132, {%r48, %r18};
 	setp.lt.u32	%p2, %r18, 1073127583;
-	@%p2 bra 	BB43_4;
+	@%p2 bra 	BB46_4;
 
 	{
 	.reg .b32 %temp; 
@@ -5673,7 +6089,7 @@ BB43_2:
 	mov.b64 	%fd132, {%r19, %r21};
 	add.s32 	%r51, %r50, -1022;
 
-BB43_4:
+BB46_4:
 	add.f64 	%fd16, %fd132, 0d3FF0000000000000;
 	// inline asm
 	rcp.approx.ftz.f64 %fd15,%fd16;
@@ -5838,13 +6254,13 @@ BB43_4:
 	mov.b32 	 %f2, %r35;
 	abs.f32 	%f1, %f2;
 	setp.lt.f32	%p4, %f1, 0f4086232B;
-	@%p4 bra 	BB43_7;
+	@%p4 bra 	BB46_7;
 
 	setp.lt.f64	%p5, %fd4, 0d0000000000000000;
 	add.f64 	%fd129, %fd4, 0d7FF0000000000000;
 	selp.f64	%fd133, 0d0000000000000000, %fd129, %p5;
 	setp.geu.f32	%p6, %f1, 0f40874800;
-	@%p6 bra 	BB43_7;
+	@%p6 bra 	BB46_7;
 
 	shr.u32 	%r36, %r13, 31;
 	add.s32 	%r37, %r13, %r36;
@@ -5859,26 +6275,26 @@ BB43_4:
 	mov.b64 	%fd131, {%r44, %r43};
 	mul.f64 	%fd133, %fd130, %fd131;
 
-BB43_7:
+BB46_7:
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r45}, %fd133;
 	}
 	and.b32  	%r46, %r45, 2147483647;
 	setp.ne.s32	%p7, %r46, 2146435072;
-	@%p7 bra 	BB43_9;
-
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%r47, %temp}, %fd133;
 	}
-	setp.eq.s32	%p8, %r47, 0;
-	@%p8 bra 	BB43_10;
+	setp.ne.s32	%p8, %r47, 0;
+	or.pred  	%p9, %p8, %p7;
+	@!%p9 bra 	BB46_9;
+	bra.uni 	BB46_8;
 
-BB43_9:
+BB46_8:
 	fma.rn.f64 	%fd133, %fd133, %fd5, %fd133;
 
-BB43_10:
+BB46_9:
 	st.param.f64	[func_retval0+0], %fd133;
 	ret;
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/50a895f8/src/main/java/org/apache/sysml/hops/Hop.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/Hop.java b/src/main/java/org/apache/sysml/hops/Hop.java
index 5ee0b56..f8ee068 100644
--- a/src/main/java/org/apache/sysml/hops/Hop.java
+++ b/src/main/java/org/apache/sysml/hops/Hop.java
@@ -1051,7 +1051,7 @@ public abstract class Hop implements ParseInfo
 	}
 
 	public enum OpOp1 {
-		NOT, ABS, SIN, COS, TAN, ASIN, ACOS, ATAN, SIGN, SQRT, LOG, EXP, 
+		NOT, ABS, SIN, COS, TAN, ASIN, ACOS, ATAN, SINH, COSH, TANH, SIGN, SQRT, LOG, EXP, 
 		CAST_AS_SCALAR, CAST_AS_MATRIX, CAST_AS_FRAME, CAST_AS_DOUBLE, CAST_AS_INT, CAST_AS_BOOLEAN,
 		PRINT, EIGEN, NROW, NCOL, LENGTH, ROUND, IQM, STOP, CEIL, FLOOR, MEDIAN, INVERSE, CHOLESKY,
 		SVD,
@@ -1276,6 +1276,9 @@ public abstract class Hop implements ParseInfo
 		HopsOpOp1LopsU.put(OpOp1.ASIN, org.apache.sysml.lops.Unary.OperationTypes.ASIN);
 		HopsOpOp1LopsU.put(OpOp1.ACOS, org.apache.sysml.lops.Unary.OperationTypes.ACOS);
 		HopsOpOp1LopsU.put(OpOp1.ATAN, org.apache.sysml.lops.Unary.OperationTypes.ATAN);
+		HopsOpOp1LopsU.put(OpOp1.SINH, org.apache.sysml.lops.Unary.OperationTypes.SINH);
+		HopsOpOp1LopsU.put(OpOp1.COSH, org.apache.sysml.lops.Unary.OperationTypes.COSH);
+		HopsOpOp1LopsU.put(OpOp1.TANH, org.apache.sysml.lops.Unary.OperationTypes.TANH);
 		HopsOpOp1LopsU.put(OpOp1.SIGN, org.apache.sysml.lops.Unary.OperationTypes.SIGN);
 		HopsOpOp1LopsU.put(OpOp1.SQRT, org.apache.sysml.lops.Unary.OperationTypes.SQRT);
 		HopsOpOp1LopsU.put(OpOp1.EXP, org.apache.sysml.lops.Unary.OperationTypes.EXP);
@@ -1310,6 +1313,9 @@ public abstract class Hop implements ParseInfo
 		HopsOpOp1LopsUS.put(OpOp1.ASIN, org.apache.sysml.lops.UnaryCP.OperationTypes.ASIN);
 		HopsOpOp1LopsUS.put(OpOp1.ACOS, org.apache.sysml.lops.UnaryCP.OperationTypes.ACOS);
 		HopsOpOp1LopsUS.put(OpOp1.ATAN, org.apache.sysml.lops.UnaryCP.OperationTypes.ATAN);
+		HopsOpOp1LopsUS.put(OpOp1.SINH, org.apache.sysml.lops.UnaryCP.OperationTypes.SINH);
+		HopsOpOp1LopsUS.put(OpOp1.COSH, org.apache.sysml.lops.UnaryCP.OperationTypes.COSH);
+		HopsOpOp1LopsUS.put(OpOp1.TANH, org.apache.sysml.lops.UnaryCP.OperationTypes.TANH);
 		HopsOpOp1LopsUS.put(OpOp1.SQRT, org.apache.sysml.lops.UnaryCP.OperationTypes.SQRT);
 		HopsOpOp1LopsUS.put(OpOp1.EXP, org.apache.sysml.lops.UnaryCP.OperationTypes.EXP);
 		HopsOpOp1LopsUS.put(OpOp1.LOG, org.apache.sysml.lops.UnaryCP.OperationTypes.LOG);
@@ -1365,6 +1371,9 @@ public abstract class Hop implements ParseInfo
 		HopsOpOp12String.put(OpOp1.ASIN, "asin");
 		HopsOpOp12String.put(OpOp1.ACOS, "acos");
 		HopsOpOp12String.put(OpOp1.ATAN, "atan");
+		HopsOpOp12String.put(OpOp1.SINH, "sinh");
+		HopsOpOp12String.put(OpOp1.COSH, "cosh");
+		HopsOpOp12String.put(OpOp1.TANH, "tanh");
 		HopsOpOp12String.put(OpOp1.STOP, "stop");
 		HopsOpOp12String.put(OpOp1.INVERSE, "inv");
 		HopsOpOp12String.put(OpOp1.SPROP, "sprop");

http://git-wip-us.apache.org/repos/asf/systemml/blob/50a895f8/src/main/java/org/apache/sysml/hops/UnaryOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/UnaryOp.java b/src/main/java/org/apache/sysml/hops/UnaryOp.java
index ca1fc84..46cabcc 100644
--- a/src/main/java/org/apache/sysml/hops/UnaryOp.java
+++ b/src/main/java/org/apache/sysml/hops/UnaryOp.java
@@ -109,7 +109,9 @@ public class UnaryOp extends Hop implements MultiThreadedHop
 			switch(_op) {
 				case SELP:case EXP:case SQRT:case LOG:case ABS:
 				case ROUND:case FLOOR:case CEIL:
-				case SIN:case COS: case TAN:case ASIN:case ACOS:case ATAN:
+				case SIN:case COS: case TAN:
+				case ASIN:case ACOS:case ATAN:
+				case SINH:case COSH: case TANH:
 				case SIGN:
 					return true;
 				default:
@@ -601,6 +603,7 @@ public class UnaryOp extends Hop implements MultiThreadedHop
 		if( mc.dimsKnown() ) {
 			if( _op==OpOp1.ABS || _op==OpOp1.COS || _op==OpOp1.SIN || _op==OpOp1.TAN 
 				|| _op==OpOp1.ACOS || _op==OpOp1.ASIN || _op==OpOp1.ATAN  
+				|| _op==OpOp1.COSH || _op==OpOp1.SINH || _op==OpOp1.TANH 
 				|| _op==OpOp1.SQRT || _op==OpOp1.ROUND  
 				|| _op==OpOp1.SPROP || _op==OpOp1.SELP ) //sparsity preserving
 			{
@@ -723,8 +726,10 @@ public class UnaryOp extends Hop implements MultiThreadedHop
 			Hop input = getInput().get(0);
 			setDim1( input.getDim1() );
 			setDim2( input.getDim2() );
-			if( _op==OpOp1.ABS || _op==OpOp1.COS || _op==OpOp1.SIN || _op==OpOp1.TAN  
-				|| _op==OpOp1.ACOS || _op==OpOp1.ASIN || _op==OpOp1.ATAN
+			// cosh(0)=cos(0)=1, acos(0)=1.5707963267948966
+			if( _op==OpOp1.ABS || _op==OpOp1.SIN || _op==OpOp1.TAN  
+				|| _op==OpOp1.SINH || _op==OpOp1.TANH
+				|| _op==OpOp1.ASIN || _op==OpOp1.ATAN
 				|| _op==OpOp1.SQRT || _op==OpOp1.ROUND || _op==OpOp1.SPROP ) //sparsity preserving
 			{
 				setNnz( input.getNnz() );