You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2017/10/26 02:29:04 UTC

[3/4] systemml git commit: [SYSTEMML-1969] Support single-precision operations on GPU backend

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/main/cpp/kernels/SystemML.ptx
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.ptx b/src/main/cpp/kernels/SystemML.ptx
index 73b057e..d382fc5 100644
--- a/src/main/cpp/kernels/SystemML.ptx
+++ b/src/main/cpp/kernels/SystemML.ptx
@@ -1,8 +1,8 @@
 //
 // Generated by NVIDIA NVVM Compiler
 //
-// Compiler Build ID: CL-21124049
-// Cuda compilation tools, release 8.0, V8.0.44
+// Compiler Build ID: CL-21554848
+// Cuda compilation tools, release 8.0, V8.0.61
 // Based on LLVM 3.4svn
 //
 
@@ -10,7 +10,7 @@
 .target sm_30
 .address_size 64
 
-	// .globl	slice_sparse_dense_row
+	// .globl	double2float_f
 .func  (.param .b64 func_retval0) __internal_trig_reduction_slowpathd
 (
 	.param .b64 __internal_trig_reduction_slowpathd_param_0,
@@ -23,20 +23,97 @@
 	.param .b64 __internal_accurate_pow_param_1
 )
 ;
-.extern .shared .align 8 .b8 sdata[];
+.extern .shared .align 1 .b8 my_sdata[];
+.const .align 4 .b8 __cudart_i2opi_f[24] = {65, 144, 67, 60, 153, 149, 98, 219, 192, 221, 52, 245, 209, 87, 39, 252, 41, 21, 68, 78, 110, 131, 249, 162};
 .const .align 8 .b8 __cudart_i2opi_d[144] = {8, 93, 141, 31, 177, 95, 251, 107, 234, 146, 82, 138, 247, 57, 7, 61, 123, 241, 229, 235, 199, 186, 39, 117, 45, 234, 95, 158, 102, 63, 70, 79, 183, 9, 203, 39, 207, 126, 54, 109, 31, 109, 10, 90, 139, 17, 47, 239, 15, 152, 5, 222, 255, 151, 248, 31, 59, 40, 249, 189, 139, 95, 132, 156, 244, 57, 83, 131, 57, 214, 145, 57, 65, 126, 95, 180, 38, 112, 156, 233, 132, 68, 187, 46, 245, 53, 130, 232, 62, 167, 41, 177, 28, 235, 29, 254, 28, 146, 209, 9, 234, 46, 73, 6, 224, 210, 77, 66, 58, 110, 36, 183, 97, 197, 187, 222, 171, 99, 81, 254, 65, 144, 67, 60, 153, 149, 98, 219, 192, 221, 52, 245, 209, 87, 39, 252, 41, 21, 68, 78, 110, 131, 249, 162};
 .const .align 8 .b8 __cudart_sin_cos_coeffs[128] = {186, 94, 120, 249, 101, 219, 229, 61, 70, 210, 176, 44, 241, 229, 90, 190, 146, 227, 172, 105, 227, 29, 199, 62, 161, 98, 219, 25, 160, 1, 42, 191, 24, 8, 17, 17, 17, 17, 129, 63, 84, 85, 85, 85, 85, 85, 197, 191, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 100, 129, 253, 32, 131, 255, 168, 189, 40, 133, 239, 193, 167, 238, 33, 62, 217, 230, 6, 142, 79, 126, 146, 190, 233, 188, 221, 25, 160, 1, 250, 62, 71, 93, 193, 22, 108, 193, 86, 191, 81, 85, 85, 85, 85, 85, 165, 63, 0, 0, 0, 0, 0, 0, 224, 191, 0, 0, 0, 0, 0, 0, 240, 63};
 
-.visible .entry slice_sparse_dense_row(
-	.param .u64 slice_sparse_dense_row_param_0,
-	.param .u64 slice_sparse_dense_row_param_1,
-	.param .u64 slice_sparse_dense_row_param_2,
-	.param .u64 slice_sparse_dense_row_param_3,
-	.param .u32 slice_sparse_dense_row_param_4,
-	.param .u32 slice_sparse_dense_row_param_5,
-	.param .u32 slice_sparse_dense_row_param_6,
-	.param .u32 slice_sparse_dense_row_param_7,
-	.param .u32 slice_sparse_dense_row_param_8
+.visible .entry double2float_f(
+	.param .u64 double2float_f_param_0,
+	.param .u64 double2float_f_param_1,
+	.param .u32 double2float_f_param_2
+)
+{
+	.reg .pred 	%p<2>;
+	.reg .f32 	%f<2>;
+	.reg .b32 	%r<6>;
+	.reg .f64 	%fd<2>;
+	.reg .b64 	%rd<9>;
+
+
+	ld.param.u64 	%rd1, [double2float_f_param_0];
+	ld.param.u64 	%rd2, [double2float_f_param_1];
+	ld.param.u32 	%r2, [double2float_f_param_2];
+	mov.u32 	%r3, %ctaid.x;
+	mov.u32 	%r4, %ntid.x;
+	mov.u32 	%r5, %tid.x;
+	mad.lo.s32 	%r1, %r4, %r3, %r5;
+	setp.ge.s32	%p1, %r1, %r2;
+	@%p1 bra 	BB0_2;
+
+	cvta.to.global.u64 	%rd3, %rd1;
+	mul.wide.s32 	%rd4, %r1, 8;
+	add.s64 	%rd5, %rd3, %rd4;
+	ld.global.f64 	%fd1, [%rd5];
+	cvt.rn.f32.f64	%f1, %fd1;
+	cvta.to.global.u64 	%rd6, %rd2;
+	mul.wide.s32 	%rd7, %r1, 4;
+	add.s64 	%rd8, %rd6, %rd7;
+	st.global.f32 	[%rd8], %f1;
+
+BB0_2:
+	ret;
+}
+
+	// .globl	float2double_f
+.visible .entry float2double_f(
+	.param .u64 float2double_f_param_0,
+	.param .u64 float2double_f_param_1,
+	.param .u32 float2double_f_param_2
+)
+{
+	.reg .pred 	%p<2>;
+	.reg .f32 	%f<2>;
+	.reg .b32 	%r<6>;
+	.reg .f64 	%fd<2>;
+	.reg .b64 	%rd<9>;
+
+
+	ld.param.u64 	%rd1, [float2double_f_param_0];
+	ld.param.u64 	%rd2, [float2double_f_param_1];
+	ld.param.u32 	%r2, [float2double_f_param_2];
+	mov.u32 	%r3, %ctaid.x;
+	mov.u32 	%r4, %ntid.x;
+	mov.u32 	%r5, %tid.x;
+	mad.lo.s32 	%r1, %r4, %r3, %r5;
+	setp.ge.s32	%p1, %r1, %r2;
+	@%p1 bra 	BB1_2;
+
+	cvta.to.global.u64 	%rd3, %rd1;
+	mul.wide.s32 	%rd4, %r1, 4;
+	add.s64 	%rd5, %rd3, %rd4;
+	ld.global.f32 	%f1, [%rd5];
+	cvt.f64.f32	%fd1, %f1;
+	cvta.to.global.u64 	%rd6, %rd2;
+	mul.wide.s32 	%rd7, %r1, 8;
+	add.s64 	%rd8, %rd6, %rd7;
+	st.global.f64 	[%rd8], %fd1;
+
+BB1_2:
+	ret;
+}
+
+	// .globl	slice_sparse_dense_row_d
+.visible .entry slice_sparse_dense_row_d(
+	.param .u64 slice_sparse_dense_row_d_param_0,
+	.param .u64 slice_sparse_dense_row_d_param_1,
+	.param .u64 slice_sparse_dense_row_d_param_2,
+	.param .u64 slice_sparse_dense_row_d_param_3,
+	.param .u32 slice_sparse_dense_row_d_param_4,
+	.param .u32 slice_sparse_dense_row_d_param_5,
+	.param .u32 slice_sparse_dense_row_d_param_6,
+	.param .u32 slice_sparse_dense_row_d_param_7,
+	.param .u32 slice_sparse_dense_row_d_param_8
 )
 {
 	.reg .pred 	%p<7>;
@@ -45,22 +122,22 @@
 	.reg .b64 	%rd<23>;
 
 
-	ld.param.u64 	%rd9, [slice_sparse_dense_row_param_0];
-	ld.param.u64 	%rd10, [slice_sparse_dense_row_param_1];
-	ld.param.u64 	%rd11, [slice_sparse_dense_row_param_2];
-	ld.param.u64 	%rd12, [slice_sparse_dense_row_param_3];
-	ld.param.u32 	%r15, [slice_sparse_dense_row_param_4];
-	ld.param.u32 	%r16, [slice_sparse_dense_row_param_5];
-	ld.param.u32 	%r12, [slice_sparse_dense_row_param_6];
-	ld.param.u32 	%r13, [slice_sparse_dense_row_param_7];
-	ld.param.u32 	%r14, [slice_sparse_dense_row_param_8];
+	ld.param.u64 	%rd9, [slice_sparse_dense_row_d_param_0];
+	ld.param.u64 	%rd10, [slice_sparse_dense_row_d_param_1];
+	ld.param.u64 	%rd11, [slice_sparse_dense_row_d_param_2];
+	ld.param.u64 	%rd12, [slice_sparse_dense_row_d_param_3];
+	ld.param.u32 	%r15, [slice_sparse_dense_row_d_param_4];
+	ld.param.u32 	%r16, [slice_sparse_dense_row_d_param_5];
+	ld.param.u32 	%r12, [slice_sparse_dense_row_d_param_6];
+	ld.param.u32 	%r13, [slice_sparse_dense_row_d_param_7];
+	ld.param.u32 	%r14, [slice_sparse_dense_row_d_param_8];
 	mov.u32 	%r17, %ntid.x;
 	mov.u32 	%r18, %ctaid.x;
 	mov.u32 	%r19, %tid.x;
 	mad.lo.s32 	%r1, %r17, %r18, %r19;
 	add.s32 	%r2, %r1, %r15;
 	setp.gt.s32	%p1, %r2, %r16;
-	@%p1 bra 	BB0_6;
+	@%p1 bra 	BB2_6;
 
 	cvta.to.global.u64 	%rd13, %rd10;
 	mul.wide.s32 	%rd14, %r2, 4;
@@ -68,7 +145,7 @@
 	ld.global.u32 	%r23, [%rd1];
 	ld.global.u32 	%r22, [%rd1+4];
 	setp.ge.s32	%p2, %r23, %r22;
-	@%p2 bra 	BB0_6;
+	@%p2 bra 	BB2_6;
 
 	cvta.to.global.u64 	%rd2, %rd12;
 	cvta.to.global.u64 	%rd15, %rd9;
@@ -80,12 +157,12 @@
 	mul.wide.s32 	%rd18, %r23, 4;
 	add.s64 	%rd21, %rd16, %rd18;
 
-BB0_3:
+BB2_3:
 	ld.global.u32 	%r8, [%rd21];
 	setp.lt.s32	%p3, %r8, %r12;
 	setp.gt.s32	%p4, %r8, %r13;
 	or.pred  	%p5, %p3, %p4;
-	@%p5 bra 	BB0_5;
+	@%p5 bra 	BB2_5;
 
 	ld.global.f64 	%fd1, [%rd22];
 	add.s32 	%r21, %r5, %r8;
@@ -94,28 +171,106 @@ BB0_3:
 	st.global.f64 	[%rd20], %fd1;
 	ld.global.u32 	%r22, [%rd1+4];
 
-BB0_5:
+BB2_5:
 	add.s64 	%rd22, %rd22, 8;
 	add.s64 	%rd21, %rd21, 4;
 	add.s32 	%r23, %r23, 1;
 	setp.lt.s32	%p6, %r23, %r22;
-	@%p6 bra 	BB0_3;
+	@%p6 bra 	BB2_3;
+
+BB2_6:
+	ret;
+}
+
+	// .globl	slice_sparse_dense_row_f
+.visible .entry slice_sparse_dense_row_f(
+	.param .u64 slice_sparse_dense_row_f_param_0,
+	.param .u64 slice_sparse_dense_row_f_param_1,
+	.param .u64 slice_sparse_dense_row_f_param_2,
+	.param .u64 slice_sparse_dense_row_f_param_3,
+	.param .u32 slice_sparse_dense_row_f_param_4,
+	.param .u32 slice_sparse_dense_row_f_param_5,
+	.param .u32 slice_sparse_dense_row_f_param_6,
+	.param .u32 slice_sparse_dense_row_f_param_7,
+	.param .u32 slice_sparse_dense_row_f_param_8
+)
+{
+	.reg .pred 	%p<7>;
+	.reg .f32 	%f<2>;
+	.reg .b32 	%r<24>;
+	.reg .b64 	%rd<22>;
+
+
+	ld.param.u64 	%rd9, [slice_sparse_dense_row_f_param_0];
+	ld.param.u64 	%rd10, [slice_sparse_dense_row_f_param_1];
+	ld.param.u64 	%rd11, [slice_sparse_dense_row_f_param_2];
+	ld.param.u64 	%rd12, [slice_sparse_dense_row_f_param_3];
+	ld.param.u32 	%r15, [slice_sparse_dense_row_f_param_4];
+	ld.param.u32 	%r16, [slice_sparse_dense_row_f_param_5];
+	ld.param.u32 	%r12, [slice_sparse_dense_row_f_param_6];
+	ld.param.u32 	%r13, [slice_sparse_dense_row_f_param_7];
+	ld.param.u32 	%r14, [slice_sparse_dense_row_f_param_8];
+	mov.u32 	%r17, %ntid.x;
+	mov.u32 	%r18, %ctaid.x;
+	mov.u32 	%r19, %tid.x;
+	mad.lo.s32 	%r1, %r17, %r18, %r19;
+	add.s32 	%r2, %r1, %r15;
+	setp.gt.s32	%p1, %r2, %r16;
+	@%p1 bra 	BB3_6;
+
+	cvta.to.global.u64 	%rd13, %rd10;
+	mul.wide.s32 	%rd14, %r2, 4;
+	add.s64 	%rd1, %rd13, %rd14;
+	ld.global.u32 	%r23, [%rd1];
+	ld.global.u32 	%r22, [%rd1+4];
+	setp.ge.s32	%p2, %r23, %r22;
+	@%p2 bra 	BB3_6;
+
+	cvta.to.global.u64 	%rd2, %rd12;
+	cvta.to.global.u64 	%rd15, %rd9;
+	cvta.to.global.u64 	%rd16, %rd11;
+	mul.lo.s32 	%r20, %r1, %r14;
+	sub.s32 	%r5, %r20, %r12;
+	mul.wide.s32 	%rd17, %r23, 4;
+	add.s64 	%rd21, %rd15, %rd17;
+	add.s64 	%rd20, %rd16, %rd17;
+
+BB3_3:
+	ld.global.u32 	%r8, [%rd20];
+	setp.lt.s32	%p3, %r8, %r12;
+	setp.gt.s32	%p4, %r8, %r13;
+	or.pred  	%p5, %p3, %p4;
+	@%p5 bra 	BB3_5;
+
+	ld.global.f32 	%f1, [%rd21];
+	add.s32 	%r21, %r5, %r8;
+	mul.wide.s32 	%rd18, %r21, 4;
+	add.s64 	%rd19, %rd2, %rd18;
+	st.global.f32 	[%rd19], %f1;
+	ld.global.u32 	%r22, [%rd1+4];
+
+BB3_5:
+	add.s64 	%rd21, %rd21, 4;
+	add.s64 	%rd20, %rd20, 4;
+	add.s32 	%r23, %r23, 1;
+	setp.lt.s32	%p6, %r23, %r22;
+	@%p6 bra 	BB3_3;
 
-BB0_6:
+BB3_6:
 	ret;
 }
 
-	// .globl	slice_sparse_dense_nnz
-.visible .entry slice_sparse_dense_nnz(
-	.param .u64 slice_sparse_dense_nnz_param_0,
-	.param .u64 slice_sparse_dense_nnz_param_1,
-	.param .u64 slice_sparse_dense_nnz_param_2,
-	.param .u64 slice_sparse_dense_nnz_param_3,
-	.param .u32 slice_sparse_dense_nnz_param_4,
-	.param .u32 slice_sparse_dense_nnz_param_5,
-	.param .u32 slice_sparse_dense_nnz_param_6,
-	.param .u32 slice_sparse_dense_nnz_param_7,
-	.param .u32 slice_sparse_dense_nnz_param_8
+	// .globl	slice_sparse_dense_nnz_d
+.visible .entry slice_sparse_dense_nnz_d(
+	.param .u64 slice_sparse_dense_nnz_d_param_0,
+	.param .u64 slice_sparse_dense_nnz_d_param_1,
+	.param .u64 slice_sparse_dense_nnz_d_param_2,
+	.param .u64 slice_sparse_dense_nnz_d_param_3,
+	.param .u32 slice_sparse_dense_nnz_d_param_4,
+	.param .u32 slice_sparse_dense_nnz_d_param_5,
+	.param .u32 slice_sparse_dense_nnz_d_param_6,
+	.param .u32 slice_sparse_dense_nnz_d_param_7,
+	.param .u32 slice_sparse_dense_nnz_d_param_8
 )
 {
 	.reg .pred 	%p<6>;
@@ -124,15 +279,15 @@ BB0_6:
 	.reg .b64 	%rd<22>;
 
 
-	ld.param.u64 	%rd5, [slice_sparse_dense_nnz_param_0];
-	ld.param.u64 	%rd8, [slice_sparse_dense_nnz_param_1];
-	ld.param.u64 	%rd6, [slice_sparse_dense_nnz_param_2];
-	ld.param.u64 	%rd7, [slice_sparse_dense_nnz_param_3];
-	ld.param.u32 	%r5, [slice_sparse_dense_nnz_param_4];
-	ld.param.u32 	%r9, [slice_sparse_dense_nnz_param_5];
-	ld.param.u32 	%r6, [slice_sparse_dense_nnz_param_6];
-	ld.param.u32 	%r7, [slice_sparse_dense_nnz_param_7];
-	ld.param.u32 	%r8, [slice_sparse_dense_nnz_param_8];
+	ld.param.u64 	%rd5, [slice_sparse_dense_nnz_d_param_0];
+	ld.param.u64 	%rd8, [slice_sparse_dense_nnz_d_param_1];
+	ld.param.u64 	%rd6, [slice_sparse_dense_nnz_d_param_2];
+	ld.param.u64 	%rd7, [slice_sparse_dense_nnz_d_param_3];
+	ld.param.u32 	%r5, [slice_sparse_dense_nnz_d_param_4];
+	ld.param.u32 	%r9, [slice_sparse_dense_nnz_d_param_5];
+	ld.param.u32 	%r6, [slice_sparse_dense_nnz_d_param_6];
+	ld.param.u32 	%r7, [slice_sparse_dense_nnz_d_param_7];
+	ld.param.u32 	%r8, [slice_sparse_dense_nnz_d_param_8];
 	mov.u32 	%r10, %ntid.x;
 	mov.u32 	%r11, %ctaid.x;
 	mov.u32 	%r12, %tid.x;
@@ -146,7 +301,7 @@ BB0_6:
 	add.s64 	%rd12, %rd1, %rd11;
 	ld.global.u32 	%r15, [%rd12+4];
 	setp.ge.s32	%p1, %r1, %r15;
-	@%p1 bra 	BB1_5;
+	@%p1 bra 	BB4_5;
 
 	cvta.to.global.u64 	%rd2, %rd7;
 	cvta.to.global.u64 	%rd3, %rd5;
@@ -158,11 +313,11 @@ BB0_6:
 	setp.lt.s32	%p2, %r2, %r6;
 	setp.gt.s32	%p3, %r2, %r7;
 	or.pred  	%p4, %p2, %p3;
-	@%p4 bra 	BB1_5;
+	@%p4 bra 	BB4_5;
 
 	mov.u32 	%r21, %r5;
 
-BB1_3:
+BB4_3:
 	mov.u32 	%r3, %r21;
 	add.s32 	%r4, %r3, 1;
 	mul.wide.s32 	%rd16, %r4, 4;
@@ -170,7 +325,7 @@ BB1_3:
 	ld.global.u32 	%r16, [%rd17];
 	setp.le.s32	%p5, %r16, %r1;
 	mov.u32 	%r21, %r4;
-	@%p5 bra 	BB1_3;
+	@%p5 bra 	BB4_3;
 
 	shl.b64 	%rd18, %rd4, 3;
 	add.s64 	%rd19, %rd3, %rd18;
@@ -183,21 +338,103 @@ BB1_3:
 	add.s64 	%rd21, %rd2, %rd20;
 	st.global.f64 	[%rd21], %fd1;
 
-BB1_5:
+BB4_5:
+	ret;
+}
+
+	// .globl	slice_sparse_dense_nnz_f
+.visible .entry slice_sparse_dense_nnz_f(
+	.param .u64 slice_sparse_dense_nnz_f_param_0,
+	.param .u64 slice_sparse_dense_nnz_f_param_1,
+	.param .u64 slice_sparse_dense_nnz_f_param_2,
+	.param .u64 slice_sparse_dense_nnz_f_param_3,
+	.param .u32 slice_sparse_dense_nnz_f_param_4,
+	.param .u32 slice_sparse_dense_nnz_f_param_5,
+	.param .u32 slice_sparse_dense_nnz_f_param_6,
+	.param .u32 slice_sparse_dense_nnz_f_param_7,
+	.param .u32 slice_sparse_dense_nnz_f_param_8
+)
+{
+	.reg .pred 	%p<6>;
+	.reg .f32 	%f<2>;
+	.reg .b32 	%r<22>;
+	.reg .b64 	%rd<22>;
+
+
+	ld.param.u64 	%rd5, [slice_sparse_dense_nnz_f_param_0];
+	ld.param.u64 	%rd8, [slice_sparse_dense_nnz_f_param_1];
+	ld.param.u64 	%rd6, [slice_sparse_dense_nnz_f_param_2];
+	ld.param.u64 	%rd7, [slice_sparse_dense_nnz_f_param_3];
+	ld.param.u32 	%r5, [slice_sparse_dense_nnz_f_param_4];
+	ld.param.u32 	%r9, [slice_sparse_dense_nnz_f_param_5];
+	ld.param.u32 	%r6, [slice_sparse_dense_nnz_f_param_6];
+	ld.param.u32 	%r7, [slice_sparse_dense_nnz_f_param_7];
+	ld.param.u32 	%r8, [slice_sparse_dense_nnz_f_param_8];
+	mov.u32 	%r10, %ntid.x;
+	mov.u32 	%r11, %ctaid.x;
+	mov.u32 	%r12, %tid.x;
+	mad.lo.s32 	%r13, %r10, %r11, %r12;
+	cvta.to.global.u64 	%rd1, %rd8;
+	mul.wide.s32 	%rd9, %r5, 4;
+	add.s64 	%rd10, %rd1, %rd9;
+	ld.global.u32 	%r14, [%rd10];
+	add.s32 	%r1, %r13, %r14;
+	mul.wide.s32 	%rd11, %r9, 4;
+	add.s64 	%rd12, %rd1, %rd11;
+	ld.global.u32 	%r15, [%rd12+4];
+	setp.ge.s32	%p1, %r1, %r15;
+	@%p1 bra 	BB5_5;
+
+	cvta.to.global.u64 	%rd2, %rd7;
+	cvta.to.global.u64 	%rd3, %rd5;
+	cvta.to.global.u64 	%rd13, %rd6;
+	cvt.s64.s32	%rd4, %r1;
+	mul.wide.s32 	%rd14, %r1, 4;
+	add.s64 	%rd15, %rd13, %rd14;
+	ld.global.u32 	%r2, [%rd15];
+	setp.lt.s32	%p2, %r2, %r6;
+	setp.gt.s32	%p3, %r2, %r7;
+	or.pred  	%p4, %p2, %p3;
+	@%p4 bra 	BB5_5;
+
+	mov.u32 	%r21, %r5;
+
+BB5_3:
+	mov.u32 	%r3, %r21;
+	add.s32 	%r4, %r3, 1;
+	mul.wide.s32 	%rd16, %r4, 4;
+	add.s64 	%rd17, %rd1, %rd16;
+	ld.global.u32 	%r16, [%rd17];
+	setp.le.s32	%p5, %r16, %r1;
+	mov.u32 	%r21, %r4;
+	@%p5 bra 	BB5_3;
+
+	shl.b64 	%rd18, %rd4, 2;
+	add.s64 	%rd19, %rd3, %rd18;
+	ld.global.f32 	%f1, [%rd19];
+	sub.s32 	%r17, %r3, %r5;
+	mul.lo.s32 	%r18, %r17, %r8;
+	sub.s32 	%r19, %r18, %r6;
+	add.s32 	%r20, %r19, %r2;
+	mul.wide.s32 	%rd20, %r20, 4;
+	add.s64 	%rd21, %rd2, %rd20;
+	st.global.f32 	[%rd21], %f1;
+
+BB5_5:
 	ret;
 }
 
-	// .globl	slice_dense_dense
-.visible .entry slice_dense_dense(
-	.param .u64 slice_dense_dense_param_0,
-	.param .u64 slice_dense_dense_param_1,
-	.param .u32 slice_dense_dense_param_2,
-	.param .u32 slice_dense_dense_param_3,
-	.param .u32 slice_dense_dense_param_4,
-	.param .u32 slice_dense_dense_param_5,
-	.param .u32 slice_dense_dense_param_6,
-	.param .u32 slice_dense_dense_param_7,
-	.param .u32 slice_dense_dense_param_8
+	// .globl	slice_dense_dense_d
+.visible .entry slice_dense_dense_d(
+	.param .u64 slice_dense_dense_d_param_0,
+	.param .u64 slice_dense_dense_d_param_1,
+	.param .u32 slice_dense_dense_d_param_2,
+	.param .u32 slice_dense_dense_d_param_3,
+	.param .u32 slice_dense_dense_d_param_4,
+	.param .u32 slice_dense_dense_d_param_5,
+	.param .u32 slice_dense_dense_d_param_6,
+	.param .u32 slice_dense_dense_d_param_7,
+	.param .u32 slice_dense_dense_d_param_8
 )
 {
 	.reg .pred 	%p<4>;
@@ -206,13 +443,13 @@ BB1_5:
 	.reg .b64 	%rd<9>;
 
 
-	ld.param.u64 	%rd1, [slice_dense_dense_param_0];
-	ld.param.u64 	%rd2, [slice_dense_dense_param_1];
-	ld.param.u32 	%r3, [slice_dense_dense_param_2];
-	ld.param.u32 	%r4, [slice_dense_dense_param_4];
-	ld.param.u32 	%r5, [slice_dense_dense_param_6];
-	ld.param.u32 	%r7, [slice_dense_dense_param_7];
-	ld.param.u32 	%r6, [slice_dense_dense_param_8];
+	ld.param.u64 	%rd1, [slice_dense_dense_d_param_0];
+	ld.param.u64 	%rd2, [slice_dense_dense_d_param_1];
+	ld.param.u32 	%r3, [slice_dense_dense_d_param_2];
+	ld.param.u32 	%r4, [slice_dense_dense_d_param_4];
+	ld.param.u32 	%r5, [slice_dense_dense_d_param_6];
+	ld.param.u32 	%r7, [slice_dense_dense_d_param_7];
+	ld.param.u32 	%r6, [slice_dense_dense_d_param_8];
 	mov.u32 	%r8, %ctaid.x;
 	mov.u32 	%r9, %ntid.x;
 	mov.u32 	%r10, %tid.x;
@@ -221,10 +458,10 @@ BB1_5:
 	setp.lt.s32	%p1, %r2, %r7;
 	setp.gt.s32	%p2, %r6, -1;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB2_2;
-	bra.uni 	BB2_1;
+	@!%p3 bra 	BB6_2;
+	bra.uni 	BB6_1;
 
-BB2_1:
+BB6_1:
 	rem.s32 	%r11, %r1, %r6;
 	cvta.to.global.u64 	%rd3, %rd1;
 	add.s32 	%r12, %r2, %r3;
@@ -238,15 +475,70 @@ BB2_1:
 	add.s64 	%rd8, %rd6, %rd7;
 	st.global.f64 	[%rd8], %fd1;
 
-BB2_2:
+BB6_2:
+	ret;
+}
+
+	// .globl	slice_dense_dense_f
+.visible .entry slice_dense_dense_f(
+	.param .u64 slice_dense_dense_f_param_0,
+	.param .u64 slice_dense_dense_f_param_1,
+	.param .u32 slice_dense_dense_f_param_2,
+	.param .u32 slice_dense_dense_f_param_3,
+	.param .u32 slice_dense_dense_f_param_4,
+	.param .u32 slice_dense_dense_f_param_5,
+	.param .u32 slice_dense_dense_f_param_6,
+	.param .u32 slice_dense_dense_f_param_7,
+	.param .u32 slice_dense_dense_f_param_8
+)
+{
+	.reg .pred 	%p<4>;
+	.reg .f32 	%f<2>;
+	.reg .b32 	%r<15>;
+	.reg .b64 	%rd<9>;
+
+
+	ld.param.u64 	%rd1, [slice_dense_dense_f_param_0];
+	ld.param.u64 	%rd2, [slice_dense_dense_f_param_1];
+	ld.param.u32 	%r3, [slice_dense_dense_f_param_2];
+	ld.param.u32 	%r4, [slice_dense_dense_f_param_4];
+	ld.param.u32 	%r5, [slice_dense_dense_f_param_6];
+	ld.param.u32 	%r7, [slice_dense_dense_f_param_7];
+	ld.param.u32 	%r6, [slice_dense_dense_f_param_8];
+	mov.u32 	%r8, %ctaid.x;
+	mov.u32 	%r9, %ntid.x;
+	mov.u32 	%r10, %tid.x;
+	mad.lo.s32 	%r1, %r9, %r8, %r10;
+	div.s32 	%r2, %r1, %r6;
+	setp.lt.s32	%p1, %r2, %r7;
+	setp.gt.s32	%p2, %r6, -1;
+	and.pred  	%p3, %p1, %p2;
+	@!%p3 bra 	BB7_2;
+	bra.uni 	BB7_1;
+
+BB7_1:
+	rem.s32 	%r11, %r1, %r6;
+	cvta.to.global.u64 	%rd3, %rd1;
+	add.s32 	%r12, %r2, %r3;
+	add.s32 	%r13, %r11, %r4;
+	mad.lo.s32 	%r14, %r12, %r5, %r13;
+	mul.wide.s32 	%rd4, %r14, 4;
+	add.s64 	%rd5, %rd3, %rd4;
+	ld.global.f32 	%f1, [%rd5];
+	cvta.to.global.u64 	%rd6, %rd2;
+	mul.wide.s32 	%rd7, %r1, 4;
+	add.s64 	%rd8, %rd6, %rd7;
+	st.global.f32 	[%rd8], %f1;
+
+BB7_2:
 	ret;
 }
 
-	// .globl	copy_u2l_dense
-.visible .entry copy_u2l_dense(
-	.param .u64 copy_u2l_dense_param_0,
-	.param .u32 copy_u2l_dense_param_1,
-	.param .u32 copy_u2l_dense_param_2
+	// .globl	copy_u2l_dense_d
+.visible .entry copy_u2l_dense_d(
+	.param .u64 copy_u2l_dense_d_param_0,
+	.param .u32 copy_u2l_dense_d_param_1,
+	.param .u32 copy_u2l_dense_d_param_2
 )
 {
 	.reg .pred 	%p<4>;
@@ -255,9 +547,9 @@ BB2_2:
 	.reg .b64 	%rd<7>;
 
 
-	ld.param.u64 	%rd1, [copy_u2l_dense_param_0];
-	ld.param.u32 	%r3, [copy_u2l_dense_param_1];
-	ld.param.u32 	%r4, [copy_u2l_dense_param_2];
+	ld.param.u64 	%rd1, [copy_u2l_dense_d_param_0];
+	ld.param.u32 	%r3, [copy_u2l_dense_d_param_1];
+	ld.param.u32 	%r4, [copy_u2l_dense_d_param_2];
 	mov.u32 	%r5, %ntid.x;
 	mov.u32 	%r6, %ctaid.x;
 	mov.u32 	%r7, %tid.x;
@@ -268,10 +560,10 @@ BB2_2:
 	setp.gt.s32	%p1, %r9, %r8;
 	setp.lt.s32	%p2, %r2, %r4;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB3_2;
-	bra.uni 	BB3_1;
+	@!%p3 bra 	BB8_2;
+	bra.uni 	BB8_1;
 
-BB3_1:
+BB8_1:
 	cvta.to.global.u64 	%rd2, %rd1;
 	mul.wide.s32 	%rd3, %r1, 8;
 	add.s64 	%rd4, %rd2, %rd3;
@@ -280,16 +572,58 @@ BB3_1:
 	add.s64 	%rd6, %rd2, %rd5;
 	st.global.f64 	[%rd6], %fd1;
 
-BB3_2:
+BB8_2:
+	ret;
+}
+
+	// .globl	copy_u2l_dense_f
+.visible .entry copy_u2l_dense_f(
+	.param .u64 copy_u2l_dense_f_param_0,
+	.param .u32 copy_u2l_dense_f_param_1,
+	.param .u32 copy_u2l_dense_f_param_2
+)
+{
+	.reg .pred 	%p<4>;
+	.reg .f32 	%f<2>;
+	.reg .b32 	%r<10>;
+	.reg .b64 	%rd<7>;
+
+
+	ld.param.u64 	%rd1, [copy_u2l_dense_f_param_0];
+	ld.param.u32 	%r3, [copy_u2l_dense_f_param_1];
+	ld.param.u32 	%r4, [copy_u2l_dense_f_param_2];
+	mov.u32 	%r5, %ntid.x;
+	mov.u32 	%r6, %ctaid.x;
+	mov.u32 	%r7, %tid.x;
+	mad.lo.s32 	%r1, %r5, %r6, %r7;
+	div.s32 	%r8, %r1, %r3;
+	rem.s32 	%r9, %r1, %r3;
+	mad.lo.s32 	%r2, %r9, %r3, %r8;
+	setp.gt.s32	%p1, %r9, %r8;
+	setp.lt.s32	%p2, %r2, %r4;
+	and.pred  	%p3, %p1, %p2;
+	@!%p3 bra 	BB9_2;
+	bra.uni 	BB9_1;
+
+BB9_1:
+	cvta.to.global.u64 	%rd2, %rd1;
+	mul.wide.s32 	%rd3, %r1, 4;
+	add.s64 	%rd4, %rd2, %rd3;
+	ld.global.f32 	%f1, [%rd4];
+	mul.wide.s32 	%rd5, %r2, 4;
+	add.s64 	%rd6, %rd2, %rd5;
+	st.global.f32 	[%rd6], %f1;
+
+BB9_2:
 	ret;
 }
 
-	// .globl	relu
-.visible .entry relu(
-	.param .u64 relu_param_0,
-	.param .u64 relu_param_1,
-	.param .u32 relu_param_2,
-	.param .u32 relu_param_3
+	// .globl	relu_d
+.visible .entry relu_d(
+	.param .u64 relu_d_param_0,
+	.param .u64 relu_d_param_1,
+	.param .u32 relu_d_param_2,
+	.param .u32 relu_d_param_3
 )
 {
 	.reg .pred 	%p<4>;
@@ -298,10 +632,10 @@ BB3_2:
 	.reg .b64 	%rd<8>;
 
 
-	ld.param.u64 	%rd1, [relu_param_0];
-	ld.param.u64 	%rd2, [relu_param_1];
-	ld.param.u32 	%r2, [relu_param_2];
-	ld.param.u32 	%r3, [relu_param_3];
+	ld.param.u64 	%rd1, [relu_d_param_0];
+	ld.param.u64 	%rd2, [relu_d_param_1];
+	ld.param.u32 	%r2, [relu_d_param_2];
+	ld.param.u32 	%r3, [relu_d_param_3];
 	mov.u32 	%r4, %ctaid.x;
 	mov.u32 	%r5, %ntid.x;
 	mov.u32 	%r6, %tid.x;
@@ -310,10 +644,10 @@ BB3_2:
 	setp.lt.s32	%p1, %r7, %r2;
 	setp.gt.s32	%p2, %r3, -1;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB4_2;
-	bra.uni 	BB4_1;
+	@!%p3 bra 	BB10_2;
+	bra.uni 	BB10_1;
 
-BB4_1:
+BB10_1:
 	cvta.to.global.u64 	%rd3, %rd1;
 	mul.wide.s32 	%rd4, %r1, 8;
 	add.s64 	%rd5, %rd3, %rd4;
@@ -324,17 +658,64 @@ BB4_1:
 	add.s64 	%rd7, %rd6, %rd4;
 	st.global.f64 	[%rd7], %fd3;
 
-BB4_2:
+BB10_2:
+	ret;
+}
+
+	// .globl	relu_f
+.visible .entry relu_f(
+	.param .u64 relu_f_param_0,
+	.param .u64 relu_f_param_1,
+	.param .u32 relu_f_param_2,
+	.param .u32 relu_f_param_3
+)
+{
+	.reg .pred 	%p<4>;
+	.reg .f32 	%f<3>;
+	.reg .b32 	%r<8>;
+	.reg .f64 	%fd<4>;
+	.reg .b64 	%rd<8>;
+
+
+	ld.param.u64 	%rd1, [relu_f_param_0];
+	ld.param.u64 	%rd2, [relu_f_param_1];
+	ld.param.u32 	%r2, [relu_f_param_2];
+	ld.param.u32 	%r3, [relu_f_param_3];
+	mov.u32 	%r4, %ctaid.x;
+	mov.u32 	%r5, %ntid.x;
+	mov.u32 	%r6, %tid.x;
+	mad.lo.s32 	%r1, %r5, %r4, %r6;
+	div.s32 	%r7, %r1, %r3;
+	setp.lt.s32	%p1, %r7, %r2;
+	setp.gt.s32	%p2, %r3, -1;
+	and.pred  	%p3, %p1, %p2;
+	@!%p3 bra 	BB11_2;
+	bra.uni 	BB11_1;
+
+BB11_1:
+	cvta.to.global.u64 	%rd3, %rd1;
+	mul.wide.s32 	%rd4, %r1, 4;
+	add.s64 	%rd5, %rd3, %rd4;
+	ld.global.f32 	%f1, [%rd5];
+	cvt.f64.f32	%fd1, %f1;
+	mov.f64 	%fd2, 0d0000000000000000;
+	max.f64 	%fd3, %fd2, %fd1;
+	cvt.rn.f32.f64	%f2, %fd3;
+	cvta.to.global.u64 	%rd6, %rd2;
+	add.s64 	%rd7, %rd6, %rd4;
+	st.global.f32 	[%rd7], %f2;
+
+BB11_2:
 	ret;
 }
 
-	// .globl	relu_backward
-.visible .entry relu_backward(
-	.param .u64 relu_backward_param_0,
-	.param .u64 relu_backward_param_1,
-	.param .u64 relu_backward_param_2,
-	.param .u32 relu_backward_param_3,
-	.param .u32 relu_backward_param_4
+	// .globl	relu_backward_d
+.visible .entry relu_backward_d(
+	.param .u64 relu_backward_d_param_0,
+	.param .u64 relu_backward_d_param_1,
+	.param .u64 relu_backward_d_param_2,
+	.param .u32 relu_backward_d_param_3,
+	.param .u32 relu_backward_d_param_4
 )
 {
 	.reg .pred 	%p<5>;
@@ -343,11 +724,11 @@ BB4_2:
 	.reg .b64 	%rd<14>;
 
 
-	ld.param.u64 	%rd2, [relu_backward_param_0];
-	ld.param.u64 	%rd3, [relu_backward_param_1];
-	ld.param.u64 	%rd4, [relu_backward_param_2];
-	ld.param.u32 	%r2, [relu_backward_param_3];
-	ld.param.u32 	%r3, [relu_backward_param_4];
+	ld.param.u64 	%rd2, [relu_backward_d_param_0];
+	ld.param.u64 	%rd3, [relu_backward_d_param_1];
+	ld.param.u64 	%rd4, [relu_backward_d_param_2];
+	ld.param.u32 	%r2, [relu_backward_d_param_3];
+	ld.param.u32 	%r3, [relu_backward_d_param_4];
 	mov.u32 	%r4, %ntid.x;
 	mov.u32 	%r5, %ctaid.x;
 	mov.u32 	%r6, %tid.x;
@@ -356,10 +737,10 @@ BB4_2:
 	setp.lt.s32	%p1, %r7, %r2;
 	setp.gt.s32	%p2, %r3, -1;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB5_4;
-	bra.uni 	BB5_1;
+	@!%p3 bra 	BB12_4;
+	bra.uni 	BB12_1;
 
-BB5_1:
+BB12_1:
 	cvta.to.global.u64 	%rd5, %rd2;
 	cvt.s64.s32	%rd1, %r1;
 	mul.wide.s32 	%rd6, %r1, 8;
@@ -367,42 +748,98 @@ BB5_1:
 	ld.global.f64 	%fd4, [%rd7];
 	mov.f64 	%fd5, 0d0000000000000000;
 	setp.leu.f64	%p4, %fd4, 0d0000000000000000;
-	@%p4 bra 	BB5_3;
+	@%p4 bra 	BB12_3;
 
 	cvta.to.global.u64 	%rd8, %rd3;
 	shl.b64 	%rd9, %rd1, 3;
 	add.s64 	%rd10, %rd8, %rd9;
 	ld.global.f64 	%fd5, [%rd10];
 
-BB5_3:
+BB12_3:
 	cvta.to.global.u64 	%rd11, %rd4;
 	shl.b64 	%rd12, %rd1, 3;
 	add.s64 	%rd13, %rd11, %rd12;
 	st.global.f64 	[%rd13], %fd5;
 
-BB5_4:
+BB12_4:
 	ret;
 }
 
-	// .globl	inplace_add
-.visible .entry inplace_add(
-	.param .u64 inplace_add_param_0,
-	.param .u64 inplace_add_param_1,
-	.param .u32 inplace_add_param_2,
-	.param .u32 inplace_add_param_3
+	// .globl	relu_backward_f
+.visible .entry relu_backward_f(
+	.param .u64 relu_backward_f_param_0,
+	.param .u64 relu_backward_f_param_1,
+	.param .u64 relu_backward_f_param_2,
+	.param .u32 relu_backward_f_param_3,
+	.param .u32 relu_backward_f_param_4
 )
 {
-	.reg .pred 	%p<4>;
+	.reg .pred 	%p<5>;
+	.reg .f32 	%f<6>;
 	.reg .b32 	%r<8>;
-	.reg .f64 	%fd<4>;
-	.reg .b64 	%rd<8>;
+	.reg .b64 	%rd<14>;
 
 
-	ld.param.u64 	%rd1, [inplace_add_param_0];
-	ld.param.u64 	%rd2, [inplace_add_param_1];
-	ld.param.u32 	%r2, [inplace_add_param_2];
-	ld.param.u32 	%r3, [inplace_add_param_3];
-	mov.u32 	%r4, %ctaid.x;
+	ld.param.u64 	%rd2, [relu_backward_f_param_0];
+	ld.param.u64 	%rd3, [relu_backward_f_param_1];
+	ld.param.u64 	%rd4, [relu_backward_f_param_2];
+	ld.param.u32 	%r2, [relu_backward_f_param_3];
+	ld.param.u32 	%r3, [relu_backward_f_param_4];
+	mov.u32 	%r4, %ntid.x;
+	mov.u32 	%r5, %ctaid.x;
+	mov.u32 	%r6, %tid.x;
+	mad.lo.s32 	%r1, %r4, %r5, %r6;
+	div.s32 	%r7, %r1, %r3;
+	setp.lt.s32	%p1, %r7, %r2;
+	setp.gt.s32	%p2, %r3, -1;
+	and.pred  	%p3, %p1, %p2;
+	@!%p3 bra 	BB13_4;
+	bra.uni 	BB13_1;
+
+BB13_1:
+	cvta.to.global.u64 	%rd5, %rd2;
+	cvt.s64.s32	%rd1, %r1;
+	mul.wide.s32 	%rd6, %r1, 4;
+	add.s64 	%rd7, %rd5, %rd6;
+	ld.global.f32 	%f4, [%rd7];
+	mov.f32 	%f5, 0f00000000;
+	setp.leu.f32	%p4, %f4, 0f00000000;
+	@%p4 bra 	BB13_3;
+
+	cvta.to.global.u64 	%rd8, %rd3;
+	shl.b64 	%rd9, %rd1, 2;
+	add.s64 	%rd10, %rd8, %rd9;
+	ld.global.f32 	%f5, [%rd10];
+
+BB13_3:
+	cvta.to.global.u64 	%rd11, %rd4;
+	shl.b64 	%rd12, %rd1, 2;
+	add.s64 	%rd13, %rd11, %rd12;
+	st.global.f32 	[%rd13], %f5;
+
+BB13_4:
+	ret;
+}
+
+	// .globl	inplace_add_d
+.visible .entry inplace_add_d(
+	.param .u64 inplace_add_d_param_0,
+	.param .u64 inplace_add_d_param_1,
+	.param .u32 inplace_add_d_param_2,
+	.param .u32 inplace_add_d_param_3
+)
+{
+	.reg .pred 	%p<4>;
+	.reg .b32 	%r<8>;
+	.reg .f64 	%fd<4>;
+	.reg .b64 	%rd<8>;
+
+
+	ld.param.u64 	%rd1, [inplace_add_d_param_0];
+	ld.param.u64 	%rd2, [inplace_add_d_param_1];
+	ld.param.u32 	%r2, [inplace_add_d_param_2];
+	ld.param.u32 	%r3, [inplace_add_d_param_3];
+	mov.u32 	%r4, %ctaid.x;
 	mov.u32 	%r5, %ntid.x;
 	mov.u32 	%r6, %tid.x;
 	mad.lo.s32 	%r1, %r5, %r4, %r6;
@@ -410,10 +847,10 @@ BB5_4:
 	setp.lt.s32	%p1, %r7, %r2;
 	setp.gt.s32	%p2, %r3, -1;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB6_2;
-	bra.uni 	BB6_1;
+	@!%p3 bra 	BB14_2;
+	bra.uni 	BB14_1;
 
-BB6_1:
+BB14_1:
 	cvta.to.global.u64 	%rd3, %rd1;
 	mul.wide.s32 	%rd4, %r1, 8;
 	add.s64 	%rd5, %rd3, %rd4;
@@ -424,18 +861,62 @@ BB6_1:
 	add.f64 	%fd3, %fd2, %fd1;
 	st.global.f64 	[%rd7], %fd3;
 
-BB6_2:
+BB14_2:
+	ret;
+}
+
+	// .globl	inplace_add_f
+.visible .entry inplace_add_f(
+	.param .u64 inplace_add_f_param_0,
+	.param .u64 inplace_add_f_param_1,
+	.param .u32 inplace_add_f_param_2,
+	.param .u32 inplace_add_f_param_3
+)
+{
+	.reg .pred 	%p<4>;
+	.reg .f32 	%f<4>;
+	.reg .b32 	%r<8>;
+	.reg .b64 	%rd<8>;
+
+
+	ld.param.u64 	%rd1, [inplace_add_f_param_0];
+	ld.param.u64 	%rd2, [inplace_add_f_param_1];
+	ld.param.u32 	%r2, [inplace_add_f_param_2];
+	ld.param.u32 	%r3, [inplace_add_f_param_3];
+	mov.u32 	%r4, %ctaid.x;
+	mov.u32 	%r5, %ntid.x;
+	mov.u32 	%r6, %tid.x;
+	mad.lo.s32 	%r1, %r5, %r4, %r6;
+	div.s32 	%r7, %r1, %r3;
+	setp.lt.s32	%p1, %r7, %r2;
+	setp.gt.s32	%p2, %r3, -1;
+	and.pred  	%p3, %p1, %p2;
+	@!%p3 bra 	BB15_2;
+	bra.uni 	BB15_1;
+
+BB15_1:
+	cvta.to.global.u64 	%rd3, %rd1;
+	mul.wide.s32 	%rd4, %r1, 4;
+	add.s64 	%rd5, %rd3, %rd4;
+	cvta.to.global.u64 	%rd6, %rd2;
+	add.s64 	%rd7, %rd6, %rd4;
+	ld.global.f32 	%f1, [%rd7];
+	ld.global.f32 	%f2, [%rd5];
+	add.f32 	%f3, %f2, %f1;
+	st.global.f32 	[%rd7], %f3;
+
+BB15_2:
 	ret;
 }
 
-	// .globl	bias_add
-.visible .entry bias_add(
-	.param .u64 bias_add_param_0,
-	.param .u64 bias_add_param_1,
-	.param .u64 bias_add_param_2,
-	.param .u32 bias_add_param_3,
-	.param .u32 bias_add_param_4,
-	.param .u32 bias_add_param_5
+	// .globl	bias_add_d
+.visible .entry bias_add_d(
+	.param .u64 bias_add_d_param_0,
+	.param .u64 bias_add_d_param_1,
+	.param .u64 bias_add_d_param_2,
+	.param .u32 bias_add_d_param_3,
+	.param .u32 bias_add_d_param_4,
+	.param .u32 bias_add_d_param_5
 )
 {
 	.reg .pred 	%p<4>;
@@ -444,12 +925,12 @@ BB6_2:
 	.reg .b64 	%rd<12>;
 
 
-	ld.param.u64 	%rd1, [bias_add_param_0];
-	ld.param.u64 	%rd2, [bias_add_param_1];
-	ld.param.u64 	%rd3, [bias_add_param_2];
-	ld.param.u32 	%r4, [bias_add_param_3];
-	ld.param.u32 	%r2, [bias_add_param_4];
-	ld.param.u32 	%r3, [bias_add_param_5];
+	ld.param.u64 	%rd1, [bias_add_d_param_0];
+	ld.param.u64 	%rd2, [bias_add_d_param_1];
+	ld.param.u64 	%rd3, [bias_add_d_param_2];
+	ld.param.u32 	%r4, [bias_add_d_param_3];
+	ld.param.u32 	%r2, [bias_add_d_param_4];
+	ld.param.u32 	%r3, [bias_add_d_param_5];
 	mov.u32 	%r5, %ctaid.x;
 	mov.u32 	%r6, %ntid.x;
 	mov.u32 	%r7, %tid.x;
@@ -458,10 +939,10 @@ BB6_2:
 	setp.lt.s32	%p1, %r8, %r4;
 	setp.gt.s32	%p2, %r2, -1;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB7_2;
-	bra.uni 	BB7_1;
+	@!%p3 bra 	BB16_2;
+	bra.uni 	BB16_1;
 
-BB7_1:
+BB16_1:
 	rem.s32 	%r9, %r1, %r2;
 	cvta.to.global.u64 	%rd4, %rd1;
 	mul.wide.s32 	%rd5, %r1, 8;
@@ -477,20 +958,73 @@ BB7_1:
 	add.s64 	%rd11, %rd10, %rd5;
 	st.global.f64 	[%rd11], %fd3;
 
-BB7_2:
+BB16_2:
+	ret;
+}
+
+	// .globl	bias_add_f
+.visible .entry bias_add_f(
+	.param .u64 bias_add_f_param_0,
+	.param .u64 bias_add_f_param_1,
+	.param .u64 bias_add_f_param_2,
+	.param .u32 bias_add_f_param_3,
+	.param .u32 bias_add_f_param_4,
+	.param .u32 bias_add_f_param_5
+)
+{
+	.reg .pred 	%p<4>;
+	.reg .f32 	%f<4>;
+	.reg .b32 	%r<11>;
+	.reg .b64 	%rd<12>;
+
+
+	ld.param.u64 	%rd1, [bias_add_f_param_0];
+	ld.param.u64 	%rd2, [bias_add_f_param_1];
+	ld.param.u64 	%rd3, [bias_add_f_param_2];
+	ld.param.u32 	%r4, [bias_add_f_param_3];
+	ld.param.u32 	%r2, [bias_add_f_param_4];
+	ld.param.u32 	%r3, [bias_add_f_param_5];
+	mov.u32 	%r5, %ctaid.x;
+	mov.u32 	%r6, %ntid.x;
+	mov.u32 	%r7, %tid.x;
+	mad.lo.s32 	%r1, %r6, %r5, %r7;
+	div.s32 	%r8, %r1, %r2;
+	setp.lt.s32	%p1, %r8, %r4;
+	setp.gt.s32	%p2, %r2, -1;
+	and.pred  	%p3, %p1, %p2;
+	@!%p3 bra 	BB17_2;
+	bra.uni 	BB17_1;
+
+BB17_1:
+	rem.s32 	%r9, %r1, %r2;
+	cvta.to.global.u64 	%rd4, %rd1;
+	mul.wide.s32 	%rd5, %r1, 4;
+	add.s64 	%rd6, %rd4, %rd5;
+	div.s32 	%r10, %r9, %r3;
+	cvta.to.global.u64 	%rd7, %rd2;
+	mul.wide.s32 	%rd8, %r10, 4;
+	add.s64 	%rd9, %rd7, %rd8;
+	ld.global.f32 	%f1, [%rd9];
+	ld.global.f32 	%f2, [%rd6];
+	add.f32 	%f3, %f2, %f1;
+	cvta.to.global.u64 	%rd10, %rd3;
+	add.s64 	%rd11, %rd10, %rd5;
+	st.global.f32 	[%rd11], %f3;
+
+BB17_2:
 	ret;
 }
 
-	// .globl	daxpy_matrix_vector
-.visible .entry daxpy_matrix_vector(
-	.param .u64 daxpy_matrix_vector_param_0,
-	.param .u64 daxpy_matrix_vector_param_1,
-	.param .f64 daxpy_matrix_vector_param_2,
-	.param .u64 daxpy_matrix_vector_param_3,
-	.param .u32 daxpy_matrix_vector_param_4,
-	.param .u32 daxpy_matrix_vector_param_5,
-	.param .u32 daxpy_matrix_vector_param_6,
-	.param .u32 daxpy_matrix_vector_param_7
+	// .globl	daxpy_matrix_vector_d
+.visible .entry daxpy_matrix_vector_d(
+	.param .u64 daxpy_matrix_vector_d_param_0,
+	.param .u64 daxpy_matrix_vector_d_param_1,
+	.param .f64 daxpy_matrix_vector_d_param_2,
+	.param .u64 daxpy_matrix_vector_d_param_3,
+	.param .u32 daxpy_matrix_vector_d_param_4,
+	.param .u32 daxpy_matrix_vector_d_param_5,
+	.param .u32 daxpy_matrix_vector_d_param_6,
+	.param .u32 daxpy_matrix_vector_d_param_7
 )
 {
 	.reg .pred 	%p<5>;
@@ -499,13 +1033,13 @@ BB7_2:
 	.reg .b64 	%rd<14>;
 
 
-	ld.param.u64 	%rd3, [daxpy_matrix_vector_param_0];
-	ld.param.u64 	%rd5, [daxpy_matrix_vector_param_1];
-	ld.param.f64 	%fd2, [daxpy_matrix_vector_param_2];
-	ld.param.u64 	%rd4, [daxpy_matrix_vector_param_3];
-	ld.param.u32 	%r5, [daxpy_matrix_vector_param_4];
-	ld.param.u32 	%r3, [daxpy_matrix_vector_param_5];
-	ld.param.u32 	%r4, [daxpy_matrix_vector_param_6];
+	ld.param.u64 	%rd3, [daxpy_matrix_vector_d_param_0];
+	ld.param.u64 	%rd5, [daxpy_matrix_vector_d_param_1];
+	ld.param.f64 	%fd2, [daxpy_matrix_vector_d_param_2];
+	ld.param.u64 	%rd4, [daxpy_matrix_vector_d_param_3];
+	ld.param.u32 	%r5, [daxpy_matrix_vector_d_param_4];
+	ld.param.u32 	%r3, [daxpy_matrix_vector_d_param_5];
+	ld.param.u32 	%r4, [daxpy_matrix_vector_d_param_6];
 	cvta.to.global.u64 	%rd1, %rd5;
 	mov.u32 	%r6, %ntid.x;
 	mov.u32 	%r7, %ctaid.x;
@@ -516,10 +1050,10 @@ BB7_2:
 	setp.lt.s32	%p1, %r1, %r5;
 	setp.gt.s32	%p2, %r3, -1;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB8_4;
-	bra.uni 	BB8_1;
+	@!%p3 bra 	BB18_4;
+	bra.uni 	BB18_1;
 
-BB8_1:
+BB18_1:
 	cvta.to.global.u64 	%rd6, %rd4;
 	mad.lo.s32 	%r10, %r1, %r3, %r2;
 	cvta.to.global.u64 	%rd7, %rd3;
@@ -528,36 +1062,111 @@ BB8_1:
 	ld.global.f64 	%fd1, [%rd9];
 	add.s64 	%rd2, %rd6, %rd8;
 	setp.eq.s32	%p4, %r4, 1;
-	@%p4 bra 	BB8_3;
-	bra.uni 	BB8_2;
+	@%p4 bra 	BB18_3;
+	bra.uni 	BB18_2;
 
-BB8_3:
+BB18_3:
 	mul.wide.s32 	%rd12, %r2, 8;
 	add.s64 	%rd13, %rd1, %rd12;
 	ld.global.f64 	%fd5, [%rd13];
 	fma.rn.f64 	%fd6, %fd5, %fd2, %fd1;
 	st.global.f64 	[%rd2], %fd6;
-	bra.uni 	BB8_4;
+	bra.uni 	BB18_4;
 
-BB8_2:
+BB18_2:
 	mul.wide.s32 	%rd10, %r1, 8;
 	add.s64 	%rd11, %rd1, %rd10;
 	ld.global.f64 	%fd3, [%rd11];
 	fma.rn.f64 	%fd4, %fd3, %fd2, %fd1;
 	st.global.f64 	[%rd2], %fd4;
 
-BB8_4:
+BB18_4:
+	ret;
+}
+
+	// .globl	daxpy_matrix_vector_f
+.visible .entry daxpy_matrix_vector_f(
+	.param .u64 daxpy_matrix_vector_f_param_0,
+	.param .u64 daxpy_matrix_vector_f_param_1,
+	.param .f64 daxpy_matrix_vector_f_param_2,
+	.param .u64 daxpy_matrix_vector_f_param_3,
+	.param .u32 daxpy_matrix_vector_f_param_4,
+	.param .u32 daxpy_matrix_vector_f_param_5,
+	.param .u32 daxpy_matrix_vector_f_param_6,
+	.param .u32 daxpy_matrix_vector_f_param_7
+)
+{
+	.reg .pred 	%p<5>;
+	.reg .f32 	%f<6>;
+	.reg .b32 	%r<11>;
+	.reg .f64 	%fd<7>;
+	.reg .b64 	%rd<14>;
+
+
+	ld.param.u64 	%rd3, [daxpy_matrix_vector_f_param_0];
+	ld.param.u64 	%rd5, [daxpy_matrix_vector_f_param_1];
+	ld.param.f64 	%fd2, [daxpy_matrix_vector_f_param_2];
+	ld.param.u64 	%rd4, [daxpy_matrix_vector_f_param_3];
+	ld.param.u32 	%r5, [daxpy_matrix_vector_f_param_4];
+	ld.param.u32 	%r3, [daxpy_matrix_vector_f_param_5];
+	ld.param.u32 	%r4, [daxpy_matrix_vector_f_param_6];
+	cvta.to.global.u64 	%rd1, %rd5;
+	mov.u32 	%r6, %ntid.x;
+	mov.u32 	%r7, %ctaid.x;
+	mov.u32 	%r8, %tid.x;
+	mad.lo.s32 	%r9, %r6, %r7, %r8;
+	div.s32 	%r1, %r9, %r3;
+	rem.s32 	%r2, %r9, %r3;
+	setp.lt.s32	%p1, %r1, %r5;
+	setp.gt.s32	%p2, %r3, -1;
+	and.pred  	%p3, %p1, %p2;
+	@!%p3 bra 	BB19_4;
+	bra.uni 	BB19_1;
+
+BB19_1:
+	cvta.to.global.u64 	%rd6, %rd4;
+	mad.lo.s32 	%r10, %r1, %r3, %r2;
+	cvta.to.global.u64 	%rd7, %rd3;
+	mul.wide.s32 	%rd8, %r10, 4;
+	add.s64 	%rd9, %rd7, %rd8;
+	ld.global.f32 	%f1, [%rd9];
+	cvt.f64.f32	%fd1, %f1;
+	add.s64 	%rd2, %rd6, %rd8;
+	setp.eq.s32	%p4, %r4, 1;
+	@%p4 bra 	BB19_3;
+	bra.uni 	BB19_2;
+
+BB19_3:
+	mul.wide.s32 	%rd12, %r2, 4;
+	add.s64 	%rd13, %rd1, %rd12;
+	ld.global.f32 	%f4, [%rd13];
+	cvt.f64.f32	%fd5, %f4;
+	fma.rn.f64 	%fd6, %fd5, %fd2, %fd1;
+	cvt.rn.f32.f64	%f5, %fd6;
+	st.global.f32 	[%rd2], %f5;
+	bra.uni 	BB19_4;
+
+BB19_2:
+	mul.wide.s32 	%rd10, %r1, 4;
+	add.s64 	%rd11, %rd1, %rd10;
+	ld.global.f32 	%f2, [%rd11];
+	cvt.f64.f32	%fd3, %f2;
+	fma.rn.f64 	%fd4, %fd3, %fd2, %fd1;
+	cvt.rn.f32.f64	%f3, %fd4;
+	st.global.f32 	[%rd2], %f3;
+
+BB19_4:
 	ret;
 }
 
-	// .globl	bias_multiply
-.visible .entry bias_multiply(
-	.param .u64 bias_multiply_param_0,
-	.param .u64 bias_multiply_param_1,
-	.param .u64 bias_multiply_param_2,
-	.param .u32 bias_multiply_param_3,
-	.param .u32 bias_multiply_param_4,
-	.param .u32 bias_multiply_param_5
+	// .globl	bias_multiply_d
+.visible .entry bias_multiply_d(
+	.param .u64 bias_multiply_d_param_0,
+	.param .u64 bias_multiply_d_param_1,
+	.param .u64 bias_multiply_d_param_2,
+	.param .u32 bias_multiply_d_param_3,
+	.param .u32 bias_multiply_d_param_4,
+	.param .u32 bias_multiply_d_param_5
 )
 {
 	.reg .pred 	%p<4>;
@@ -566,12 +1175,12 @@ BB8_4:
 	.reg .b64 	%rd<12>;
 
 
-	ld.param.u64 	%rd1, [bias_multiply_param_0];
-	ld.param.u64 	%rd2, [bias_multiply_param_1];
-	ld.param.u64 	%rd3, [bias_multiply_param_2];
-	ld.param.u32 	%r4, [bias_multiply_param_3];
-	ld.param.u32 	%r2, [bias_multiply_param_4];
-	ld.param.u32 	%r3, [bias_multiply_param_5];
+	ld.param.u64 	%rd1, [bias_multiply_d_param_0];
+	ld.param.u64 	%rd2, [bias_multiply_d_param_1];
+	ld.param.u64 	%rd3, [bias_multiply_d_param_2];
+	ld.param.u32 	%r4, [bias_multiply_d_param_3];
+	ld.param.u32 	%r2, [bias_multiply_d_param_4];
+	ld.param.u32 	%r3, [bias_multiply_d_param_5];
 	mov.u32 	%r5, %ctaid.x;
 	mov.u32 	%r6, %ntid.x;
 	mov.u32 	%r7, %tid.x;
@@ -580,10 +1189,10 @@ BB8_4:
 	setp.lt.s32	%p1, %r8, %r4;
 	setp.gt.s32	%p2, %r2, -1;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB9_2;
-	bra.uni 	BB9_1;
+	@!%p3 bra 	BB20_2;
+	bra.uni 	BB20_1;
 
-BB9_1:
+BB20_1:
 	rem.s32 	%r9, %r1, %r2;
 	cvta.to.global.u64 	%rd4, %rd1;
 	mul.wide.s32 	%rd5, %r1, 8;
@@ -599,110 +1208,89 @@ BB9_1:
 	add.s64 	%rd11, %rd10, %rd5;
 	st.global.f64 	[%rd11], %fd3;
 
-BB9_2:
+BB20_2:
 	ret;
 }
 
-	// .globl	compare_and_set
-.visible .entry compare_and_set(
-	.param .u64 compare_and_set_param_0,
-	.param .u64 compare_and_set_param_1,
-	.param .u32 compare_and_set_param_2,
-	.param .u32 compare_and_set_param_3,
-	.param .f64 compare_and_set_param_4,
-	.param .f64 compare_and_set_param_5,
-	.param .f64 compare_and_set_param_6,
-	.param .f64 compare_and_set_param_7,
-	.param .f64 compare_and_set_param_8
+	// .globl	bias_multiply_f
+.visible .entry bias_multiply_f(
+	.param .u64 bias_multiply_f_param_0,
+	.param .u64 bias_multiply_f_param_1,
+	.param .u64 bias_multiply_f_param_2,
+	.param .u32 bias_multiply_f_param_3,
+	.param .u32 bias_multiply_f_param_4,
+	.param .u32 bias_multiply_f_param_5
 )
 {
-	.reg .pred 	%p<6>;
-	.reg .b32 	%r<10>;
-	.reg .f64 	%fd<9>;
-	.reg .b64 	%rd<8>;
+	.reg .pred 	%p<4>;
+	.reg .f32 	%f<4>;
+	.reg .b32 	%r<11>;
+	.reg .b64 	%rd<12>;
 
 
-	ld.param.u64 	%rd2, [compare_and_set_param_0];
-	ld.param.u64 	%rd3, [compare_and_set_param_1];
-	ld.param.u32 	%r2, [compare_and_set_param_2];
-	ld.param.u32 	%r3, [compare_and_set_param_3];
-	ld.param.f64 	%fd2, [compare_and_set_param_4];
-	ld.param.f64 	%fd3, [compare_and_set_param_5];
-	ld.param.f64 	%fd4, [compare_and_set_param_6];
-	ld.param.f64 	%fd5, [compare_and_set_param_7];
-	ld.param.f64 	%fd6, [compare_and_set_param_8];
-	mov.u32 	%r4, %ctaid.x;
-	mov.u32 	%r5, %ntid.x;
-	mov.u32 	%r6, %tid.x;
-	mad.lo.s32 	%r7, %r5, %r4, %r6;
-	div.s32 	%r8, %r7, %r3;
-	rem.s32 	%r9, %r7, %r3;
-	mad.lo.s32 	%r1, %r8, %r3, %r9;
-	setp.lt.s32	%p1, %r8, %r2;
-	setp.gt.s32	%p2, %r3, -1;
+	ld.param.u64 	%rd1, [bias_multiply_f_param_0];
+	ld.param.u64 	%rd2, [bias_multiply_f_param_1];
+	ld.param.u64 	%rd3, [bias_multiply_f_param_2];
+	ld.param.u32 	%r4, [bias_multiply_f_param_3];
+	ld.param.u32 	%r2, [bias_multiply_f_param_4];
+	ld.param.u32 	%r3, [bias_multiply_f_param_5];
+	mov.u32 	%r5, %ctaid.x;
+	mov.u32 	%r6, %ntid.x;
+	mov.u32 	%r7, %tid.x;
+	mad.lo.s32 	%r1, %r6, %r5, %r7;
+	div.s32 	%r8, %r1, %r2;
+	setp.lt.s32	%p1, %r8, %r4;
+	setp.gt.s32	%p2, %r2, -1;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB10_6;
-	bra.uni 	BB10_1;
+	@!%p3 bra 	BB21_2;
+	bra.uni 	BB21_1;
 
-BB10_1:
-	cvta.to.global.u64 	%rd4, %rd2;
-	mul.wide.s32 	%rd5, %r1, 8;
+BB21_1:
+	rem.s32 	%r9, %r1, %r2;
+	cvta.to.global.u64 	%rd4, %rd1;
+	mul.wide.s32 	%rd5, %r1, 4;
 	add.s64 	%rd6, %rd4, %rd5;
-	ld.global.f64 	%fd1, [%rd6];
-	sub.f64 	%fd7, %fd1, %fd2;
-	abs.f64 	%fd8, %fd7;
-	setp.lt.f64	%p4, %fd8, %fd3;
-	cvta.to.global.u64 	%rd7, %rd3;
-	add.s64 	%rd1, %rd7, %rd5;
-	@%p4 bra 	BB10_5;
-	bra.uni 	BB10_2;
-
-BB10_5:
-	st.global.f64 	[%rd1], %fd4;
-	bra.uni 	BB10_6;
-
-BB10_2:
-	setp.lt.f64	%p5, %fd1, %fd2;
-	@%p5 bra 	BB10_4;
-	bra.uni 	BB10_3;
-
-BB10_4:
-	st.global.f64 	[%rd1], %fd5;
-	bra.uni 	BB10_6;
-
-BB10_3:
-	st.global.f64 	[%rd1], %fd6;
+	div.s32 	%r10, %r9, %r3;
+	cvta.to.global.u64 	%rd7, %rd2;
+	mul.wide.s32 	%rd8, %r10, 4;
+	add.s64 	%rd9, %rd7, %rd8;
+	ld.global.f32 	%f1, [%rd9];
+	ld.global.f32 	%f2, [%rd6];
+	mul.f32 	%f3, %f2, %f1;
+	cvta.to.global.u64 	%rd10, %rd3;
+	add.s64 	%rd11, %rd10, %rd5;
+	st.global.f32 	[%rd11], %f3;
 
-BB10_6:
+BB21_2:
 	ret;
 }
 
-	// .globl	matrix_matrix_cellwise_op
-.visible .entry matrix_matrix_cellwise_op(
-	.param .u64 matrix_matrix_cellwise_op_param_0,
-	.param .u64 matrix_matrix_cellwise_op_param_1,
-	.param .u64 matrix_matrix_cellwise_op_param_2,
-	.param .u32 matrix_matrix_cellwise_op_param_3,
-	.param .u32 matrix_matrix_cellwise_op_param_4,
-	.param .u32 matrix_matrix_cellwise_op_param_5,
-	.param .u32 matrix_matrix_cellwise_op_param_6,
-	.param .u32 matrix_matrix_cellwise_op_param_7
+	// .globl	matrix_matrix_cellwise_op_d
+.visible .entry matrix_matrix_cellwise_op_d(
+	.param .u64 matrix_matrix_cellwise_op_d_param_0,
+	.param .u64 matrix_matrix_cellwise_op_d_param_1,
+	.param .u64 matrix_matrix_cellwise_op_d_param_2,
+	.param .u32 matrix_matrix_cellwise_op_d_param_3,
+	.param .u32 matrix_matrix_cellwise_op_d_param_4,
+	.param .u32 matrix_matrix_cellwise_op_d_param_5,
+	.param .u32 matrix_matrix_cellwise_op_d_param_6,
+	.param .u32 matrix_matrix_cellwise_op_d_param_7
 )
 {
-	.reg .pred 	%p<77>;
-	.reg .b32 	%r<65>;
-	.reg .f64 	%fd<55>;
+	.reg .pred 	%p<73>;
+	.reg .b32 	%r<66>;
+	.reg .f64 	%fd<56>;
 	.reg .b64 	%rd<19>;
 
 
-	ld.param.u64 	%rd2, [matrix_matrix_cellwise_op_param_0];
-	ld.param.u64 	%rd3, [matrix_matrix_cellwise_op_param_1];
-	ld.param.u64 	%rd4, [matrix_matrix_cellwise_op_param_2];
-	ld.param.u32 	%r14, [matrix_matrix_cellwise_op_param_3];
-	ld.param.u32 	%r10, [matrix_matrix_cellwise_op_param_4];
-	ld.param.u32 	%r11, [matrix_matrix_cellwise_op_param_5];
-	ld.param.u32 	%r12, [matrix_matrix_cellwise_op_param_6];
-	ld.param.u32 	%r13, [matrix_matrix_cellwise_op_param_7];
+	ld.param.u64 	%rd2, [matrix_matrix_cellwise_op_d_param_0];
+	ld.param.u64 	%rd3, [matrix_matrix_cellwise_op_d_param_1];
+	ld.param.u64 	%rd4, [matrix_matrix_cellwise_op_d_param_2];
+	ld.param.u32 	%r14, [matrix_matrix_cellwise_op_d_param_3];
+	ld.param.u32 	%r10, [matrix_matrix_cellwise_op_d_param_4];
+	ld.param.u32 	%r11, [matrix_matrix_cellwise_op_d_param_5];
+	ld.param.u32 	%r12, [matrix_matrix_cellwise_op_d_param_6];
+	ld.param.u32 	%r13, [matrix_matrix_cellwise_op_d_param_7];
 	mov.u32 	%r15, %ntid.x;
 	mov.u32 	%r16, %ctaid.x;
 	mov.u32 	%r17, %tid.x;
@@ -712,93 +1300,93 @@ BB10_6:
 	setp.lt.s32	%p2, %r1, %r14;
 	setp.gt.s32	%p3, %r10, -1;
 	and.pred  	%p4, %p2, %p3;
-	@!%p4 bra 	BB11_73;
-	bra.uni 	BB11_1;
+	@!%p4 bra 	BB22_77;
+	bra.uni 	BB22_1;
 
-BB11_1:
+BB22_1:
 	mad.lo.s32 	%r3, %r1, %r10, %r2;
 	setp.eq.s32	%p5, %r11, 1;
-	mov.u32 	%r63, %r1;
-	@%p5 bra 	BB11_5;
+	mov.u32 	%r64, %r1;
+	@%p5 bra 	BB22_5;
 
 	setp.ne.s32	%p6, %r11, 2;
-	mov.u32 	%r64, %r3;
-	@%p6 bra 	BB11_4;
+	mov.u32 	%r65, %r3;
+	@%p6 bra 	BB22_4;
 
-	mov.u32 	%r64, %r2;
+	mov.u32 	%r65, %r2;
 
-BB11_4:
-	mov.u32 	%r58, %r64;
-	mov.u32 	%r4, %r58;
-	mov.u32 	%r63, %r4;
+BB22_4:
+	mov.u32 	%r59, %r65;
+	mov.u32 	%r4, %r59;
+	mov.u32 	%r64, %r4;
 
-BB11_5:
-	mov.u32 	%r5, %r63;
+BB22_5:
+	mov.u32 	%r5, %r64;
 	setp.eq.s32	%p7, %r12, 1;
-	mov.u32 	%r61, %r1;
-	@%p7 bra 	BB11_9;
+	mov.u32 	%r62, %r1;
+	@%p7 bra 	BB22_9;
 
 	setp.ne.s32	%p8, %r12, 2;
-	mov.u32 	%r62, %r3;
-	@%p8 bra 	BB11_8;
+	mov.u32 	%r63, %r3;
+	@%p8 bra 	BB22_8;
 
-	mov.u32 	%r62, %r2;
+	mov.u32 	%r63, %r2;
 
-BB11_8:
-	mov.u32 	%r61, %r62;
+BB22_8:
+	mov.u32 	%r62, %r63;
 
-BB11_9:
+BB22_9:
 	cvta.to.global.u64 	%rd5, %rd3;
 	cvta.to.global.u64 	%rd6, %rd2;
 	mul.wide.s32 	%rd7, %r5, 8;
 	add.s64 	%rd8, %rd6, %rd7;
 	ld.global.f64 	%fd1, [%rd8];
-	mul.wide.s32 	%rd9, %r61, 8;
+	mul.wide.s32 	%rd9, %r62, 8;
 	add.s64 	%rd10, %rd5, %rd9;
 	ld.global.f64 	%fd2, [%rd10];
-	mov.f64 	%fd54, 0d7FEFFFFFFFFFFFFF;
+	mov.f64 	%fd55, 0d7FEFFFFFFFFFFFFF;
 	setp.gt.s32	%p9, %r13, 8;
-	@%p9 bra 	BB11_26;
+	@%p9 bra 	BB22_26;
 
 	setp.gt.s32	%p23, %r13, 3;
-	@%p23 bra 	BB11_18;
+	@%p23 bra 	BB22_18;
 
 	setp.gt.s32	%p30, %r13, 1;
-	@%p30 bra 	BB11_15;
+	@%p30 bra 	BB22_15;
 
 	setp.eq.s32	%p33, %r13, 0;
-	@%p33 bra 	BB11_71;
-	bra.uni 	BB11_13;
+	@%p33 bra 	BB22_75;
+	bra.uni 	BB22_13;
 
-BB11_71:
-	add.f64 	%fd54, %fd1, %fd2;
-	bra.uni 	BB11_72;
+BB22_75:
+	add.f64 	%fd55, %fd1, %fd2;
+	bra.uni 	BB22_76;
 
-BB11_26:
+BB22_26:
 	setp.gt.s32	%p10, %r13, 13;
-	@%p10 bra 	BB11_35;
+	@%p10 bra 	BB22_35;
 
 	setp.gt.s32	%p17, %r13, 10;
-	@%p17 bra 	BB11_31;
+	@%p17 bra 	BB22_31;
 
 	setp.eq.s32	%p21, %r13, 9;
-	@%p21 bra 	BB11_53;
-	bra.uni 	BB11_29;
+	@%p21 bra 	BB22_55;
+	bra.uni 	BB22_29;
 
-BB11_53:
-	setp.eq.f64	%p50, %fd1, %fd2;
-	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p50;
-	bra.uni 	BB11_72;
+BB22_55:
+	setp.eq.f64	%p48, %fd1, %fd2;
+	selp.f64	%fd55, 0d3FF0000000000000, 0d0000000000000000, %p48;
+	bra.uni 	BB22_76;
 
-BB11_18:
+BB22_18:
 	setp.gt.s32	%p24, %r13, 5;
-	@%p24 bra 	BB11_22;
+	@%p24 bra 	BB22_22;
 
 	setp.eq.s32	%p28, %r13, 4;
-	@%p28 bra 	BB11_56;
-	bra.uni 	BB11_20;
+	@%p28 bra 	BB22_58;
+	bra.uni 	BB22_20;
 
-BB11_56:
+BB22_58:
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r8}, %fd1;
@@ -811,7 +1399,7 @@ BB11_56:
 	add.s32 	%r32, %r31, -1012;
 	mov.b64 	 %rd15, %fd2;
 	shl.b64 	%rd1, %rd15, %r32;
-	setp.eq.s64	%p55, %rd1, -9223372036854775808;
+	setp.eq.s64	%p53, %rd1, -9223372036854775808;
 	abs.f64 	%fd19, %fd1;
 	// Callseq Start 0
 	{
@@ -828,472 +1416,966 @@ BB11_56:
 	param0, 
 	param1
 	);
-	ld.param.f64	%fd53, [retval0+0];
+	ld.param.f64	%fd54, [retval0+0];
 	
 	//{
 	}// Callseq End 0
-	setp.lt.s32	%p56, %r8, 0;
-	and.pred  	%p1, %p56, %p55;
-	@!%p1 bra 	BB11_58;
-	bra.uni 	BB11_57;
+	setp.lt.s32	%p54, %r8, 0;
+	and.pred  	%p1, %p54, %p53;
+	@!%p1 bra 	BB22_60;
+	bra.uni 	BB22_59;
 
-BB11_57:
+BB22_59:
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r33}, %fd53;
+	mov.b64 	{%temp, %r33}, %fd54;
 	}
 	xor.b32  	%r34, %r33, -2147483648;
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r35, %temp}, %fd53;
+	mov.b64 	{%r35, %temp}, %fd54;
 	}
-	mov.b64 	%fd53, {%r35, %r34};
+	mov.b64 	%fd54, {%r35, %r34};
 
-BB11_58:
-	mov.f64 	%fd52, %fd53;
-	setp.eq.f64	%p57, %fd1, 0d0000000000000000;
-	@%p57 bra 	BB11_61;
-	bra.uni 	BB11_59;
+BB22_60:
+	mov.f64 	%fd53, %fd54;
+	setp.eq.f64	%p55, %fd1, 0d0000000000000000;
+	@%p55 bra 	BB22_63;
+	bra.uni 	BB22_61;
 
-BB11_61:
-	selp.b32	%r36, %r8, 0, %p55;
+BB22_63:
+	selp.b32	%r36, %r8, 0, %p53;
 	or.b32  	%r37, %r36, 2146435072;
-	setp.lt.s32	%p61, %r9, 0;
-	selp.b32	%r38, %r37, %r36, %p61;
+	setp.lt.s32	%p59, %r9, 0;
+	selp.b32	%r38, %r37, %r36, %p59;
 	mov.u32 	%r39, 0;
-	mov.b64 	%fd52, {%r39, %r38};
-	bra.uni 	BB11_62;
+	mov.b64 	%fd53, {%r39, %r38};
+	bra.uni 	BB22_64;
 
-BB11_35:
+BB22_35:
 	setp.gt.s32	%p11, %r13, 15;
-	@%p11 bra 	BB11_39;
+	@%p11 bra 	BB22_39;
 
 	setp.eq.s32	%p15, %r13, 14;
-	@%p15 bra 	BB11_50;
-	bra.uni 	BB11_37;
+	@%p15 bra 	BB22_52;
+	bra.uni 	BB22_37;
 
-BB11_50:
+BB22_52:
 	cvt.rni.s64.f64	%rd11, %fd1;
 	cvt.rni.s64.f64	%rd12, %fd2;
 	cvt.u32.u64	%r25, %rd11;
 	cvt.u32.u64	%r26, %rd12;
 	or.b32  	%r27, %r26, %r25;
-	setp.eq.s32	%p47, %r27, 0;
-	selp.f64	%fd54, 0d0000000000000000, 0d3FF0000000000000, %p47;
-	bra.uni 	BB11_72;
+	setp.eq.s32	%p45, %r27, 0;
+	selp.f64	%fd55, 0d0000000000000000, 0d3FF0000000000000, %p45;
+	bra.uni 	BB22_76;
 
-BB11_15:
+BB22_15:
 	setp.eq.s32	%p31, %r13, 2;
-	@%p31 bra 	BB11_70;
-	bra.uni 	BB11_16;
+	@%p31 bra 	BB22_74;
+	bra.uni 	BB22_16;
 
-BB11_70:
-	mul.f64 	%fd54, %fd1, %fd2;
-	bra.uni 	BB11_72;
+BB22_74:
+	mul.f64 	%fd55, %fd1, %fd2;
+	bra.uni 	BB22_76;
 
-BB11_31:
+BB22_31:
 	setp.eq.s32	%p18, %r13, 11;
-	@%p18 bra 	BB11_52;
+	@%p18 bra 	BB22_54;
 
 	setp.eq.s32	%p19, %r13, 12;
-	@%p19 bra 	BB11_51;
-	bra.uni 	BB11_33;
+	@%p19 bra 	BB22_53;
+	bra.uni 	BB22_33;
 
-BB11_51:
-	max.f64 	%fd54, %fd1, %fd2;
-	bra.uni 	BB11_72;
+BB22_53:
+	max.f64 	%fd55, %fd1, %fd2;
+	bra.uni 	BB22_76;
 
-BB11_22:
+BB22_22:
 	setp.eq.s32	%p25, %r13, 6;
-	@%p25 bra 	BB11_55;
+	@%p25 bra 	BB22_57;
 
 	setp.eq.s32	%p26, %r13, 7;
-	@%p26 bra 	BB11_54;
-	bra.uni 	BB11_24;
+	@%p26 bra 	BB22_56;
+	bra.uni 	BB22_24;
 
-BB11_54:
-	setp.gt.f64	%p52, %fd1, %fd2;
-	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p52;
-	bra.uni 	BB11_72;
+BB22_56:
+	setp.gt.f64	%p50, %fd1, %fd2;
+	selp.f64	%fd55, 0d3FF0000000000000, 0d0000000000000000, %p50;
+	bra.uni 	BB22_76;
 
-BB11_39:
+BB22_39:
 	setp.eq.s32	%p12, %r13, 16;
-	@%p12 bra 	BB11_49;
+	@%p12 bra 	BB22_51;
 
 	setp.eq.s32	%p13, %r13, 17;
-	@%p13 bra 	BB11_45;
-	bra.uni 	BB11_41;
+	@%p13 bra 	BB22_46;
+	bra.uni 	BB22_41;
 
-BB11_45:
-	setp.eq.f64	%p39, %fd2, 0d0000000000000000;
-	setp.eq.f64	%p40, %fd2, 0d8000000000000000;
-	or.pred  	%p41, %p39, %p40;
-	mov.f64 	%fd54, 0d7FF8000000000000;
-	@%p41 bra 	BB11_72;
+BB22_46:
+	setp.eq.f64	%p38, %fd2, 0d0000000000000000;
+	setp.eq.f64	%p39, %fd2, 0d8000000000000000;
+	or.pred  	%p40, %p38, %p39;
+	mov.f64 	%fd55, 0d7FF8000000000000;
+	@%p40 bra 	BB22_76;
 
-	div.rn.f64 	%fd54, %fd1, %fd2;
-	abs.f64 	%fd39, %fd54;
-	setp.gtu.f64	%p42, %fd39, 0d7FF0000000000000;
-	@%p42 bra 	BB11_72;
+	div.rn.f64 	%fd55, %fd1, %fd2;
+	abs.f64 	%fd39, %fd55;
+	setp.gtu.f64	%p41, %fd39, 0d7FF0000000000000;
+	@%p41 bra 	BB22_76;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r22, %temp}, %fd54;
+	mov.b64 	{%temp, %r22}, %fd55;
 	}
+	and.b32  	%r23, %r22, 2147483647;
+	setp.ne.s32	%p42, %r23, 2146435072;
+	@%p42 bra 	BB22_50;
+
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r23}, %fd54;
+	mov.b64 	{%r24, %temp}, %fd55;
 	}
-	and.b32  	%r24, %r23, 2147483647;
-	setp.ne.s32	%p43, %r24, 2146435072;
-	setp.ne.s32	%p44, %r22, 0;
-	or.pred  	%p45, %p43, %p44;
-	@!%p45 bra 	BB11_72;
-	bra.uni 	BB11_48;
-
-BB11_48:
-	cvt.rmi.f64.f64	%fd40, %fd54;
+	setp.eq.s32	%p43, %r24, 0;
+	@%p43 bra 	BB22_76;
+
+BB22_50:
+	cvt.rmi.f64.f64	%fd40, %fd55;
 	mul.f64 	%fd41, %fd2, %fd40;
-	sub.f64 	%fd54, %fd1, %fd41;
-	bra.uni 	BB11_72;
+	sub.f64 	%fd55, %fd1, %fd41;
+	bra.uni 	BB22_76;
 
-BB11_13:
+BB22_13:
 	setp.eq.s32	%p34, %r13, 1;
-	@%p34 bra 	BB11_14;
-	bra.uni 	BB11_72;
+	@%p34 bra 	BB22_14;
+	bra.uni 	BB22_76;
 
-BB11_14:
-	sub.f64 	%fd54, %fd1, %fd2;
-	bra.uni 	BB11_72;
+BB22_14:
+	sub.f64 	%fd55, %fd1, %fd2;
+	bra.uni 	BB22_76;
 
-BB11_29:
+BB22_29:
 	setp.eq.s32	%p22, %r13, 10;
-	@%p22 bra 	BB11_30;
-	bra.uni 	BB11_72;
+	@%p22 bra 	BB22_30;
+	bra.uni 	BB22_76;
 
-BB11_30:
-	setp.neu.f64	%p49, %fd1, %fd2;
-	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p49;
-	bra.uni 	BB11_72;
+BB22_30:
+	setp.neu.f64	%p47, %fd1, %fd2;
+	selp.f64	%fd55, 0d3FF0000000000000, 0d0000000000000000, %p47;
+	bra.uni 	BB22_76;
 
-BB11_20:
+BB22_20:
 	setp.eq.s32	%p29, %r13, 5;
-	@%p29 bra 	BB11_21;
-	bra.uni 	BB11_72;
+	@%p29 bra 	BB22_21;
+	bra.uni 	BB22_76;
 
-BB11_21:
-	setp.lt.f64	%p54, %fd1, %fd2;
-	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p54;
-	bra.uni 	BB11_72;
+BB22_21:
+	setp.lt.f64	%p52, %fd1, %fd2;
+	selp.f64	%fd55, 0d3FF0000000000000, 0d0000000000000000, %p52;
+	bra.uni 	BB22_76;
 
-BB11_37:
+BB22_37:
 	setp.eq.s32	%p16, %r13, 15;
-	@%p16 bra 	BB11_38;
-	bra.uni 	BB11_72;
+	@%p16 bra 	BB22_38;
+	bra.uni 	BB22_76;
 
-BB11_38:
+BB22_38:
 	mul.f64 	%fd43, %fd1, %fd2;
 	mov.f64 	%fd44, 0d3FF0000000000000;
-	sub.f64 	%fd54, %fd44, %fd43;
-	bra.uni 	BB11_72;
+	sub.f64 	%fd55, %fd44, %fd43;
+	bra.uni 	BB22_76;
 
-BB11_16:
+BB22_16:
 	setp.eq.s32	%p32, %r13, 3;
-	@%p32 bra 	BB11_17;
-	bra.uni 	BB11_72;
+	@%p32 bra 	BB22_17;
+	bra.uni 	BB22_76;
 
-BB11_17:
-	div.rn.f64 	%fd54, %fd1, %fd2;
-	bra.uni 	BB11_72;
+BB22_17:
+	div.rn.f64 	%fd55, %fd1, %fd2;
+	bra.uni 	BB22_76;
 
-BB11_52:
-	min.f64 	%fd54, %fd1, %fd2;
-	bra.uni 	BB11_72;
+BB22_54:
+	min.f64 	%fd55, %fd1, %fd2;
+	bra.uni 	BB22_76;
 
-BB11_33:
+BB22_33:
 	setp.eq.s32	%p20, %r13, 13;
-	@%p20 bra 	BB11_34;
-	bra.uni 	BB11_72;
+	@%p20 bra 	BB22_34;
+	bra.uni 	BB22_76;
 
-BB11_34:
+BB22_34:
 	cvt.rni.s64.f64	%rd13, %fd1;
 	cvt.rni.s64.f64	%rd14, %fd2;
 	cvt.u32.u64	%r28, %rd13;
 	cvt.u32.u64	%r29, %rd14;
 	and.b32  	%r30, %r29, %r28;
-	setp.eq.s32	%p48, %r30, 0;
-	selp.f64	%fd54, 0d0000000000000000, 0d3FF0000000000000, %p48;
-	bra.uni 	BB11_72;
+	setp.eq.s32	%p46, %r30, 0;
+	selp.f64	%fd55, 0d0000000000000000, 0d3FF0000000000000, %p46;
+	bra.uni 	BB22_76;
 
-BB11_55:
-	setp.le.f64	%p53, %fd1, %fd2;
-	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p53;
-	bra.uni 	BB11_72;
+BB22_57:
+	setp.gtu.f64	%p51, %fd1, %fd2;
+	selp.f64	%fd55, 0d0000000000000000, 0d3FF0000000000000, %p51;
+	bra.uni 	BB22_76;
 
-BB11_24:
+BB22_24:
 	setp.eq.s32	%p27, %r13, 8;
-	@%p27 bra 	BB11_25;
-	bra.uni 	BB11_72;
+	@%p27 bra 	BB22_25;
+	bra.uni 	BB22_76;
 
-BB11_25:
-	setp.ge.f64	%p51, %fd1, %fd2;
-	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p51;
-	bra.uni 	BB11_72;
+BB22_25:
+	setp.ltu.f64	%p49, %fd1, %fd2;
+	selp.f64	%fd55, 0d0000000000000000, 0d3FF0000000000000, %p49;
+	bra.uni 	BB22_76;
 
-BB11_49:
-	setp.neu.f64	%p46, %fd1, 0d0000000000000000;
+BB22_51:
+	setp.neu.f64	%p44, %fd1, 0d0000000000000000;
 	sub.f64 	%fd42, %fd1, %fd2;
-	selp.f64	%fd54, %fd42, 0d0000000000000000, %p46;
-	bra.uni 	BB11_72;
+	selp.f64	%fd55, %fd42, 0d0000000000000000, %p44;
+	bra.uni 	BB22_76;
 
-BB11_41:
+BB22_41:
 	setp.ne.s32	%p14, %r13, 18;
-	@%p14 bra 	BB11_72;
+	@%p14 bra 	BB22_76;
 
-	div.rn.f64 	%fd54, %fd1, %fd2;
-	abs.f64 	%fd37, %fd54;
+	div.rn.f64 	%fd55, %fd1, %fd2;
+	abs.f64 	%fd37, %fd55;
 	setp.gtu.f64	%p35, %fd37, 0d7FF0000000000000;
-	@%p35 bra 	BB11_72;
+	@%p35 bra 	BB22_76;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r19, %temp}, %fd54;
+	mov.b64 	{%temp, %r19}, %fd55;
 	}
+	and.b32  	%r20, %r19, 2147483647;
+	setp.ne.s32	%p36, %r20, 2146435072;
+	@%p36 bra 	BB22_45;
+
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r20}, %fd54;
+	mov.b64 	{%r21, %temp}, %fd55;
 	}
-	and.b32  	%r21, %r20, 2147483647;
-	setp.ne.s32	%p36, %r21, 2146435072;
-	setp.ne.s32	%p37, %r19, 0;
-	or.pred  	%p38, %p36, %p37;
-	@!%p38 bra 	BB11_72;
-	bra.uni 	BB11_44;
+	setp.eq.s32	%p37, %r21, 0;
+	@%p37 bra 	BB22_76;
 
-BB11_44:
-	cvt.rmi.f64.f64	%fd54, %fd54;
-	bra.uni 	BB11_72;
+BB22_45:
+	cvt.rmi.f64.f64	%fd55, %fd55;
+	bra.uni 	BB22_76;
 
-BB11_59:
-	setp.gt.s32	%p58, %r8, -1;
-	@%p58 bra 	BB11_62;
+BB22_61:
+	setp.gt.s32	%p56, %r8, -1;
+	@%p56 bra 	BB22_64;
 
 	cvt.rzi.f64.f64	%fd45, %fd2;
-	setp.neu.f64	%p59, %fd45, %fd2;
-	selp.f64	%fd52, 0dFFF8000000000000, %fd52, %p59;
+	setp.neu.f64	%p57, %fd45, %fd2;
+	selp.f64	%fd53, 0dFFF8000000000000, %fd53, %p57;
 
-BB11_62:
-	mov.f64 	%fd25, %fd52;
+BB22_64:
+	mov.f64 	%fd25, %fd53;
 	add.f64 	%fd26, %fd1, %fd2;
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r40}, %fd26;
 	}
 	and.b32  	%r41, %r40, 2146435072;
-	setp.ne.s32	%p62, %r41, 2146435072;
-	mov.f64 	%fd51, %fd25;
-	@%p62 bra 	BB11_69;
+	setp.ne.s32	%p60, %r41, 2146435072;
+	mov.f64 	%fd52, %fd25;
+	@%p60 bra 	BB22_73;
 
-	setp.gtu.f64	%p63, %fd19, 0d7FF0000000000000;
-	mov.f64 	%fd51, %fd26;
-	@%p63 bra 	BB11_69;
+	setp.gtu.f64	%p61, %fd19, 0d7FF0000000000000;
+	mov.f64 	%fd52, %fd26;
+	@%p61 bra 	BB22_73;
 
 	abs.f64 	%fd46, %fd2;
-	setp.gtu.f64	%p64, %fd46, 0d7FF0000000000000;
-	mov.f64 	%fd50, %fd26;
-	mov.f64 	%fd51, %fd50;
-	@%p64 bra 	BB11_69;
+	setp.gtu.f64	%p62, %fd46, 0d7FF0000000000000;
+	mov.f64 	%fd51, %fd26;
+	mov.f64 	%fd52, %fd51;
+	@%p62 bra 	BB22_73;
+
+	and.b32  	%r42, %r9, 2147483647;
+	setp.ne.s32	%p63, %r42, 2146435072;
+	@%p63 bra 	BB22_69;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r42, %temp}, %fd2;
+	mov.b64 	{%r43, %temp}, %fd2;
 	}
-	and.b32  	%r43, %r9, 2147483647;
-	setp.eq.s32	%p65, %r43, 2146435072;
-	setp.eq.s32	%p66, %r42, 0;
-	and.pred  	%p67, %p65, %p66;
-	@%p67 bra 	BB11_68;
-	bra.uni 	BB11_66;
-
-BB11_68:
-	setp.gt.f64	%p71, %fd19, 0d3FF0000000000000;
-	selp.b32	%r51, 2146435072, 0, %p71;
-	xor.b32  	%r52, %r51, 2146435072;
-	setp.lt.s32	%p72, %r9, 0;
-	selp.b32	%r53, %r52, %r51, %p72;
-	setp.eq.f64	%p73, %fd1, 0dBFF0000000000000;
-	selp.b32	%r54, 1072693248, %r53, %p73;
-	mov.u32 	%r55, 0;
-	mov.b64 	%fd51, {%r55, %r54};
-	bra.uni 	BB11_69;
-
-BB11_66:
+	setp.eq.s32	%p64, %r43, 0;
+	@%p64 bra 	BB22_72;
+
+BB22_69:
+	and.b32  	%r44, %r8, 2147483647;
+	setp.ne.s32	%p65, %r44, 2146435072;
+	mov.f64 	%fd49, %fd25;
+	mov.f64 	%fd52, %fd49;
+	@%p65 bra 	BB22_73;
+
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r44, %temp}, %fd1;
+	mov.b64 	{%r45, %temp}, %fd1;
 	}
-	and.b32  	%r45, %r8, 2147483647;
-	setp.eq.s32	%p68, %r45, 2146435072;
-	setp.eq.s32	%p69, %r44, 0;
-	and.pred  	%p70, %p68, %p69;
-	mov.f64 	%fd51, %fd25;
-	@!%p70 bra 	BB11_69;
-	bra.uni 	BB11_67;
-
-BB11_67:
+	setp.ne.s32	%p66, %r45, 0;
+	mov.f64 	%fd52, %fd25;
+	@%p66 bra 	BB22_73;
+
 	shr.s32 	%r46, %r9, 31;
 	and.b32  	%r47, %r46, -2146435072;
-	selp.b32	%r48, -1048576, 2146435072, %p1;
-	add.s32 	%r49, %r48, %r47;
-	mov.u32 	%r50, 0;
-	mov.b64 	%fd51, {%r50, %r49};
-
-BB11_69:
-	setp.eq.f64	%p74, %fd2, 0d0000000000000000;
-	setp.eq.f64	%p75, %fd1, 0d3FF0000000000000;
-	or.pred  	%p76, %p75, %p74;
-	selp.f64	%fd54, 0d3FF0000000000000, %fd51, %p76;
-
-BB11_72:
+	add.s32 	%r48, %r47, 2146435072;
+	or.b32  	%r49, %r48, -2147483648;
+	selp.b32	%r50, %r49, %r48, %p1;
+	mov.u32 	%r51, 0;
+	mov.b64 	%fd52, {%r51, %r50};
+	bra.uni 	BB22_73;
+
+BB22_72:
+	setp.gt.f64	%p67, %fd19, 0d3FF0000000000000;
+	selp.b32	%r52, 2146435072, 0, %p67;
+	xor.b32  	%r53, %r52, 2146435072;
+	setp.lt.s32	%p68, %r9, 0;
+	selp.b32	%r54, %r53, %r52, %p68;
+	setp.eq.f64	%p69, %fd1, 0dBFF0000000000000;
+	selp.b32	%r55, 1072693248, %r54, %p69;
+	mov.u32 	%r56, 0;
+	mov.b64 	%fd52, {%r56, %r55};
+
+BB22_73:
+	setp.eq.f64	%p70, %fd2, 0d0000000000000000;
+	setp.eq.f64	%p71, %fd1, 0d3FF0000000000000;
+	or.pred  	%p72, %p71, %p70;
+	selp.f64	%fd55, 0d3FF0000000000000, %fd52, %p72;
+
+BB22_76:
 	cvta.to.global.u64 	%rd16, %rd4;
 	mul.wide.s32 	%rd17, %r3, 8;
 	add.s64 	%rd18, %rd16, %rd17;
-	st.global.f64 	[%rd18], %fd54;
+	st.global.f64 	[%rd18], %fd55;
 	bar.sync 	0;
 
-BB11_73:
+BB22_77:
 	ret;
 }
 
-	// .globl	matrix_scalar_op
-.visible .entry matrix_scalar_op(
-	.param .u64 matrix_scalar_op_param_0,
-	.param .f64 matrix_scalar_op_param_1,
-	.param .u64 matrix_scalar_op_param_2,
-	.param .u32 matrix_scalar_op_param_3,
-	.param .u32 matrix_scalar_op_param_4,
-	.param .u32 matrix_scalar_op_param_5
+	// .globl	matrix_matrix_cellwise_op_f
+.visible .entry matrix_matrix_cellwise_op_f(
+	.param .u64 matrix_matrix_cellwise_op_f_param_0,
+	.param .u64 matrix_matrix_cellwise_op_f_param_1,
+	.param .u64 matrix_matrix_cellwise_op_f_param_2,
+	.param .u32 matrix_matrix_cellwise_op_f_param_3,
+	.param .u32 matrix_matrix_cellwise_op_f_param_4,
+	.param .u32 matrix_matrix_cellwise_op_f_param_5,
+	.param .u32 matrix_matrix_cellwise_op_f_param_6,
+	.param .u32 matrix_matrix_cellwise_op_f_param_7
 )
 {
-	.reg .pred 	%p<141>;
-	.reg .b32 	%r<86>;
-	.reg .f64 	%fd<107>;
-	.reg .b64 	%rd<20>;
+	.reg .pred 	%p<76>;
+	.reg .f32 	%f<134>;
+	.reg .b32 	%r<51>;
+	.reg .b64 	%rd<17>;
 
 
-	ld.param.u64 	%rd4, [matrix_scalar_op_param_0];
-	ld.param.f64 	%fd68, [matrix_scalar_op_param_1];
-	ld.param.u64 	%rd5, [matrix_scalar_op_param_2];
-	ld.param.u32 	%r8, [matrix_scalar_op_param_3];
-	ld.param.u32 	%r6, [matrix_scalar_op_param_4];
-	ld.param.u32 	%r7, [matrix_scalar_op_param_5];
-	mov.u32 	%r9, %ntid.x;
-	mov.u32 	%r10, %ctaid.x;
-	mov.u32 	%r11, %tid.x;
-	mad.lo.s32 	%r1, %r9, %r10, %r11;
-	setp.ge.s32	%p3, %r1, %r8;
-	@%p3 bra 	BB12_130;
+	ld.param.u64 	%rd1, [matrix_matrix_cellwise_op_f_param_0];
+	ld.param.u64 	%rd2, [matrix_matrix_cellwise_op_f_param_1];
+	ld.param.u64 	%rd3, [matrix_matrix_cellwise_op_f_param_2];
+	ld.param.u32 	%r12, [matrix_matrix_cellwise_op_f_param_3];
+	ld.param.u32 	%r8, [matrix_matrix_cellwise_op_f_param_4];
+	ld.param.u32 	%r9, [matrix_matrix_cellwise_op_f_param_5];
+	ld.param.u32 	%r10, [matrix_matrix_cellwise_op_f_param_6];
+	ld.param.u32 	%r11, [matrix_matrix_cellwise_op_f_param_7];
+	mov.u32 	%r13, %ntid.x;
+	mov.u32 	%r14, %ctaid.x;
+	mov.u32 	%r15, %tid.x;
+	mad.lo.s32 	%r16, %r13, %r14, %r15;
+	div.s32 	%r1, %r16, %r8;
+	rem.s32 	%r2, %r16, %r8;
+	setp.lt.s32	%p2, %r1, %r12;
+	setp.gt.s32	%p3, %r8, -1;
+	and.pred  	%p4, %p2, %p3;
+	@!%p4 bra 	BB23_71;
+	bra.uni 	BB23_1;
 
-	cvta.to.global.u64 	%rd6, %rd5;
-	cvta.to.global.u64 	%rd7, %rd4;
-	mul.wide.s32 	%rd8, %r1, 8;
-	add.s64 	%rd9, %rd7, %rd8;
-	ld.global.f64 	%fd1, [%rd9];
-	add.s64 	%rd1, %rd6, %rd8;
-	setp.eq.s32	%p4, %r7, 0;
-	@%p4 bra 	BB12_66;
+BB23_1:
+	mad.lo.s32 	%r3, %r1, %r8, %r2;
+	setp.eq.s32	%p5, %r9, 1;
+	mov.u32 	%r49, %r1;
+	@%p5 bra 	BB23_5;
 
-	mov.f64 	%fd98, 0d7FEFFFFFFFFFFFFF;
-	setp.gt.s32	%p5, %r6, 8;
-	@%p5 bra 	BB12_19;
+	setp.ne.s32	%p6, %r9, 2;
+	mov.u32 	%r50, %r3;
+	@%p6 bra 	BB23_4;
 
-	setp.gt.s32	%p19, %r6, 3;
-	@%p19 bra 	BB12_11;
+	mov.u32 	%r50, %r2;
 
-	setp.gt.s32	%p26, %r6, 1;
-	@%p26 bra 	BB12_8;
+BB23_4:
+	mov.u32 	%r44, %r50;
+	mov.u32 	%r4, %r44;
+	mov.u32 	%r49, %r4;
 
-	setp.eq.s32	%p29, %r6, 0;
-	@%p29 bra 	BB12_64;
-	bra.uni 	BB12_6;
+BB23_5:
+	mov.u32 	%r5, %r49;
+	setp.eq.s32	%p7, %r10, 1;
+	mov.u32 	%r47, %r1;
+	@%p7 bra 	BB23_9;
 
-BB12_64:
-	add.f64 	%fd98, %fd1, %fd68;
-	bra.uni 	BB12_65;
+	setp.ne.s32	%p8, %r10, 2;
+	mov.u32 	%r48, %r3;
+	@%p8 bra 	BB23_8;
 
-BB12_66:
-	mov.f64 	%fd106, 0d7FEFFFFFFFFFFFFF;
-	setp.gt.s32	%p73, %r6, 8;
-	@%p73 bra 	BB12_83;
+	mov.u32 	%r48, %r2;
 
-	setp.gt.s32	%p87, %r6, 3;
-	@%p87 bra 	BB12_75;
+BB23_8:
+	mov.u32 	%r47, %r48;
 
-	setp.gt.s32	%p94, %r6, 1;
-	@%p94 bra 	BB12_72;
+BB23_9:
+	cvta.to.global.u64 	%rd4, %rd2;
+	cvta.to.global.u64 	%rd5, %rd1;
+	mul.wide.s32 	%rd6, %r5, 4;
+	add.s64 	%rd7, %rd5, %rd6;
+	ld.global.f32 	%f1, [%rd7];
+	mul.wide.s32 	%rd8, %r47, 4;
+	add.s64 	%rd9, %rd4, %rd8;
+	ld.global.f32 	%f2, [%rd9];
+	mov.f32 	%f133, 0f7F7FFFFF;
+	setp.gt.s32	%p9, %r11, 8;
+	@%p9 bra 	BB23_26;
 
-	setp.eq.s32	%p97, %r6, 0;
-	@%p97 bra 	BB12_128;
-	bra.uni 	BB12_70;
+	setp.gt.s32	%p23, %r11, 3;
+	@%p23 bra 	BB23_18;
 
-BB12_128:
-	add.f64 	%fd106, %fd1, %fd68;
-	bra.uni 	BB12_129;
+	setp.gt.s32	%p30, %r11, 1;
+	@%p30 bra 	BB23_15;
 
-BB12_19:
-	setp.gt.s32	%p6, %r6, 13;
-	@%p6 bra 	BB12_28;
+	setp.eq.s32	%p33, %r11, 0;
+	@%p33 bra 	BB23_69;
+	bra.uni 	BB23_13;
 
-	setp.gt.s32	%p13, %r6, 10;
-	@%p13 bra 	BB12_24;
+BB23_69:
+	add.f32 	%f133, %f1, %f2;
+	bra.uni 	BB23_70;
 
-	setp.eq.s32	%p17, %r6, 9;
-	@%p17 bra 	BB12_46;
-	bra.uni 	BB12_22;
+BB23_26:
+	setp.gt.s32	%p10, %r11, 13;
+	@%p10 bra 	BB23_35;
 
-BB12_46:
-	setp.eq.f64	%p46, %fd1, %fd68;
-	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p46;
-	bra.uni 	BB12_65;
+	setp.gt.s32	%p17, %r11, 10;
+	@%p17 bra 	BB23_31;
 
-BB12_83:
-	setp.gt.s32	%p74, %r6, 13;
-	@%p74 bra 	BB12_92;
+	setp.eq.s32	%p21, %r11, 9;
+	@%p21 bra 	BB23_51;
+	bra.uni 	BB23_29;
+
+BB23_51:
+	setp.eq.f32	%p44, %f1, %f2;
+	selp.f32	%f133, 0f3F800000, 0f00000000, %p44;
+	bra.uni 	BB23_70;
+
+BB23_18:
+	setp.gt.s32	%p24, %r11, 5;
+	@%p24 bra 	BB23_22;
+
+	setp.eq.s32	%p28, %r11, 4;
+	@%p28 bra 	BB23_54;
+	bra.uni 	BB23_20;
+
+BB23_54:
+	mul.f32 	%f53, %f2, 0f3F000000;
+	cvt.rzi.f32.f32	%f54, %f53;
+	fma.rn.f32 	%f55, %f54, 0fC0000000, %f2;
+	abs.f32 	%f19, %f55;
+	abs.f32 	%f20, %f1;
+	setp.lt.f32	%p49, %f20, 0f00800000;
+	mul.f32 	%f56, %f20, 0f4B800000;
+	selp.f32	%f57, 0fC3170000, 0fC2FE0000, %p49;
+	selp.f32	%f58, %f56, %f20, %p49;
+	mov.b32 	 %r23, %f58;
+	and.b32  	%r24, %r23, 8388607;
+	or.b32  	%r25, %r24, 1065353216;
+	mov.b32 	 %f59, %r25;
+	shr.u32 	%r26, %r23, 23;
+	cvt.rn.f32.u32	%f60, %r26;
+	add.f32 	%f61, %f57, %f60;
+	setp.gt.f32	%p50, %f59, 0f3FB504F3;
+	mul.f32 	%f62, %f59, 0f3F000000;
+	add.f32 	%f63, %f61, 0f3F800000;
+	selp.f32	%f64, %f62, %f59, %p50;
+	selp.f32	%f65, %f63, %f61, %p50;
+	add.f32 	%f66, %f64, 0fBF800000;
+	add.f32 	%f50, %f64, 0f3F800000;
+	// inline asm
+	rcp.approx.ftz.f32 %f49,%f50;
+	// inline asm
+	add.f32 	%f67, %f66, %f66;
+	mul.f32 	%f68, %f49, %f67;
+	mul.f32 	%f69, %f68, %f68;
+	mov.f32 	%f70, 0f3C4CAF63;
+	mov.f32 	%f71, 0f3B18F0FE;
+	fma.rn.f32 	%f72, %f71, %f69, %f70;
+	mov.f32 	%f73, 0f3DAAAABD;
+	fma.rn.f32 	%f74, %f72, %f69, %f73;
+	mul.rn.f32 	%f75, %f74, %f69;
+	mul.rn.f32 	%f76, %f75, %f68;
+	sub.f32 	%f77, %f66, %f68;
+	neg.f32 	%f78, %f68;
+	add.f32 	%f79, %f77, %f77;
+	fma.rn.f32 	%f80, %f78, %f66, %f79;
+	mul.rn.f32 	%f81, %f49, %f80;
+	add.f32 	%f82, %f76, %f68;
+	sub.f32 	%f83, %f68, %f82;
+	add.f32 	%f84, %f76, %f83;
+	add.f32 	%f85, %f81, %f84;
+	add.f32 	%f86, %f82, %f85;
+	sub.f32 	%f87, %f82, %f86;
+	add.f32 	%f88, %f85, %f87;
+	mov.f32 	%f89, 0f3F317200;
+	mul.rn.f32 	%f90, %f65, %f89;
+	mov.f32 	%f91, 0f35BFBE8E;
+	mul.rn.f32 	%f92, %f65, %f91;
+	add.f32 	%f93, %f90, %f86;
+	sub.f32 	%f94, %f90, %f93;
+	add.f32 	%f95, %f86, %f94;
+	add.f32 	%f96, %f88, %f95;
+	add.f32 	%f97, %f92, %f96;
+	add.f32 	%f98, %f93, %f97;
+	sub.f32 	%f99, %f93, %f98;
+	add.f32 	%f100, %f97, %f99;
+	abs.f32 	%f21, %f2;
+	setp.gt.f32	%p51, %f21, 0f77F684DF;
+	mul.f32 	%f101, %f2, 0f39000000;
+	selp.f32	%f102, %f101, %f2, %p51;
+	mul.rn.f32 	%f103, %f102, %f98;
+	neg.f32 	%f104, %f103;
+	fma.rn.f32 	%f105, %f102, %f98, %f104;
+	fma.rn.f32 	%f106, %f102, %f100, %f105;
+	mov.f32 	%f107, 0f00000000;
+	fma.rn.f32 	%f108, %f107, %f98, %f106;
+	add.rn.f32 	%f109, %f103, %f108;
+	neg.f32 	%f110, %f109;
+	add.rn.f32 	%f111, %f103, %f110;
+	add.rn.f32 	%f112, %f111, %f108;
+	mov.b32 	 %r27, %f109;
+	setp.eq.s32	%p52, %r27, 1118925336;
+	add.s32 	%r28, %r27, -1;
+	mov.b32 	 %f113, %r28;
+	add.f32 	%f114, %f112, 0f37000000;
+	selp.f32	%f115, %f113, %f109, %p52;
+	selp.f32	%f22, %f114, %f112, %p52;
+	mul.f32 	%f116, %f115, 0f3FB8AA3B;
+	cvt.rzi.f32.f32	%f117, %f116;
+	mov.f32 	%f118, 0fBF317200;
+	fma.rn.f32 	%f119, %f117, %f118, %f115;
+	mov.f32 	%f120, 0fB5BFBE8E;
+	fma.rn.f32 	%f121, %f117, %f120, %f119;
+	mul.f32 	%f52, %f121, 0f3FB8AA3B;
+	// inline asm
+	ex2.approx.ftz.f32 %f51,%f52;
+	// inline asm
+	add.f32 	%f122, %f117, 0f00000000;
+	ex2.approx.f32 	%f123, %f122;
+	mul.f32 	%f124, %f51, %f123;
+	setp.lt.f32	%p53, %f115, 0fC2D20000;
+	selp.f32	%f125, 0f00000000, %f124, %p53;
+	setp.gt.f32	%p54, %f115, 0f42D20000;
+	selp.f32	%f131, 0f7F800000, %f125, %p54;
+	setp.eq.f32	%p55, %f131, 0f7F800000;
+	@%p55 bra 	BB23_56;
+
+	fma.rn.f32 	%f131, %f131, %f22, %f131;
+
+BB23_56:
+	setp.lt.f32	%p56, %f1, 0f00000000;
+	setp.eq.f32	%p57, %f19, 0f3F800000;
+	and.pred  	%p1, %p56, %p57;
+	mov.b32 	 %r29, %f131;
+	xor.b32  	%r30, %r29, -2147483648;
+	mov.b32 	 %f126, %r30;
+	selp.f32	%f132, %f126, %f131, %p1;
+	setp.eq.f32	%p58, %f1, 0f00000000;
+	@%p58 bra 	BB23_59;
+	bra.uni 	BB23_57;
+
+BB23_59:
+	add.f32 	%f128, %f1, %f1;
+	mov.b32 	 %r31, %f128;
+	selp.b32	%r32, %r31, 0, %p57;
+	or.b32  	%r33, %r32, 2139095040;
+	setp.lt.f32	%p62, %f2, 0f00000000;
+	selp.b32	%r34, %r33, %r32, %p62;
+	mov.b32 	 %f132, %r34;
+	bra.uni 	BB23_60;
 
-	setp.gt.s32	%p81, %r6, 10;
-	@%p81 bra 	BB12_88;
+BB23_35:
+	setp.gt.s32	%p11, %r11, 15;
+	@%p11 bra 	BB23_39;
+
+	setp.eq.s32	%p15, %r11, 14;
+	@%p15 bra 	BB23_48;
+	bra.uni 	BB23_37;
+
+BB23_48:
+	cvt.rni.s64.f32	%rd10, %f1;
+	cvt.rni.s64.f32	%rd11, %f2;
+	cvt.u32.u64	%r17, %rd10;
+	cvt.u32.u64	%r18, %rd11;
+	or.b32  	%r19, %r18, %r17;
+	setp.eq.s32	%p41, %r19, 0;
+	selp.f32	%f133, 0f00000000, 0f3F800000, %p41;
+	bra.uni 	BB23_70;
 
-	setp.eq.s32	%p85, %r6, 9;
-	@%p85 bra 	BB12_110;
-	bra.uni 	BB12_86;
+BB23_15:
+	setp.eq.s32	%p31, %r11, 2;
+	@%p31 bra 	BB23_68;
+	bra.uni 	BB23_16;
 
-BB12_110:
-	setp.eq.f64	%p114, %fd1, %fd68;
-	selp.f64	%fd106, 0d3FF0000000000000, 0d0000000000000000, %p114;
-	bra.uni 	BB12_129;
+BB23_68:
+	mul.f32 	%f133, %f1, %f2;
+	bra.uni 	BB23_70;
 
-BB12_11:
-	setp.gt.s32	%p20, %r6, 5;
-	@%p20 bra 	BB12_15;
+BB23_31:
+	setp.eq.s32	%p18, %r11, 11;
+	@%p18 bra 	BB23_50;
 
-	setp.eq.s32	%p24, %r6, 4;
-	@%p24 bra 	BB12_49;
-	bra.uni 	BB12_13;
+	setp.eq.s32	%p19, %r11, 12;
+	@%p19 bra 	BB23_49;
+	bra.uni 	BB23_33;
 
-BB12_49:
-	{
-	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r2}, %fd68;
-	}
-	{
-	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r3}, %fd1;
-	}
-	bfe.u32 	%r24, %r3, 20, 11;
+BB23_49:
+	max.f32 	%f133, %f1, %f2;
+	bra.uni 	BB23_70;
+
+BB23_22:
+	setp.eq.s32	%p25, %r11, 6;
+	@%p25 bra 	BB23_53;
+
+	setp.eq.s32	%p26, %r11, 7;
+	@%p26 bra 	BB23_52;
+	bra.uni 	BB23_24;
+
+BB23_52:
+	setp.gt.f32	%p46, %f1, %f2;
+	selp.f32	%f133, 0f3F800000, 0f00000000, %p46;
+	bra.uni 	BB23_70;
+
+BB23_39:
+	setp.eq.s32	%p12, %r11, 16;
+	@%p12 bra 	BB23_47;
+
+	setp.eq.s32	%p13, %r11, 17;
+	@%p13 bra 	BB23_44;
+	bra.uni 	BB23_41;
+
+BB23_44:
+	setp.eq.f32	%p36, %f2, 0f00000000;
+	setp.eq.f32	%p37, %f2, 0f80000000;
+	or.pred  	%p38, %p36, %p37;
+	mov.f32 	%f133, 0f7FC00000;
+	@%p38 bra 	BB23_70;
+
+	div.rn.f32 	%f133, %f1, %f2;
+	abs.f32 	%f43, %f133;
+	setp.geu.f32	%p39, %f43, 0f7F800000;
+	@%p39 bra 	BB23_70;
+
+	cvt.rmi.f32.f32	%f44, %f133;
+	mul.f32 	%f45, %f2, %f44;
+	sub.f32 	%f133, %f1, %f45;
+	bra.uni 	BB23_70;
+
+BB23_13:
+	setp.eq.s32	%p34, %r11, 1;
+	@%p34 bra 	BB23_14;
+	bra.uni 	BB23_70;
+
+BB23_14:
+	sub.f32 	%f133, %f1, %f2;
+	bra.uni 	BB23_70;
+
+BB23_29:
+	setp.eq.s32	%p22, %r11, 10;
+	@%p22 bra 	BB23_30;
+	bra.uni 	BB23_70;
+
+BB23_30:
+	setp.neu.f32	%p43, %f1, %f2;
+	selp.f32	%f133, 0f3F800000, 0f00000000, %p43;
+	bra.uni 	BB23_70;
+
+BB23_20:
+	setp.eq.s32	%p29, %r11, 5;
+	@%p29 bra 	BB23_21;
+	bra.uni 	BB23_70;
+
+BB23_21:
+	setp.lt.f32	%p48, %f1, %f2;
+	selp.f32	%f133, 0f3F800000, 0f00000000, %p48;
+	bra.uni 	BB23_70;
+
+BB23_37:
+	setp.eq.s32	%p16, %r11, 15;
+	@%p16 bra 	BB23_38;
+	bra.uni 	BB23_70;
+
+BB23_38:
+	mul.f32 	%f47, %f1, %f2;
+	mov.f32 	%f48, 0f3F800000;
+	sub.f32 	%f133, %f48, %f47;
+	bra.uni 	BB23_70;
+
+BB23_16:
+	setp.eq.s32	%p32, %r11, 3;
+	@%p32 bra 	BB23_17;
+	bra.uni 	BB23_70;
+
+BB23_17:
+	div.rn.f32 	%f133, %f1, %f2;
+	bra.uni 	BB23_70;
+
+BB23_50:
+	min.f32 	%f133, %f1, %f2;
+	bra.uni 	BB23_70;
+
+BB23_33:
+	setp.eq.s32	%p20, %r11, 13;
+	@%p20 bra 	BB23_34;
+	bra.uni 	BB23_70;
+
+BB23_34:
+	cvt.rni.s64.f32	%rd12, %f1;
+	cvt.rni.s64.f32	%rd13, %f2;
+	cvt.u32.u64	%r20, %rd12;
+	cvt.u32.u64	%r21, %rd13;
+	and.b32  	%r22, %r21, %r20;
+	setp.eq.s32	%p42, %r22, 0;
+	selp.f32	%f133, 0f00000000, 0f3F800000, %p42;
+	bra.uni 	BB23_70;
+
+BB23_53:
+	setp.gtu.f32	%p47, %f1, %f2;
+	selp.f32	%f133, 0f00000000, 0f3F800000, %p47;
+	bra.uni 	BB23_70;
+
+BB23_24:
+	setp.eq.s32	%p27, %r11, 8;
+	@%p27 bra 	BB23_25;
+	bra.uni 	BB23_70;
+
+BB23_25:
+	setp.ltu.f32	%p45, %f1, %f2;
+	selp.f32	%f133, 0f00000000, 0f3F800000, %p45;
+	bra.uni 	BB23_70;
+
+BB23_47:
+	setp.neu.f32	%p40, %f1, 0f00000000;
+	sub.f32 	%f46, %f1, %f2;
+	selp.f32	%f133, %f46, 0f00000000, %p40;
+	bra.uni 	BB23_70;
+
+BB23_41:
+	setp.ne.s32	%p14, %r11, 18;
+	@%p14 bra 	BB23_70;
+
+	div.rn.f32 	%f133, %f1, %f2;
+	abs.f32 	%f41, %f133;
+	setp.geu.f32	%p35, %f41, 0f7F800000;
+	@%p35 bra 	BB23_70;
+
+	cvt.rmi.f32.f32	%f133, %f133;
+	bra.uni 	BB23_70;
+
+BB23_57:
+	setp.geu.f32	%p59, %f1, 0f00000000;
+	@%p59 bra 	BB23_60;
+
+	cvt.rzi.f32.f32	%f127, %f2;
+	setp.neu.f32	%p60, %f127, %f2;
+	selp.f32	%f132, 0f7FFFFFFF, %f132, %p60;
+
+BB23_60:
+	add.f32 	%f129, %f20, %f21;
+	mov.b32 	 %r35, %f129;
+	setp.lt.s32	%p63, %r35, 2139095040;
+	@%p63 bra 	BB23_67;
+
+	setp.gtu.f32	%p64, %f20, 0f7F800000;
+	setp.gtu.f32	%p65, %f21, 0f7F800000;
+	or.pred  	%p66, %p64, %p65;
+	@%p66 bra 	BB23_66;
+	bra.uni 	BB23_62;
+
+BB23_66:
+	add.f32 	%f132, %f1, %f2;
+	bra.uni 	BB23_67;
+
+BB23_62:
+	setp.eq.f32	%p67, %f21, 0f7F800000;
+	@%p67 bra 	BB23_65;
+	bra.uni 	BB23_63;
+
+BB23_65:
+	setp.gt.f32	%p70, %f20, 0f3F800000;
+	selp.b32	%r39, 2139095040, 0, %p70;
+	xor.b32  	%r40, %r39, 2139095040;
+	setp.lt.f32	%p71, %f2, 0f00000000;
+	selp.b32	%r41, %r40, %r39, %p71;
+	mov.b32 	 %f130, %r41;
+	setp.eq.f32	%p72, %f1, 0fBF800000;
+	selp.f32	%f132, 0f3F800000, %f130, %p72;
+	bra.uni 	BB23_67;
+
+BB23_63:
+	setp.neu.f32	%p68, %f20, 0f7F800000;
+	@%p68 bra 	BB23_67;
+
+	setp.ltu.f32	%p69, %f2, 0f00000000;
+	selp.b32	%r36, 0, 2139095040, %p69;
+	or.b32  	%r37, %r36, -2147483648;
+	selp.b32	%r38, %r37, %r36, %p1;
+	mov.b32 	 %f132, %r38;
+
+BB23_67:
+	setp.eq.f32	%p73, %f2, 0f00000000;
+	setp.eq.f32	%p74, %f1, 0f3F800000;
+	or.pred  	%p75, %p74, %p73;
+	selp.f32	%f133, 0f3F800000, %f132, %p75;
+
+BB23_70:
+	cvta.to.global.u64 	%rd14, %rd3;
+	mul.wide.s32 	%rd15, %r3, 4;
+	add.s64 	%rd16, %rd14, %rd15;
+	st.global.f32 	[%rd16], %f133;
+	bar.sync 	0;
+
+BB23_71:
+	ret;
+}
+
+	// .globl	matrix_scalar_op_d
+.visible .entry matrix_scalar_op_d(
+	.param .u64 matrix_scalar_op_d_param_0,
+	.param .f64 matrix_scalar_op_d_param_1,
+	.param .u64 matrix_scalar_op_d_param_2,
+	.param .u32 matrix_scalar_op_d_param_3,
+	.param .u32 matrix_scalar_op_d_param_4,
+	.param .u32 matrix_scalar_op_d_param_5
+)
+{
+	.reg .pred 	%p<133>;
+	.reg .b32 	%r<88>;
+	.reg .f64 	%fd<109>;
+	.reg .b64 	%rd<20>;
+
+
+	ld.param.u64 	%rd4, [matrix_scalar_op_d_param_0];
+	ld.param.f64 	%fd68, [matrix_scalar_op_d_param_1];
+	ld.param.u64 	%rd5, [matrix_scalar_op_d_param_2];
+	ld.param.u32 	%r8, [matrix_scalar_op_d_param_3];
+	ld.param.u32 	%r6, [matrix_scalar_op_d_param_4];
+	ld.param.u32 	%r7, [matrix_scalar_op_d_param_5];
+	mov.u32 	%r9, %ntid.x;
+	mov.u32 	%r10, %ctaid.x;
+	mov.u32 	%r11, %tid.x;
+	mad.lo.s32 	%r1, %r9, %r10, %r11;
+	setp.ge.s32	%p3, %r1, %r8;
+	@%p3 bra 	BB24_138;
+
+	cvta.to.global.u64 	%rd6, %rd5;
+	cvta.to.global.u64 	%rd7, %rd4;
+	mul.wide.s32 	%rd8, %r1, 8;
+	add.s64 	%rd9, %rd7, %rd8;
+	ld.global.f64 	%fd1, [%rd9];
+	add.s64 	%rd1, %rd6, %rd8;
+	setp.eq.s32	%p4, %r7, 0;
+	@%p4 bra 	BB24_70;
+
+	mov.f64 	%fd99, 0d7FEFFFFFFFFFFFFF;
+	setp.gt.s32	%p5, %r6, 8;
+	@%p5 bra 	BB24_19;
+
+	setp.gt.s32	%p19, %r6, 3;
+	@%p19 bra 	BB24_11;
+
+	setp.gt.s32	%p26, %r6, 1;
+	@%p26 bra 	BB24_8;
+
+	setp.eq.s32	%p29, %r6, 0;
+	@%p29 bra 	BB24_68;
+	bra.uni 	BB24_6;
+
+BB24_68:
+	add.f64 	%fd99, %fd1, %fd68;
+	bra.uni 	BB24_69;
+
+BB24_70:
+	mov.f64 	%fd108, 0d7FEFFFFFFFFFFFFF;
+	setp.gt.s32	%p69, %r6, 8;
+	@%p69 bra 	BB24_87;
+
+	setp.gt.s32	%p83, %r6, 3;
+	@%p83 bra 	BB24_79;
+
+	setp.gt.s32	%p90, %r6, 1;
+	@%p90 bra 	BB24_76;
+
+	setp.eq.s32	%p93, %r6, 0;
+	@%p93 bra 	BB24_136;
+	bra.uni 	BB24_74;
+
+BB24_136:
+	add.f64 	%fd108, %fd1, %fd68;
+	bra.uni 	BB24_137;
+
+BB24_19:
+	setp.gt.s32	%p6, %r6, 13;
+	@%p6 bra 	BB24_28;
+
+	setp.gt.s32	%p13, %r6, 10;
+	@%p13 bra 	BB24_24;
+
+	setp.eq.s32	%p17, %r6, 9;
+	@%p17 bra 	BB24_48;
+	bra.uni 	BB24_22;
+
+BB24_48:
+	setp.eq.f64	%p44, %fd1, %fd68;
+	selp.f64	%fd99, 0d3FF0000000000000, 0d0000000000000000, %p44;
+	bra.uni 	BB24_69;
+
+BB24_87:
+	setp.gt.s32	%p70, %r6, 13;
+	@%p70 bra 	BB24_96;
+
+	setp.gt.s32	%p77, %r6, 10;
+	@%p77 bra 	BB24_92;
+
+	setp.eq.s32	%p81, %r6, 9;
+	@%p81 bra 	BB24_116;
+	bra.uni 	BB24_90;
+
+BB24_116:
+	setp.eq.f64	%p108, %fd1, %fd68;
+	selp.f64	%fd108, 0d3FF0000000000000, 0d0000000000000000, %p108;
+	bra.uni 	BB24_137;
+
+BB24_11:
+	setp.gt.s32	%p20, %r6, 5;
+	@%p20 bra 	BB24_15;
+
+	setp.eq.s32	%p24, %r6, 4;
+	@%p24 bra 	BB24_51;
+	bra.uni 	BB24_13;
+
+BB24_51:
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r2}, %fd68;
+	}
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r3}, %fd1;
+	}
+	bfe.u32 	%r24, %r3, 20, 11;
 	add.s32 	%r25, %r24, -1012;
 	mov.b64 	 %rd14, %fd1;
 	shl.b64 	%rd2, %rd14, %r25;
-	setp.eq.s64	%p51, %rd2, -9223372036854775808;
+	setp.eq.s64	%p49, %rd2, -9223372036854775808;
 	abs.f64 	%fd18, %fd68;
 	// Callseq Start 1
 	{
@@ -1310,69 +2392,69 @@ BB12_49:
 	param0, 
 	param1
 	);
-	ld.param.f64	%fd97, [retval0+0];
+	ld.param.f64	%fd98, [retval0+0];
 	
 	//{
 	}// Callseq End 1
-	setp.lt.s32	%p52, %r2, 0;
-	and.pred  	%p1, %p52, %p51;
-	@!%p1 bra 	BB12_51;
-	bra.uni 	BB12_50;
+	setp.lt.s32	%p50, %r2, 0;
+	and.pred  	%p1, %p50, %p49;
+	@!%p1 bra 	BB24_53;
+	bra.uni 	BB24_52;
 
-BB12_50:
+BB24_52:
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r26}, %fd97;
+	mov.b64 	{%temp, %r26}, %fd98;
 	}
 	xor.b32  	%r27, %r26, -2147483648;
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r28, %temp}, %fd97;
+	mov.b64 	{%r28, %temp}, %fd98;
 	}
-	mov.b64 	%fd97, {%r28, %r27};
+	mov.b64 	%fd98, {%r28, %r27};
 
-BB12_51:
-	mov.f64 	%fd96, %fd97;
-	setp.eq.f64	%p53, %fd68, 0d0000000000000000;
-	@%p53 bra 	BB12_54;
-	bra.uni 	BB12_52;
+BB24_53:
+	mov.f64 	%fd97, %fd98;
+	setp.eq.f64	%p51, %fd68, 0d0000000000000000;
+	@%p51 bra 	BB24_56;
+	bra.uni 	BB24_54;
 
-BB12_54:
-	selp.b32	%r29, %r2, 0, %p51;
+BB24_56:
+	selp.b32	%r29, %r2, 0, %p49;
 	or.b32  	%r30, %r29, 2146435072;
-	setp.lt.s32	%p57, %r3, 0;
-	selp.b32	%r31, %r30, %r29, %p57;
+	setp.lt.s32	%p55, %r3, 0;
+	selp.b32	%r31, %r30, %r29, %p55;
 	mov.u32 	%r32, 0;
-	mov.b64 	%fd96, {%r32, %r31};
-	bra.uni 	BB12_55;
+	mov.b64 	%fd97, {%r32, %r31};
+	bra.uni 	BB24_57;
 
-BB12_28:
+BB24_28:
 	setp.gt.s32	%p7, %r6, 15;
-	@%p7 bra 	BB12_32;
+	@%p7 bra 	BB24_32;
 
 	setp.eq.s32	%p11, %r6, 14;
-	@%p11 bra 	BB12_43;
-	bra.uni 	BB12_30;
+	@%p11 bra 	BB24_45;
+	bra.uni 	BB24_30;
 
-BB12_43:
+BB24_45:
 	cvt.rni.s64.f64	%rd10, %fd68;
 	cvt.rni.s64.f64	%rd11, %fd1;
 	cvt.u32.u64	%r18, %rd10;
 	cvt.u32.u64	%r19, %rd11;
 	or.b32  	%r20, %r19, %r18;
-	setp.eq.s32	%p43, %r20, 0;
-	selp.f64	%fd98, 0d0000000000000000, 0d3FF0000000000000, %p43;
-	bra.uni 	BB12_65;
+	setp.eq.s32	%p41, %r20, 0;
+	selp.f64	%fd99, 0d0000000000000000, 0d3FF0000000000000, %p41;
+	bra.uni 	BB24_69;
 
-BB12_75:
-	setp.gt.s32	%p88, %r6, 5;
-	@%p88 bra 	BB12_79;
+BB24_79:
+	setp.gt.s32	%p84, %r6, 5;
+	@%p84 bra 	BB24_83;
 
-	setp.eq.s32	%p92, %r6, 4;
-	@%p92 bra 	BB12_113;
-	bra.uni 	BB12_77;
+	setp.eq.s32	%p88, %r6, 4;
+	@%p88 bra 	BB24_119;
+	bra.uni 	BB24_81;
 
-BB12_113:
+BB24_119:
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r4}, %fd1;
@@ -1381,11 +2463,11 @@ BB12_113:
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r5}, %fd68;
 	}
-	bfe.u32 	%r61, %r5, 20, 11;
-	add.s32 	%r62, %r61, -1012;
+	bfe.u32 	%r62, %r5, 20, 11;
+	add.s32 	%r63, %r62, -1012;
 	mov.b64 	 %rd19, %fd68;
-	shl.b64 	%rd3, %rd19, %r62;
-	setp.eq.s64	%p119, %rd3, -9223372036854775808;
+	shl.b64 	%rd3, %rd19, %r63;
+	setp.eq.s64	%p113, %rd3, -9223372036854775808;
 	abs.f64 	%fd51, %fd1;
 	// Callseq Start 2
 	{
@@ -1402,621 +2484,1482 @@ BB12_113:
 	param0, 
 	param1
 	);
-	ld.param.f64	%fd105, [retval0+0];
+	ld.param.f64	%fd107, [retval0+0];
 	
 	//{
 	}// Callseq End 2
-	setp.lt.s32	%p120, %r4, 0;
-	and.pred  	%p2, %p120, %p119;
-	@!%p2 bra 	BB12_115;
-	bra.uni 	BB12_114;
+	setp.lt.s32	%p114, %r4, 0;
+	and.pred  	%p2, %p114, %p113;
+	@!%p2 bra 	BB24_121;
+	bra.uni 	BB24_120;
 
-BB12_114:
+BB24_120:
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r63}, %fd105;
+	mov.b64 	{%temp, %r64}, %fd107;
 	}
-	xor.b32  	%r64, %r63, -2147483648;
+	xor.b32  	%r65, %r64, -2147483648;
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r65, %temp}, %fd105;
+	mov.b64 	{%r66, %temp}, %fd107;
 	}
-	mov.b64 	%fd105, {%r65, %r64};
-
-BB12_115:
-	mov.f64 	%fd104, %fd105;
-	setp.eq.f64	%p121, %fd1, 0d0000000000000000;
-	@%p121 bra 	BB12_118;
-	bra.uni 	BB12_116;
-
-BB12_118:
-	selp.b32	%r66, %r4, 0, %p119;
-	or.b32  	%r67, %r66, 2146435072;
-	setp.lt.s32	%p125, %r5, 0;
-	selp.b32	%r68, %r67, %r66, %p125;
-	mov.u32 	%r69, 0;
-	mov.b64 	%fd104, {%r69, %r68};
-	bra.uni 	BB12_119;
-
-BB12_92:
-	setp.gt.s32	%p75, %r6, 15;
-	@%p75 bra 	BB12_96;
-
-	setp.eq.s32	%p79, %r6, 14;
-	@%p79 bra 	BB12_107;
-	bra.uni 	BB12_94;
-
-BB12_107:
+	mov.b64 	%fd107, {%r66, %r65};
+
+BB24_121:
+	mov.f64 	%fd106, %fd107;
+	setp.eq.f64	%p115, %fd1, 0d0000000000000000;
+	@%p115 bra 	BB24_124;
+	bra.uni 	BB24_122;
+
+BB24_124:
+	selp.b32	%r67, %r4, 0, %p113;
+	or.b32  	%r68, %r67, 2146435072;
+	setp.lt.s32	%p119, %r5, 0;
+	selp.b32	%r69, %r68, %r67, %p119;
+	mov.u32 	%r70, 0;
+	mov.b64 	%fd106, {%r70, %r69};
+	bra.uni 	BB24_125;
+
+BB24_96:
+	setp.gt.s32	%p71, %r6, 15;
+	@%p71 bra 	BB24_100;
+
+	setp.eq.s32	%p75, %r6, 14;
+	@%p75 bra 	BB24_113;
+	bra.uni 	BB24_98;
+
+BB24_113:
 	cvt.rni.s64.f64	%rd15, %fd1;
 	cvt.rni.s64.f64	%rd16, %fd68;
-	cvt.u32.u64	%r55, %rd15;
-	cvt.u32.u64	%r56, %rd16;
-	or.b32  	%r57, %r56, %r55;
-	setp.eq.s32	%p111, %r57, 0;
-	selp.f64	%fd106, 0d0000000000000000, 0d3FF0000000000000, %p111;
-	bra.uni 	BB12_129;
-
-BB12_8:
+	cvt.u32.u64	%r56, %rd15;
+	cvt.u32.u64	%r57, %rd16;
+	or.b32  	%r58, %r57, %r56;
+	setp.eq.s32	%p105, %r58, 0;
+	selp.f64	%fd108, 0d0000000000000000, 0d3FF0000000000000, %p105;
+	bra.uni 	BB24_137;
+
+BB24_8:
 	setp.eq.s32	%p27, %r6, 2;
-	@%p27 bra 	BB12_63;
-	bra.uni 	BB12_9;
+	@%p27 bra 	BB24_67;
+	bra.uni 	BB24_9;
 
-BB12_63:
-	mul.f64 	%fd98, %fd1, %fd68;
-	bra.uni 	BB12_65;
+BB24_67:
+	mul.f64 	%fd99, %fd1, %fd68;
+	bra.uni 	BB24_69;
 
-BB12_24:
+BB24_24:
 	setp.eq.s32	%p14, %r6, 11;
-	@%p14 bra 	BB12_45;
+	@%p14 bra 	BB24_47;
 
 	setp.eq.s32	%p15, %r6, 12;
-	@%p15 bra 	BB12_44;
-	bra.uni 	BB12_26;
+	@%p15 bra 	BB24_46;
+	bra.uni 	BB24_26;
 
-BB12_44:
-	max.f64 	%fd98, %fd68, %fd1;
-	bra.uni 	BB12_65;
+BB24_46:
+	max.f64 	%fd99, %fd68, %fd1;
+	bra.uni 	BB24_69;
 
-BB12_15:
+BB24_15:
 	setp.eq.s32	%p21, %r6, 6;
-	@%p21 bra 	BB12_48;
+	@%p21 bra 	BB24_50;
 
 	setp.eq.s32	%p22, %r6, 7;
-	@%p22 bra 	BB12_47;
-	bra.uni 	BB12_17;
+	@%p22 bra 	BB24_49;
+	bra.uni 	BB24_17;
 
-BB12_47:
-	setp.lt.f64	%p48, %fd1, %fd68;
-	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p48;
-	bra.uni 	BB12_65;
+BB24_49:
+	setp.lt.f64	%p46, %fd1, %fd68;
+	selp.f64	%fd99, 0d3FF0000000000000, 0d0000000000000000, %p46;
+	bra.uni 	BB24_69;
 
-BB12_32:
+BB24_32:
 	setp.eq.s32	%p8, %r6, 16;
-	@%p8 bra 	BB12_42;
+	@%p8 bra 	BB24_44;
 
 	setp.eq.s32	%p9, %r6, 17;
-	@%p9 bra 	BB12_38;
-	bra.uni 	BB12_34;
+	@%p9 bra 	BB24_39;
+	bra.uni 	BB24_34;
 
-BB12_38:
-	setp.eq.f64	%p35, %fd1, 0d0000000000000000;
-	setp.eq.f64	%p36, %fd1, 0d8000000000000000;
-	or.pred  	%p37, %p35, %p36;
-	mov.f64 	%fd98, 0d7FF8000000000000;
-	@%p37 bra 	BB12_65;
+BB24_39:
+	setp.eq.f64	%p34, %fd1, 0d0000000000000000;
+	setp.eq.f64	%p35, %fd1, 0d8000000000000000;
+	or.pred  	%p36, %p34, %p35;
+	mov.f64 	%fd99, 0d7FF8000000000000;
+	@%p36 bra 	BB24_69;
 
-	div.rn.f64 	%fd98, %fd68, %fd1;
-	abs.f64 	%fd72, %fd98;
-	setp.gtu.f64	%p38, %fd72, 0d7FF0000000000000;
-	@%p38 bra 	BB12_65;
+	div.rn.f64 	%fd99, %fd68, %fd1;
+	abs.f64 	%fd72, %fd99;
+	setp.gtu.f64	%p37, %fd72, 0d7FF0000000000000;
+	@%p37 bra 	BB24_69;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r15, %temp}, %fd98;
+	mov.b64 	{%temp, %r15}, %fd99;
 	}
+	and.b32  	%r16, %r15, 2147483647;
+	setp.ne.s32	%p38, %r16, 2146435072;
+	@%p38 bra 	BB24_43;
+
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r16}, %fd98;
+	mov.b64 	{%r17, %temp}, %fd99;
 	}
-	and.b32  	%r17, %r16, 2147483647;
-	setp.ne.s32	%p39, %r17, 2146435072;
-	setp.ne.s32	%p40, %r15, 0;
-	or.pred  	%p41, %p39, %p40;
-	@!%p41 bra 	BB12_65;
-	bra.uni 	BB12_41;
-
-BB12_41:
-	cvt.rmi.f64.f64	%fd73, %fd98;
+	setp.eq.s32	%p39, %r17, 0;
+	@%p39 bra 	BB24_69;
+
+BB24_43:
+	cvt.rmi.f64.f64	%fd73, %fd99;
 	mul.f64 	%fd74, %fd1, %fd73;
-	sub.f64 	%fd98, %fd68, %fd74;
-	bra.uni 	BB12_65;
-
-BB12_72:
-	setp.eq.s32	%p95, %r6, 2;
-	@%p95 bra 	BB12_127;
-	bra.uni 	BB12_73;
-
-BB12_127:
-	mul.f64 	%fd106, %fd1, %fd68;
-	bra.uni 	BB12_129;
-
-BB12_88:
-	setp.eq.s32	%p82, %r6, 11;
-	@%p82 bra 	BB12_109;
-
-	setp.eq.s32	%p83, %r6, 12;
-	@%p83 bra 	BB12_108;
-	bra.uni 	BB12_90;
-
-BB12_108:
-	max.f64 	%fd106, %fd1, %fd68;
-	bra.uni 	BB12_129;
-
-BB12_79:
-	setp.eq.s32	%p89, %r6, 6;
-	@%p89 bra 	BB12_112;
-
-	setp.eq.s32	%p90, %r6, 7;
-	@%p90 bra 	BB12_111;
-	bra.uni 	BB12_81;
-
-BB12_111:
-	setp.gt.f64	%p116, %fd1, %fd68;
-	selp.f64	%fd106, 0d3FF0000000000000, 0d0000000000000000, %p116;
-	bra.uni 	BB12_129;
-
-BB12_96:
-	setp.eq.s32	%p76, %r6, 16;
-	@%p76 bra 	BB12_106;
-
-	setp.eq.s32	%p77, %r6, 17;
-	@%p77 bra 	BB12_102;
-	bra.uni 	BB12_98;
-
-BB12_102:
-	setp.eq.f64	%p103, %fd68, 0d0000000000000000;
-	setp.eq.f64	%p104, %fd68, 0d8000000000000000;
-	or.pred  	%p105, %p103, %p104;
-	mov.f64 	%fd106, 0d7FF8000000000000;
-	@%p105 bra 	BB12_129;
-
-	div.rn.f64 	%fd106, %fd1, %fd68;
-	abs.f64 	%fd83, %fd106;
-	setp.gtu.f64	%p106, %fd83, 0d7FF0000000000000;
-	@%p106 bra 	BB12_129;
+	sub.f64 	%fd99, %fd68, %fd74;
+	bra.uni 	BB24_69;
+
+BB24_76:
+	setp.eq.s32	%p91, %r6, 2;
+	@%p91 bra 	BB24_135;
+	bra.uni 	BB24_77;
+
+BB24_135:
+	mul.f64 	%fd108, %fd1, %fd68;
+	bra.uni 	BB24_137;
+
+BB24_92:
+	setp.eq.s32	%p78, %r6, 11;
+	@%p78 bra 	BB24_115;
+
+	setp.eq.s32	%p79, %r6, 12;
+	@%p79 bra 	BB24_114;
+	bra.uni 	BB24_94;
+
+BB24_114:
+	max.f64 	%fd108, %fd1, %fd68;
+	bra.uni 	BB24_137;
+
+BB24_83:
+	setp.eq.s32	%p85, %r6, 6;
+	@%p85 bra 	BB24_118;
+
+	setp.eq.s32	%p86, %r6, 7;
+	@%p86 bra 	BB24_117;
+	bra.uni 	BB24_85;
+
+BB24_117:
+	setp.gt.f64	%p110, %fd1, %fd68;
+	selp.f64	%fd108, 0d3FF0000000000000, 0d0000000000000000, %p110;
+	bra.uni 	BB24_137;
+
+BB24_100:
+	setp.eq.s32	%p72, %r6, 16;
+	@%p72 bra 	BB24_112;
+
+	setp.eq.s32	%p73, %r6, 17;
+	@%p73 bra 	BB24_107;
+	bra.uni 	BB24_102;
+
+BB24_107:
+	setp.eq.f64	%p98, %fd68, 0d0000000000000000;
+	setp.eq.f64	%p99, %fd68, 0d8000000000000000;
+	or.pred  	%p100, %p98, %p99;
+	mov.f64 	%fd108, 0d7FF8000000000000;
+	@%p100 bra 	BB24_137;
+
+	div.rn.f64 	%fd108, %fd1, %fd68;
+	abs.f64 	%fd83, %fd108;
+	setp.gtu.f64	%p101, %fd83, 0d7FF0000000000000;
+	@%p101 bra 	BB24_137;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r52, %temp}, %fd106;
+	mov.b64 	{%temp, %r53}, %fd108;
 	}
+	and.b32  	%r54, %r53, 2147483647;
+	setp.ne.s32	%p102, %r54, 2146435072;
+	@%p102 bra 	BB24_111;
+
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r53}, %fd106;
+	mov.b64 	{%r55, %temp}, %fd108;
 	}
-	and.b32  	%r54, %r53, 2147483647;
-	setp.ne.s32	%p107, %r54, 2146435072;
-	setp.ne.s32	%p108, %r52, 0;
-	or.pred  	%p109, %p107, %p108;
-	@!%p109 bra 	BB12_129;
-	bra.uni 	BB12_105;
-
-BB12_105:
-	cvt.rmi.f64.f64	%fd84, %fd106;
+	setp.eq.s32	%p103, %r55, 0;
+	@%p103 bra 	BB24_137;
+
+BB24_111:
+	cvt.rmi.f64.f64	%fd84, %fd108;
 	mul.f64 	%fd85, %fd84, %fd68;
-	sub.f64 	%fd106, %fd1, %fd85;
-	bra.uni 	BB12_129;
+	sub.f64 	%fd108, %fd1, %fd85;
+	bra.uni 	BB24_137;
 
-BB12_6:
+BB24_6:
 	setp.eq.s32	%p30, %r6, 1;
-	@%p30 bra 	BB12_7;
-	bra.uni 	BB12_65;
+	@%p30 bra 	BB24_7;
+	bra.uni 	BB24_69;
 
-BB12_7:
-	sub.f64 	%fd98, %fd68, %fd1;
-	bra.uni 	BB12_65;
+BB24_7:
+	sub.f64 	%fd99, %fd68, %fd1;
+	bra.uni 	BB24_69;
 
-BB12_22:
+BB24_22:
 	setp.eq.s32	%p18, %r6, 10;
-	@%p18 bra 	BB12_23;
-	bra.uni 	BB12_65;
+	@%p18 bra 	BB24_23;
+	bra.uni 	BB24_69;
 
-BB12_23:
-	setp.neu.f64	%p45, %fd1, %fd68;
-	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p45;
-	bra.uni 	BB12_65;
+BB24_23:
+	setp.neu.f64	%p43, %fd1, %fd68;
+	selp.f64	%fd99, 0d3FF0000000000000, 0d0000000000000000, %p43;
+	bra.uni 	BB24_69;
 
-BB12_13:
+BB24_13:
 	setp.eq.s32	%p25, %r6, 5;
-	@%p25 bra 	BB12_14;
-	bra.uni 	BB12_65;
+	@%p25 bra 	BB24_14;
+	bra.uni 	BB24_69;
 
-BB12_14:
-	setp.gt.f64	%p50, %fd1, %fd68;
-	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p50;
-	bra.uni 	BB12_65;
+BB24_14:
+	setp.gt.f64	%p48, %fd1, %fd68;
+	selp.f64	%fd99, 0d3FF0000000000000, 0d0000000000000000, %p48;
+	bra.uni 	BB24_69;
 
-BB12_30:
+BB24_30:
 	setp.eq.s32	%p12, %r6, 15;
-	@%p12 bra 	BB12_31;
-	bra.uni 	BB12_65;
+	@%p12 bra 	BB24_31;
+	bra.uni 	BB24_69;
 
-BB12_31:
+BB24_31:
 	mul.f64 	%fd76, %fd1, %fd68;
 	mov.f64 	%fd77, 0d3FF0000000000000;
-	sub.f64 	%fd98, %fd77, %fd76;
-	bra.uni 	BB12_65;
+	sub.f64 	%fd99, %fd77, %fd76;
+	bra.uni 	BB24_69;
 
-BB12_9:
+BB24_9:
 	setp.eq.s32	%p28, %r6, 3;
-	@%p28 bra 	BB12_10;
-	bra.uni 	BB12_65;
+	@%p28 bra 	BB24_10;
+	bra.uni 	BB24_69;
 
-BB12_10:
-	div.rn.f64 	%fd98, %fd68, %fd1;
-	bra.uni 	BB12_65;
+BB24_10:
+	div.rn.f64 	%fd99, %fd68, %fd1;
+	bra.uni 	BB24_69;
 
-BB12_45:
-	min.f64 	%fd98, %fd68, %fd1;
-	bra.uni 	BB12_65;
+BB24_47:
+	min.f64 	%fd99, %fd68, %fd1;
+	bra.uni 	BB24_69;
 
-BB12_26:
+BB24_26:
 	setp.eq.s32	%p16, %r6, 13;
-	@%p16 bra 	BB12_27;
-	bra.uni 	BB12_65;
+	@%p16 bra 	BB24_27;
+	bra.uni 	BB24_69;
 
-BB12_27:
+BB24_27:
 	cvt.rni.s64.f64	%rd12, %fd68;
 	cvt.rni.s64.f64	%rd13, %fd1;
 	cvt.u32.u64	%r21, %rd12;
 	cvt.u32.u64	%r22, %rd13;
 	and.b32  	%r23, %r22, %r21;
-	setp.eq.s32	%p44, %r23, 0;
-	selp.f64	%fd98, 0d0000000000000000, 0d3FF0000000000000, %p44;
-	bra.uni 	BB12_65;
+	setp.eq.s32	%p42, %r23, 0;
+	selp.f64	%fd99, 0d0000000000000000, 0d3FF0000000000000, %p42;
+	bra.uni 	BB24_69;
 
-BB12_48:

<TRUNCATED>