You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2017/09/13 23:04:18 UTC

[2/3] systemml git commit: [SYSTEMML-445] Improved performance of GPU right indexing

http://git-wip-us.apache.org/repos/asf/systemml/blob/4cf95c92/src/main/cpp/kernels/SystemML.ptx
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.ptx b/src/main/cpp/kernels/SystemML.ptx
index caa3fc7..54b53b9 100644
--- a/src/main/cpp/kernels/SystemML.ptx
+++ b/src/main/cpp/kernels/SystemML.ptx
@@ -1,8 +1,8 @@
 //
 // Generated by NVIDIA NVVM Compiler
 //
-// Compiler Build ID: CL-21554848
-// Cuda compilation tools, release 8.0, V8.0.61
+// Compiler Build ID: CL-21124049
+// Cuda compilation tools, release 8.0, V8.0.44
 // Based on LLVM 3.4svn
 //
 
@@ -35,11 +35,12 @@
 	.param .u32 slice_sparse_dense_param_4,
 	.param .u32 slice_sparse_dense_param_5,
 	.param .u32 slice_sparse_dense_param_6,
-	.param .u32 slice_sparse_dense_param_7
+	.param .u32 slice_sparse_dense_param_7,
+	.param .u32 slice_sparse_dense_param_8
 )
 {
 	.reg .pred 	%p<7>;
-	.reg .b32 	%r<26>;
+	.reg .b32 	%r<24>;
 	.reg .f64 	%fd<2>;
 	.reg .b64 	%rd<23>;
 
@@ -48,38 +49,36 @@
 	ld.param.u64 	%rd10, [slice_sparse_dense_param_1];
 	ld.param.u64 	%rd11, [slice_sparse_dense_param_2];
 	ld.param.u64 	%rd12, [slice_sparse_dense_param_3];
-	ld.param.u32 	%r14, [slice_sparse_dense_param_4];
-	ld.param.u32 	%r15, [slice_sparse_dense_param_5];
+	ld.param.u32 	%r15, [slice_sparse_dense_param_4];
+	ld.param.u32 	%r16, [slice_sparse_dense_param_5];
 	ld.param.u32 	%r12, [slice_sparse_dense_param_6];
 	ld.param.u32 	%r13, [slice_sparse_dense_param_7];
-	mov.u32 	%r16, %ntid.x;
-	mov.u32 	%r17, %ctaid.x;
-	mov.u32 	%r18, %tid.x;
-	mad.lo.s32 	%r1, %r16, %r17, %r18;
-	add.s32 	%r2, %r1, %r14;
-	setp.gt.s32	%p1, %r2, %r15;
+	ld.param.u32 	%r14, [slice_sparse_dense_param_8];
+	mov.u32 	%r17, %ntid.x;
+	mov.u32 	%r18, %ctaid.x;
+	mov.u32 	%r19, %tid.x;
+	mad.lo.s32 	%r1, %r17, %r18, %r19;
+	add.s32 	%r2, %r1, %r15;
+	setp.gt.s32	%p1, %r2, %r16;
 	@%p1 bra 	BB0_6;
 
 	cvta.to.global.u64 	%rd13, %rd10;
 	mul.wide.s32 	%rd14, %r2, 4;
 	add.s64 	%rd1, %rd13, %rd14;
-	ld.global.u32 	%r25, [%rd1];
-	ld.global.u32 	%r24, [%rd1+4];
-	setp.ge.s32	%p2, %r25, %r24;
+	ld.global.u32 	%r23, [%rd1];
+	ld.global.u32 	%r22, [%rd1+4];
+	setp.ge.s32	%p2, %r23, %r22;
 	@%p2 bra 	BB0_6;
 
 	cvta.to.global.u64 	%rd2, %rd12;
-	cvta.to.global.u64 	%rd15, %rd11;
-	mov.u32 	%r19, 1;
-	sub.s32 	%r20, %r19, %r12;
-	add.s32 	%r21, %r20, %r13;
-	mul.lo.s32 	%r22, %r1, %r21;
-	sub.s32 	%r5, %r22, %r12;
-	cvta.to.global.u64 	%rd16, %rd9;
-	mul.wide.s32 	%rd17, %r25, 8;
-	add.s64 	%rd22, %rd16, %rd17;
-	mul.wide.s32 	%rd18, %r25, 4;
-	add.s64 	%rd21, %rd15, %rd18;
+	cvta.to.global.u64 	%rd15, %rd9;
+	cvta.to.global.u64 	%rd16, %rd11;
+	mul.lo.s32 	%r20, %r1, %r14;
+	sub.s32 	%r5, %r20, %r12;
+	mul.wide.s32 	%rd17, %r23, 8;
+	add.s64 	%rd22, %rd15, %rd17;
+	mul.wide.s32 	%rd18, %r23, 4;
+	add.s64 	%rd21, %rd16, %rd18;
 
 BB0_3:
 	ld.global.u32 	%r8, [%rd21];
@@ -89,23 +88,87 @@ BB0_3:
 	@%p5 bra 	BB0_5;
 
 	ld.global.f64 	%fd1, [%rd22];
-	add.s32 	%r23, %r5, %r8;
-	mul.wide.s32 	%rd19, %r23, 8;
+	add.s32 	%r21, %r5, %r8;
+	mul.wide.s32 	%rd19, %r21, 8;
 	add.s64 	%rd20, %rd2, %rd19;
 	st.global.f64 	[%rd20], %fd1;
-	ld.global.u32 	%r24, [%rd1+4];
+	ld.global.u32 	%r22, [%rd1+4];
 
 BB0_5:
 	add.s64 	%rd22, %rd22, 8;
 	add.s64 	%rd21, %rd21, 4;
-	add.s32 	%r25, %r25, 1;
-	setp.lt.s32	%p6, %r25, %r24;
+	add.s32 	%r23, %r23, 1;
+	setp.lt.s32	%p6, %r23, %r22;
 	@%p6 bra 	BB0_3;
 
 BB0_6:
 	ret;
 }
 
+	// .globl	slice_dense_dense
+.visible .entry slice_dense_dense(
+	.param .u64 slice_dense_dense_param_0,
+	.param .u64 slice_dense_dense_param_1,
+	.param .u32 slice_dense_dense_param_2,
+	.param .u32 slice_dense_dense_param_3,
+	.param .u32 slice_dense_dense_param_4,
+	.param .u32 slice_dense_dense_param_5,
+	.param .u32 slice_dense_dense_param_6,
+	.param .u32 slice_dense_dense_param_7
+)
+{
+	.reg .pred 	%p<4>;
+	.reg .b32 	%r<22>;
+	.reg .f64 	%fd<2>;
+	.reg .b64 	%rd<15>;
+
+
+	ld.param.u64 	%rd7, [slice_dense_dense_param_0];
+	ld.param.u64 	%rd8, [slice_dense_dense_param_1];
+	ld.param.u32 	%r9, [slice_dense_dense_param_2];
+	ld.param.u32 	%r13, [slice_dense_dense_param_3];
+	ld.param.u32 	%r10, [slice_dense_dense_param_4];
+	ld.param.u32 	%r11, [slice_dense_dense_param_6];
+	ld.param.u32 	%r12, [slice_dense_dense_param_7];
+	mov.u32 	%r1, %ntid.x;
+	mov.u32 	%r2, %ctaid.x;
+	mov.u32 	%r3, %tid.x;
+	mad.lo.s32 	%r4, %r1, %r2, %r3;
+	add.s32 	%r14, %r4, %r9;
+	setp.gt.s32	%p1, %r14, %r13;
+	@%p1 bra 	BB1_4;
+
+	mul.lo.s32 	%r21, %r4, %r12;
+	setp.lt.s32	%p2, %r12, 1;
+	@%p2 bra 	BB1_4;
+
+	cvta.to.global.u64 	%rd9, %rd8;
+	cvta.to.global.u64 	%rd10, %rd7;
+	add.s32 	%r6, %r21, %r12;
+	mul.lo.s32 	%r15, %r1, %r2;
+	add.s32 	%r16, %r3, %r15;
+	mul.lo.s32 	%r17, %r12, %r16;
+	mul.wide.s32 	%rd11, %r17, 8;
+	add.s64 	%rd14, %rd9, %rd11;
+	add.s32 	%r18, %r3, %r9;
+	add.s32 	%r19, %r18, %r15;
+	mad.lo.s32 	%r20, %r11, %r19, %r10;
+	mul.wide.s32 	%rd12, %r20, 8;
+	add.s64 	%rd13, %rd10, %rd12;
+
+BB1_3:
+	ld.global.f64 	%fd1, [%rd13];
+	st.global.f64 	[%rd14], %fd1;
+	add.s64 	%rd14, %rd14, 8;
+	add.s64 	%rd13, %rd13, 8;
+	add.s32 	%r21, %r21, 1;
+	setp.lt.s32	%p3, %r21, %r6;
+	@%p3 bra 	BB1_3;
+
+BB1_4:
+	ret;
+}
+
 	// .globl	copy_u2l_dense
 .visible .entry copy_u2l_dense(
 	.param .u64 copy_u2l_dense_param_0,
@@ -132,10 +195,10 @@ BB0_6:
 	setp.gt.s32	%p1, %r9, %r8;
 	setp.lt.s32	%p2, %r2, %r4;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB1_2;
-	bra.uni 	BB1_1;
+	@!%p3 bra 	BB2_2;
+	bra.uni 	BB2_1;
 
-BB1_1:
+BB2_1:
 	cvta.to.global.u64 	%rd2, %rd1;
 	mul.wide.s32 	%rd3, %r1, 8;
 	add.s64 	%rd4, %rd2, %rd3;
@@ -144,7 +207,7 @@ BB1_1:
 	add.s64 	%rd6, %rd2, %rd5;
 	st.global.f64 	[%rd6], %fd1;
 
-BB1_2:
+BB2_2:
 	ret;
 }
 
@@ -174,10 +237,10 @@ BB1_2:
 	setp.lt.s32	%p1, %r2, %r4;
 	setp.gt.s32	%p2, %r3, -1;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB2_2;
-	bra.uni 	BB2_1;
+	@!%p3 bra 	BB3_2;
+	bra.uni 	BB3_1;
 
-BB2_1:
+BB3_1:
 	rem.s32 	%r8, %r1, %r3;
 	cvta.to.global.u64 	%rd3, %rd1;
 	mad.lo.s32 	%r9, %r2, %r3, %r8;
@@ -190,7 +253,7 @@ BB2_1:
 	add.s64 	%rd7, %rd6, %rd4;
 	st.global.f64 	[%rd7], %fd3;
 
-BB2_2:
+BB3_2:
 	ret;
 }
 
@@ -222,10 +285,10 @@ BB2_2:
 	setp.lt.s32	%p1, %r2, %r4;
 	setp.gt.s32	%p2, %r3, -1;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB3_4;
-	bra.uni 	BB3_1;
+	@!%p3 bra 	BB4_4;
+	bra.uni 	BB4_1;
 
-BB3_1:
+BB4_1:
 	rem.s32 	%r8, %r1, %r3;
 	cvta.to.global.u64 	%rd5, %rd2;
 	mad.lo.s32 	%r9, %r2, %r3, %r8;
@@ -235,20 +298,20 @@ BB3_1:
 	ld.global.f64 	%fd4, [%rd7];
 	mov.f64 	%fd5, 0d0000000000000000;
 	setp.leu.f64	%p4, %fd4, 0d0000000000000000;
-	@%p4 bra 	BB3_3;
+	@%p4 bra 	BB4_3;
 
 	cvta.to.global.u64 	%rd8, %rd3;
 	shl.b64 	%rd9, %rd1, 3;
 	add.s64 	%rd10, %rd8, %rd9;
 	ld.global.f64 	%fd5, [%rd10];
 
-BB3_3:
+BB4_3:
 	cvta.to.global.u64 	%rd11, %rd4;
 	shl.b64 	%rd12, %rd1, 3;
 	add.s64 	%rd13, %rd11, %rd12;
 	st.global.f64 	[%rd13], %fd5;
 
-BB3_4:
+BB4_4:
 	ret;
 }
 
@@ -278,10 +341,10 @@ BB3_4:
 	setp.lt.s32	%p1, %r2, %r4;
 	setp.gt.s32	%p2, %r3, -1;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB4_2;
-	bra.uni 	BB4_1;
+	@!%p3 bra 	BB5_2;
+	bra.uni 	BB5_1;
 
-BB4_1:
+BB5_1:
 	rem.s32 	%r8, %r1, %r3;
 	cvta.to.global.u64 	%rd3, %rd1;
 	mad.lo.s32 	%r9, %r2, %r3, %r8;
@@ -294,7 +357,7 @@ BB4_1:
 	add.f64 	%fd3, %fd2, %fd1;
 	st.global.f64 	[%rd7], %fd3;
 
-BB4_2:
+BB5_2:
 	ret;
 }
 
@@ -328,10 +391,10 @@ BB4_2:
 	setp.lt.s32	%p1, %r2, %r5;
 	setp.gt.s32	%p2, %r3, -1;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB5_2;
-	bra.uni 	BB5_1;
+	@!%p3 bra 	BB6_2;
+	bra.uni 	BB6_1;
 
-BB5_1:
+BB6_1:
 	rem.s32 	%r9, %r1, %r3;
 	cvta.to.global.u64 	%rd4, %rd1;
 	mad.lo.s32 	%r10, %r2, %r3, %r9;
@@ -348,7 +411,7 @@ BB5_1:
 	add.s64 	%rd11, %rd10, %rd5;
 	st.global.f64 	[%rd11], %fd3;
 
-BB5_2:
+BB6_2:
 	ret;
 }
 
@@ -387,10 +450,10 @@ BB5_2:
 	setp.lt.s32	%p1, %r1, %r5;
 	setp.gt.s32	%p2, %r3, -1;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB6_4;
-	bra.uni 	BB6_1;
+	@!%p3 bra 	BB7_4;
+	bra.uni 	BB7_1;
 
-BB6_1:
+BB7_1:
 	cvta.to.global.u64 	%rd6, %rd4;
 	mad.lo.s32 	%r10, %r1, %r3, %r2;
 	cvta.to.global.u64 	%rd7, %rd3;
@@ -399,25 +462,25 @@ BB6_1:
 	ld.global.f64 	%fd1, [%rd9];
 	add.s64 	%rd2, %rd6, %rd8;
 	setp.eq.s32	%p4, %r4, 1;
-	@%p4 bra 	BB6_3;
-	bra.uni 	BB6_2;
+	@%p4 bra 	BB7_3;
+	bra.uni 	BB7_2;
 
-BB6_3:
+BB7_3:
 	mul.wide.s32 	%rd12, %r2, 8;
 	add.s64 	%rd13, %rd1, %rd12;
 	ld.global.f64 	%fd5, [%rd13];
 	fma.rn.f64 	%fd6, %fd5, %fd2, %fd1;
 	st.global.f64 	[%rd2], %fd6;
-	bra.uni 	BB6_4;
+	bra.uni 	BB7_4;
 
-BB6_2:
+BB7_2:
 	mul.wide.s32 	%rd10, %r1, 8;
 	add.s64 	%rd11, %rd1, %rd10;
 	ld.global.f64 	%fd3, [%rd11];
 	fma.rn.f64 	%fd4, %fd3, %fd2, %fd1;
 	st.global.f64 	[%rd2], %fd4;
 
-BB6_4:
+BB7_4:
 	ret;
 }
 
@@ -451,10 +514,10 @@ BB6_4:
 	setp.lt.s32	%p1, %r2, %r5;
 	setp.gt.s32	%p2, %r3, -1;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB7_2;
-	bra.uni 	BB7_1;
+	@!%p3 bra 	BB8_2;
+	bra.uni 	BB8_1;
 
-BB7_1:
+BB8_1:
 	rem.s32 	%r9, %r1, %r3;
 	cvta.to.global.u64 	%rd4, %rd1;
 	mad.lo.s32 	%r10, %r2, %r3, %r9;
@@ -471,7 +534,7 @@ BB7_1:
 	add.s64 	%rd11, %rd10, %rd5;
 	st.global.f64 	[%rd11], %fd3;
 
-BB7_2:
+BB8_2:
 	ret;
 }
 
@@ -513,10 +576,10 @@ BB7_2:
 	setp.lt.s32	%p1, %r8, %r2;
 	setp.gt.s32	%p2, %r3, -1;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB8_6;
-	bra.uni 	BB8_1;
+	@!%p3 bra 	BB9_6;
+	bra.uni 	BB9_1;
 
-BB8_1:
+BB9_1:
 	cvta.to.global.u64 	%rd4, %rd2;
 	mul.wide.s32 	%rd5, %r1, 8;
 	add.s64 	%rd6, %rd4, %rd5;
@@ -526,26 +589,26 @@ BB8_1:
 	setp.lt.f64	%p4, %fd8, %fd3;
 	cvta.to.global.u64 	%rd7, %rd3;
 	add.s64 	%rd1, %rd7, %rd5;
-	@%p4 bra 	BB8_5;
-	bra.uni 	BB8_2;
+	@%p4 bra 	BB9_5;
+	bra.uni 	BB9_2;
 
-BB8_5:
+BB9_5:
 	st.global.f64 	[%rd1], %fd4;
-	bra.uni 	BB8_6;
+	bra.uni 	BB9_6;
 
-BB8_2:
+BB9_2:
 	setp.lt.f64	%p5, %fd1, %fd2;
-	@%p5 bra 	BB8_4;
-	bra.uni 	BB8_3;
+	@%p5 bra 	BB9_4;
+	bra.uni 	BB9_3;
 
-BB8_4:
+BB9_4:
 	st.global.f64 	[%rd1], %fd5;
-	bra.uni 	BB8_6;
+	bra.uni 	BB9_6;
 
-BB8_3:
+BB9_3:
 	st.global.f64 	[%rd1], %fd6;
 
-BB8_6:
+BB9_6:
 	ret;
 }
 
@@ -561,9 +624,9 @@ BB8_6:
 	.param .u32 matrix_matrix_cellwise_op_param_7
 )
 {
-	.reg .pred 	%p<73>;
-	.reg .b32 	%r<66>;
-	.reg .f64 	%fd<56>;
+	.reg .pred 	%p<77>;
+	.reg .b32 	%r<65>;
+	.reg .f64 	%fd<55>;
 	.reg .b64 	%rd<19>;
 
 
@@ -584,93 +647,93 @@ BB8_6:
 	setp.lt.s32	%p2, %r1, %r14;
 	setp.gt.s32	%p3, %r10, -1;
 	and.pred  	%p4, %p2, %p3;
-	@!%p4 bra 	BB9_77;
-	bra.uni 	BB9_1;
+	@!%p4 bra 	BB10_73;
+	bra.uni 	BB10_1;
 
-BB9_1:
+BB10_1:
 	mad.lo.s32 	%r3, %r1, %r10, %r2;
 	setp.eq.s32	%p5, %r11, 1;
-	mov.u32 	%r64, %r1;
-	@%p5 bra 	BB9_5;
+	mov.u32 	%r63, %r1;
+	@%p5 bra 	BB10_5;
 
 	setp.ne.s32	%p6, %r11, 2;
-	mov.u32 	%r65, %r3;
-	@%p6 bra 	BB9_4;
+	mov.u32 	%r64, %r3;
+	@%p6 bra 	BB10_4;
 
-	mov.u32 	%r65, %r2;
+	mov.u32 	%r64, %r2;
 
-BB9_4:
-	mov.u32 	%r59, %r65;
-	mov.u32 	%r4, %r59;
-	mov.u32 	%r64, %r4;
+BB10_4:
+	mov.u32 	%r58, %r64;
+	mov.u32 	%r4, %r58;
+	mov.u32 	%r63, %r4;
 
-BB9_5:
-	mov.u32 	%r5, %r64;
+BB10_5:
+	mov.u32 	%r5, %r63;
 	setp.eq.s32	%p7, %r12, 1;
-	mov.u32 	%r62, %r1;
-	@%p7 bra 	BB9_9;
+	mov.u32 	%r61, %r1;
+	@%p7 bra 	BB10_9;
 
 	setp.ne.s32	%p8, %r12, 2;
-	mov.u32 	%r63, %r3;
-	@%p8 bra 	BB9_8;
+	mov.u32 	%r62, %r3;
+	@%p8 bra 	BB10_8;
 
-	mov.u32 	%r63, %r2;
+	mov.u32 	%r62, %r2;
 
-BB9_8:
-	mov.u32 	%r62, %r63;
+BB10_8:
+	mov.u32 	%r61, %r62;
 
-BB9_9:
+BB10_9:
 	cvta.to.global.u64 	%rd5, %rd3;
 	cvta.to.global.u64 	%rd6, %rd2;
 	mul.wide.s32 	%rd7, %r5, 8;
 	add.s64 	%rd8, %rd6, %rd7;
 	ld.global.f64 	%fd1, [%rd8];
-	mul.wide.s32 	%rd9, %r62, 8;
+	mul.wide.s32 	%rd9, %r61, 8;
 	add.s64 	%rd10, %rd5, %rd9;
 	ld.global.f64 	%fd2, [%rd10];
-	mov.f64 	%fd55, 0d7FEFFFFFFFFFFFFF;
+	mov.f64 	%fd54, 0d7FEFFFFFFFFFFFFF;
 	setp.gt.s32	%p9, %r13, 8;
-	@%p9 bra 	BB9_26;
+	@%p9 bra 	BB10_26;
 
 	setp.gt.s32	%p23, %r13, 3;
-	@%p23 bra 	BB9_18;
+	@%p23 bra 	BB10_18;
 
 	setp.gt.s32	%p30, %r13, 1;
-	@%p30 bra 	BB9_15;
+	@%p30 bra 	BB10_15;
 
 	setp.eq.s32	%p33, %r13, 0;
-	@%p33 bra 	BB9_75;
-	bra.uni 	BB9_13;
+	@%p33 bra 	BB10_71;
+	bra.uni 	BB10_13;
 
-BB9_75:
-	add.f64 	%fd55, %fd1, %fd2;
-	bra.uni 	BB9_76;
+BB10_71:
+	add.f64 	%fd54, %fd1, %fd2;
+	bra.uni 	BB10_72;
 
-BB9_26:
+BB10_26:
 	setp.gt.s32	%p10, %r13, 13;
-	@%p10 bra 	BB9_35;
+	@%p10 bra 	BB10_35;
 
 	setp.gt.s32	%p17, %r13, 10;
-	@%p17 bra 	BB9_31;
+	@%p17 bra 	BB10_31;
 
 	setp.eq.s32	%p21, %r13, 9;
-	@%p21 bra 	BB9_55;
-	bra.uni 	BB9_29;
+	@%p21 bra 	BB10_53;
+	bra.uni 	BB10_29;
 
-BB9_55:
-	setp.eq.f64	%p48, %fd1, %fd2;
-	selp.f64	%fd55, 0d3FF0000000000000, 0d0000000000000000, %p48;
-	bra.uni 	BB9_76;
+BB10_53:
+	setp.eq.f64	%p50, %fd1, %fd2;
+	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p50;
+	bra.uni 	BB10_72;
 
-BB9_18:
+BB10_18:
 	setp.gt.s32	%p24, %r13, 5;
-	@%p24 bra 	BB9_22;
+	@%p24 bra 	BB10_22;
 
 	setp.eq.s32	%p28, %r13, 4;
-	@%p28 bra 	BB9_58;
-	bra.uni 	BB9_20;
+	@%p28 bra 	BB10_56;
+	bra.uni 	BB10_20;
 
-BB9_58:
+BB10_56:
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r8}, %fd1;
@@ -683,7 +746,7 @@ BB9_58:
 	add.s32 	%r32, %r31, -1012;
 	mov.b64 	 %rd15, %fd2;
 	shl.b64 	%rd1, %rd15, %r32;
-	setp.eq.s64	%p53, %rd1, -9223372036854775808;
+	setp.eq.s64	%p55, %rd1, -9223372036854775808;
 	abs.f64 	%fd19, %fd1;
 	// Callseq Start 0
 	{
@@ -700,342 +763,340 @@ BB9_58:
 	param0, 
 	param1
 	);
-	ld.param.f64	%fd54, [retval0+0];
+	ld.param.f64	%fd53, [retval0+0];
 	
 	//{
 	}// Callseq End 0
-	setp.lt.s32	%p54, %r8, 0;
-	and.pred  	%p1, %p54, %p53;
-	@!%p1 bra 	BB9_60;
-	bra.uni 	BB9_59;
+	setp.lt.s32	%p56, %r8, 0;
+	and.pred  	%p1, %p56, %p55;
+	@!%p1 bra 	BB10_58;
+	bra.uni 	BB10_57;
 
-BB9_59:
+BB10_57:
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r33}, %fd54;
+	mov.b64 	{%temp, %r33}, %fd53;
 	}
 	xor.b32  	%r34, %r33, -2147483648;
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r35, %temp}, %fd54;
+	mov.b64 	{%r35, %temp}, %fd53;
 	}
-	mov.b64 	%fd54, {%r35, %r34};
+	mov.b64 	%fd53, {%r35, %r34};
 
-BB9_60:
-	mov.f64 	%fd53, %fd54;
-	setp.eq.f64	%p55, %fd1, 0d0000000000000000;
-	@%p55 bra 	BB9_63;
-	bra.uni 	BB9_61;
+BB10_58:
+	mov.f64 	%fd52, %fd53;
+	setp.eq.f64	%p57, %fd1, 0d0000000000000000;
+	@%p57 bra 	BB10_61;
+	bra.uni 	BB10_59;
 
-BB9_63:
-	selp.b32	%r36, %r8, 0, %p53;
+BB10_61:
+	selp.b32	%r36, %r8, 0, %p55;
 	or.b32  	%r37, %r36, 2146435072;
-	setp.lt.s32	%p59, %r9, 0;
-	selp.b32	%r38, %r37, %r36, %p59;
+	setp.lt.s32	%p61, %r9, 0;
+	selp.b32	%r38, %r37, %r36, %p61;
 	mov.u32 	%r39, 0;
-	mov.b64 	%fd53, {%r39, %r38};
-	bra.uni 	BB9_64;
+	mov.b64 	%fd52, {%r39, %r38};
+	bra.uni 	BB10_62;
 
-BB9_35:
+BB10_35:
 	setp.gt.s32	%p11, %r13, 15;
-	@%p11 bra 	BB9_39;
+	@%p11 bra 	BB10_39;
 
 	setp.eq.s32	%p15, %r13, 14;
-	@%p15 bra 	BB9_52;
-	bra.uni 	BB9_37;
+	@%p15 bra 	BB10_50;
+	bra.uni 	BB10_37;
 
-BB9_52:
+BB10_50:
 	cvt.rni.s64.f64	%rd11, %fd1;
 	cvt.rni.s64.f64	%rd12, %fd2;
 	cvt.u32.u64	%r25, %rd11;
 	cvt.u32.u64	%r26, %rd12;
 	or.b32  	%r27, %r26, %r25;
-	setp.eq.s32	%p45, %r27, 0;
-	selp.f64	%fd55, 0d0000000000000000, 0d3FF0000000000000, %p45;
-	bra.uni 	BB9_76;
+	setp.eq.s32	%p47, %r27, 0;
+	selp.f64	%fd54, 0d0000000000000000, 0d3FF0000000000000, %p47;
+	bra.uni 	BB10_72;
 
-BB9_15:
+BB10_15:
 	setp.eq.s32	%p31, %r13, 2;
-	@%p31 bra 	BB9_74;
-	bra.uni 	BB9_16;
+	@%p31 bra 	BB10_70;
+	bra.uni 	BB10_16;
 
-BB9_74:
-	mul.f64 	%fd55, %fd1, %fd2;
-	bra.uni 	BB9_76;
+BB10_70:
+	mul.f64 	%fd54, %fd1, %fd2;
+	bra.uni 	BB10_72;
 
-BB9_31:
+BB10_31:
 	setp.eq.s32	%p18, %r13, 11;
-	@%p18 bra 	BB9_54;
+	@%p18 bra 	BB10_52;
 
 	setp.eq.s32	%p19, %r13, 12;
-	@%p19 bra 	BB9_53;
-	bra.uni 	BB9_33;
+	@%p19 bra 	BB10_51;
+	bra.uni 	BB10_33;
 
-BB9_53:
-	max.f64 	%fd55, %fd1, %fd2;
-	bra.uni 	BB9_76;
+BB10_51:
+	max.f64 	%fd54, %fd1, %fd2;
+	bra.uni 	BB10_72;
 
-BB9_22:
+BB10_22:
 	setp.eq.s32	%p25, %r13, 6;
-	@%p25 bra 	BB9_57;
+	@%p25 bra 	BB10_55;
 
 	setp.eq.s32	%p26, %r13, 7;
-	@%p26 bra 	BB9_56;
-	bra.uni 	BB9_24;
+	@%p26 bra 	BB10_54;
+	bra.uni 	BB10_24;
 
-BB9_56:
-	setp.gt.f64	%p50, %fd1, %fd2;
-	selp.f64	%fd55, 0d3FF0000000000000, 0d0000000000000000, %p50;
-	bra.uni 	BB9_76;
+BB10_54:
+	setp.gt.f64	%p52, %fd1, %fd2;
+	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p52;
+	bra.uni 	BB10_72;
 
-BB9_39:
+BB10_39:
 	setp.eq.s32	%p12, %r13, 16;
-	@%p12 bra 	BB9_51;
+	@%p12 bra 	BB10_49;
 
 	setp.eq.s32	%p13, %r13, 17;
-	@%p13 bra 	BB9_46;
-	bra.uni 	BB9_41;
+	@%p13 bra 	BB10_45;
+	bra.uni 	BB10_41;
 
-BB9_46:
-	setp.eq.f64	%p38, %fd2, 0d0000000000000000;
-	setp.eq.f64	%p39, %fd2, 0d8000000000000000;
-	or.pred  	%p40, %p38, %p39;
-	mov.f64 	%fd55, 0d7FF8000000000000;
-	@%p40 bra 	BB9_76;
+BB10_45:
+	setp.eq.f64	%p39, %fd2, 0d0000000000000000;
+	setp.eq.f64	%p40, %fd2, 0d8000000000000000;
+	or.pred  	%p41, %p39, %p40;
+	mov.f64 	%fd54, 0d7FF8000000000000;
+	@%p41 bra 	BB10_72;
 
-	div.rn.f64 	%fd55, %fd1, %fd2;
-	abs.f64 	%fd39, %fd55;
-	setp.gtu.f64	%p41, %fd39, 0d7FF0000000000000;
-	@%p41 bra 	BB9_76;
+	div.rn.f64 	%fd54, %fd1, %fd2;
+	abs.f64 	%fd39, %fd54;
+	setp.gtu.f64	%p42, %fd39, 0d7FF0000000000000;
+	@%p42 bra 	BB10_72;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r22}, %fd55;
+	mov.b64 	{%r22, %temp}, %fd54;
 	}
-	and.b32  	%r23, %r22, 2147483647;
-	setp.ne.s32	%p42, %r23, 2146435072;
-	@%p42 bra 	BB9_50;
-
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r24, %temp}, %fd55;
+	mov.b64 	{%temp, %r23}, %fd54;
 	}
-	setp.eq.s32	%p43, %r24, 0;
-	@%p43 bra 	BB9_76;
+	and.b32  	%r24, %r23, 2147483647;
+	setp.ne.s32	%p43, %r24, 2146435072;
+	setp.ne.s32	%p44, %r22, 0;
+	or.pred  	%p45, %p43, %p44;
+	@!%p45 bra 	BB10_72;
+	bra.uni 	BB10_48;
 
-BB9_50:
-	cvt.rmi.f64.f64	%fd40, %fd55;
+BB10_48:
+	cvt.rmi.f64.f64	%fd40, %fd54;
 	mul.f64 	%fd41, %fd2, %fd40;
-	sub.f64 	%fd55, %fd1, %fd41;
-	bra.uni 	BB9_76;
+	sub.f64 	%fd54, %fd1, %fd41;
+	bra.uni 	BB10_72;
 
-BB9_13:
+BB10_13:
 	setp.eq.s32	%p34, %r13, 1;
-	@%p34 bra 	BB9_14;
-	bra.uni 	BB9_76;
+	@%p34 bra 	BB10_14;
+	bra.uni 	BB10_72;
 
-BB9_14:
-	sub.f64 	%fd55, %fd1, %fd2;
-	bra.uni 	BB9_76;
+BB10_14:
+	sub.f64 	%fd54, %fd1, %fd2;
+	bra.uni 	BB10_72;
 
-BB9_29:
+BB10_29:
 	setp.eq.s32	%p22, %r13, 10;
-	@%p22 bra 	BB9_30;
-	bra.uni 	BB9_76;
+	@%p22 bra 	BB10_30;
+	bra.uni 	BB10_72;
 
-BB9_30:
-	setp.neu.f64	%p47, %fd1, %fd2;
-	selp.f64	%fd55, 0d3FF0000000000000, 0d0000000000000000, %p47;
-	bra.uni 	BB9_76;
+BB10_30:
+	setp.neu.f64	%p49, %fd1, %fd2;
+	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p49;
+	bra.uni 	BB10_72;
 
-BB9_20:
+BB10_20:
 	setp.eq.s32	%p29, %r13, 5;
-	@%p29 bra 	BB9_21;
-	bra.uni 	BB9_76;
+	@%p29 bra 	BB10_21;
+	bra.uni 	BB10_72;
 
-BB9_21:
-	setp.lt.f64	%p52, %fd1, %fd2;
-	selp.f64	%fd55, 0d3FF0000000000000, 0d0000000000000000, %p52;
-	bra.uni 	BB9_76;
+BB10_21:
+	setp.lt.f64	%p54, %fd1, %fd2;
+	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p54;
+	bra.uni 	BB10_72;
 
-BB9_37:
+BB10_37:
 	setp.eq.s32	%p16, %r13, 15;
-	@%p16 bra 	BB9_38;
-	bra.uni 	BB9_76;
+	@%p16 bra 	BB10_38;
+	bra.uni 	BB10_72;
 
-BB9_38:
+BB10_38:
 	mul.f64 	%fd43, %fd1, %fd2;
 	mov.f64 	%fd44, 0d3FF0000000000000;
-	sub.f64 	%fd55, %fd44, %fd43;
-	bra.uni 	BB9_76;
+	sub.f64 	%fd54, %fd44, %fd43;
+	bra.uni 	BB10_72;
 
-BB9_16:
+BB10_16:
 	setp.eq.s32	%p32, %r13, 3;
-	@%p32 bra 	BB9_17;
-	bra.uni 	BB9_76;
+	@%p32 bra 	BB10_17;
+	bra.uni 	BB10_72;
 
-BB9_17:
-	div.rn.f64 	%fd55, %fd1, %fd2;
-	bra.uni 	BB9_76;
+BB10_17:
+	div.rn.f64 	%fd54, %fd1, %fd2;
+	bra.uni 	BB10_72;
 
-BB9_54:
-	min.f64 	%fd55, %fd1, %fd2;
-	bra.uni 	BB9_76;
+BB10_52:
+	min.f64 	%fd54, %fd1, %fd2;
+	bra.uni 	BB10_72;
 
-BB9_33:
+BB10_33:
 	setp.eq.s32	%p20, %r13, 13;
-	@%p20 bra 	BB9_34;
-	bra.uni 	BB9_76;
+	@%p20 bra 	BB10_34;
+	bra.uni 	BB10_72;
 
-BB9_34:
+BB10_34:
 	cvt.rni.s64.f64	%rd13, %fd1;
 	cvt.rni.s64.f64	%rd14, %fd2;
 	cvt.u32.u64	%r28, %rd13;
 	cvt.u32.u64	%r29, %rd14;
 	and.b32  	%r30, %r29, %r28;
-	setp.eq.s32	%p46, %r30, 0;
-	selp.f64	%fd55, 0d0000000000000000, 0d3FF0000000000000, %p46;
-	bra.uni 	BB9_76;
+	setp.eq.s32	%p48, %r30, 0;
+	selp.f64	%fd54, 0d0000000000000000, 0d3FF0000000000000, %p48;
+	bra.uni 	BB10_72;
 
-BB9_57:
-	setp.le.f64	%p51, %fd1, %fd2;
-	selp.f64	%fd55, 0d3FF0000000000000, 0d0000000000000000, %p51;
-	bra.uni 	BB9_76;
+BB10_55:
+	setp.le.f64	%p53, %fd1, %fd2;
+	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p53;
+	bra.uni 	BB10_72;
 
-BB9_24:
+BB10_24:
 	setp.eq.s32	%p27, %r13, 8;
-	@%p27 bra 	BB9_25;
-	bra.uni 	BB9_76;
+	@%p27 bra 	BB10_25;
+	bra.uni 	BB10_72;
 
-BB9_25:
-	setp.ge.f64	%p49, %fd1, %fd2;
-	selp.f64	%fd55, 0d3FF0000000000000, 0d0000000000000000, %p49;
-	bra.uni 	BB9_76;
+BB10_25:
+	setp.ge.f64	%p51, %fd1, %fd2;
+	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p51;
+	bra.uni 	BB10_72;
 
-BB9_51:
-	setp.neu.f64	%p44, %fd1, 0d0000000000000000;
+BB10_49:
+	setp.neu.f64	%p46, %fd1, 0d0000000000000000;
 	sub.f64 	%fd42, %fd1, %fd2;
-	selp.f64	%fd55, %fd42, 0d0000000000000000, %p44;
-	bra.uni 	BB9_76;
+	selp.f64	%fd54, %fd42, 0d0000000000000000, %p46;
+	bra.uni 	BB10_72;
 
-BB9_41:
+BB10_41:
 	setp.ne.s32	%p14, %r13, 18;
-	@%p14 bra 	BB9_76;
+	@%p14 bra 	BB10_72;
 
-	div.rn.f64 	%fd55, %fd1, %fd2;
-	abs.f64 	%fd37, %fd55;
+	div.rn.f64 	%fd54, %fd1, %fd2;
+	abs.f64 	%fd37, %fd54;
 	setp.gtu.f64	%p35, %fd37, 0d7FF0000000000000;
-	@%p35 bra 	BB9_76;
+	@%p35 bra 	BB10_72;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r19}, %fd55;
+	mov.b64 	{%r19, %temp}, %fd54;
 	}
-	and.b32  	%r20, %r19, 2147483647;
-	setp.ne.s32	%p36, %r20, 2146435072;
-	@%p36 bra 	BB9_45;
-
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r21, %temp}, %fd55;
+	mov.b64 	{%temp, %r20}, %fd54;
 	}
-	setp.eq.s32	%p37, %r21, 0;
-	@%p37 bra 	BB9_76;
+	and.b32  	%r21, %r20, 2147483647;
+	setp.ne.s32	%p36, %r21, 2146435072;
+	setp.ne.s32	%p37, %r19, 0;
+	or.pred  	%p38, %p36, %p37;
+	@!%p38 bra 	BB10_72;
+	bra.uni 	BB10_44;
 
-BB9_45:
-	cvt.rmi.f64.f64	%fd55, %fd55;
-	bra.uni 	BB9_76;
+BB10_44:
+	cvt.rmi.f64.f64	%fd54, %fd54;
+	bra.uni 	BB10_72;
 
-BB9_61:
-	setp.gt.s32	%p56, %r8, -1;
-	@%p56 bra 	BB9_64;
+BB10_59:
+	setp.gt.s32	%p58, %r8, -1;
+	@%p58 bra 	BB10_62;
 
 	cvt.rzi.f64.f64	%fd45, %fd2;
-	setp.neu.f64	%p57, %fd45, %fd2;
-	selp.f64	%fd53, 0dFFF8000000000000, %fd53, %p57;
+	setp.neu.f64	%p59, %fd45, %fd2;
+	selp.f64	%fd52, 0dFFF8000000000000, %fd52, %p59;
 
-BB9_64:
-	mov.f64 	%fd25, %fd53;
+BB10_62:
+	mov.f64 	%fd25, %fd52;
 	add.f64 	%fd26, %fd1, %fd2;
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r40}, %fd26;
 	}
 	and.b32  	%r41, %r40, 2146435072;
-	setp.ne.s32	%p60, %r41, 2146435072;
-	mov.f64 	%fd52, %fd25;
-	@%p60 bra 	BB9_73;
+	setp.ne.s32	%p62, %r41, 2146435072;
+	mov.f64 	%fd51, %fd25;
+	@%p62 bra 	BB10_69;
 
-	setp.gtu.f64	%p61, %fd19, 0d7FF0000000000000;
-	mov.f64 	%fd52, %fd26;
-	@%p61 bra 	BB9_73;
-
-	abs.f64 	%fd46, %fd2;
-	setp.gtu.f64	%p62, %fd46, 0d7FF0000000000000;
+	setp.gtu.f64	%p63, %fd19, 0d7FF0000000000000;
 	mov.f64 	%fd51, %fd26;
-	mov.f64 	%fd52, %fd51;
-	@%p62 bra 	BB9_73;
+	@%p63 bra 	BB10_69;
 
-	and.b32  	%r42, %r9, 2147483647;
-	setp.ne.s32	%p63, %r42, 2146435072;
-	@%p63 bra 	BB9_69;
+	abs.f64 	%fd46, %fd2;
+	setp.gtu.f64	%p64, %fd46, 0d7FF0000000000000;
+	mov.f64 	%fd50, %fd26;
+	mov.f64 	%fd51, %fd50;
+	@%p64 bra 	BB10_69;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r43, %temp}, %fd2;
+	mov.b64 	{%r42, %temp}, %fd2;
 	}
-	setp.eq.s32	%p64, %r43, 0;
-	@%p64 bra 	BB9_72;
+	and.b32  	%r43, %r9, 2147483647;
+	setp.eq.s32	%p65, %r43, 2146435072;
+	setp.eq.s32	%p66, %r42, 0;
+	and.pred  	%p67, %p65, %p66;
+	@%p67 bra 	BB10_68;
+	bra.uni 	BB10_66;
 
-BB9_69:
-	and.b32  	%r44, %r8, 2147483647;
-	setp.ne.s32	%p65, %r44, 2146435072;
-	mov.f64 	%fd49, %fd25;
-	mov.f64 	%fd52, %fd49;
-	@%p65 bra 	BB9_73;
+BB10_68:
+	setp.gt.f64	%p71, %fd19, 0d3FF0000000000000;
+	selp.b32	%r51, 2146435072, 0, %p71;
+	xor.b32  	%r52, %r51, 2146435072;
+	setp.lt.s32	%p72, %r9, 0;
+	selp.b32	%r53, %r52, %r51, %p72;
+	setp.eq.f64	%p73, %fd1, 0dBFF0000000000000;
+	selp.b32	%r54, 1072693248, %r53, %p73;
+	mov.u32 	%r55, 0;
+	mov.b64 	%fd51, {%r55, %r54};
+	bra.uni 	BB10_69;
 
+BB10_66:
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r45, %temp}, %fd1;
+	mov.b64 	{%r44, %temp}, %fd1;
 	}
-	setp.ne.s32	%p66, %r45, 0;
-	mov.f64 	%fd52, %fd25;
-	@%p66 bra 	BB9_73;
+	and.b32  	%r45, %r8, 2147483647;
+	setp.eq.s32	%p68, %r45, 2146435072;
+	setp.eq.s32	%p69, %r44, 0;
+	and.pred  	%p70, %p68, %p69;
+	mov.f64 	%fd51, %fd25;
+	@!%p70 bra 	BB10_69;
+	bra.uni 	BB10_67;
 
+BB10_67:
 	shr.s32 	%r46, %r9, 31;
 	and.b32  	%r47, %r46, -2146435072;
-	add.s32 	%r48, %r47, 2146435072;
-	or.b32  	%r49, %r48, -2147483648;
-	selp.b32	%r50, %r49, %r48, %p1;
-	mov.u32 	%r51, 0;
-	mov.b64 	%fd52, {%r51, %r50};
-	bra.uni 	BB9_73;
-
-BB9_72:
-	setp.gt.f64	%p67, %fd19, 0d3FF0000000000000;
-	selp.b32	%r52, 2146435072, 0, %p67;
-	xor.b32  	%r53, %r52, 2146435072;
-	setp.lt.s32	%p68, %r9, 0;
-	selp.b32	%r54, %r53, %r52, %p68;
-	setp.eq.f64	%p69, %fd1, 0dBFF0000000000000;
-	selp.b32	%r55, 1072693248, %r54, %p69;
-	mov.u32 	%r56, 0;
-	mov.b64 	%fd52, {%r56, %r55};
-
-BB9_73:
-	setp.eq.f64	%p70, %fd2, 0d0000000000000000;
-	setp.eq.f64	%p71, %fd1, 0d3FF0000000000000;
-	or.pred  	%p72, %p71, %p70;
-	selp.f64	%fd55, 0d3FF0000000000000, %fd52, %p72;
+	selp.b32	%r48, -1048576, 2146435072, %p1;
+	add.s32 	%r49, %r48, %r47;
+	mov.u32 	%r50, 0;
+	mov.b64 	%fd51, {%r50, %r49};
 
-BB9_76:
+BB10_69:
+	setp.eq.f64	%p74, %fd2, 0d0000000000000000;
+	setp.eq.f64	%p75, %fd1, 0d3FF0000000000000;
+	or.pred  	%p76, %p75, %p74;
+	selp.f64	%fd54, 0d3FF0000000000000, %fd51, %p76;
+
+BB10_72:
 	cvta.to.global.u64 	%rd16, %rd4;
 	mul.wide.s32 	%rd17, %r3, 8;
 	add.s64 	%rd18, %rd16, %rd17;
-	st.global.f64 	[%rd18], %fd55;
+	st.global.f64 	[%rd18], %fd54;
 	bar.sync 	0;
 
-BB9_77:
+BB10_73:
 	ret;
 }
 
@@ -1049,9 +1110,9 @@ BB9_77:
 	.param .u32 matrix_scalar_op_param_5
 )
 {
-	.reg .pred 	%p<133>;
-	.reg .b32 	%r<88>;
-	.reg .f64 	%fd<109>;
+	.reg .pred 	%p<141>;
+	.reg .b32 	%r<86>;
+	.reg .f64 	%fd<107>;
 	.reg .b64 	%rd<20>;
 
 
@@ -1066,7 +1127,7 @@ BB9_77:
 	mov.u32 	%r11, %tid.x;
 	mad.lo.s32 	%r1, %r9, %r10, %r11;
 	setp.ge.s32	%p3, %r1, %r8;
-	@%p3 bra 	BB10_138;
+	@%p3 bra 	BB11_130;
 
 	cvta.to.global.u64 	%rd6, %rd5;
 	cvta.to.global.u64 	%rd7, %rd4;
@@ -1075,86 +1136,86 @@ BB9_77:
 	ld.global.f64 	%fd1, [%rd9];
 	add.s64 	%rd1, %rd6, %rd8;
 	setp.eq.s32	%p4, %r7, 0;
-	@%p4 bra 	BB10_70;
+	@%p4 bra 	BB11_66;
 
-	mov.f64 	%fd99, 0d7FEFFFFFFFFFFFFF;
+	mov.f64 	%fd98, 0d7FEFFFFFFFFFFFFF;
 	setp.gt.s32	%p5, %r6, 8;
-	@%p5 bra 	BB10_19;
+	@%p5 bra 	BB11_19;
 
 	setp.gt.s32	%p19, %r6, 3;
-	@%p19 bra 	BB10_11;
+	@%p19 bra 	BB11_11;
 
 	setp.gt.s32	%p26, %r6, 1;
-	@%p26 bra 	BB10_8;
+	@%p26 bra 	BB11_8;
 
 	setp.eq.s32	%p29, %r6, 0;
-	@%p29 bra 	BB10_68;
-	bra.uni 	BB10_6;
+	@%p29 bra 	BB11_64;
+	bra.uni 	BB11_6;
 
-BB10_68:
-	add.f64 	%fd99, %fd1, %fd68;
-	bra.uni 	BB10_69;
+BB11_64:
+	add.f64 	%fd98, %fd1, %fd68;
+	bra.uni 	BB11_65;
 
-BB10_70:
-	mov.f64 	%fd108, 0d7FEFFFFFFFFFFFFF;
-	setp.gt.s32	%p69, %r6, 8;
-	@%p69 bra 	BB10_87;
+BB11_66:
+	mov.f64 	%fd106, 0d7FEFFFFFFFFFFFFF;
+	setp.gt.s32	%p73, %r6, 8;
+	@%p73 bra 	BB11_83;
 
-	setp.gt.s32	%p83, %r6, 3;
-	@%p83 bra 	BB10_79;
+	setp.gt.s32	%p87, %r6, 3;
+	@%p87 bra 	BB11_75;
 
-	setp.gt.s32	%p90, %r6, 1;
-	@%p90 bra 	BB10_76;
+	setp.gt.s32	%p94, %r6, 1;
+	@%p94 bra 	BB11_72;
 
-	setp.eq.s32	%p93, %r6, 0;
-	@%p93 bra 	BB10_136;
-	bra.uni 	BB10_74;
+	setp.eq.s32	%p97, %r6, 0;
+	@%p97 bra 	BB11_128;
+	bra.uni 	BB11_70;
 
-BB10_136:
-	add.f64 	%fd108, %fd1, %fd68;
-	bra.uni 	BB10_137;
+BB11_128:
+	add.f64 	%fd106, %fd1, %fd68;
+	bra.uni 	BB11_129;
 
-BB10_19:
+BB11_19:
 	setp.gt.s32	%p6, %r6, 13;
-	@%p6 bra 	BB10_28;
+	@%p6 bra 	BB11_28;
 
 	setp.gt.s32	%p13, %r6, 10;
-	@%p13 bra 	BB10_24;
+	@%p13 bra 	BB11_24;
 
 	setp.eq.s32	%p17, %r6, 9;
-	@%p17 bra 	BB10_48;
-	bra.uni 	BB10_22;
+	@%p17 bra 	BB11_46;
+	bra.uni 	BB11_22;
 
-BB10_48:
-	setp.eq.f64	%p44, %fd1, %fd68;
-	selp.f64	%fd99, 0d3FF0000000000000, 0d0000000000000000, %p44;
-	bra.uni 	BB10_69;
+BB11_46:
+	setp.eq.f64	%p46, %fd1, %fd68;
+	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p46;
+	bra.uni 	BB11_65;
 
-BB10_87:
-	setp.gt.s32	%p70, %r6, 13;
-	@%p70 bra 	BB10_96;
+BB11_83:
+	setp.gt.s32	%p74, %r6, 13;
+	@%p74 bra 	BB11_92;
 
-	setp.gt.s32	%p77, %r6, 10;
-	@%p77 bra 	BB10_92;
+	setp.gt.s32	%p81, %r6, 10;
+	@%p81 bra 	BB11_88;
 
-	setp.eq.s32	%p81, %r6, 9;
-	@%p81 bra 	BB10_116;
-	bra.uni 	BB10_90;
+	setp.eq.s32	%p85, %r6, 9;
+	@%p85 bra 	BB11_110;
+	bra.uni 	BB11_86;
 
-BB10_116:
-	setp.eq.f64	%p108, %fd1, %fd68;
-	selp.f64	%fd108, 0d3FF0000000000000, 0d0000000000000000, %p108;
-	bra.uni 	BB10_137;
+BB11_110:
+	setp.eq.f64	%p114, %fd1, %fd68;
+	selp.f64	%fd106, 0d3FF0000000000000, 0d0000000000000000, %p114;
+	bra.uni 	BB11_129;
 
-BB10_11:
+BB11_11:
 	setp.gt.s32	%p20, %r6, 5;
-	@%p20 bra 	BB10_15;
+	@%p20 bra 	BB11_15;
 
 	setp.eq.s32	%p24, %r6, 4;
-	@%p24 bra 	BB10_51;
-	bra.uni 	BB10_13;
+	@%p24 bra 	BB11_49;
+	bra.uni 	BB11_13;
 
-BB10_51:
+BB11_49:
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r2}, %fd68;
@@ -1167,7 +1228,7 @@ BB10_51:
 	add.s32 	%r25, %r24, -1012;
 	mov.b64 	 %rd14, %fd1;
 	shl.b64 	%rd2, %rd14, %r25;
-	setp.eq.s64	%p49, %rd2, -9223372036854775808;
+	setp.eq.s64	%p51, %rd2, -9223372036854775808;
 	abs.f64 	%fd18, %fd68;
 	// Callseq Start 1
 	{
@@ -1184,69 +1245,69 @@ BB10_51:
 	param0, 
 	param1
 	);
-	ld.param.f64	%fd98, [retval0+0];
+	ld.param.f64	%fd97, [retval0+0];
 	
 	//{
 	}// Callseq End 1
-	setp.lt.s32	%p50, %r2, 0;
-	and.pred  	%p1, %p50, %p49;
-	@!%p1 bra 	BB10_53;
-	bra.uni 	BB10_52;
+	setp.lt.s32	%p52, %r2, 0;
+	and.pred  	%p1, %p52, %p51;
+	@!%p1 bra 	BB11_51;
+	bra.uni 	BB11_50;
 
-BB10_52:
+BB11_50:
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r26}, %fd98;
+	mov.b64 	{%temp, %r26}, %fd97;
 	}
 	xor.b32  	%r27, %r26, -2147483648;
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r28, %temp}, %fd98;
+	mov.b64 	{%r28, %temp}, %fd97;
 	}
-	mov.b64 	%fd98, {%r28, %r27};
+	mov.b64 	%fd97, {%r28, %r27};
 
-BB10_53:
-	mov.f64 	%fd97, %fd98;
-	setp.eq.f64	%p51, %fd68, 0d0000000000000000;
-	@%p51 bra 	BB10_56;
-	bra.uni 	BB10_54;
+BB11_51:
+	mov.f64 	%fd96, %fd97;
+	setp.eq.f64	%p53, %fd68, 0d0000000000000000;
+	@%p53 bra 	BB11_54;
+	bra.uni 	BB11_52;
 
-BB10_56:
-	selp.b32	%r29, %r2, 0, %p49;
+BB11_54:
+	selp.b32	%r29, %r2, 0, %p51;
 	or.b32  	%r30, %r29, 2146435072;
-	setp.lt.s32	%p55, %r3, 0;
-	selp.b32	%r31, %r30, %r29, %p55;
+	setp.lt.s32	%p57, %r3, 0;
+	selp.b32	%r31, %r30, %r29, %p57;
 	mov.u32 	%r32, 0;
-	mov.b64 	%fd97, {%r32, %r31};
-	bra.uni 	BB10_57;
+	mov.b64 	%fd96, {%r32, %r31};
+	bra.uni 	BB11_55;
 
-BB10_28:
+BB11_28:
 	setp.gt.s32	%p7, %r6, 15;
-	@%p7 bra 	BB10_32;
+	@%p7 bra 	BB11_32;
 
 	setp.eq.s32	%p11, %r6, 14;
-	@%p11 bra 	BB10_45;
-	bra.uni 	BB10_30;
+	@%p11 bra 	BB11_43;
+	bra.uni 	BB11_30;
 
-BB10_45:
+BB11_43:
 	cvt.rni.s64.f64	%rd10, %fd68;
 	cvt.rni.s64.f64	%rd11, %fd1;
 	cvt.u32.u64	%r18, %rd10;
 	cvt.u32.u64	%r19, %rd11;
 	or.b32  	%r20, %r19, %r18;
-	setp.eq.s32	%p41, %r20, 0;
-	selp.f64	%fd99, 0d0000000000000000, 0d3FF0000000000000, %p41;
-	bra.uni 	BB10_69;
+	setp.eq.s32	%p43, %r20, 0;
+	selp.f64	%fd98, 0d0000000000000000, 0d3FF0000000000000, %p43;
+	bra.uni 	BB11_65;
 
-BB10_79:
-	setp.gt.s32	%p84, %r6, 5;
-	@%p84 bra 	BB10_83;
+BB11_75:
+	setp.gt.s32	%p88, %r6, 5;
+	@%p88 bra 	BB11_79;
 
-	setp.eq.s32	%p88, %r6, 4;
-	@%p88 bra 	BB10_119;
-	bra.uni 	BB10_81;
+	setp.eq.s32	%p92, %r6, 4;
+	@%p92 bra 	BB11_113;
+	bra.uni 	BB11_77;
 
-BB10_119:
+BB11_113:
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r4}, %fd1;
@@ -1255,11 +1316,11 @@ BB10_119:
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r5}, %fd68;
 	}
-	bfe.u32 	%r62, %r5, 20, 11;
-	add.s32 	%r63, %r62, -1012;
+	bfe.u32 	%r61, %r5, 20, 11;
+	add.s32 	%r62, %r61, -1012;
 	mov.b64 	 %rd19, %fd68;
-	shl.b64 	%rd3, %rd19, %r63;
-	setp.eq.s64	%p113, %rd3, -9223372036854775808;
+	shl.b64 	%rd3, %rd19, %r62;
+	setp.eq.s64	%p119, %rd3, -9223372036854775808;
 	abs.f64 	%fd51, %fd1;
 	// Callseq Start 2
 	{
@@ -1276,616 +1337,612 @@ BB10_119:
 	param0, 
 	param1
 	);
-	ld.param.f64	%fd107, [retval0+0];
+	ld.param.f64	%fd105, [retval0+0];
 	
 	//{
 	}// Callseq End 2
-	setp.lt.s32	%p114, %r4, 0;
-	and.pred  	%p2, %p114, %p113;
-	@!%p2 bra 	BB10_121;
-	bra.uni 	BB10_120;
+	setp.lt.s32	%p120, %r4, 0;
+	and.pred  	%p2, %p120, %p119;
+	@!%p2 bra 	BB11_115;
+	bra.uni 	BB11_114;
 
-BB10_120:
+BB11_114:
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r64}, %fd107;
+	mov.b64 	{%temp, %r63}, %fd105;
 	}
-	xor.b32  	%r65, %r64, -2147483648;
+	xor.b32  	%r64, %r63, -2147483648;
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r66, %temp}, %fd107;
+	mov.b64 	{%r65, %temp}, %fd105;
 	}
-	mov.b64 	%fd107, {%r66, %r65};
-
-BB10_121:
-	mov.f64 	%fd106, %fd107;
-	setp.eq.f64	%p115, %fd1, 0d0000000000000000;
-	@%p115 bra 	BB10_124;
-	bra.uni 	BB10_122;
-
-BB10_124:
-	selp.b32	%r67, %r4, 0, %p113;
-	or.b32  	%r68, %r67, 2146435072;
-	setp.lt.s32	%p119, %r5, 0;
-	selp.b32	%r69, %r68, %r67, %p119;
-	mov.u32 	%r70, 0;
-	mov.b64 	%fd106, {%r70, %r69};
-	bra.uni 	BB10_125;
-
-BB10_96:
-	setp.gt.s32	%p71, %r6, 15;
-	@%p71 bra 	BB10_100;
-
-	setp.eq.s32	%p75, %r6, 14;
-	@%p75 bra 	BB10_113;
-	bra.uni 	BB10_98;
-
-BB10_113:
+	mov.b64 	%fd105, {%r65, %r64};
+
+BB11_115:
+	mov.f64 	%fd104, %fd105;
+	setp.eq.f64	%p121, %fd1, 0d0000000000000000;
+	@%p121 bra 	BB11_118;
+	bra.uni 	BB11_116;
+
+BB11_118:
+	selp.b32	%r66, %r4, 0, %p119;
+	or.b32  	%r67, %r66, 2146435072;
+	setp.lt.s32	%p125, %r5, 0;
+	selp.b32	%r68, %r67, %r66, %p125;
+	mov.u32 	%r69, 0;
+	mov.b64 	%fd104, {%r69, %r68};
+	bra.uni 	BB11_119;
+
+BB11_92:
+	setp.gt.s32	%p75, %r6, 15;
+	@%p75 bra 	BB11_96;
+
+	setp.eq.s32	%p79, %r6, 14;
+	@%p79 bra 	BB11_107;
+	bra.uni 	BB11_94;
+
+BB11_107:
 	cvt.rni.s64.f64	%rd15, %fd1;
 	cvt.rni.s64.f64	%rd16, %fd68;
-	cvt.u32.u64	%r56, %rd15;
-	cvt.u32.u64	%r57, %rd16;
-	or.b32  	%r58, %r57, %r56;
-	setp.eq.s32	%p105, %r58, 0;
-	selp.f64	%fd108, 0d0000000000000000, 0d3FF0000000000000, %p105;
-	bra.uni 	BB10_137;
-
-BB10_8:
+	cvt.u32.u64	%r55, %rd15;
+	cvt.u32.u64	%r56, %rd16;
+	or.b32  	%r57, %r56, %r55;
+	setp.eq.s32	%p111, %r57, 0;
+	selp.f64	%fd106, 0d0000000000000000, 0d3FF0000000000000, %p111;
+	bra.uni 	BB11_129;
+
+BB11_8:
 	setp.eq.s32	%p27, %r6, 2;
-	@%p27 bra 	BB10_67;
-	bra.uni 	BB10_9;
+	@%p27 bra 	BB11_63;
+	bra.uni 	BB11_9;
 
-BB10_67:
-	mul.f64 	%fd99, %fd1, %fd68;
-	bra.uni 	BB10_69;
+BB11_63:
+	mul.f64 	%fd98, %fd1, %fd68;
+	bra.uni 	BB11_65;
 
-BB10_24:
+BB11_24:
 	setp.eq.s32	%p14, %r6, 11;
-	@%p14 bra 	BB10_47;
+	@%p14 bra 	BB11_45;
 
 	setp.eq.s32	%p15, %r6, 12;
-	@%p15 bra 	BB10_46;
-	bra.uni 	BB10_26;
+	@%p15 bra 	BB11_44;
+	bra.uni 	BB11_26;
 
-BB10_46:
-	max.f64 	%fd99, %fd68, %fd1;
-	bra.uni 	BB10_69;
+BB11_44:
+	max.f64 	%fd98, %fd68, %fd1;
+	bra.uni 	BB11_65;
 
-BB10_15:
+BB11_15:
 	setp.eq.s32	%p21, %r6, 6;
-	@%p21 bra 	BB10_50;
+	@%p21 bra 	BB11_48;
 
 	setp.eq.s32	%p22, %r6, 7;
-	@%p22 bra 	BB10_49;
-	bra.uni 	BB10_17;
+	@%p22 bra 	BB11_47;
+	bra.uni 	BB11_17;
 
-BB10_49:
-	setp.lt.f64	%p46, %fd1, %fd68;
-	selp.f64	%fd99, 0d3FF0000000000000, 0d0000000000000000, %p46;
-	bra.uni 	BB10_69;
+BB11_47:
+	setp.lt.f64	%p48, %fd1, %fd68;
+	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p48;
+	bra.uni 	BB11_65;
 
-BB10_32:
+BB11_32:
 	setp.eq.s32	%p8, %r6, 16;
-	@%p8 bra 	BB10_44;
+	@%p8 bra 	BB11_42;
 
 	setp.eq.s32	%p9, %r6, 17;
-	@%p9 bra 	BB10_39;
-	bra.uni 	BB10_34;
+	@%p9 bra 	BB11_38;
+	bra.uni 	BB11_34;
 
-BB10_39:
-	setp.eq.f64	%p34, %fd1, 0d0000000000000000;
-	setp.eq.f64	%p35, %fd1, 0d8000000000000000;
-	or.pred  	%p36, %p34, %p35;
-	mov.f64 	%fd99, 0d7FF8000000000000;
-	@%p36 bra 	BB10_69;
+BB11_38:
+	setp.eq.f64	%p35, %fd1, 0d0000000000000000;
+	setp.eq.f64	%p36, %fd1, 0d8000000000000000;
+	or.pred  	%p37, %p35, %p36;
+	mov.f64 	%fd98, 0d7FF8000000000000;
+	@%p37 bra 	BB11_65;
 
-	div.rn.f64 	%fd99, %fd68, %fd1;
-	abs.f64 	%fd72, %fd99;
-	setp.gtu.f64	%p37, %fd72, 0d7FF0000000000000;
-	@%p37 bra 	BB10_69;
+	div.rn.f64 	%fd98, %fd68, %fd1;
+	abs.f64 	%fd72, %fd98;
+	setp.gtu.f64	%p38, %fd72, 0d7FF0000000000000;
+	@%p38 bra 	BB11_65;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r15}, %fd99;
+	mov.b64 	{%r15, %temp}, %fd98;
 	}
-	and.b32  	%r16, %r15, 2147483647;
-	setp.ne.s32	%p38, %r16, 2146435072;
-	@%p38 bra 	BB10_43;
-
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r17, %temp}, %fd99;
+	mov.b64 	{%temp, %r16}, %fd98;
 	}
-	setp.eq.s32	%p39, %r17, 0;
-	@%p39 bra 	BB10_69;
-
-BB10_43:
-	cvt.rmi.f64.f64	%fd73, %fd99;
+	and.b32  	%r17, %r16, 2147483647;
+	setp.ne.s32	%p39, %r17, 2146435072;
+	setp.ne.s32	%p40, %r15, 0;
+	or.pred  	%p41, %p39, %p40;
+	@!%p41 bra 	BB11_65;
+	bra.uni 	BB11_41;
+
+BB11_41:
+	cvt.rmi.f64.f64	%fd73, %fd98;
 	mul.f64 	%fd74, %fd1, %fd73;
-	sub.f64 	%fd99, %fd68, %fd74;
-	bra.uni 	BB10_69;
-
-BB10_76:
-	setp.eq.s32	%p91, %r6, 2;
-	@%p91 bra 	BB10_135;
-	bra.uni 	BB10_77;
-
-BB10_135:
-	mul.f64 	%fd108, %fd1, %fd68;
-	bra.uni 	BB10_137;
-
-BB10_92:
-	setp.eq.s32	%p78, %r6, 11;
-	@%p78 bra 	BB10_115;
-
-	setp.eq.s32	%p79, %r6, 12;
-	@%p79 bra 	BB10_114;
-	bra.uni 	BB10_94;
-
-BB10_114:
-	max.f64 	%fd108, %fd1, %fd68;
-	bra.uni 	BB10_137;
-
-BB10_83:
-	setp.eq.s32	%p85, %r6, 6;
-	@%p85 bra 	BB10_118;
-
-	setp.eq.s32	%p86, %r6, 7;
-	@%p86 bra 	BB10_117;
-	bra.uni 	BB10_85;
-
-BB10_117:
-	setp.gt.f64	%p110, %fd1, %fd68;
-	selp.f64	%fd108, 0d3FF0000000000000, 0d0000000000000000, %p110;
-	bra.uni 	BB10_137;
-
-BB10_100:
-	setp.eq.s32	%p72, %r6, 16;
-	@%p72 bra 	BB10_112;
-
-	setp.eq.s32	%p73, %r6, 17;
-	@%p73 bra 	BB10_107;
-	bra.uni 	BB10_102;
-
-BB10_107:
-	setp.eq.f64	%p98, %fd68, 0d0000000000000000;
-	setp.eq.f64	%p99, %fd68, 0d8000000000000000;
-	or.pred  	%p100, %p98, %p99;
-	mov.f64 	%fd108, 0d7FF8000000000000;
-	@%p100 bra 	BB10_137;
-
-	div.rn.f64 	%fd108, %fd1, %fd68;
-	abs.f64 	%fd83, %fd108;
-	setp.gtu.f64	%p101, %fd83, 0d7FF0000000000000;
-	@%p101 bra 	BB10_137;
+	sub.f64 	%fd98, %fd68, %fd74;
+	bra.uni 	BB11_65;
+
+BB11_72:
+	setp.eq.s32	%p95, %r6, 2;
+	@%p95 bra 	BB11_127;
+	bra.uni 	BB11_73;
+
+BB11_127:
+	mul.f64 	%fd106, %fd1, %fd68;
+	bra.uni 	BB11_129;
+
+BB11_88:
+	setp.eq.s32	%p82, %r6, 11;
+	@%p82 bra 	BB11_109;
+
+	setp.eq.s32	%p83, %r6, 12;
+	@%p83 bra 	BB11_108;
+	bra.uni 	BB11_90;
+
+BB11_108:
+	max.f64 	%fd106, %fd1, %fd68;
+	bra.uni 	BB11_129;
+
+BB11_79:
+	setp.eq.s32	%p89, %r6, 6;
+	@%p89 bra 	BB11_112;
+
+	setp.eq.s32	%p90, %r6, 7;
+	@%p90 bra 	BB11_111;
+	bra.uni 	BB11_81;
+
+BB11_111:
+	setp.gt.f64	%p116, %fd1, %fd68;
+	selp.f64	%fd106, 0d3FF0000000000000, 0d0000000000000000, %p116;
+	bra.uni 	BB11_129;
+
+BB11_96:
+	setp.eq.s32	%p76, %r6, 16;
+	@%p76 bra 	BB11_106;
+
+	setp.eq.s32	%p77, %r6, 17;
+	@%p77 bra 	BB11_102;
+	bra.uni 	BB11_98;
+
+BB11_102:
+	setp.eq.f64	%p103, %fd68, 0d0000000000000000;
+	setp.eq.f64	%p104, %fd68, 0d8000000000000000;
+	or.pred  	%p105, %p103, %p104;
+	mov.f64 	%fd106, 0d7FF8000000000000;
+	@%p105 bra 	BB11_129;
+
+	div.rn.f64 	%fd106, %fd1, %fd68;
+	abs.f64 	%fd83, %fd106;
+	setp.gtu.f64	%p106, %fd83, 0d7FF0000000000000;
+	@%p106 bra 	BB11_129;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r53}, %fd108;
+	mov.b64 	{%r52, %temp}, %fd106;
 	}
-	and.b32  	%r54, %r53, 2147483647;
-	setp.ne.s32	%p102, %r54, 2146435072;
-	@%p102 bra 	BB10_111;
-
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r55, %temp}, %fd108;
+	mov.b64 	{%temp, %r53}, %fd106;
 	}
-	setp.eq.s32	%p103, %r55, 0;
-	@%p103 bra 	BB10_137;
-
-BB10_111:
-	cvt.rmi.f64.f64	%fd84, %fd108;
+	and.b32  	%r54, %r53, 2147483647;
+	setp.ne.s32	%p107, %r54, 2146435072;
+	setp.ne.s32	%p108, %r52, 0;
+	or.pred  	%p109, %p107, %p108;
+	@!%p109 bra 	BB11_129;
+	bra.uni 	BB11_105;
+
+BB11_105:
+	cvt.rmi.f64.f64	%fd84, %fd106;
 	mul.f64 	%fd85, %fd84, %fd68;
-	sub.f64 	%fd108, %fd1, %fd85;
-	bra.uni 	BB10_137;
+	sub.f64 	%fd106, %fd1, %fd85;
+	bra.uni 	BB11_129;
 
-BB10_6:
+BB11_6:
 	setp.eq.s32	%p30, %r6, 1;
-	@%p30 bra 	BB10_7;
-	bra.uni 	BB10_69;
+	@%p30 bra 	BB11_7;
+	bra.uni 	BB11_65;
 
-BB10_7:
-	sub.f64 	%fd99, %fd68, %fd1;
-	bra.uni 	BB10_69;
+BB11_7:
+	sub.f64 	%fd98, %fd68, %fd1;
+	bra.uni 	BB11_65;
 
-BB10_22:
+BB11_22:
 	setp.eq.s32	%p18, %r6, 10;
-	@%p18 bra 	BB10_23;
-	bra.uni 	BB10_69;
+	@%p18 bra 	BB11_23;
+	bra.uni 	BB11_65;
 
-BB10_23:
-	setp.neu.f64	%p43, %fd1, %fd68;
-	selp.f64	%fd99, 0d3FF0000000000000, 0d0000000000000000, %p43;
-	bra.uni 	BB10_69;
+BB11_23:
+	setp.neu.f64	%p45, %fd1, %fd68;
+	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p45;
+	bra.uni 	BB11_65;
 
-BB10_13:
+BB11_13:
 	setp.eq.s32	%p25, %r6, 5;
-	@%p25 bra 	BB10_14;
-	bra.uni 	BB10_69;
+	@%p25 bra 	BB11_14;
+	bra.uni 	BB11_65;
 
-BB10_14:
-	setp.gt.f64	%p48, %fd1, %fd68;
-	selp.f64	%fd99, 0d3FF0000000000000, 0d0000000000000000, %p48;
-	bra.uni 	BB10_69;
+BB11_14:
+	setp.gt.f64	%p50, %fd1, %fd68;
+	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p50;
+	bra.uni 	BB11_65;
 
-BB10_30:
+BB11_30:
 	setp.eq.s32	%p12, %r6, 15;
-	@%p12 bra 	BB10_31;
-	bra.uni 	BB10_69;
+	@%p12 bra 	BB11_31;
+	bra.uni 	BB11_65;
 
-BB10_31:
+BB11_31:
 	mul.f64 	%fd76, %fd1, %fd68;
 	mov.f64 	%fd77, 0d3FF0000000000000;
-	sub.f64 	%fd99, %fd77, %fd76;
-	bra.uni 	BB10_69;
+	sub.f64 	%fd98, %fd77, %fd76;
+	bra.uni 	BB11_65;
 
-BB10_9:
+BB11_9:
 	setp.eq.s32	%p28, %r6, 3;
-	@%p28 bra 	BB10_10;
-	bra.uni 	BB10_69;
+	@%p28 bra 	BB11_10;
+	bra.uni 	BB11_65;
 
-BB10_10:
-	div.rn.f64 	%fd99, %fd68, %fd1;
-	bra.uni 	BB10_69;
+BB11_10:
+	div.rn.f64 	%fd98, %fd68, %fd1;
+	bra.uni 	BB11_65;
 
-BB10_47:
-	min.f64 	%fd99, %fd68, %fd1;
-	bra.uni 	BB10_69;
+BB11_45:
+	min.f64 	%fd98, %fd68, %fd1;
+	bra.uni 	BB11_65;
 
-BB10_26:
+BB11_26:
 	setp.eq.s32	%p16, %r6, 13;
-	@%p16 bra 	BB10_27;
-	bra.uni 	BB10_69;
+	@%p16 bra 	BB11_27;
+	bra.uni 	BB11_65;
 
-BB10_27:
+BB11_27:
 	cvt.rni.s64.f64	%rd12, %fd68;
 	cvt.rni.s64.f64	%rd13, %fd1;
 	cvt.u32.u64	%r21, %rd12;
 	cvt.u32.u64	%r22, %rd13;
 	and.b32  	%r23, %r22, %r21;
-	setp.eq.s32	%p42, %r23, 0;
-	selp.f64	%fd99, 0d0000000000000000, 0d3FF0000000000000, %p42;
-	bra.uni 	BB10_69;
+	setp.eq.s32	%p44, %r23, 0;
+	selp.f64	%fd98, 0d0000000000000000, 0d3FF0000000000000, %p44;
+	bra.uni 	BB11_65;
 
-BB10_50:
-	setp.ge.f64	%p47, %fd1, %fd68;
-	selp.f64	%fd99, 0d3FF0000000000000, 0d0000000000000000, %p47;
-	bra.uni 	BB10_69;
+BB11_48:
+	setp.ge.f64	%p49, %fd1, %fd68;
+	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p49;
+	bra.uni 	BB11_65;
 
-BB10_17:
+BB11_17:
 	setp.eq.s32	%p23, %r6, 8;
-	@%p23 bra 	BB10_18;
-	bra.uni 	BB10_69;
+	@%p23 bra 	BB11_18;
+	bra.uni 	BB11_65;
 
-BB10_18:
-	setp.le.f64	%p45, %fd1, %fd68;
-	selp.f64	%fd99, 0d3FF0000000000000, 0d0000000000000000, %p45;
-	bra.uni 	BB10_69;
+BB11_18:
+	setp.le.f64	%p47, %fd1, %fd68;
+	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p47;
+	bra.uni 	BB11_65;
 
-BB10_44:
-	setp.neu.f64	%p40, %fd68, 0d0000000000000000;
+BB11_42:
+	setp.neu.f64	%p42, %fd68, 0d0000000000000000;
 	sub.f64 	%fd75, %fd68, %fd1;
-	selp.f64	%fd99, %fd75, 0d0000000000000000, %p40;
-	bra.uni 	BB10_69;
+	selp.f64	%fd98, %fd75, 0d0000000000000000, %p42;
+	bra.uni 	BB11_65;
 
-BB10_34:
+BB11_34:
 	setp.ne.s32	%p10, %r6, 18;
-	@%p10 bra 	BB10_69;
+	@%p10 bra 	BB11_65;
 
-	div.rn.f64 	%fd99, %fd68, %fd1;
-	abs.f64 	%fd70, %fd99;
+	div.rn.f64 	%fd98, %fd68, %fd1;
+	abs.f64 	%fd70, %fd98;
 	setp.gtu.f64	%p31, %fd70, 0d7FF0000000000000;
-	@%p31 bra 	BB10_69;
+	@%p31 bra 	BB11_65;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r12}, %fd99;
+	mov.b64 	{%r12, %temp}, %fd98;
 	}
-	and.b32  	%r13, %r12, 2147483647;
-	setp.ne.s32	%p32, %r13, 2146435072;
-	@%p32 bra 	BB10_38;
-
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r14, %temp}, %fd99;
+	mov.b64 	{%temp, %r13}, %fd98;
 	}
-	setp.eq.s32	%p33, %r14, 0;
-	@%p33 bra 	BB10_69;
-
-BB10_38:
-	cvt.rmi.f64.f64	%fd99, %fd99;
-	bra.uni 	BB10_69;
-
-BB10_74:
-	setp.eq.s32	%p94, %r6, 1;
-	@%p94 bra 	BB10_75;
-	bra.uni 	BB10_137;
-
-BB10_75:
-	sub.f64 	%fd108, %fd1, %fd68;
-	bra.uni 	BB10_137;
-
-BB10_90:
-	setp.eq.s32	%p82, %r6, 10;
-	@%p82 bra 	BB10_91;
-	bra.uni 	BB10_137;
-
-BB10_91:
-	setp.neu.f64	%p107, %fd1, %fd68;
-	selp.f64	%fd108, 0d3FF0000000000000, 0d0000000000000000, %p107;
-	bra.uni 	BB10_137;
-
-BB10_81:
-	setp.eq.s32	%p89, %r6, 5;
-	@%p89 bra 	BB10_82;
-	bra.uni 	BB10_137;
-
-BB10_82:
-	setp.lt.f64	%p112, %fd1, %fd68;
-	selp.f64	%fd108, 0d3FF0000000000000, 0d0000000000000000, %p112;
-	bra.uni 	BB10_137;
-
-BB10_98:
-	setp.eq.s32	%p76, %r6, 15;
-	@%p76 bra 	BB10_99;
-	bra.uni 	BB10_137;
-
-BB10_99:
+	and.b32  	%r14, %r13, 2147483647;
+	setp.ne.s32	%p32, %r14, 2146435072;
+	setp.ne.s32	%p33, %r12, 0;
+	or.pred  	%p34, %p32, %p33;
+	@!%p34 bra 	BB11_65;
+	bra.uni 	BB11_37;
+
+BB11_37:
+	cvt.rmi.f64.f64	%fd98, %fd98;
+	bra.uni 	BB11_65;
+
+BB11_70:
+	setp.eq.s32	%p98, %r6, 1;
+	@%p98 bra 	BB11_71;
+	bra.uni 	BB11_129;
+
+BB11_71:
+	sub.f64 	%fd106, %fd1, %fd68;
+	bra.uni 	BB11_129;
+
+BB11_86:
+	setp.eq.s32	%p86, %r6, 10;
+	@%p86 bra 	BB11_87;
+	bra.uni 	BB11_129;
+
+BB11_87:
+	setp.neu.f64	%p113, %fd1, %fd68;
+	selp.f64	%fd106, 0d3FF0000000000000, 0d0000000000000000, %p113;
+	bra.uni 	BB11_129;
+
+BB11_77:
+	setp.eq.s32	%p93, %r6, 5;
+	@%p93 bra 	BB11_78;
+	bra.uni 	BB11_129;
+
+BB11_78:
+	setp.lt.f64	%p118, %fd1, %fd68;
+	selp.f64	%fd106, 0d3FF0000000000000, 0d0000000000000000, %p118;
+	bra.uni 	BB11_129;
+
+BB11_94:
+	setp.eq.s32	%p80, %r6, 15;
+	@%p80 bra 	BB11_95;
+	bra.uni 	BB11_129;
+
+BB11_95:
 	mul.f64 	%fd87, %fd1, %fd68;
 	mov.f64 	%fd88, 0d3FF0000000000000;
-	sub.f64 	%fd108, %fd88, %fd87;
-	bra.uni 	BB10_137;
+	sub.f64 	%fd106, %fd88, %fd87;
+	bra.uni 	BB11_129;
 
-BB10_77:
-	setp.eq.s32	%p92, %r6, 3;
-	@%p92 bra 	BB10_78;
-	bra.uni 	BB10_137;
+BB11_73:
+	setp.eq.s32	%p96, %r6, 3;
+	@%p96 bra 	BB11_74;
+	bra.uni 	BB11_129;
 
-BB10_78:
-	div.rn.f64 	%fd108, %fd1, %fd68;
-	bra.uni 	BB10_137;
+BB11_74:
+	div.rn.f64 	%fd106, %fd1, %fd68;
+	bra.uni 	BB11_129;
 
-BB10_115:
-	min.f64 	%fd108, %fd1, %fd68;
-	bra.uni 	BB10_137;
+BB11_109:
+	min.f64 	%fd106, %fd1, %fd68;
+	bra.uni 	BB11_129;
 
-BB10_94:
-	setp.eq.s32	%p80, %r6, 13;
-	@%p80 bra 	BB10_95;
-	bra.uni 	BB10_137;
+BB11_90:
+	setp.eq.s32	%p84, %r6, 13;
+	@%p84 bra 	BB11_91;
+	bra.uni 	BB11_129;
 
-BB10_95:
+BB11_91:
 	cvt.rni.s64.f64	%rd17, %fd1;
 	cvt.rni.s64.f64	%rd18, %fd68;
-	cvt.u32.u64	%r59, %rd17;
-	cvt.u32.u64	%r60, %rd18;
-	and.b32  	%r61, %r60, %r59;
-	setp.eq.s32	%p106, %r61, 0;
-	selp.f64	%fd108, 0d0000000000000000, 0d3FF0000000000000, %p106;
-	bra.uni 	BB10_137;
-
-BB10_118:
-	setp.le.f64	%p111, %fd1, %fd68;
-	selp.f64	%fd108, 0d3FF0000000000000, 0d0000000000000000, %p111;
-	bra.uni 	BB10_137;
-
-BB10_85:
-	setp.eq.s32	%p87, %r6, 8;
-	@%p87 bra 	BB10_86;
-	bra.uni 	BB10_137;
-
-BB10_86:
-	setp.ge.f64	%p109, %fd1, %fd68;
-	selp.f64	%fd108, 0d3FF0000000000000, 0d0000000000000000, %p109;
-	bra.uni 	BB10_137;
-
-BB10_112:
-	setp.neu.f64	%p104, %fd1, 0d0000000000000000;
+	cvt.u32.u64	%r58, %rd17;
+	cvt.u32.u64	%r59, %rd18;
+	and.b32  	%r60, %r59, %r58;
+	setp.eq.s32	%p112, %r60, 0;
+	selp.f64	%fd106, 0d0000000000000000, 0d3FF0000000000000, %p112;
+	bra.uni 	BB11_129;
+
+BB11_112:
+	setp.le.f64	%p117, %fd1, %fd68;
+	selp.f64	%fd106, 0d3FF0000000000000, 0d0000000000000000, %p117;
+	bra.uni 	BB11_129;
+
+BB11_81:
+	setp.eq.s32	%p91, %r6, 8;
+	@%p91 bra 	BB11_82;
+	bra.uni 	BB11_129;
+
+BB11_82:
+	setp.ge.f64	%p115, %fd1, %fd68;
+	selp.f64	%fd106, 0d3FF0000000000000, 0d0000000000000000, %p115;
+	bra.uni 	BB11_129;
+
+BB11_106:
+	setp.neu.f64	%p110, %fd1, 0d0000000000000000;
 	sub.f64 	%fd86, %fd1, %fd68;
-	selp.f64	%fd108, %fd86, 0d0000000000000000, %p104;
-	bra.uni 	BB10_137;
+	selp.f64	%fd106, %fd86, 0d0000000000000000, %p110;
+	bra.uni 	BB11_129;
 
-BB10_102:
-	setp.ne.s32	%p74, %r6, 18;
-	@%p74 bra 	BB10_137;
+BB11_98:
+	setp.ne.s32	%p78, %r6, 18;
+	@%p78 bra 	BB11_129;
 
-	div.rn.f64 	%fd108, %fd1, %fd68;
-	abs.f64 	%fd81, %fd108;
-	setp.gtu.f64	%p95, %fd81, 0d7FF0000000000000;
-	@%p95 bra 	BB10_137;
+	div.rn.f64 	%fd106, %fd1, %fd68;
+	abs.f64 	%fd81, %fd106;
+	setp.gtu.f64	%p99, %fd81, 0d7FF0000000000000;
+	@%p99 bra 	BB11_129;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r50}, %fd108;
+	mov.b64 	{%r49, %temp}, %fd106;
 	}
-	and.b32  	%r51, %r50, 2147483647;
-	setp.ne.s32	%p96, %r51, 2146435072;
-	@%p96 bra 	BB10_106;
-
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r52, %temp}, %fd108;
+	mov.b64 	{%temp, %r50}, %fd106;
 	}
-	setp.eq.s32	%p97, %r52, 0;
-	@%p97 bra 	BB10_137;
+	and.b32  	%r51, %r50, 2147483647;
+	setp.ne.s32	%p100, %r51, 2146435072;
+	setp.ne.s32	%p101, %r49, 0;
+	or.pred  	%p102, %p100, %p101;
+	@!%p102 bra 	BB11_129;
+	bra.uni 	BB11_101;
 
-BB10_106:
-	cvt.rmi.f64.f64	%fd108, %fd108;
-	bra.uni 	BB10_137;
+BB11_101:
+	cvt.rmi.f64.f64	%fd106, %fd106;
+	bra.uni 	BB11_129;
 
-BB10_54:
-	setp.gt.s32	%p52, %r2, -1;
-	@%p52 bra 	BB10_57;
+BB11_52:
+	setp.gt.s32	%p54, %r2, -1;
+	@%p54 bra 	BB11_55;
 
 	cvt.rzi.f64.f64	%fd78, %fd1;
-	setp.neu.f64	%p53, %fd78, %fd1;
-	selp.f64	%fd97, 0dFFF8000000000000, %fd97, %p53;
+	setp.neu.f64	%p55, %fd78, %fd1;
+	selp.f64	%fd96, 0dFFF8000000000000, %fd96, %p55;
 
-BB10_57:
-	mov.f64 	%fd24, %fd97;
+BB11_55:
+	mov.f64 	%fd24, %fd96;
 	add.f64 	%fd25, %fd1, %fd68;
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r33}, %fd25;
 	}
 	and.b32  	%r34, %r33, 2146435072;
-	setp.ne.s32	%p56, %r34, 2146435072;
-	mov.f64 	%fd96, %fd24;
-	@%p56 bra 	BB10_66;
-
-	setp.gtu.f64	%p57, %fd18, 0d7FF0000000000000;
-	mov.f64 	%fd96, %fd25;
-	@%p57 bra 	BB10_66;
+	setp.ne.s32	%p58, %r34, 2146435072;
+	mov.f64 	%fd95, %fd24;
+	@%p58 bra 	BB11_62;
 
-	abs.f64 	%fd79, %fd1;
-	setp.gtu.f64	%p58, %fd79, 0d7FF0000000000000;
+	setp.gtu.f64	%p59, %fd18, 0d7FF0000000000000;
 	mov.f64 	%fd95, %fd25;
-	mov.f64 	%fd96, %fd95;
-	@%p58 bra 	BB10_66;
-
-	and.b32  	%r35, %r3, 2147483647;
-	setp.ne.s32	%p59, %r35, 2146435072;
-	@%p59 bra 	BB10_62;
-
-	{
-	.reg .b32 %temp; 
-	mov.b64 	{%r36, %temp}, %fd1;
-	}
-	setp.eq.s32	%p60, %r36, 0;
-	@%p60 bra 	BB10_65;
+	@%p59 bra 	BB11_62;
 
-BB10_62:
-	and.b32  	%r37, %r2, 2147483647;
-	setp.ne.s32	%p61, %r37, 2146435072;
-	mov.f64 	%fd93, %fd24;
-	mov.f64 	%fd96, %fd93;
-	@%p61 bra 	BB10_66;
+	abs.f64 	%fd79, %fd1;
+	setp.gtu.f64	%p60, %fd79, 0d7FF0000000000000;
+	mov.f64 	%fd94, %fd25;
+	mov.f64 	%fd95, %fd94;
+	@%p60 bra 	BB11_62;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r38, %temp}, %fd68;
+	mov.b64 	{%r35, %temp}, %fd1;
 	}
-	setp.ne.s32	%p62, %r38, 0;
-	mov.f64 	%fd96, %fd24;
-	@%p62 bra 	BB10_66;
-
-	shr.s32 	%r39, %r3, 31;
-	and.b32  	%r40, %r39, -2146435072;
-	add.s32 	%r41, %r40, 2146435072;
-	or.b32  	%r42, %r41, -2147483648;
-	selp.b32	%r43, %r42, %r41, %p1;
-	mov.u32 	%r44, 0;
-	mov.b64 	%fd96, {%r44, %r43};
-	bra.uni 	BB10_66;
-
-BB10_122:
-	setp.gt.s32	%p116, %r4, -1;
-	@%p116 bra 	BB10_125;
+	and.b32  	%r36, %r3, 2147483647;
+	setp.eq.s32	%p61, %r36, 2146435072;
+	setp.eq.s32	%p62, %r35, 0;
+	and.pred  	%p63, %p61, %p62;
+	@%p63 bra 	BB11_61;
+	bra.uni 	BB11_59;
+
+BB11_61:
+	setp.gt.f64	%p67, %fd18, 0d3FF0000000000000;
+	selp.b32	%r44, 2146435072, 0, %p67;
+	xor.b32  	%r45, %r44, 2146435072;
+	setp.lt.s32	%p68, %r3, 0;
+	selp.b32	%r46, %r45, %r44, %p68;
+	setp.eq.f64	%p69, %fd68, 0dBFF0000000000000;
+	selp.b32	%r47, 1072693248, %r46, %p69;
+	mov.u32 	%r48, 0;
+	mov.b64 	%fd95, {%r48, %r47};
+	bra.uni 	BB11_62;
+
+BB11_116:
+	setp.gt.s32	%p122, %r4, -1;
+	@%p122 bra 	BB11_119;
 
 	cvt.rzi.f64.f64	%fd89, %fd68;
-	setp.neu.f64	%p117, %fd89, %fd68;
-	selp.f64	%fd106, 0dFFF8000000000000, %fd106, %p117;
+	setp.neu.f64	%p123, %fd89, %fd68;
+	selp.f64	%fd104, 0dFFF8000000000000, %fd104, %p123;
 
-BB10_125:
-	mov.f64 	%fd57, %fd106;
+BB11_119:
+	mov.f64 	%fd57, %fd104;
 	add.f64 	%fd58, %fd1, %fd68;
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r71}, %fd58;
+	mov.b64 	{%temp, %r70}, %fd58;
 	}
-	and.b32  	%r72, %r71, 2146435072;
-	setp.ne.s32	%p120, %r72, 2146435072;
-	mov.f64 	%fd105, %fd57;
-	@%p120 bra 	BB10_134;
+	and.b32  	%r71, %r70, 2146435072;
+	setp.ne.s32	%p126, %r71, 2146435072;
+	mov.f64 	%fd103, %fd57;
+	@%p126 bra 	BB11_126;
 
-	setp.gtu.f64	%p121, %fd51, 0d7FF0000000000000;
-	mov.f64 	%fd105, %fd58;
-	@%p121 bra 	BB10_134;
+	setp.gtu.f64	%p127, %fd51, 0d7FF0000000000000;
+	mov.f64 	%fd103, %fd58;
+	@%p127 bra 	BB11_126;
 
 	abs.f64 	%fd90, %fd68;
-	setp.gtu.f64	%p122, %fd90, 0d7FF0000000000000;
-	mov.f64 	%fd104, %fd58;
-	mov.f64 	%fd105, %fd104;
-	@%p122 bra 	BB10_134;
+	setp.gtu.f64	%p128, %fd90, 0d7FF0000000000000;
+	mov.f64 	%fd102, %fd58;
+	mov.f64 	%fd103, %fd102;
+	@%p128 bra 	BB11_126;
 
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r72, %temp}, %fd68;
+	}
 	and.b32  	%r73, %r5, 2147483647;
-	setp.ne.s32	%p123, %r73, 2146435072;
-	@%p123 bra 	BB10_130;
-
+	setp.eq.s32	%p129, %r73, 2146435072;
+	setp.eq.s32	%p130, %r72, 0;
+	and.pred  	%p131, %p129, %p130;
+	@%p131 bra 	BB11_125;
+	bra.uni 	BB11_123;
+
+BB11_125:
+	setp.gt.f64	%p135, %fd51, 0d3FF0000000000000;
+	selp.b32	%r81, 2146435072, 0, %p135;
+	xor.b32  	%r82, %r81, 2146435072;
+	setp.lt.s32	%p136, %r5, 0;
+	selp.b32	%r83, %r82, %r81, %p136;
+	setp.eq.f64	%p137, %fd1, 0dBFF0000000000000;
+	selp.b32	%r84, 1072693248, %r83, %p137;
+	mov.u32 	%r85, 0;
+	mov.b64 	%fd103, {%r85, %r84};
+	bra.uni 	BB11_126;
+
+BB11_59:
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r74, %temp}, %fd68;
+	mov.b64 	{%r37, %temp}, %fd68;
 	}
-	setp.eq.s32	%p124, %r74, 0;
-	@%p124 bra 	BB10_133;
+	and.b32  	%r38, %r2, 2147483647;
+	setp.eq.s32	%p64, %r38, 2146435072;
+	setp.eq.s32	%p65, %r37, 0;
+	and.pred  	%p66, %p64, %p65;
+	mov.f64 	%fd95, %fd24;
+	@!%p66 bra 	BB11_62;
+	bra.uni 	BB11_60;
+
+BB11_60:
+	shr.s32 	%r39, %r3, 31;
+	and.b32  	%r40, %r39, -2146435072;
+	selp.b32	%r41, -1048576, 2146435072, %p1;
+	add.s32 	%r42, %r41, %r40;
+	mov.u32 	%r43, 0;
+	mov.b64 	%fd95, {%r43, %r42};
+
+BB11_62:
+	setp.eq.f64	%p70, %fd1, 0d0000000000000000;
+	setp.eq.f64	%p71, %fd68, 0d3FF0000000000000;
+	or.pred  	%p72, %p71, %p70;
+	selp.f64	%fd98, 0d3FF0000000000000, %fd95, %p72;
 
-BB10_130:
-	and.b32  	%r75, %r4, 2147483647;
-	setp.ne.s32	%p125, %r75, 2146435072;
-	mov.f64 	%fd102, %fd57;
-	mov.f64 	%fd105, %fd102;
-	@%p125 bra 	BB10_134;
+BB11_65:
+	st.global.f64 	[%rd1], %fd98;
+	bra.uni 	BB11_130;
 
+BB11_123:
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r76, %temp}, %fd1;
+	mov.b64 	{%r74, %temp}, %fd1;
 	}
-	setp.ne.s32	%p126, %r76, 0;
-	mov.f64 	%fd105, %fd57;
-	@%p126 bra 	BB10_134;
-
-	shr.s32 	%r77, %r5, 31;
-	and.b32  	%r78, %r77, -2146435072;
-	add.s32 	%r79, %r78, 2146435072;
-	or.b32  	%r80, %r79, -2147483648;
-	selp.b32	%r81, %r80, %r79, %p2;
-	mov.u32 	%r82, 0;
-	mov.b64 	%fd105, {%r82, %r81};
-	bra.uni 	BB10_134;
-
-BB10_65:
-	setp.gt.f64	%p63, %fd18, 0d3FF0000000000000;
-	selp.b32	%r45, 2146435072, 0, %p63;
-	xor.b32  	%r46, %r45, 2146435072;
-	setp.lt.s32	%p64, %r3, 0;
-	selp.b32	%r47, %r46, %r45, %p64;
-	setp.eq.f64	%p65, %fd68, 0dBFF0000000000000;
-	selp.b32	%r48, 1072693248, %r47, %p65;
-	mov.u32 	%r49, 0;
-	mov.b64 	%fd96, {%r49, %r48};
-
-BB10_66:
-	setp.eq.f64	%p66, %fd1, 0d0000000000000000;
-	setp.eq.f64	%p67, %fd68, 0d3FF0000000000000;
-	or.pred  	%p68, %p67, %p66;
-	selp.f64	%fd99, 0d3FF0000000000000, %fd96, %p68;
-
-BB10_69:
-	st.global.f64 	[%rd1], %fd99;
-	bra.uni 	BB10_138;
-
-BB10_133:
-	setp.gt.f64	%p127, %fd51, 0d3FF0000000000000;
-	selp.b32	%r83, 2146435072, 0, %p127;
-	xor.b32  	%r84, %r83, 2146435072;
-	setp.lt.s32	%p128, %r5, 0;
-	selp.b32	%r85, %r84, %r83, %p128;
-	setp.eq.f64	%p129, %fd1, 0dBFF0000000000000;
-	selp.b32	%r86, 1072693248, %r85, %p129;
-	mov.u32 	%r87, 0;
-	mov.b64 	%fd105, {%r87, %r86};
-
-BB10_134:
-	setp.eq.f64	%p130, %fd68, 0d0000000000000000;
-	setp.eq.f64	%p131, %fd1, 0d3FF0000000000000;
-	or.pred  	%p132, %p131, %p130;
-	selp.f64	%fd108, 0d3FF0000000000000, %fd105, %p132;
-
-BB10_137:
-	st.global.f64 	[%rd1], %fd108;
-
-BB10_138:
+	and.b32  	%r75, %r4, 2147483647;
+	setp.eq.s32	%p132, %r75, 2146435072;
+	setp.eq.s32	%p133, %r74, 0;
+	and.pred  	%p134, %p132, %p133;
+	mov.f64 	%fd103, %fd57;
+	@!%p134 bra 	BB11_126;
+	bra.uni 	BB11_124;
+
+BB11_124:
+	shr.s32 	%r76, %r5, 31;
+	and.b32  	%r77, %r76, -2146435072;
+	selp.b32	%r78, -1048576, 2146435072, %p2;
+	add.s32 	%r79, %r78, %r77;
+	mov.u32 	%r80, 0;
+	mov.b64 	%fd103, {%r80, %r79};
+
+BB11_126:
+	setp.eq.f64	%p138, %fd68, 0d0000000000000000;
+	setp.eq.f64	%p139, %fd1, 0d3FF0000000000000;
+	or.pred  	%p140, %p139, %p138;
+	selp.f64	%fd106, 0d3FF0000000000000, %fd103, %p140;
+
+BB11_129:
+	st.global.f64 	[%rd1], %fd106;
+
+BB11_130:
 	bar.sync 	0;
 	ret;
 }
@@ -1911,14 +1968,14 @@ BB10_138:
 	mov.u32 	%r5, %tid.x;
 	mad.lo.s32 	%r1, %r4, %r3, %r5;
 	setp.ge.s32	%p1, %r1, %r2;
-	@%p1 bra 	BB11_2;
+	@%p1 bra 	BB12_2;
 
 	cvta.to.global.u64 	%rd2, %rd1;
 	mul.wide.s32 	%rd3, %r1, 8;
 	add.s64 	%rd4, %rd2, %rd3;
 	st.global.f64 	[%rd4], %fd1;
 
-BB11_2:
+BB12_2:
 	ret;
 }
 
@@ -1958,10 +2015,10 @@ BB11_2:
 	setp.lt.s32	%p1, %r1, %r7;
 	setp.lt.s32	%p2, %r2, %r4;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB12_2;
-	bra.uni 	BB12_1;
+	@!%p3 bra 	BB13_2;
+	bra.uni 	BB13_1;
 
-BB12_1:
+BB13_1:
 	cvta.to.global.u64 	%rd5, %rd2;
 	mad.lo.s32 	%r13, %r1, %r4, %r2;
 	mul.wide.s32 	%rd6, %r13, 8;
@@ -1972,14 +2029,14 @@ BB12_1:
 	add.s64 	%rd9, %rd1, %rd8;
 	st.global.f64 	[%rd9], %fd1;
 
-BB12_2:
+BB13_2:
 	setp.lt.s32	%p4, %r1, %r5;
 	setp.lt.s32	%p5, %r2, %r6;
 	and.pred  	%p6, %p4, %p5;
-	@!%p6 bra 	BB12_4;
-	bra.uni 	BB12_3;
+	@!%p6 bra 	BB13_4;
+	bra.uni 	BB13_3;
 
-BB12_3:
+BB13_3:
 	cvta.to.global.u64 	%rd10, %rd3;
 	mad.lo.s32 	%r15, %r1, %r6, %r2;
 	mul.wide.s32 	%rd11, %r15, 8;
@@ -1991,7 +2048,7 @@ BB12_3:
 	add.s64 	%rd14, %rd1, %rd13;
 	st.global.f64 	[%rd14], %fd2;
 
-BB12_4:
+BB13_4:
 	ret;
 }
 
@@ -2030,10 +2087,10 @@ BB12_4:
 	setp.lt.s32	%p1, %r1, %r3;
 	setp.lt.s32	%p2, %r2, %r4;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB13_2;
-	bra.uni 	BB13_1;
+	@!%p3 bra 	BB14_2;
+	bra.uni 	BB14_1;
 
-BB13_1:
+BB14_1:
 	cvta.to.global.u64 	%rd5, %rd2;
 	mad.lo.s32 	%r12, %r1, %r4, %r2;
 	mul.wide.s32 	%rd6, %r12, 8;
@@ -2042,14 +2099,14 @@ BB13_1:
 	add.s64 	%rd8, %rd1, %rd6;
 	st.global.f64 	[%rd8], %fd1;
 
-BB13_2:
+BB14_2:
 	setp.lt.s32	%p4, %r1, %r5;
 	setp.lt.s32	%p5, %r2, %r6;
 	and.pred  	%p6, %p4, %p5;
-	@!%p6 bra 	BB13_4;
-	bra.uni 	BB13_3;
+	@!%p6 bra 	BB14_4;
+	bra.uni 	BB14_3;
 
-BB13_3:
+BB14_3:
 	cvta.to.global.u64 	%rd9, %rd3;
 	mad.lo.s32 	%r13, %r1, %r6, %r2;
 	mul.wide.s32 	%rd10, %r13, 8;
@@ -2061,7 +2118,7 @@ BB13_3:
 	add.s64 	%rd13, %rd1, %rd12;
 	st.global.f64 	[%rd13], %fd2;
 
-BB13_4:
+BB14_4:
 	ret;
 }
 
@@ -2089,9 +2146,9 @@ BB13_4:
 	mov.f64 	%fd76, 0d0000000000000000;
 	mov.f64 	%fd77, %fd76;
 	setp.ge.u32	%p1, %r32, %r5;
-	@%p1 bra 	BB14_4;
+	@%p1 bra 	BB15_4;
 
-BB14_1:
+BB15_1:
 	mov.f64 	%fd1, %fd77;
 	cvta.to.global.u64 	%rd4, %rd2;
 	mul.wide.u32 	%rd5, %r32, 8;
@@ -2100,23 +2157,23 @@ BB14_1:
 	add.f64 	%fd78, %fd1, %fd30;
 	add.s32 	%r3, %r32, %r9;
 	setp.ge.u32	%p2, %r3, %r5;
-	@%p2 bra 	BB14_3;
+	@%p2 bra 	BB15_3;
 
 	mul.wide.u32 	%rd8, %r3, 8;
 	add.s64 	%rd9, %rd4, %rd8;
 	ld.global.f64 	%fd31, [%rd9];
 	add.f64 	%fd78, %fd78, %fd31;
 
-BB14_3:
+BB15_3:
 	mov.f64 	%fd77, %fd78;
 	shl.b32 	%r12, %r9, 1;
 	mov.u32 	%r13, %nctaid.x;
 	mad.lo.s32 	%r32, %r12, %r13, %r32;
 	setp.lt.u32	%p3, %r32, %r5;
 	mov.f64 	%fd76, %fd77;
-	@%p3 bra 	BB14_1;
+	@%p3 bra 	BB15_1;
 
-BB14_4:
+BB15_4:
 	mov.f64 	%fd74, %fd76;
 	mul.wide.u32 	%rd10, %r6, 8;
 	mov.u64 	%rd11, sdata;
@@ -2124,130 +2181,130 @@ BB14_4:
 	st.shared.f64 	[%rd1], %fd74;
 	bar.sync 	0;
 	setp.lt.u32	%p4, %r9, 1024;
-	@%p4 bra 	BB14_8;
+	@%p4 bra 	BB15_8;
 
 	setp.gt.u32	%p5, %r6, 511;
 	mov.f64 	%fd75, %fd74;
-	@%p5 bra 	BB14_7;
+	@%p5 bra 	BB15_7;
 
 	ld.shared.f64 	%fd32, [%rd1+4096];
 	add.f64 	%fd75, %fd74, %fd32;
 	st.shared.f64 	[%rd1], %fd75;
 
-BB14_7:
+BB15_7:
 	mov.f64 	%fd74, %fd75;
 	bar.sync 	0;
 
-BB14_8:
+BB15_8:
 	mov.f64 	%fd72, %fd74;
 	setp.lt.u32	%p6, %r9, 512;
-	@%p6 bra 	BB14_12;
+	@%p6 bra 	BB15_12;
 
 	setp.gt.u32	%p7, %r6, 255;
 	mov.f64 	%fd73, %fd72;
-	@%p7 bra 	BB14_11;
+	@%p7 bra 	BB15_11;
 
 	ld.shared.f64 	%fd33, [%rd1+2048];
 	add.f64 	%fd73, %fd72, %fd33;
 	st.shared.f64 	[%rd1], %fd73;
 
-BB14_11:
+BB15_11:
 	mov.f64 	%fd72, %fd73;
 	bar.sync 	0;
 
-BB14_12:
+BB15_12:
 	mov.f64 	%fd70, %fd72;
 	setp.lt.u32	%p8, %r9, 256;
-	@%p8 bra 	BB14_16;
+	@%p8 bra 	BB15_16;
 
 	setp.gt.u32	%p9, %r6, 127;
 	mov.f64 	%fd71, %fd70;
-	@%p9 bra 	BB14_15;
+	@%p9 bra 	BB15_15;
 
 	ld.shared.f64 	%fd34, [%rd1+1024];
 	add.f64 	%fd71, %fd70, %fd34;
 	st.shared.f64 	[%rd1], %fd71;
 
-BB14_15:
+BB15_15:
 	mov.f64 	%fd70, %fd71;
 	bar.sync 	0;
 
-BB14_16:
+BB15_16:
 	mov.f64 	%fd68, %fd70;
 	setp.lt.u32	%p10, %r9, 128;
-	@%p10 bra 	BB14_20;
+	@%p10 bra 	BB15_20;
 
 	setp.gt.u32	%p11, %r6, 63;
 	mov.f64 	%fd69, %fd68;
-	@%p11 bra 	BB14_19;
+	@%p11 bra 	BB15_19;
 
 	ld.shared.f64 	%fd35, [%rd1+512];
 	add.f64 	%fd69, %fd68, %fd35;
 	st.shared.f64 	[%rd1], %fd69;
 
-BB14_19:
+BB15_19:
 	mov.f64 	%fd68, %fd69;
 	bar.sync 	0;
 
-BB14_20:
+BB15_20:
 	mov.f64 	%fd67, %fd68;
 	setp.gt.u32	%p12, %r6, 31;
-	@%p12 bra 	BB14_33;
+	@%p12 bra 	BB15_33;
 
 	setp.lt.u32	%p13, %r9, 64;
-	@%p13 bra 	BB14_23;
+	@%p13 bra 	BB15_23;
 
 	ld.volatile.shared.f64 	%fd36, [%rd1+256];
 	add.f64 	%fd67, %fd67, %fd36;
 	st.volatile.shared.f64 	[%rd1], %fd67;
 
-BB14_23:
+BB15_23:
 	mov.f64 	%fd66, %fd67;
 	setp.lt.u32	%p14, %r9, 32;
-	@%p14 bra 	BB14_25;
+	@%p14 bra 	BB15_25;
 
 	ld.volatile.shared.f64 	%fd37, [%rd1+128];
 	add.f64 	%fd66, %fd66, %fd37;
 	st.volatile.shared.f64 	[%rd1], %fd66;
 
-BB14_25:
+BB15_25:
 	mov.f64 	%fd65, %fd66;
 	setp.lt.u32	%p15, %r9, 16;
-	@%p15 bra 	BB14_27;
+	@%p15 bra 	BB15_27;
 
 	ld.volatile.shared.f64 	%fd38, [%rd1+64];
 	add.f64 	%fd65, %fd65, %fd38;
 	st.volatile.shared.f64 	[%rd1], %fd65;
 
-BB14_27:
+BB15_27:
 	mov.f64 	%fd64, %fd65;
 	setp.lt.u32	%p16, %r9, 8;
-	@%p16 bra 	BB14_29;
+	@%p16 bra 	BB15_29;
 
 	ld.volatile.shared.f64 	%fd39, [%rd1+32];
 	add.f64 	%fd64, %fd64, %fd39;
 	st.volatile.shared.f64 	[%rd1], %fd64;
 
-BB14_29:
+BB15_29:
 	mov.f64 	%fd63, %fd64;
 	setp.lt.u32	%p17, %r9, 4;
-	@%p17 bra 	BB14_31;
+	@%p17 bra 	BB15_31;
 
 	ld.volatile.shared.f64 	%fd40, [%rd1+16];
 	add.f64 	%fd63, %fd63, %fd40;
 	st.volatile.shared.f64 	[%rd1], %fd63;
 
-BB14_31:
+BB15_31:
 	setp.lt.u32	%p18, %r9, 2;
-	@%p18 bra 	BB14_33;
+	@%p18 bra 	BB15_33;
 
 	ld.volatile.shared.f64 	%fd41, [%rd1+8];
 	add.f64 	%fd42, %fd63, %fd41;
 	st.volatile.shared.f64 	[%rd1], %fd42;
 
-BB14_33:
+BB15_33:
 	setp.ne.s32	%p19, %r6, 0;
-	@%p19 bra 	BB14_35;
+	@%p19 bra 	BB15_35;
 
 	ld.shared.f64 	%fd43, [sdata];
 	cvta.to.global.u64 	%rd12, %rd3;
@@ -2255,7 +2312,7 @@ BB14_33:
 	add.s64 	%rd14, %rd12, %rd13;
 	st.global.f64 	[%rd14], %fd43;
 
-BB14_35:
+BB15_35:
 	ret;
 }
 
@@ -2279,17 +2336,17 @@ BB14_35:
 	ld.param.u32 	%r4, [reduce_row_sum_param_3];
 	mov.u32 	%r6, %ctaid.x;
 	setp.ge.u32	%p1, %r6, %r5;
-	@%p1 bra 	BB15_35;
+	@%p1 bra 	BB16_35;
 
 	mov.u32 	%r38, %tid.x;
 	mov.f64 	%fd72, 0d0000000000000000;
 	mov.f64 	%fd73, %fd72;
 	setp.ge.u32	%p2, %r38, %r4;
-	@%p2 bra 	BB15_4;
+	@%p2 bra 	BB16_4;
 
 	cvta.to.global.u64 	%rd3, %rd1;
 
-BB15_3:
+BB16_3:
 	mad.lo.s32 	%r8, %r6, %r4, %r38;
 	mul.wide.u32 	%rd4, %r8, 8;
 	add.s64 	%rd5, %rd3, %rd4;
@@ -2299,9 +2356,9 @@ BB15_3:
 	add.s32 	%r38, %r9, %r38;
 	setp.lt.u32	%p3, %r38, %r4;
 	mov.f64 	%fd72, %fd73;
-	@%p3 bra 	BB15_3;
+	@%p3 bra 	BB16_3;
 
-BB15_4:
+BB16_4:
 	mov.f64 	%fd70, %fd72;
 	mov.u32 	%r10, %tid.x;
 	mul.wide.u32 	%rd6, %r10, 8;
@@ -2311,130 +2368,130 @@ BB15_4:
 	bar.sync 	0;
 	mov.u32 	%r11, %ntid.x;
 	setp.lt.u32	%p4, %r11, 1024;
-	@%p4 bra 	BB15_8;
+	@%p4 bra 	BB16_8;
 
 	setp.gt.u32	%p5, %r10, 511;
 	mov.f64 	%fd71, %fd70;
-	@%p5 bra 	BB15_7;
+	@%p5 bra 	BB16_7;
 
 	ld.shared.f64 	%fd29, [%rd8+4096];
 	add.f64 	%fd71, %fd70, %fd29;
 	st.shared.f64 	[%rd8], %fd71;
 
-BB15_7:
+BB16_7:
 	mov.f64 	%fd70, %fd71;
 	bar.sync 	0;
 
-BB15_8:
+BB16_8:
 	mov.f64 	%fd68, %fd70;
 	setp.lt.u32	%p6, %r11, 512;
-	@%p6 bra 	BB15_12;
+	@%p6 bra 	BB16_12;
 
 	setp.gt.u32	%p7, %r10, 255;
 	mov.f64 	%fd69, %fd68;
-	@%p7 bra 	BB15_11;
+	@%p7 bra 	BB16_11;
 
 	ld.shared.f64 	%fd30, [%rd8+2048];
 	add.f64 	%fd69, %fd68, %fd30;
 	st.shared.f64 	[%rd8], %fd69;
 
-BB15_11:
+BB16_11:
 	mov.f64 	%fd68, %fd69;
 	bar.sync 	0;
 
-BB15_12:
+BB16_12:
 	mov.f64 	%fd66, %fd68;
 	setp.lt.u32	%p8, %r11, 256;
-	@%p8 bra 	BB15_16;
+	@%p8 bra 	BB16_16;
 
 	setp.gt.u32	%p9, %r10, 127;
 	mov.f64 	%fd67, %fd66;
-	@%p9 bra 	BB15_15;
+	@%p9 bra 	BB16_15;
 
 	ld.shared.f64 	%fd31, [%rd8+1024];
 	add.f64 	%fd67, %fd66, %fd31;
 	st.shared.f64 	[%rd8], %fd67;
 
-BB15_15:
+BB16_15:
 	mov.f64 	%fd66, %fd67;
 	bar.sync 	0;
 
-BB15_16:
+BB16_16:
 	mov.f64 	%fd64, %fd66;
 	setp.lt.u32	%p10, %r11, 128;
-	@%p10 bra 	BB15_20;
+	@%p10 bra 	BB16_20;
 
 	setp.gt.u32	%p11, %r10, 63;
 	mov.f64 	%fd65, %fd64;
-	@%p11 bra 	BB15_19;
+	@%p11 bra 	BB16_19;
 
 	ld.shared.f64 	%fd32, [%rd8+512];
 	add.f64 	%fd65, %fd64, %fd32;
 	st.shared.f64 	[%rd8], %fd65;
 
-BB15_19:
+BB16_19:
 	mov.f64 	%fd64, %fd65;
 	bar.sync 	0;
 
-BB15_20:
+BB16_20:
 	mov.f64 	%fd63, %fd64;
 	setp.gt.u32	%p12, %r10, 31;
-	@%p12 bra 	BB15_33;
+	@%p12 bra 	BB16_33;
 
 	setp.lt.u32	%p13, %r11, 64;
-	@%p13 bra 	BB15_23;
+	@%p13 bra 	BB16_23;
 
 	ld.volatile.shared.f64 	%fd33, [%rd8+256];
 	add.f64 	%fd63, %fd63, %fd33;
 	st.volatile.shared.f64 	[%rd8], %fd63;
 
-BB15_23:
+BB16_23:
 	mov.f64 	%fd62, %fd63;
 	setp.lt.u32	%p14, %r11, 32;
-	@%p14 bra 	BB15_25;
+	@%p14 bra 	BB16_25;
 
 	ld.volatile.shared.f64 	%fd34, [%rd8+128];
 	add.f64 	%fd62, %fd62, %fd34;
 	st.volatile.shared.f64 	[%rd8], %fd62;
 
-BB15_25:
+BB16_25:
 	mov.f64 	%fd61, %fd62;
 	setp.lt.u32	%p15, %r11, 16;
-	@%p15 bra 	BB15_27;
+	@%p15 bra 	BB16_27;
 
 	ld.volatile.shared.f64 	%fd35, [%rd8+64];
 	add.f64 	%fd61, %fd61, %fd35;
 	st.volatile.shared.f64 	[%rd8], %fd61;
 
-BB15_27:
+BB16_27:
 	mov.f64 	%fd60, %fd61;
 	setp.lt.u32	%p16, %r11, 8;
-	@%p16 bra 	BB15_29;
+	@%p16 bra 	BB16_29;
 
 	ld.volatile.shared.f64 	%fd36, [%rd8+32];
 	add.f64 	%fd60, %fd60, %fd36;
 	st.volatile.shared.f64 	[%rd8], %fd60;
 
-BB15_29:
+BB16_29:
 	mov.f64 	%fd59, %fd60;
 	setp.lt.u32	%p17, %r11, 4;
-	@%p17 bra 	BB15_31;
+	@%p17 bra 	BB16_31;
 
 	ld.volatile.shared.f64 	%fd37, [%rd8+16];
 	add.f64 	%fd59, %fd59, %fd37;
 	st.volatile.shared.f64 	[%rd8], %fd59;
 
-BB15_31:
+BB16_31:
 	setp.lt.u32	%p18, %r11, 2;
-	@%p18 bra 	BB15_33;
+	@%p18 bra 	BB16_33;
 
 	ld.volatile.shared.f64 	%fd38, [%rd8+8];
 	add.f64 	%fd39, %fd59, %fd38;
 	st.volatile.shared.f64 	[%rd8], %fd39;
 
-BB15_33:
+BB16_33:
 	setp.ne.s32	%p19, %r10, 0;
-	@%p19 bra 	BB15_35;
+	@%p19 bra 	BB16_35;
 
 	ld.shared.f64 	%fd40, [sdata];
 	cvta.to.global.u64 	%rd39, %rd2;
@@ -2442,7 +2499,7 @@ BB15_33:
 	add.s64 	%rd41, %rd39, %rd40;
 	st.global.f64 	[%rd41], %fd40;
 
-BB15_35:
+BB16_35:
 	ret;
 }
 
@@ -2469,18 +2526,18 @@ BB15_35:
 	mov.u32 	%r9, %tid.x;
 	mad.lo.s32 	%r1, %r7, %r8, %r9;
 	setp.ge.u32	%p1, %r1, %r6;
-	@%p1 bra 	BB16_5;
+	@%p1 bra 	BB17_5;
 
 	cvta.to.global.u64 	%rd1, %rd2;
 	mul.lo.s32 	%r2, %r6, %r5;
 	mov.f64 	%fd8, 0d0000000000000000;
 	mov.f64 	%fd9, %fd8;
 	setp.ge.u32	%p2, %r1, %r2;
-	@%p2 bra 	BB16_4;
+	@%p2 bra 	BB17_4;
 
 	mov.u32 	%r10, %r1;
 
-BB16_3:
+BB17_3:
 	mov.u32 	%r3, %r10;
 	mul.wide.u32 	%rd4, %r3, 8;
 	add.s64 	%rd5, %rd1, %rd4;
@@ -2490,15 +2547,15 @@ BB16_3:
 	setp.lt.u32	%p3, %r4, %r2;
 	mov.u32 	%r10, %r4;
 	mov.f64 	%fd8, %fd9;
-	@%p3 bra 	BB16_3;
+	@%p3 bra 	BB17_3;
 
-BB16_4:
+BB17_4:
 	cvta.to.global.u64 	%rd6, %rd3;
 	mul.wide.u32 	%rd7, %r1, 8;
 	add.s64 	%rd8, %rd6, %rd7;
 	st.global.f64 	[%rd8], %fd8;
 
-BB16_5:
+BB17_5:
 	ret;
 }
 
@@ -2526,9 +2583,9 @@ BB16_5:
 	mov.f64 	%fd76, 0dFFEFFFFFFFFFFFFF;
 	mov.f64 	%fd77, %fd76;
 	setp.ge.u32	%p1, %r32, %r5;
-	@%p1 bra 	BB17_4;
+	@%p1 bra 	BB18_4;
 
-BB17_1:
+BB18_1:
 	mov.f64 	%fd1, %fd77;
 	cvta.to.global.u64 	%rd4, %rd2;
 	mul.wide.u32 	%rd5, %r32, 8;
@@ -2537,23 +2594,23 @@ BB17_1:
 	max.f64 	%fd78, %fd1, %fd30;
 	add.s32 	%r3, %r32, %r9;
 	setp.ge.u32	%p2, %r3, %r5;
-	@%p2 bra 	BB17_3;
+	@%p2 bra 	BB18_3;
 
 	mul.wide.u32 	%rd8, %r3, 8;
 	add.s64 	%rd9, %rd4, %rd8;
 	ld.global.f64 	%fd31, [%rd9];
 	max.f64 	%fd78, %fd78, %fd31;
 
-BB17_3:
+BB18_3:
 	mov.f64 	%fd77, %fd78;
 	shl.b32 	%r12, %r9, 1;
 	mov.u32 	%r13, %nctaid.x;
 	mad.lo.s32 	%r32, %r12, %r13, %r32;
 	setp.lt.u32	%p3, %r32, %r5;
 	mov.f64 	%fd76, %fd77;
-	@%p3 bra 	BB17_1;
+	@%p3 bra 	BB18_1;
 
-BB17_4:
+BB18_4:
 	mov.f64 	%fd74, %fd76;
 	mul.wide.u32 	%rd10, %r6, 8;
 	mov.u64 	%rd11, sdata;
@@ -2561,130 +2618,130 @@ BB17_4:
 	st.shared.f64 	[%rd1], %fd74;
 	bar.sync 	0;
 	setp.lt.u32	%p4, %r9, 1024;
-	@%p4 bra 	BB17_8;
+	@%p4 bra 	BB18_8;
 
 	setp.gt.u32	%p5, %r6, 511;
 	mov.f64 	%fd75, %fd74;
-	@%p5 bra 	BB17_7;
+	@%p5 bra 	BB18_7;
 
 	ld.shared.f64 	%fd32, [%rd1+4096];
 	max.f64 	%fd75, %fd74, %fd32;
 	st.shared.f64 	[%rd1], %fd75;
 
-BB17_7:
+BB18_7:
 	mov.f64 	%fd74, %fd75;
 	bar.sync 	0;
 
-BB17_8:
+BB18_8:
 	mov.f64 	%fd72, %fd74;
 	setp.lt.u32	%p6, %r9, 512;
-	@%p6 bra 	BB17_12;
+	@%p6 bra 	BB18_12;
 
 	setp.gt.u32	%p7, %r6, 255;
 	mov.f64 	%fd73, %fd72;
-	@%p7 bra 	BB17_11;
+	@%p7 bra 	BB18_11;
 
 	ld.shared.f64 	%fd33, [%rd1+2048];
 	max.f64 	%fd73, %fd72, %fd33;
 	st.shared.f64 	[%rd1], %fd73;
 
-BB17_11:
+BB18_11:
 	mov.f64 	%fd72, %fd73;
 	bar.sync 	0;
 
-BB17_12:
+BB18_12:
 	mov.f64 	%fd70, %fd72;
 	setp.lt.u32	%p8, %r9, 256;
-	@%p8 bra 	BB17_16;
+	@%p8 bra 	BB18_16;
 
 	setp.gt.u32	%p9, %r6, 127;
 	mov.f64 	%fd71, %fd70;
-	@%p9 bra 	BB17_15;
+	@%p9 bra 	BB18_15;
 
 	ld.shared.f64 	%fd34, [%rd1+1024];
 	max.f64 	%fd71, %fd70, %fd34;
 	st.shared.f64 	[%rd1], %fd71;
 
-BB17_15:
+BB18_15:
 	mov.f64 	%fd70, %fd71;
 	bar.sync 	0;
 
-BB17_16:
+BB18_16:
 	mov.f64 	%fd68, %fd70;
 	setp.lt.u32	%p10, %r9, 128;
-	@%p10 bra 	BB17_20;
+	@%p10 bra 	BB18_20;
 
 	setp.gt.u32	%p11, %r6, 63;
 	mov.f64 	%fd69, %fd68;
-	@%p11 bra 	BB17_19;
+	@%p11 bra 	BB18_19;
 
 	ld.shared.f64 	%fd35, [%rd1+512];
 	max.f64 	%fd69, %fd68, %fd35;
 	st.shared.f64 	[%rd1], %fd69;
 
-BB17_19:
+BB18_19:
 	mov.f64 	%fd68, %fd69;
 	bar.sync 	0;
 
-BB17_20:
+BB18_20:
 	mov.f64 	%fd67, %fd68;
 	setp.gt.u32	%p12, %r6, 31;
-	@%p12 bra 	BB17_33;
+	@%p12 bra 	BB18_33;
 
 	setp.lt.u32	%p13, %r9, 64;
-	@%p13 bra 	BB17_23;
+	@%p13 bra 	BB18_23;
 
 	ld.volatile.shared.f64 	%fd36, [%rd1+256];
 	max.f64 	%fd67, %fd67, %fd36;
 	st.volatile.shared.f64 	[%rd1], %fd67;
 
-BB17_23:
+BB18_23:
 	mov.f64 	%fd66, %fd67;
 	setp.lt.u32	%p14, %r9, 32;
-	@%p14 bra 	BB17_25;
+	@%p14 bra 	BB18_25;
 
 	ld.volatile.shared.f64 	%fd37, [%rd1+128];
 	max.f64 	%fd66, %fd66, %fd37;
 	st.volatile.shared.f64 	[%rd1], %fd66;
 
-BB17_25:
+BB18_25:
 	mov.f64 	%fd65, %fd66;
 	setp.lt.u32	%p15, %r9, 16;
-	@%p15 bra 	BB17_27;
+	@%p15 bra 	BB18_27;
 
 	ld.volatile.shared.f64 	%fd38, [%rd1+64];
 	max.f64 	%fd65, %fd65, %fd38;
 	st.volatile.shared.f64 	[%rd1], %fd65;
 
-BB17_27:
+BB18_27:
 	mov.f64 	%fd64, %fd65;
 	setp.lt.u32	%p16, %r9, 8;
-	@%p16 bra 	BB17_29;
+	@%p16 bra 	BB18_29;
 
 	ld.volatile.shared.f64 	%fd39, [%rd1+32];
 	max.f64 	%fd64, %fd64, %fd39;
 	st.volatile.shared.f64 	[%rd1], %fd64;
 
-BB17_29:
+BB18_29:
 	mov.f64 	%fd63, %fd64;
 	setp.lt.u32	%p17, %r9, 4;
-	@%p17 bra 	BB17_31;
+	@%p17 bra 	BB18_31;
 
 	ld.volatile.shared.f64 	%fd40, [%rd1+16];
 	max.f64 	%fd63, %fd63, %fd40;
 	st.volatile.shared.f64 	[%rd1], %fd63;
 
-BB17_31:
+BB18_31:
 	setp.lt.u32	%p18, %r9, 2;
-	@%p18 bra 	BB17_33;
+	@%p18 bra 	BB18_33;
 
 	ld.volatile.shared.f64 	%fd41, [%rd1+8];
 	max.f64 	%fd42, %fd63, %fd41;
 	st.volatile.shared.f64 	[%rd1], %fd42;
 
-BB17_33:
+BB18_33:
 	setp.ne.s32	%p19, %r6, 0;
-	@%p19 bra 	BB17_35;
+	@%p19 bra 	BB18_35;
 
 	ld.shared.f64 	%fd43, [sdata];
 	cvta.to.global.u64 	%rd12, %rd3;
@@ -2692,7 +2749,7 @@ BB17_33:
 	add.s64 	%rd14, %rd12, %rd13;
 	st.global.f64 	[%rd14], %fd43;
 
-BB17_35:
+BB18_35:
 	ret;
 }
 
@@ -2716,17 +2773,17 @@ BB17_35:
 	ld.param.u32 	%r4, [reduce_row_max_param_3];
 	mov.u32 	%r6, %ctaid.x;
 	setp.ge.u32	%p1, %r6, %r5;
-	@%p1 bra 	BB18_35;
+	@%p1 bra 	BB19_35;
 
 	mov.u32 	%r38, %tid.x;
 	mov.f64 	%fd72, 0dFFEFFFFFFFFFFFFF;
 	mov.f64 	%fd73, %fd72;
 	setp.ge.u32	%p2, %r38, %r4;
-	@%p2 bra 	BB18_4;
+	@%p2 bra 	BB19_4;
 
 	cvta.to.global.u64 	%rd3, %rd1;
 
-BB18_3:
+BB19_3:
 	mad.lo.s32 	%r8, %r6, %r4, %r38;
 	mul.wide.u32 	%rd4, %r8, 8;
 	add.s64 	%rd5, %rd3, %rd4;
@@ -2736,9 +2793,9 @@ BB18_3:
 	add.s32 	%r38, %r9, %r38;
 	setp.lt.u32	%p3, %r38, %r4;
 	mov.f64 	%fd72, %fd73;
-	@%p3 bra 	BB18_3;
+	@%p3 bra 	BB19_3;
 
-BB18_4:
+BB19_4:
 	mov.f64 	%fd70, %fd72;
 	mov.u32 	%r10, %tid.x;
 	mul.wide.u32 	%rd6, %r10, 8;
@@ -2748,130 +2805,130 @@ BB18_4:
 	bar.sync 	0;
 	mov.u32 	%r11, %ntid.x;
 	setp.lt.u32	%p4, %r11, 1024;
-	@%p4 bra 	BB18_8;
+	@%p4 bra 	BB19_8;
 
 	setp.gt.u32	%p5, %r10, 511;
 	mov.f64 	%fd71, %fd70;
-	@%p5 bra 	BB18_7;
+	@%p5 bra 	BB19_7;
 
 	ld.shared.f64 	%fd29, [%rd8+4096];
 	max.f64 	%fd71, %fd70, %fd29;
 	st.shared.f64 	[%rd8], %fd71;
 
-BB18_7:
+BB19_7:
 	mov.f64 	%fd70, %fd71;
 	bar.sync 	0;
 
-BB18_8:
+BB19_8:
 	mov.f64 	%fd68, %fd70;
 	setp.lt.u32	%p6, %r11, 512;
-	@%p6 bra 	BB18_12;
+	@%p6 bra 	BB19_12;
 
 	setp.gt.u32	%p7, %r10, 255;
 	mov.f64 	%fd69, %fd68;
-	@%p7 bra 	BB18_11;
+	@%p7 bra 	BB19_11;
 
 	ld.shared.f64 	%fd30, [%rd8+2048];
 	max.f64 	%fd69, %fd68, %fd30;
 	st.shared.f64 	[%rd8], %fd69;
 
-BB18_11:
+BB19_11:
 	mov.f64 	%fd68, %fd69;
 	bar.sync 	0;
 
-BB18_12:
+BB19_12:
 	mov.f64 	%fd66, %fd68;
 	setp.lt.u32	%p8, %r11, 256;
-	@%p8 bra 	BB18_16;
+	@%p8 bra 	BB19_16;
 
 	setp.gt.u32	%p9, %r10, 127;
 	mov.f64 	%fd67, %fd66;
-	@%p9 bra 	BB18_15;
+	@%p9 bra 	BB19_15;
 
 	ld.shared.f64 	%fd31, [%rd8+1024];
 	max.f64 	%fd67, %fd66, %fd31;
 	st.shared.f64 	[%rd8], %fd67;
 
-BB18_15:
+BB19_15:
 	mov.f64 	%fd66, %fd67;
 	bar.sync 	0;
 
-BB18_16:
+BB19_16:
 	mov.f64 	%fd64, %fd66;
 	setp.lt.u32	%p10, %r11, 128;
-	@%p10 bra 	BB18_20;
+	@%p10 bra 	BB19_20;
 
 	setp.gt.u32	%p11, %r10, 63;
 	mov.f64 	%fd65, %fd64;
-	@%p11 bra 	BB18_19;
+	@%p11 bra 	BB19_19;
 
 	ld.shared.f64 	%fd32, [%rd8+512];
 	max.f64 	%fd65, %fd64, %fd32;
 	st.shared.f64 	[%rd8], %fd65;
 
-BB18_19:
+BB19_19:
 	mov.f64 	%fd64, %fd65;
 	bar.sync 	0;
 
-BB18_20:
+BB19_20:
 	mov.f64 	%fd63, %fd64;
 	setp.gt.u32	%p12, %r10, 31;
-	@%p12 bra 	BB18_33;
+	@%p12 bra 	BB19_33;
 
 	setp.lt.u32	%p13, %r11, 64;
-	@%p13 bra 	BB18_23;
+	@%p13 bra 	BB19_23;
 
 	ld.volatile.shared.f64 	%fd33, [%rd8+256];
 	max.f64 	%fd63, %fd63, %fd33;
 	st.volatile.shared.f64 	[%rd8], %fd63;
 
-BB18_23:
+BB19_23:
 	mov.f64 	%fd62, %fd63;
 	setp.lt.u32	%p14, %r11, 32;
-	@%p14 bra 	BB18_25;
+	@%p14 bra 	BB19_25;
 
 	ld.volatile.shared.f64 	%fd34, [%rd8+128];
 	max.f64 	%fd62, %fd62, %fd34;
 	st.volatile.shared.f64 	[%rd8], %fd62;
 
-BB18_25:
+BB19_25:
 	mov.f64 	%fd61, %fd62;
 	setp.lt.u32	%p15, %r11, 16;
-	@%p15 bra 	BB18_27;
+	@%p15 bra 	BB19_27;
 
 	ld.volatile.shared.f64 	%fd35, [%rd8+64];
 	max.f64 	%fd61, %fd61, %fd35;
 	st.volatile.shared.f64 	[%rd8], %fd61;
 
-BB18_27:
+BB19_27:
 	mov.f64 	%fd60, %fd61;
 	setp.lt.u32	%p16, %r11, 8;
-	@%p16 bra 	BB18_29;
+	@%p16 bra 	BB19_29;
 
 	ld.volatile.shared.f64 	%fd36, [%rd8+32];
 	max.f64 	%fd60, %fd60, %fd36;
 	st.volatile.shared.f64 	[%rd8], %fd60;
 
-BB18_29:
+BB19_29:
 	mov.f64 	%fd59, %fd60;
 	setp.lt.u32	%p17, %r11, 4;
-	@%p17 bra 	BB18_31;
+	@%p17 bra 	BB19_31;
 
 	ld.volatile.shared.f64 	%fd37, [%rd8+16];
 	max.f64 	%fd59, %fd59, %fd37;
 	st.volatile.shared.f64 	[%rd8], %fd59;
 
-BB18_31:
+BB19_31:
 	setp.lt.u32	%p18, %r11, 2;
-	@%p18 bra 	BB18_33;
+	@%p18 bra 	BB19_33;
 
 	ld.volatile.shared.f64 	%fd38, [%rd8+8];
 	max.f64 	%fd39, %fd59, %fd38;
 	st.volatile.shared.f64 	[%rd8], %fd39;
 
-BB18_33:
+BB19_33:
 	setp.ne.s32	%p19, %r10, 0;
-	@%p19 bra 	BB18_35;
+	@%p19 bra 	BB19_35;
 
 	ld.shared.f64 	%fd40, [sdata];
 	cvta.to.global.u64 	%rd39, %rd2;
@@ -2879,7 +2936,7 @@ BB18_33:
 	add.s64 	%rd41, %rd39, %rd40;
 	st.global.f64 	[%rd41], %fd40;
 
-BB18_35:
+BB19_35:
 	ret;
 }
 
@@ -2906,18 +2963,18 @@ BB18_35:
 	mov.u32 	%r9, %tid.x;
 	mad.lo.s32 	%r1, %r7, %r8, %r9;
 	setp.ge.u32	%p1, %r1, %r6;
-	@%p1 bra 	BB19_5;
+	@%p1 bra 	BB20_5;
 
 	cvta.to.global.u64 	%rd1, %rd2;
 	mul.lo.s32 	%r2, %r6, %r5;
 	mov.f64 	%fd8, 0dFFEFFFFFFFFFFFFF;
 	mov.f64 	%fd9, %fd8;
 	setp.ge.u32	%p2, %r1, %r2;
-	@%p2 bra 	BB19_4;
+	@%p2 bra 	BB20_4;
 
 	mov.u32 	%r10, %r1;
 
-BB19_3:
+BB20_3:
 	mov.u32 	%r3, %r10;
 	mul.wide.u32 	%rd4, %r3, 8;
 	add.s64 	%rd5, %rd1, %rd4;
@@ -2927,15 +2984,15 @@ BB19_3:
 	setp.lt.u32	%p3, %r4, %r2;
 	mov.u32 	%r10, %r4;
 	mov.f64 	%fd8, %fd9;
-	@%p3 bra 	BB19_3;
+	@%p3 bra 	BB20_3;
 
-BB19_4:
+BB20_4:
 	cvta.to.global.u64 	%rd6, %rd3;
 	mul.wide.u32 	%rd7, %r1, 8;
 	add.s64 	%rd8, %rd6, %rd7;
 	st.global.f64 	[%rd8], %fd8;
 
-BB19_5:
+BB20_5:
 	ret;
 }
 
@@ -2963,9 +3020,9 @@ BB19_5:
 	mov.f64 	%fd76, 0d7FEFFFFFFFFFFFFF;
 	mov.f64 	%fd77, %fd76;
 	setp.ge.u32	%p1, %r32, %r5;
-	@%p1 bra 	BB20_4;
+	@%p1 bra 	BB21_4;
 
-BB20_1:
+BB21_1:
 	mov.f64 	%fd1, %fd77;
 	cvta.to.global.u64 	%rd4, %rd2;
 	mul.wide.u32 	%rd5, %r32, 8;
@@ -2974,23 +3031,23 @@ BB20_1:
 	min.f64 	%fd78, %fd1, %fd30;
 	add.s32 	%r3, %r32, %r9;
 	setp.ge.u32	%p2, %r3, %r5;
-	@%p2 bra 	BB20_3;
+	@%p2 bra 	BB21_3;
 
 	mul.wide.u32 	%rd8, %r3, 8;
 	add.s64 	%rd9, %rd4, %rd8;
 	ld.global.f64 	%fd31, [%rd9];
 	min.f64 	%fd78, %fd78, %fd31;
 
-BB20_3:
+BB21_3:
 	mov.f64 	%fd77, %fd78;
 	shl.b32 	%r12, %r9, 1;
 	mov.u32 	%r13, %nctaid.x;
 	mad.lo.s32 	%r32, %r12, %r13, %r32;
 	setp.lt.u32	%p3, %r32, %r5;
 	mov.f64 	%fd76, %fd77;
-	@%p3 bra 	BB20_1;
+	@%p3 bra 	BB21_1;
 
-BB20_4:
+BB21_4:
 	mov.f64 	%fd74, %fd76;
 	mul.wide.u32 	%rd10, %r6, 8;
 	mov.u64 	%rd11, sdata;
@@ -2998,130 +3055,130 @@ BB20_4:
 	st.shared.f64 	[%rd1], %fd74;
 	bar.sync 	0;
 	setp.lt.u32	%p4, %r9, 1024;
-	@%p4 bra 	BB20_8;
+	@%p4 bra 	BB21_8;
 
 	setp.gt.u32	%p5, %r6, 511;
 	mov.f64 	%fd75, %fd74;
-	@%p5 bra 	BB20_7;
+	@%p5 bra 	BB21_7;
 
 	ld.shared.f64 	%fd32, [%rd1+4096];
 	min.f64 	%fd75, %fd74, %fd32;
 	st.shared.f64 	[%rd1], %fd75;
 
-BB20_7:
+BB21_7:
 	mov.f64 	%fd74, %fd75;
 	bar.sync 	0;
 
-BB20_8:
+BB21_8:
 	mov.f64 	%fd72, %fd74;
 	setp.lt.u32	%p6, %r9, 512;
-	@%p6 bra 	BB20_12;
+	@%p6 bra 	BB21_12;
 
 	setp.gt.u32	%p7, %r6, 255;
 	mov.f64 	%fd73, %fd72;
-	@%p7 bra 	BB20_11;
+	@%p7 bra 	BB21_11;
 
 	ld.shared.f64 	%fd33, [%rd1+2048];
 	min.f64 	%fd73, %fd72, %fd33;
 	st.shared.f64 	[%rd1], %fd73;
 
-BB20_11:
+BB21_11:
 	mov.f64 	%fd72, %fd73;
 	bar.sync 	0;
 
-BB20_12:
+BB21_12:
 	mov.f64 	%fd70, %fd72;
 	setp.lt.u32	%p8, %r9, 256;
-	@%p8 bra 	BB20_16;
+	@%p8 bra 	BB21_16;
 
 	setp.gt.u32	%p9, %r6, 127;
 	mov.f64 	%fd71, %fd70;
-	@%p9 bra 	BB20_15;
+	@%p9 bra 	BB21_15;
 
 	ld.shared.f64 	%fd34, [%rd1+1024];
 	min.f64 	%fd71, %fd70, %fd34;
 	st.shared.f64 	[%rd1], %fd71;
 
-BB20_15:
+BB21_15:
 	mov.f64 	%fd70, %fd71;
 	bar.sync 	0;
 
-BB20_16:
+BB21_16:
 	mov.f64 	%fd68, %fd70;
 	setp.lt.u32	%p10, %r9, 128;
-	@%p10 bra 	BB20_20;
+	@%p10 bra 	BB21_20;
 
 	setp.gt.u32	%p11, %r6, 63;
 	mov.f64 	%fd69, %fd68;
-	@%p11 bra 	BB20_19;
+	@%p11 bra 	BB21_19;
 
 	ld.shared.f64 	%fd35, [%rd1+512];
 	min.f64 	%fd69, %fd68, %fd35;
 	st.shared.f64 	[%rd1], %fd69;
 
-BB20_19:
+BB21_19:
 	mov.f64 	%fd68, %fd69;
 	bar.sync 	0;
 
-BB20_20:
+BB21_20:
 	mov.f64 	%fd67, %fd68;
 	setp.gt.u32	%p12, %r6, 31;
-	@%p12 bra 	BB20_33;
+	@%p12 bra 	BB21_33;
 
 	setp.lt.u32	%p13, %r9, 64;
-	@%p13 bra 	BB20_23;
+	@%p13 bra 	BB21_23;
 
 	ld.volatile.shared.f64 	%fd36, [%rd1+256];
 	min.f64 	%fd67, %fd67, %fd36;
 	st.volatile.shared.f64 	[%rd1], %fd67;
 
-BB20_23:
+BB21_23:
 	mov.f64 	%fd66, %fd67;
 	setp.lt.u32	%p14, %r9, 32;
-	@%p14 bra 	BB20_25;
+	@%p14 bra 	BB21_25;
 
 	ld.volatile.shared.f64 	%fd37, [%rd1+128];
 	min.f64 	%fd66, %fd66, %fd37;
 	st.volatile.shared.f64 	[%rd1], %fd66;
 
-BB20_25:
+BB21_25:
 	mov.f64 	%fd65, %fd66;
 	setp.lt.u32	%p15, %r9, 16;
-	@%p15 bra 	BB20_27;
+	@%p15 bra 	BB21_27;
 
 	ld.volatile.shared.f64 	%fd38, [%rd1+64];
 	min.f64 	%fd65, %fd65, %fd38;
 	st.volatile.shared.f64 	[%rd1], %fd65;
 
-BB20_27:
+BB21_27:
 	mov.f64 	%fd64, %fd65;
 	setp.lt.u32	%p16, %r9, 8;
-	@%p16 bra 	BB20_29;
+	@%p16 bra 	BB21_29;
 
 	ld.volatile.shared.f64 	%fd39, [%rd1+32];
 	min.f64 	%fd64, %fd64, %fd39;
 	st.volatile.shared.f64 	[%rd1], %fd64;
 
-BB20_29:
+BB21_29:
 	mov.f64 	%fd63, %fd64;
 	setp.lt.u32	%p17, %r9, 4;
-	@%p17 bra 	BB20_31;
+	@%p17 bra 	BB21_31;
 
 	ld.volatile.shared.f64 	%fd40, [%rd1+16];
 	min.f64 	%fd63, %fd63, %fd40;
 	st.volatile.shared.f64 	[%rd1], %fd63;
 
-BB20_31:
+BB21_31:
 	setp.lt.u32	%p18, %r9, 2;
-	@%p18 bra 	BB20_33;
+	@%p18 bra 	BB21_33;
 
 	ld.volatile.shared.f64 	%fd41, [%rd1+8];
 	min.f64 	%fd42, %fd63, %fd41;
 	st.volatile.shared.f64 	[%rd1], %fd42;
 
-BB20_33:
+BB21_33:
 	setp.ne.s32	%p19, %r6, 0;
-	@%p19 bra 	BB20_35;
+	@%p19 bra 	BB21_35;
 
 	ld.shared.f64 	%fd43, [sdata];
 	cvta.to.global.u64 	%rd12, %rd3;
@@ -3129,7 +3186,7 @@ BB20_33:
 	add.s64 	%rd14, %rd12, %rd13;
 	st.global.f64 	[%rd14], %fd43;
 
-BB20_35:
+BB21_35:
 	ret;
 }
 
@@ -3153,17 +3210,17 @@ BB20_35:
 	ld.param.u32 	%r4, [reduce_row_min_param_3];
 	mov.u32 	%r6, %ctaid.x;
 	setp.ge.u32	%p1, %r6, %r5;
-	@%p1 bra 	BB21_35;
+	@%p1 bra 	BB22_35;
 
 	mov.u32 	%r38, %tid.x;
 	mov.f64 	%fd72, 0d7FEFFFFFFFFFFFFF;
 	mov.f64 	%fd73, %fd72;
 	setp.ge.u32	%p2, %r38, %r4;
-	@%p2 bra 	BB21_4;
+	@%p2 bra 	BB22_4;
 
 	cvta.to.global.u64 	%rd3, %rd1;
 
-BB21_3:
+BB22_3:
 	mad.lo.s32 	%r8, %r6, %r4, %r38;
 	mul.wide.u32 	%rd4, %r8, 8;
 	add.s64 	%rd5, %rd3, %rd4;
@@ -3173,9 +3230,9 @@ BB21_3:
 	add.s32 	%r38, %r9, %r38;
 	setp.lt.u32	%p3, %r38, %r4;
 	mov.f64 	%fd72, %fd73;
-	@%p3 bra 	BB21_3;
+	@%p3 bra 	BB22_3;
 
-BB21_4:
+BB22_4:
 	mov.f64 	%fd70, %fd72;
 	mov.u32 	%r10, %tid.x;
 	mul.wide.u32 	%rd6, %r10, 8;
@@ -3185,130 +3242,130 @@ BB21_4:
 	bar.sync 	0;
 	mov.u32 	%r11, %ntid.x;
 	setp.lt.u32	%p4, %r11, 1024;
-	@%p4 bra 	BB21_8;
+	@%p4 bra 	BB22_8;
 
 	setp.gt.u32	%p5, %r10, 511;
 	mov.f64 	%fd71, %fd70;
-	@%p5 bra 	BB21_7;
+	@%p5 bra 	BB22_7;
 
 	ld.shared.f64 	%fd29, [%rd8+4096];
 	min.f64 	%fd71, %fd70, %fd29;
 	st.shared.f64 	[%rd8], %fd71;
 
-BB21_7:
+BB22_7:
 	mov.f64 	%fd70, %fd71;
 	bar.sync 	0;
 
-BB21_8:
+BB22_8:
 	mov.f64 	%fd68, %fd70;
 	setp.lt.u32	%p6, %r11, 512;
-	@%p6 bra 	BB21_12;
+	@%p6 bra 	BB22_12;
 
 	setp.gt.u32	%p7, %r10, 255;
 	mov.f64 	%fd69, %fd68;
-	@%p7 bra 	BB21_11;
+	@%p7 bra 	BB22_11;
 
 	ld.shared.f64 	%fd30, [%rd8+2048];
 	min.f64 	%fd69, %fd68, %fd30;
 	st.shared.f64 	[%rd8], %fd69;
 
-BB21_11:
+BB22_11:
 	mov.f64 	%fd68, %fd69;
 	bar.sync 	0;
 
-BB21_12:
+BB22_12:
 	mov.f64 	%fd66, %fd68;
 	setp.lt.u32	%p8, %r11, 256;
-	@%p8 bra 	BB21_16;
+	@%p8 bra 	BB22_16;
 
 	setp.gt.u32	%p9, %r10, 127;
 	mov.f64 	%fd67, %fd66;
-	@%p9 bra 	BB21_15;
+	@%p9 bra 	BB22_15;
 
 	ld.shared.f64 	%fd31, [%rd8+1024];
 	min.f64 	%fd67, %fd66, %fd31;
 	st.shared.f64 	[%rd8], %fd67;
 
-BB21_15:
+BB22_15:
 	mov.f64 	%fd66, %fd67;
 	bar.sync 	0;
 
-BB21_16:
+BB22_16:
 	mov.f64 	%fd64, %fd66;
 	setp.lt.u32	%p10, %r11, 128;
-	@%p10 bra 	BB21_20;
+	@%p10 bra 	BB22_20;
 
 	setp.gt.u32	%p11, %r10, 63;
 	mov.f64 	%fd65, %fd64;
-	@%p11 bra 	BB21_19;
+	@%p11 bra 	BB22_19;
 
 	ld.shared.f64 	%fd32, [%rd8+512];
 	min.f64 	%fd65, %fd64, %fd32;
 	st.shared.f64 	[%rd8], %fd65;
 
-BB21_19:
+BB22_19:
 	mov.f64 	%fd64, %fd65;
 	bar.sync 	0;
 
-BB21_20:
+BB22_20:
 	mov.f64 	%fd63, %fd64;
 	setp.gt.u32	%p12, %r10, 31;
-	@%p12 bra 	BB21_33;
+	@%p12 bra 	BB22_33;
 
 	setp.lt.u32	%p13, %r11, 64;
-	@%p13 bra 	BB21_23;
+	@%p13 bra 	BB22_23;
 
 	ld.volatile.shared.f64 	%fd33, [%rd8+256];
 	min.f64 	%fd63, %fd63, %fd33;
 	st.volatile.shared.f64 	[%rd8], %fd63;
 
-BB21_23:
+BB22_23:
 	mov.f64 	%fd62, %fd63;
 	setp.lt.u32	%p14, %r11, 32;
-	@%p14 bra 	BB21_25;
+	@%p14 bra 	BB22_25;
 
 	ld.volatile.shared.f64 	%fd34, [%rd8+128];
 	min.f64 	%fd62, %fd62, %fd34;
 	st.volatile.shared.f64 	[%rd8], %fd62;
 
-BB21_25:
+BB22_25:
 	mov.f64 	%fd61, %fd62;
 	setp.lt.u32	%p15, %r11, 16;
-	@%p15 bra 	BB21_27;
+	@%p15 bra 	BB22_27;
 
 	ld.volatile.shared.f64 	%fd35, [%rd8+64];
 	min.f64 	%fd61, %fd61, %fd35;
 	st.volatile.shared.f64 	[%rd8], %fd61;
 
-BB21_27:
+BB22_27:
 	mov.f64 	%fd60, %fd61;
 	setp.lt.u32	%p16, %r11, 8;
-	@%p16 bra 	BB21_29;
+	@%p16 bra 	BB22_29;
 
 	ld.volatile.shared.f64 	%fd36, [%rd8+32];
 	min.f64 	%fd60, %fd60, %fd36;
 	st.volatile.shared.f64 	[%rd8], %fd60;
 
-BB21_29:
+BB22_29:
 	mov.f64 	%fd59, %fd60;
 	setp.lt.u32	%p17, %r11, 4;
-	@%p17 bra 	BB21_31;
+	@%p17 bra 	BB22_31;
 
 	ld.volatile.shared.f64 	%fd37, [%rd8+16];
 	min.f64 	%fd59, %fd59, %fd37;
 	st.volatile.shared.f64 	[%rd8], %fd59;
 
-BB21_31:
+BB22_31:
 	setp.lt.u32	%p18, %r11, 2;
-	@%p18 bra 	BB21_33;
+	@%p18 bra 	BB22_33;
 
 	ld.volatile.shared.f64 	%fd38, [%rd8+8];
 	min.f64 	%fd39, %fd59, %fd38;
 	st.volatile.shared.f64 	[%rd8], %fd39;
 
-BB21_33:
+BB22_33:
 	setp.ne.s32	%p19, %r10, 0;
-	@%p19 bra 	BB21_35;
+	@%p19 bra 	BB22_35;
 
 	ld.shared.f64 	%fd40, [sdata];
 	cvta.to.global.u64 	%rd39, %rd2;
@@ -3316,7 +3373,7 @@ BB21_33:
 	add.s64 	%rd41, %rd39, %rd40;
 	st.global.f64 	[%rd41], %fd40;
 
-BB21_35:
+BB22_35:
 	ret;
 }
 
@@ -3343,18 +3400,18 @@ BB21_35:
 	mov.u32 	%r9, %tid.x;
 	mad.lo.s32 	%r1, %r7, %r8, %r9;
 	setp.ge.u32	%p1, %r1, %r6;
-	@%p1 bra 	BB22_5;
+	@%p1 bra 	BB23_5;
 
 	cvta.to.global.u64 	%rd1, %rd2;
 	mul.lo.s32 	%r2, %r6, %r5;
 	mov.f64 	%fd8, 0d7FEFFFFFFFFFFFFF;
 	mov.f64 	%fd9, %fd8;
 	setp.ge.u32	%p2, %r1, %r2;
-	@%p2 bra 	BB22_4;
+	@%p2 bra 	BB23_4;
 
 	mov.u32 	%r10, %r1;
 
-BB22_3:
+BB23_3:
 	mov.u32 	%r3, %r10;
 	mul.wide.u32 	%rd4, %r3, 8;
 	add.s64 	%rd5, %rd1, %rd4;
@@ -3364,15 +3421,15 @@ BB22_3:
 	setp.lt.u32	%p3, %r4, %r2;
 	mov.u32 	%r10, %r4;
 	mov.f64 	%fd8, %fd9;
-	@%p3 bra 	BB22_3;
+	@%p3 bra 	BB23_3;
 
-BB22_4:
+BB23_4:
 	cvta.to.global.u64 	%rd6, %rd3;
 	mul.wide.u32 	%rd7, %r1, 8;
 	add.s64 	%rd8, %rd6, %rd7;
 	st.global.f64 	[%rd8], %fd8;
 
-BB22_5:
+BB23_5:
 	ret;
 }
 
@@ -3400,9 +3457,9 @@ BB22_5:
 	mov.f64 	%fd76, 0d3FF0000000000000;
 	mov.f64 	%fd77, %fd76;
 	setp.ge.u32	%p1, %r32, %r5;
-	@%p1 bra 	BB23_4;
+	@%p1 bra 	BB24_4;
 
-BB23_1:
+BB24_1:
 	mov.f64 	%fd1, %fd77;
 	cvta.to.global.u64 	%rd4, %rd2;
 	mul.wide.u32 	%rd5, %r32, 8;
@@ -3411,23 +3468,23 @@ BB23_1:
 	mul.f64 	%fd78, %fd1, %fd30;
 	add.s32 	%r3, %r32, %r9;
 	setp.ge.u32	%p2, %r3, %r5;
-	@%p2 bra 	BB23_3;
+	@%p2 bra 	BB24_3;
 
 	mul.wide.u32 	%rd8, %r3, 8;
 	add.s64 	%rd9, %rd4, %rd8;
 	ld.global.f64 	%fd31, [%rd9];
 	mul.f64 	%fd78, %fd78, %fd31;
 
-BB23_3:
+BB24_3:
 	mov.f64 	%fd77, %fd78;
 	shl.b32 	%r12, %r9, 1;
 	mov.u32 	%r13, %nctaid.x;
 	mad.lo.s32 	%r32, %r12, %r13, %r32;
 	setp.lt.u32	%p3, %r32, %r5;
 	mov.f64 	%fd76, %fd77;
-	@%p3 bra 	BB23_1;
+	@%p3 bra 	BB24_1;
 
-BB23_4:
+BB24_4:
 	mov.f64 	%fd74, %fd76;
 	mul.wide.u32 	%rd10, %r6, 8;
 	mov.u64 	%rd11, sdata;
@@ -3435,130 +3492,130 @@ BB23_4:
 	st.shared.f64 	[%rd1], %fd74;
 	bar.sync 	0;
 	setp.lt.u32	%p4, %r9, 1024;
-	@%p4 bra 	BB23_8;
+	@%p4 bra 	BB24_8;
 
 	setp.gt.u32	%p5, %r6, 511;
 	mov.f64 	%fd75, %fd74;
-	@%p5 bra 	BB23_7;
+	@%p5 bra 	BB24_7;
 
 	ld.shared.f64 	%fd32, [%rd1+4096];
 	mul.f64 	%fd75, %fd74, %fd32;
 	st.shared.f64 	[%rd1], %fd75;
 
-BB23_7:
+BB24_7:
 	mov.f64 	%fd74, %fd75;
 	bar.sync 	0;
 
-BB23_8:
+BB24_8:
 	mov.f64 	%fd72, %fd74;
 	setp.lt.u32	%p6, %r9, 512;
-	@%p6 bra 	BB23_12;
+	@%p6 bra 	BB24_12;
 
 	setp.gt.u32	%p7, %r6, 255;
 	mov.f64 	%fd73, %fd72;
-	@%p7 bra 	BB23_11;
+	@%p7 bra 	BB24_11;
 
 	ld.shared.f64 	%fd33, [%rd1+2048];
 	mul.f64 	%fd73, %fd72, %fd33;
 	st.shared.f64 	[%rd1], %fd73;
 
-BB23_11:
+BB24_11:
 	mov.f64 	%fd72, %fd73;
 	bar.sync 	0;
 
-BB23_12:
+BB24_12:
 	mov.f64 	%fd70, %fd72;
 	setp.lt.u32	%p8, %r9, 256;
-	@%p8 bra 	BB23_16;
+	@%p8 bra 	BB24_16;
 
 	setp.gt.u32	%p9, %r6, 127;
 	mov.f64 	%fd71, %fd70;
-	@%p9 bra 	BB23_15;
+	@%p9 bra 	BB24_15;
 
 	ld.shared.f64 	%fd34, [%rd1+1024];
 	mul.f64 	%fd71, %fd70, %fd34;
 	st.shared.f64 	[%rd1], %fd71;
 
-BB23_15:
+BB24_15:
 	mov.f64 	%fd70, %fd71;
 	bar.sync 	0;
 
-BB23_16:
+BB24_16:
 	mov.f64 	%fd68, %fd70;
 	setp.lt.u32	%p10, %r9, 128;
-	@%p10 bra 	BB23_20;
+	@%p10 bra 	BB24_20;
 
 	setp.gt.u32	%p11, %r6, 63;
 	mov.f64 	%fd69, %fd68;
-	@%p11 bra 	BB23_19;
+	@%p11 bra 	BB24_19;
 
 	ld.shared.f64 	%fd35, [%rd1+512];
 	mul.f64 	%fd69, %fd68, %fd35;
 	st.shared.f64 	[%rd1], %fd69;
 
-BB23_19:
+BB24_19:
 	mov.f64 	%fd68, %fd69;
 	bar.sync 	0;
 
-BB23_20:
+BB24_20:
 	mov.f64 	%fd67, %fd68;
 	setp.gt.u32	%p12, %r6, 31;
-	@%p12 bra 	BB23_33;
+	@%p12 bra 	BB24_33;
 
 	setp.lt.u32	%p13, %r9, 64;
-	@%p13 bra 	BB23_23;
+	@%p13 bra 	BB24_23;
 
 	ld.volatile.shared.f64 	%fd36, [%rd1+256];
 	mul.f64 	%fd67, %fd67, %fd36;
 	st.volatile.shared.f64 	[%rd1], %fd67;
 
-BB23_23:
+BB24_23:
 	mov.f64 	%fd66, %fd67;
 	setp.lt.u32	%p14, %r9, 32;
-	@%p14 bra 	BB23_25;
+	@%p14 bra 	BB24_25;
 
 	ld.volatile.shared.f64 	%fd37, [%rd1+128];
 	mul.f64 	%fd66, %fd66, %fd37;
 	st.volatile.shared.f64 	[%rd1], %fd66;
 
-BB23_25:
+BB24_25:
 	mov.f64 	%fd65, %fd66;
 	setp.lt.u32	%p15, %r9, 16;
-	@%p15 bra 	BB23_27;
+	@%p15 bra 	BB24_27;
 
 	ld.volatile.shared.f64 	%fd38, [%rd1+64];
 	mul.f64 	%fd65, %fd65, %fd38;
 	st.volatile.shared.f64 	[%rd1], %fd65;
 
-BB23_27:
+BB24_27:
 	mov.f64 	%fd64, %fd65;
 	setp.lt.u32	%p16, %r9, 8;
-	@%p16 bra 	BB23_29;
+	@%p16 bra 	BB24_29;
 
 	ld.volatile.shared.f64 	%fd39, [%rd1+32];
 	mul.f64 	%fd64, %fd64, %fd39;
 	st.volatile.shared.f64 	[%rd1], %fd64;
 
-BB23_29:
+BB24_29:
 	mov.f64 	%fd63, %fd64;
 	setp.lt.u32	%p17, %r9, 4;
-	@%p17 bra 	BB23_31;
+	@%p17 bra 	BB24_31;
 
 	ld.volatile.shared.f64 	%fd40, [%rd1+16];
 	mul.f64 	%fd63, %fd63, %fd40;
 	st.volatile.shared.f64 	[%rd1], %fd63;
 
-BB23_31:
+BB24_31:
 	setp.lt.u32	%p18, %r9, 2;
-	@%p18 bra 	BB23_33;
+	@%p18 bra 	BB24_33;
 
 	ld.volatile.shared.f64 	%fd41, [%rd1+8];
 	mul.f64 	%fd42, %fd63, %fd41;
 	st.volatile.shared.f64 	[%rd1], %fd42;
 
-BB23_33:
+BB24_33:
 	setp.ne.s32	%p19, %r6, 0;
-	@%p19 bra 	BB23_35;
+	@%p19 bra 	BB24_35;
 
 	ld.shared.f64 	%fd43, [sdata];
 	cvta.t

<TRUNCATED>