You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2017/02/08 19:15:32 UTC

[2/3] incubator-systemml git commit: [SYSTEMML-1039] Added variance, row/col variance

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ad009d81/src/main/cpp/kernels/SystemML.ptx
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.ptx b/src/main/cpp/kernels/SystemML.ptx
index dfff5dd..93f3879 100644
--- a/src/main/cpp/kernels/SystemML.ptx
+++ b/src/main/cpp/kernels/SystemML.ptx
@@ -10,7 +10,7 @@
 .target sm_30
 .address_size 64
 
-	// .globl	getBoolean
+	// .globl	_Z6reduceI5SumOpEvPdS1_jT_d
 .func  (.param .b64 func_retval0) __internal_accurate_pow
 (
 	.param .b64 __internal_accurate_pow_param_0,
@@ -19,307 +19,6 @@
 ;
 .extern .shared .align 8 .b8 sdata[];
 
-.visible .func  (.param .b64 func_retval0) getBoolean(
-	.param .b32 getBoolean_param_0
-)
-{
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<2>;
-	.reg .f64 	%fd<2>;
-
-
-	ld.param.u32 	%r1, [getBoolean_param_0];
-	setp.eq.s32	%p1, %r1, 0;
-	selp.f64	%fd1, 0d0000000000000000, 0d3FF0000000000000, %p1;
-	st.param.f64	[func_retval0+0], %fd1;
-	ret;
-}
-
-	// .globl	binaryOp
-.visible .func  (.param .b64 func_retval0) binaryOp(
-	.param .b64 binaryOp_param_0,
-	.param .b64 binaryOp_param_1,
-	.param .b32 binaryOp_param_2
-)
-{
-	.reg .pred 	%p<41>;
-	.reg .b32 	%r<30>;
-	.reg .f64 	%fd<40>;
-	.reg .b64 	%rd<3>;
-
-
-	ld.param.f64 	%fd26, [binaryOp_param_0];
-	ld.param.f64 	%fd27, [binaryOp_param_1];
-	ld.param.u32 	%r3, [binaryOp_param_2];
-	setp.eq.s32	%p2, %r3, 0;
-	@%p2 bra 	BB1_40;
-
-	setp.eq.s32	%p3, %r3, 1;
-	@%p3 bra 	BB1_39;
-	bra.uni 	BB1_2;
-
-BB1_39:
-	sub.f64 	%fd39, %fd26, %fd27;
-	bra.uni 	BB1_41;
-
-BB1_40:
-	add.f64 	%fd39, %fd26, %fd27;
-	bra.uni 	BB1_41;
-
-BB1_2:
-	setp.eq.s32	%p4, %r3, 2;
-	@%p4 bra 	BB1_38;
-	bra.uni 	BB1_3;
-
-BB1_38:
-	mul.f64 	%fd39, %fd26, %fd27;
-	bra.uni 	BB1_41;
-
-BB1_3:
-	setp.eq.s32	%p5, %r3, 3;
-	@%p5 bra 	BB1_37;
-	bra.uni 	BB1_4;
-
-BB1_37:
-	div.rn.f64 	%fd39, %fd26, %fd27;
-	bra.uni 	BB1_41;
-
-BB1_4:
-	setp.eq.s32	%p6, %r3, 4;
-	@%p6 bra 	BB1_21;
-	bra.uni 	BB1_5;
-
-BB1_21:
-	{
-	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r1}, %fd26;
-	}
-	{
-	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r2}, %fd27;
-	}
-	bfe.u32 	%r4, %r2, 20, 11;
-	add.s32 	%r5, %r4, -1012;
-	mov.b64 	 %rd2, %fd27;
-	shl.b64 	%rd1, %rd2, %r5;
-	setp.eq.s64	%p21, %rd1, -9223372036854775808;
-	abs.f64 	%fd9, %fd26;
-	// Callseq Start 0
-	{
-	.reg .b32 temp_param_reg;
-	// <end>}
-	.param .b64 param0;
-	st.param.f64	[param0+0], %fd9;
-	.param .b64 param1;
-	st.param.f64	[param1+0], %fd27;
-	.param .b64 retval0;
-	call.uni (retval0), 
-	__internal_accurate_pow, 
-	(
-	param0, 
-	param1
-	);
-	ld.param.f64	%fd38, [retval0+0];
-	
-	//{
-	}// Callseq End 0
-	setp.lt.s32	%p22, %r1, 0;
-	and.pred  	%p1, %p22, %p21;
-	@!%p1 bra 	BB1_23;
-	bra.uni 	BB1_22;
-
-BB1_22:
-	{
-	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r6}, %fd38;
-	}
-	xor.b32  	%r7, %r6, -2147483648;
-	{
-	.reg .b32 %temp; 
-	mov.b64 	{%r8, %temp}, %fd38;
-	}
-	mov.b64 	%fd38, {%r8, %r7};
-
-BB1_23:
-	mov.f64 	%fd37, %fd38;
-	setp.eq.f64	%p23, %fd26, 0d0000000000000000;
-	@%p23 bra 	BB1_26;
-	bra.uni 	BB1_24;
-
-BB1_26:
-	selp.b32	%r9, %r1, 0, %p21;
-	or.b32  	%r10, %r9, 2146435072;
-	setp.lt.s32	%p27, %r2, 0;
-	selp.b32	%r11, %r10, %r9, %p27;
-	mov.u32 	%r12, 0;
-	mov.b64 	%fd37, {%r12, %r11};
-	bra.uni 	BB1_27;
-
-BB1_5:
-	setp.eq.s32	%p7, %r3, 5;
-	@%p7 bra 	BB1_20;
-	bra.uni 	BB1_6;
-
-BB1_20:
-	setp.lt.f64	%p20, %fd26, %fd27;
-	selp.f64	%fd39, 0d3FF0000000000000, 0d0000000000000000, %p20;
-	bra.uni 	BB1_41;
-
-BB1_6:
-	setp.eq.s32	%p8, %r3, 6;
-	@%p8 bra 	BB1_19;
-	bra.uni 	BB1_7;
-
-BB1_19:
-	setp.le.f64	%p19, %fd26, %fd27;
-	selp.f64	%fd39, 0d3FF0000000000000, 0d0000000000000000, %p19;
-	bra.uni 	BB1_41;
-
-BB1_24:
-	setp.gt.s32	%p24, %r1, -1;
-	@%p24 bra 	BB1_27;
-
-	cvt.rzi.f64.f64	%fd29, %fd27;
-	setp.neu.f64	%p25, %fd29, %fd27;
-	selp.f64	%fd37, 0dFFF8000000000000, %fd37, %p25;
-
-BB1_27:
-	mov.f64 	%fd15, %fd37;
-	add.f64 	%fd16, %fd26, %fd27;
-	{
-	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r13}, %fd16;
-	}
-	and.b32  	%r14, %r13, 2146435072;
-	setp.ne.s32	%p28, %r14, 2146435072;
-	mov.f64 	%fd36, %fd15;
-	@%p28 bra 	BB1_36;
-
-	setp.gtu.f64	%p29, %fd9, 0d7FF0000000000000;
-	mov.f64 	%fd36, %fd16;
-	@%p29 bra 	BB1_36;
-
-	abs.f64 	%fd30, %fd27;
-	setp.gtu.f64	%p30, %fd30, 0d7FF0000000000000;
-	mov.f64 	%fd35, %fd16;
-	mov.f64 	%fd36, %fd35;
-	@%p30 bra 	BB1_36;
-
-	and.b32  	%r15, %r2, 2147483647;
-	setp.ne.s32	%p31, %r15, 2146435072;
-	@%p31 bra 	BB1_32;
-
-	{
-	.reg .b32 %temp; 
-	mov.b64 	{%r16, %temp}, %fd27;
-	}
-	setp.eq.s32	%p32, %r16, 0;
-	@%p32 bra 	BB1_35;
-
-BB1_32:
-	and.b32  	%r17, %r1, 2147483647;
-	setp.ne.s32	%p33, %r17, 2146435072;
-	mov.f64 	%fd33, %fd15;
-	mov.f64 	%fd36, %fd33;
-	@%p33 bra 	BB1_36;
-
-	{
-	.reg .b32 %temp; 
-	mov.b64 	{%r18, %temp}, %fd26;
-	}
-	setp.ne.s32	%p34, %r18, 0;
-	mov.f64 	%fd36, %fd15;
-	@%p34 bra 	BB1_36;
-
-	shr.s32 	%r19, %r2, 31;
-	and.b32  	%r20, %r19, -2146435072;
-	add.s32 	%r21, %r20, 2146435072;
-	or.b32  	%r22, %r21, -2147483648;
-	selp.b32	%r23, %r22, %r21, %p1;
-	mov.u32 	%r24, 0;
-	mov.b64 	%fd36, {%r24, %r23};
-	bra.uni 	BB1_36;
-
-BB1_7:
-	setp.eq.s32	%p9, %r3, 7;
-	@%p9 bra 	BB1_18;
-	bra.uni 	BB1_8;
-
-BB1_18:
-	setp.gt.f64	%p18, %fd26, %fd27;
-	selp.f64	%fd39, 0d3FF0000000000000, 0d0000000000000000, %p18;
-	bra.uni 	BB1_41;
-
-BB1_8:
-	setp.eq.s32	%p10, %r3, 8;
-	@%p10 bra 	BB1_17;
-	bra.uni 	BB1_9;
-
-BB1_17:
-	setp.ge.f64	%p17, %fd26, %fd27;
-	selp.f64	%fd39, 0d3FF0000000000000, 0d0000000000000000, %p17;
-	bra.uni 	BB1_41;
-
-BB1_9:
-	setp.eq.s32	%p11, %r3, 9;
-	@%p11 bra 	BB1_16;
-	bra.uni 	BB1_10;
-
-BB1_16:
-	setp.eq.f64	%p16, %fd26, %fd27;
-	selp.f64	%fd39, 0d3FF0000000000000, 0d0000000000000000, %p16;
-	bra.uni 	BB1_41;
-
-BB1_10:
-	setp.eq.s32	%p12, %r3, 10;
-	@%p12 bra 	BB1_15;
-	bra.uni 	BB1_11;
-
-BB1_15:
-	setp.neu.f64	%p15, %fd26, %fd27;
-	selp.f64	%fd39, 0d3FF0000000000000, 0d0000000000000000, %p15;
-	bra.uni 	BB1_41;
-
-BB1_35:
-	setp.gt.f64	%p35, %fd9, 0d3FF0000000000000;
-	selp.b32	%r25, 2146435072, 0, %p35;
-	xor.b32  	%r26, %r25, 2146435072;
-	setp.lt.s32	%p36, %r2, 0;
-	selp.b32	%r27, %r26, %r25, %p36;
-	setp.eq.f64	%p37, %fd26, 0dBFF0000000000000;
-	selp.b32	%r28, 1072693248, %r27, %p37;
-	mov.u32 	%r29, 0;
-	mov.b64 	%fd36, {%r29, %r28};
-
-BB1_36:
-	setp.eq.f64	%p38, %fd27, 0d0000000000000000;
-	setp.eq.f64	%p39, %fd26, 0d3FF0000000000000;
-	or.pred  	%p40, %p39, %p38;
-	selp.f64	%fd39, 0d3FF0000000000000, %fd36, %p40;
-
-BB1_41:
-	st.param.f64	[func_retval0+0], %fd39;
-	ret;
-
-BB1_11:
-	setp.eq.s32	%p13, %r3, 11;
-	@%p13 bra 	BB1_14;
-	bra.uni 	BB1_12;
-
-BB1_14:
-	min.f64 	%fd39, %fd26, %fd27;
-	bra.uni 	BB1_41;
-
-BB1_12:
-	mov.f64 	%fd39, 0dC08F380000000000;
-	setp.ne.s32	%p14, %r3, 12;
-	@%p14 bra 	BB1_41;
-
-	max.f64 	%fd39, %fd26, %fd27;
-	bra.uni 	BB1_41;
-}
-
-	// .globl	_Z6reduceI5SumOpEvPdS1_jT_d
 .visible .func _Z6reduceI5SumOpEvPdS1_jT_d(
 	.param .b64 _Z6reduceI5SumOpEvPdS1_jT_d_param_0,
 	.param .b64 _Z6reduceI5SumOpEvPdS1_jT_d_param_1,
@@ -344,11 +43,11 @@ BB1_12:
 	mov.u32 	%r9, %ntid.x;
 	mad.lo.s32 	%r32, %r8, %r9, %r6;
 	setp.ge.u32	%p1, %r32, %r5;
-	@%p1 bra 	BB2_5;
+	@%p1 bra 	BB0_5;
 
 	mov.f64 	%fd77, %fd76;
 
-BB2_2:
+BB0_2:
 	mov.f64 	%fd1, %fd77;
 	mul.wide.u32 	%rd4, %r32, 8;
 	add.s64 	%rd5, %rd2, %rd4;
@@ -356,23 +55,23 @@ BB2_2:
 	add.f64 	%fd78, %fd1, %fd29;
 	add.s32 	%r3, %r32, %r9;
 	setp.ge.u32	%p2, %r3, %r5;
-	@%p2 bra 	BB2_4;
+	@%p2 bra 	BB0_4;
 
 	mul.wide.u32 	%rd6, %r3, 8;
 	add.s64 	%rd7, %rd2, %rd6;
 	ld.f64 	%fd30, [%rd7];
 	add.f64 	%fd78, %fd78, %fd30;
 
-BB2_4:
+BB0_4:
 	mov.f64 	%fd77, %fd78;
 	shl.b32 	%r12, %r9, 1;
 	mov.u32 	%r13, %nctaid.x;
 	mad.lo.s32 	%r32, %r12, %r13, %r32;
 	setp.lt.u32	%p3, %r32, %r5;
 	mov.f64 	%fd76, %fd77;
-	@%p3 bra 	BB2_2;
+	@%p3 bra 	BB0_2;
 
-BB2_5:
+BB0_5:
 	mov.f64 	%fd74, %fd76;
 	mul.wide.u32 	%rd8, %r6, 8;
 	mov.u64 	%rd9, sdata;
@@ -380,137 +79,137 @@ BB2_5:
 	st.shared.f64 	[%rd1], %fd74;
 	bar.sync 	0;
 	setp.lt.u32	%p4, %r9, 1024;
-	@%p4 bra 	BB2_9;
+	@%p4 bra 	BB0_9;
 
 	setp.gt.u32	%p5, %r6, 511;
 	mov.f64 	%fd75, %fd74;
-	@%p5 bra 	BB2_8;
+	@%p5 bra 	BB0_8;
 
 	ld.shared.f64 	%fd31, [%rd1+4096];
 	add.f64 	%fd75, %fd74, %fd31;
 	st.shared.f64 	[%rd1], %fd75;
 
-BB2_8:
+BB0_8:
 	mov.f64 	%fd74, %fd75;
 	bar.sync 	0;
 
-BB2_9:
+BB0_9:
 	mov.f64 	%fd72, %fd74;
 	setp.lt.u32	%p6, %r9, 512;
-	@%p6 bra 	BB2_13;
+	@%p6 bra 	BB0_13;
 
 	setp.gt.u32	%p7, %r6, 255;
 	mov.f64 	%fd73, %fd72;
-	@%p7 bra 	BB2_12;
+	@%p7 bra 	BB0_12;
 
 	ld.shared.f64 	%fd32, [%rd1+2048];
 	add.f64 	%fd73, %fd72, %fd32;
 	st.shared.f64 	[%rd1], %fd73;
 
-BB2_12:
+BB0_12:
 	mov.f64 	%fd72, %fd73;
 	bar.sync 	0;
 
-BB2_13:
+BB0_13:
 	mov.f64 	%fd70, %fd72;
 	setp.lt.u32	%p8, %r9, 256;
-	@%p8 bra 	BB2_17;
+	@%p8 bra 	BB0_17;
 
 	setp.gt.u32	%p9, %r6, 127;
 	mov.f64 	%fd71, %fd70;
-	@%p9 bra 	BB2_16;
+	@%p9 bra 	BB0_16;
 
 	ld.shared.f64 	%fd33, [%rd1+1024];
 	add.f64 	%fd71, %fd70, %fd33;
 	st.shared.f64 	[%rd1], %fd71;
 
-BB2_16:
+BB0_16:
 	mov.f64 	%fd70, %fd71;
 	bar.sync 	0;
 
-BB2_17:
+BB0_17:
 	mov.f64 	%fd68, %fd70;
 	setp.lt.u32	%p10, %r9, 128;
-	@%p10 bra 	BB2_21;
+	@%p10 bra 	BB0_21;
 
 	setp.gt.u32	%p11, %r6, 63;
 	mov.f64 	%fd69, %fd68;
-	@%p11 bra 	BB2_20;
+	@%p11 bra 	BB0_20;
 
 	ld.shared.f64 	%fd34, [%rd1+512];
 	add.f64 	%fd69, %fd68, %fd34;
 	st.shared.f64 	[%rd1], %fd69;
 
-BB2_20:
+BB0_20:
 	mov.f64 	%fd68, %fd69;
 	bar.sync 	0;
 
-BB2_21:
+BB0_21:
 	mov.f64 	%fd67, %fd68;
 	setp.gt.u32	%p12, %r6, 31;
-	@%p12 bra 	BB2_34;
+	@%p12 bra 	BB0_34;
 
 	setp.lt.u32	%p13, %r9, 64;
-	@%p13 bra 	BB2_24;
+	@%p13 bra 	BB0_24;
 
 	ld.volatile.shared.f64 	%fd35, [%rd1+256];
 	add.f64 	%fd67, %fd67, %fd35;
 	st.volatile.shared.f64 	[%rd1], %fd67;
 
-BB2_24:
+BB0_24:
 	mov.f64 	%fd66, %fd67;
 	setp.lt.u32	%p14, %r9, 32;
-	@%p14 bra 	BB2_26;
+	@%p14 bra 	BB0_26;
 
 	ld.volatile.shared.f64 	%fd36, [%rd1+128];
 	add.f64 	%fd66, %fd66, %fd36;
 	st.volatile.shared.f64 	[%rd1], %fd66;
 
-BB2_26:
+BB0_26:
 	mov.f64 	%fd65, %fd66;
 	setp.lt.u32	%p15, %r9, 16;
-	@%p15 bra 	BB2_28;
+	@%p15 bra 	BB0_28;
 
 	ld.volatile.shared.f64 	%fd37, [%rd1+64];
 	add.f64 	%fd65, %fd65, %fd37;
 	st.volatile.shared.f64 	[%rd1], %fd65;
 
-BB2_28:
+BB0_28:
 	mov.f64 	%fd64, %fd65;
 	setp.lt.u32	%p16, %r9, 8;
-	@%p16 bra 	BB2_30;
+	@%p16 bra 	BB0_30;
 
 	ld.volatile.shared.f64 	%fd38, [%rd1+32];
 	add.f64 	%fd64, %fd64, %fd38;
 	st.volatile.shared.f64 	[%rd1], %fd64;
 
-BB2_30:
+BB0_30:
 	mov.f64 	%fd63, %fd64;
 	setp.lt.u32	%p17, %r9, 4;
-	@%p17 bra 	BB2_32;
+	@%p17 bra 	BB0_32;
 
 	ld.volatile.shared.f64 	%fd39, [%rd1+16];
 	add.f64 	%fd63, %fd63, %fd39;
 	st.volatile.shared.f64 	[%rd1], %fd63;
 
-BB2_32:
+BB0_32:
 	setp.lt.u32	%p18, %r9, 2;
-	@%p18 bra 	BB2_34;
+	@%p18 bra 	BB0_34;
 
 	ld.volatile.shared.f64 	%fd40, [%rd1+8];
 	add.f64 	%fd41, %fd63, %fd40;
 	st.volatile.shared.f64 	[%rd1], %fd41;
 
-BB2_34:
+BB0_34:
 	setp.ne.s32	%p19, %r6, 0;
-	@%p19 bra 	BB2_36;
+	@%p19 bra 	BB0_36;
 
 	ld.shared.f64 	%fd42, [sdata];
 	mul.wide.u32 	%rd10, %r7, 8;
 	add.s64 	%rd11, %rd3, %rd10;
 	st.f64 	[%rd11], %fd42;
 
-BB2_36:
+BB0_36:
 	ret;
 }
 
@@ -538,14 +237,14 @@ BB2_36:
 	ld.param.f64 	%fd40, [_Z10reduce_rowI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_6];
 	mov.u32 	%r1, %ctaid.x;
 	setp.ge.u32	%p1, %r1, %r7;
-	@%p1 bra 	BB3_34;
+	@%p1 bra 	BB1_34;
 
 	mov.u32 	%r28, %tid.x;
 	mul.lo.s32 	%r3, %r1, %r6;
 	setp.ge.u32	%p2, %r28, %r6;
-	@%p2 bra 	BB3_3;
+	@%p2 bra 	BB1_3;
 
-BB3_2:
+BB1_2:
 	add.s32 	%r8, %r28, %r3;
 	mul.wide.u32 	%rd4, %r8, 8;
 	add.s64 	%rd5, %rd2, %rd4;
@@ -554,9 +253,9 @@ BB3_2:
 	mov.u32 	%r9, %ntid.x;
 	add.s32 	%r28, %r9, %r28;
 	setp.lt.u32	%p3, %r28, %r6;
-	@%p3 bra 	BB3_2;
+	@%p3 bra 	BB1_2;
 
-BB3_3:
+BB1_3:
 	mov.u32 	%r10, %tid.x;
 	mul.wide.u32 	%rd6, %r10, 8;
 	mov.u64 	%rd7, sdata;
@@ -565,121 +264,121 @@ BB3_3:
 	bar.sync 	0;
 	mov.u32 	%r11, %ntid.x;
 	setp.lt.u32	%p4, %r11, 1024;
-	@%p4 bra 	BB3_7;
+	@%p4 bra 	BB1_7;
 
 	setp.gt.u32	%p5, %r10, 511;
-	@%p5 bra 	BB3_6;
+	@%p5 bra 	BB1_6;
 
 	ld.shared.f64 	%fd28, [%rd1+4096];
 	add.f64 	%fd40, %fd40, %fd28;
 	st.shared.f64 	[%rd1], %fd40;
 
-BB3_6:
+BB1_6:
 	bar.sync 	0;
 
-BB3_7:
+BB1_7:
 	setp.lt.u32	%p6, %r11, 512;
-	@%p6 bra 	BB3_11;
+	@%p6 bra 	BB1_11;
 
 	setp.gt.u32	%p7, %r10, 255;
-	@%p7 bra 	BB3_10;
+	@%p7 bra 	BB1_10;
 
 	ld.shared.f64 	%fd29, [%rd1+2048];
 	add.f64 	%fd40, %fd40, %fd29;
 	st.shared.f64 	[%rd1], %fd40;
 
-BB3_10:
+BB1_10:
 	bar.sync 	0;
 
-BB3_11:
+BB1_11:
 	setp.lt.u32	%p8, %r11, 256;
-	@%p8 bra 	BB3_15;
+	@%p8 bra 	BB1_15;
 
 	setp.gt.u32	%p9, %r10, 127;
-	@%p9 bra 	BB3_14;
+	@%p9 bra 	BB1_14;
 
 	ld.shared.f64 	%fd30, [%rd1+1024];
 	add.f64 	%fd40, %fd40, %fd30;
 	st.shared.f64 	[%rd1], %fd40;
 
-BB3_14:
+BB1_14:
 	bar.sync 	0;
 
-BB3_15:
+BB1_15:
 	setp.lt.u32	%p10, %r11, 128;
-	@%p10 bra 	BB3_19;
+	@%p10 bra 	BB1_19;
 
 	setp.gt.u32	%p11, %r10, 63;
-	@%p11 bra 	BB3_18;
+	@%p11 bra 	BB1_18;
 
 	ld.shared.f64 	%fd31, [%rd1+512];
 	add.f64 	%fd40, %fd40, %fd31;
 	st.shared.f64 	[%rd1], %fd40;
 
-BB3_18:
+BB1_18:
 	bar.sync 	0;
 
-BB3_19:
+BB1_19:
 	setp.gt.u32	%p12, %r10, 31;
-	@%p12 bra 	BB3_32;
+	@%p12 bra 	BB1_32;
 
 	setp.lt.u32	%p13, %r11, 64;
-	@%p13 bra 	BB3_22;
+	@%p13 bra 	BB1_22;
 
 	ld.volatile.shared.f64 	%fd32, [%rd1+256];
 	add.f64 	%fd40, %fd40, %fd32;
 	st.volatile.shared.f64 	[%rd1], %fd40;
 
-BB3_22:
+BB1_22:
 	setp.lt.u32	%p14, %r11, 32;
-	@%p14 bra 	BB3_24;
+	@%p14 bra 	BB1_24;
 
 	ld.volatile.shared.f64 	%fd33, [%rd1+128];
 	add.f64 	%fd40, %fd40, %fd33;
 	st.volatile.shared.f64 	[%rd1], %fd40;
 
-BB3_24:
+BB1_24:
 	setp.lt.u32	%p15, %r11, 16;
-	@%p15 bra 	BB3_26;
+	@%p15 bra 	BB1_26;
 
 	ld.volatile.shared.f64 	%fd34, [%rd1+64];
 	add.f64 	%fd40, %fd40, %fd34;
 	st.volatile.shared.f64 	[%rd1], %fd40;
 
-BB3_26:
+BB1_26:
 	setp.lt.u32	%p16, %r11, 8;
-	@%p16 bra 	BB3_28;
+	@%p16 bra 	BB1_28;
 
 	ld.volatile.shared.f64 	%fd35, [%rd1+32];
 	add.f64 	%fd40, %fd40, %fd35;
 	st.volatile.shared.f64 	[%rd1], %fd40;
 
-BB3_28:
+BB1_28:
 	setp.lt.u32	%p17, %r11, 4;
-	@%p17 bra 	BB3_30;
+	@%p17 bra 	BB1_30;
 
 	ld.volatile.shared.f64 	%fd36, [%rd1+16];
 	add.f64 	%fd40, %fd40, %fd36;
 	st.volatile.shared.f64 	[%rd1], %fd40;
 
-BB3_30:
+BB1_30:
 	setp.lt.u32	%p18, %r11, 2;
-	@%p18 bra 	BB3_32;
+	@%p18 bra 	BB1_32;
 
 	ld.volatile.shared.f64 	%fd37, [%rd1+8];
 	add.f64 	%fd38, %fd40, %fd37;
 	st.volatile.shared.f64 	[%rd1], %fd38;
 
-BB3_32:
+BB1_32:
 	setp.ne.s32	%p19, %r10, 0;
-	@%p19 bra 	BB3_34;
+	@%p19 bra 	BB1_34;
 
 	ld.shared.f64 	%fd39, [sdata];
 	mul.wide.u32 	%rd8, %r1, 8;
 	add.s64 	%rd9, %rd3, %rd8;
 	st.f64 	[%rd9], %fd39;
 
-BB3_34:
+BB1_34:
 	ret;
 }
 
@@ -710,15 +409,15 @@ BB3_34:
 	mov.u32 	%r9, %tid.x;
 	mad.lo.s32 	%r1, %r8, %r7, %r9;
 	setp.ge.u32	%p1, %r1, %r6;
-	@%p1 bra 	BB4_5;
+	@%p1 bra 	BB2_5;
 
 	mul.lo.s32 	%r2, %r6, %r5;
 	setp.ge.u32	%p2, %r1, %r2;
-	@%p2 bra 	BB4_4;
+	@%p2 bra 	BB2_4;
 
 	mov.u32 	%r10, %r1;
 
-BB4_3:
+BB2_3:
 	mov.u32 	%r3, %r10;
 	mul.wide.u32 	%rd3, %r3, 8;
 	add.s64 	%rd4, %rd1, %rd3;
@@ -727,14 +426,14 @@ BB4_3:
 	add.s32 	%r4, %r3, %r6;
 	setp.lt.u32	%p3, %r4, %r2;
 	mov.u32 	%r10, %r4;
-	@%p3 bra 	BB4_3;
+	@%p3 bra 	BB2_3;
 
-BB4_4:
+BB2_4:
 	mul.wide.u32 	%rd5, %r1, 8;
 	add.s64 	%rd6, %rd2, %rd5;
 	st.f64 	[%rd6], %fd6;
 
-BB4_5:
+BB2_5:
 	ret;
 }
 
@@ -763,11 +462,11 @@ BB4_5:
 	mov.u32 	%r9, %ntid.x;
 	mad.lo.s32 	%r32, %r8, %r9, %r6;
 	setp.ge.u32	%p1, %r32, %r5;
-	@%p1 bra 	BB5_5;
+	@%p1 bra 	BB3_5;
 
 	mov.f64 	%fd77, %fd76;
 
-BB5_2:
+BB3_2:
 	mov.f64 	%fd1, %fd77;
 	mul.wide.u32 	%rd4, %r32, 8;
 	add.s64 	%rd5, %rd2, %rd4;
@@ -775,23 +474,23 @@ BB5_2:
 	max.f64 	%fd78, %fd1, %fd29;
 	add.s32 	%r3, %r32, %r9;
 	setp.ge.u32	%p2, %r3, %r5;
-	@%p2 bra 	BB5_4;
+	@%p2 bra 	BB3_4;
 
 	mul.wide.u32 	%rd6, %r3, 8;
 	add.s64 	%rd7, %rd2, %rd6;
 	ld.f64 	%fd30, [%rd7];
 	max.f64 	%fd78, %fd78, %fd30;
 
-BB5_4:
+BB3_4:
 	mov.f64 	%fd77, %fd78;
 	shl.b32 	%r12, %r9, 1;
 	mov.u32 	%r13, %nctaid.x;
 	mad.lo.s32 	%r32, %r12, %r13, %r32;
 	setp.lt.u32	%p3, %r32, %r5;
 	mov.f64 	%fd76, %fd77;
-	@%p3 bra 	BB5_2;
+	@%p3 bra 	BB3_2;
 
-BB5_5:
+BB3_5:
 	mov.f64 	%fd74, %fd76;
 	mul.wide.u32 	%rd8, %r6, 8;
 	mov.u64 	%rd9, sdata;
@@ -799,137 +498,137 @@ BB5_5:
 	st.shared.f64 	[%rd1], %fd74;
 	bar.sync 	0;
 	setp.lt.u32	%p4, %r9, 1024;
-	@%p4 bra 	BB5_9;
+	@%p4 bra 	BB3_9;
 
 	setp.gt.u32	%p5, %r6, 511;
 	mov.f64 	%fd75, %fd74;
-	@%p5 bra 	BB5_8;
+	@%p5 bra 	BB3_8;
 
 	ld.shared.f64 	%fd31, [%rd1+4096];
 	max.f64 	%fd75, %fd74, %fd31;
 	st.shared.f64 	[%rd1], %fd75;
 
-BB5_8:
+BB3_8:
 	mov.f64 	%fd74, %fd75;
 	bar.sync 	0;
 
-BB5_9:
+BB3_9:
 	mov.f64 	%fd72, %fd74;
 	setp.lt.u32	%p6, %r9, 512;
-	@%p6 bra 	BB5_13;
+	@%p6 bra 	BB3_13;
 
 	setp.gt.u32	%p7, %r6, 255;
 	mov.f64 	%fd73, %fd72;
-	@%p7 bra 	BB5_12;
+	@%p7 bra 	BB3_12;
 
 	ld.shared.f64 	%fd32, [%rd1+2048];
 	max.f64 	%fd73, %fd72, %fd32;
 	st.shared.f64 	[%rd1], %fd73;
 
-BB5_12:
+BB3_12:
 	mov.f64 	%fd72, %fd73;
 	bar.sync 	0;
 
-BB5_13:
+BB3_13:
 	mov.f64 	%fd70, %fd72;
 	setp.lt.u32	%p8, %r9, 256;
-	@%p8 bra 	BB5_17;
+	@%p8 bra 	BB3_17;
 
 	setp.gt.u32	%p9, %r6, 127;
 	mov.f64 	%fd71, %fd70;
-	@%p9 bra 	BB5_16;
+	@%p9 bra 	BB3_16;
 
 	ld.shared.f64 	%fd33, [%rd1+1024];
 	max.f64 	%fd71, %fd70, %fd33;
 	st.shared.f64 	[%rd1], %fd71;
 
-BB5_16:
+BB3_16:
 	mov.f64 	%fd70, %fd71;
 	bar.sync 	0;
 
-BB5_17:
+BB3_17:
 	mov.f64 	%fd68, %fd70;
 	setp.lt.u32	%p10, %r9, 128;
-	@%p10 bra 	BB5_21;
+	@%p10 bra 	BB3_21;
 
 	setp.gt.u32	%p11, %r6, 63;
 	mov.f64 	%fd69, %fd68;
-	@%p11 bra 	BB5_20;
+	@%p11 bra 	BB3_20;
 
 	ld.shared.f64 	%fd34, [%rd1+512];
 	max.f64 	%fd69, %fd68, %fd34;
 	st.shared.f64 	[%rd1], %fd69;
 
-BB5_20:
+BB3_20:
 	mov.f64 	%fd68, %fd69;
 	bar.sync 	0;
 
-BB5_21:
+BB3_21:
 	mov.f64 	%fd67, %fd68;
 	setp.gt.u32	%p12, %r6, 31;
-	@%p12 bra 	BB5_34;
+	@%p12 bra 	BB3_34;
 
 	setp.lt.u32	%p13, %r9, 64;
-	@%p13 bra 	BB5_24;
+	@%p13 bra 	BB3_24;
 
 	ld.volatile.shared.f64 	%fd35, [%rd1+256];
 	max.f64 	%fd67, %fd67, %fd35;
 	st.volatile.shared.f64 	[%rd1], %fd67;
 
-BB5_24:
+BB3_24:
 	mov.f64 	%fd66, %fd67;
 	setp.lt.u32	%p14, %r9, 32;
-	@%p14 bra 	BB5_26;
+	@%p14 bra 	BB3_26;
 
 	ld.volatile.shared.f64 	%fd36, [%rd1+128];
 	max.f64 	%fd66, %fd66, %fd36;
 	st.volatile.shared.f64 	[%rd1], %fd66;
 
-BB5_26:
+BB3_26:
 	mov.f64 	%fd65, %fd66;
 	setp.lt.u32	%p15, %r9, 16;
-	@%p15 bra 	BB5_28;
+	@%p15 bra 	BB3_28;
 
 	ld.volatile.shared.f64 	%fd37, [%rd1+64];
 	max.f64 	%fd65, %fd65, %fd37;
 	st.volatile.shared.f64 	[%rd1], %fd65;
 
-BB5_28:
+BB3_28:
 	mov.f64 	%fd64, %fd65;
 	setp.lt.u32	%p16, %r9, 8;
-	@%p16 bra 	BB5_30;
+	@%p16 bra 	BB3_30;
 
 	ld.volatile.shared.f64 	%fd38, [%rd1+32];
 	max.f64 	%fd64, %fd64, %fd38;
 	st.volatile.shared.f64 	[%rd1], %fd64;
 
-BB5_30:
+BB3_30:
 	mov.f64 	%fd63, %fd64;
 	setp.lt.u32	%p17, %r9, 4;
-	@%p17 bra 	BB5_32;
+	@%p17 bra 	BB3_32;
 
 	ld.volatile.shared.f64 	%fd39, [%rd1+16];
 	max.f64 	%fd63, %fd63, %fd39;
 	st.volatile.shared.f64 	[%rd1], %fd63;
 
-BB5_32:
+BB3_32:
 	setp.lt.u32	%p18, %r9, 2;
-	@%p18 bra 	BB5_34;
+	@%p18 bra 	BB3_34;
 
 	ld.volatile.shared.f64 	%fd40, [%rd1+8];
 	max.f64 	%fd41, %fd63, %fd40;
 	st.volatile.shared.f64 	[%rd1], %fd41;
 
-BB5_34:
+BB3_34:
 	setp.ne.s32	%p19, %r6, 0;
-	@%p19 bra 	BB5_36;
+	@%p19 bra 	BB3_36;
 
 	ld.shared.f64 	%fd42, [sdata];
 	mul.wide.u32 	%rd10, %r7, 8;
 	add.s64 	%rd11, %rd3, %rd10;
 	st.f64 	[%rd11], %fd42;
 
-BB5_36:
+BB3_36:
 	ret;
 }
 
@@ -957,14 +656,14 @@ BB5_36:
 	ld.param.f64 	%fd40, [_Z10reduce_rowI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_6];
 	mov.u32 	%r1, %ctaid.x;
 	setp.ge.u32	%p1, %r1, %r7;
-	@%p1 bra 	BB6_34;
+	@%p1 bra 	BB4_34;
 
 	mov.u32 	%r28, %tid.x;
 	mul.lo.s32 	%r3, %r1, %r6;
 	setp.ge.u32	%p2, %r28, %r6;
-	@%p2 bra 	BB6_3;
+	@%p2 bra 	BB4_3;
 
-BB6_2:
+BB4_2:
 	add.s32 	%r8, %r28, %r3;
 	mul.wide.u32 	%rd4, %r8, 8;
 	add.s64 	%rd5, %rd2, %rd4;
@@ -973,9 +672,9 @@ BB6_2:
 	mov.u32 	%r9, %ntid.x;
 	add.s32 	%r28, %r9, %r28;
 	setp.lt.u32	%p3, %r28, %r6;
-	@%p3 bra 	BB6_2;
+	@%p3 bra 	BB4_2;
 
-BB6_3:
+BB4_3:
 	mov.u32 	%r10, %tid.x;
 	mul.wide.u32 	%rd6, %r10, 8;
 	mov.u64 	%rd7, sdata;
@@ -984,121 +683,121 @@ BB6_3:
 	bar.sync 	0;
 	mov.u32 	%r11, %ntid.x;
 	setp.lt.u32	%p4, %r11, 1024;
-	@%p4 bra 	BB6_7;
+	@%p4 bra 	BB4_7;
 
 	setp.gt.u32	%p5, %r10, 511;
-	@%p5 bra 	BB6_6;
+	@%p5 bra 	BB4_6;
 
 	ld.shared.f64 	%fd28, [%rd1+4096];
 	max.f64 	%fd40, %fd40, %fd28;
 	st.shared.f64 	[%rd1], %fd40;
 
-BB6_6:
+BB4_6:
 	bar.sync 	0;
 
-BB6_7:
+BB4_7:
 	setp.lt.u32	%p6, %r11, 512;
-	@%p6 bra 	BB6_11;
+	@%p6 bra 	BB4_11;
 
 	setp.gt.u32	%p7, %r10, 255;
-	@%p7 bra 	BB6_10;
+	@%p7 bra 	BB4_10;
 
 	ld.shared.f64 	%fd29, [%rd1+2048];
 	max.f64 	%fd40, %fd40, %fd29;
 	st.shared.f64 	[%rd1], %fd40;
 
-BB6_10:
+BB4_10:
 	bar.sync 	0;
 
-BB6_11:
+BB4_11:
 	setp.lt.u32	%p8, %r11, 256;
-	@%p8 bra 	BB6_15;
+	@%p8 bra 	BB4_15;
 
 	setp.gt.u32	%p9, %r10, 127;
-	@%p9 bra 	BB6_14;
+	@%p9 bra 	BB4_14;
 
 	ld.shared.f64 	%fd30, [%rd1+1024];
 	max.f64 	%fd40, %fd40, %fd30;
 	st.shared.f64 	[%rd1], %fd40;
 
-BB6_14:
+BB4_14:
 	bar.sync 	0;
 
-BB6_15:
+BB4_15:
 	setp.lt.u32	%p10, %r11, 128;
-	@%p10 bra 	BB6_19;
+	@%p10 bra 	BB4_19;
 
 	setp.gt.u32	%p11, %r10, 63;
-	@%p11 bra 	BB6_18;
+	@%p11 bra 	BB4_18;
 
 	ld.shared.f64 	%fd31, [%rd1+512];
 	max.f64 	%fd40, %fd40, %fd31;
 	st.shared.f64 	[%rd1], %fd40;
 
-BB6_18:
+BB4_18:
 	bar.sync 	0;
 
-BB6_19:
+BB4_19:
 	setp.gt.u32	%p12, %r10, 31;
-	@%p12 bra 	BB6_32;
+	@%p12 bra 	BB4_32;
 
 	setp.lt.u32	%p13, %r11, 64;
-	@%p13 bra 	BB6_22;
+	@%p13 bra 	BB4_22;
 
 	ld.volatile.shared.f64 	%fd32, [%rd1+256];
 	max.f64 	%fd40, %fd40, %fd32;
 	st.volatile.shared.f64 	[%rd1], %fd40;
 
-BB6_22:
+BB4_22:
 	setp.lt.u32	%p14, %r11, 32;
-	@%p14 bra 	BB6_24;
+	@%p14 bra 	BB4_24;
 
 	ld.volatile.shared.f64 	%fd33, [%rd1+128];
 	max.f64 	%fd40, %fd40, %fd33;
 	st.volatile.shared.f64 	[%rd1], %fd40;
 
-BB6_24:
+BB4_24:
 	setp.lt.u32	%p15, %r11, 16;
-	@%p15 bra 	BB6_26;
+	@%p15 bra 	BB4_26;
 
 	ld.volatile.shared.f64 	%fd34, [%rd1+64];
 	max.f64 	%fd40, %fd40, %fd34;
 	st.volatile.shared.f64 	[%rd1], %fd40;
 
-BB6_26:
+BB4_26:
 	setp.lt.u32	%p16, %r11, 8;
-	@%p16 bra 	BB6_28;
+	@%p16 bra 	BB4_28;
 
 	ld.volatile.shared.f64 	%fd35, [%rd1+32];
 	max.f64 	%fd40, %fd40, %fd35;
 	st.volatile.shared.f64 	[%rd1], %fd40;
 
-BB6_28:
+BB4_28:
 	setp.lt.u32	%p17, %r11, 4;
-	@%p17 bra 	BB6_30;
+	@%p17 bra 	BB4_30;
 
 	ld.volatile.shared.f64 	%fd36, [%rd1+16];
 	max.f64 	%fd40, %fd40, %fd36;
 	st.volatile.shared.f64 	[%rd1], %fd40;
 
-BB6_30:
+BB4_30:
 	setp.lt.u32	%p18, %r11, 2;
-	@%p18 bra 	BB6_32;
+	@%p18 bra 	BB4_32;
 
 	ld.volatile.shared.f64 	%fd37, [%rd1+8];
 	max.f64 	%fd38, %fd40, %fd37;
 	st.volatile.shared.f64 	[%rd1], %fd38;
 
-BB6_32:
+BB4_32:
 	setp.ne.s32	%p19, %r10, 0;
-	@%p19 bra 	BB6_34;
+	@%p19 bra 	BB4_34;
 
 	ld.shared.f64 	%fd39, [sdata];
 	mul.wide.u32 	%rd8, %r1, 8;
 	add.s64 	%rd9, %rd3, %rd8;
 	st.f64 	[%rd9], %fd39;
 
-BB6_34:
+BB4_34:
 	ret;
 }
 
@@ -1129,15 +828,15 @@ BB6_34:
 	mov.u32 	%r9, %tid.x;
 	mad.lo.s32 	%r1, %r8, %r7, %r9;
 	setp.ge.u32	%p1, %r1, %r6;
-	@%p1 bra 	BB7_5;
+	@%p1 bra 	BB5_5;
 
 	mul.lo.s32 	%r2, %r6, %r5;
 	setp.ge.u32	%p2, %r1, %r2;
-	@%p2 bra 	BB7_4;
+	@%p2 bra 	BB5_4;
 
 	mov.u32 	%r10, %r1;
 
-BB7_3:
+BB5_3:
 	mov.u32 	%r3, %r10;
 	mul.wide.u32 	%rd3, %r3, 8;
 	add.s64 	%rd4, %rd1, %rd3;
@@ -1146,14 +845,14 @@ BB7_3:
 	add.s32 	%r4, %r3, %r6;
 	setp.lt.u32	%p3, %r4, %r2;
 	mov.u32 	%r10, %r4;
-	@%p3 bra 	BB7_3;
+	@%p3 bra 	BB5_3;
 
-BB7_4:
+BB5_4:
 	mul.wide.u32 	%rd5, %r1, 8;
 	add.s64 	%rd6, %rd2, %rd5;
 	st.f64 	[%rd6], %fd6;
 
-BB7_5:
+BB5_5:
 	ret;
 }
 
@@ -1182,11 +881,11 @@ BB7_5:
 	mov.u32 	%r9, %ntid.x;
 	mad.lo.s32 	%r32, %r8, %r9, %r6;
 	setp.ge.u32	%p1, %r32, %r5;
-	@%p1 bra 	BB8_5;
+	@%p1 bra 	BB6_5;
 
 	mov.f64 	%fd77, %fd76;
 
-BB8_2:
+BB6_2:
 	mov.f64 	%fd1, %fd77;
 	mul.wide.u32 	%rd4, %r32, 8;
 	add.s64 	%rd5, %rd2, %rd4;
@@ -1194,23 +893,23 @@ BB8_2:
 	min.f64 	%fd78, %fd1, %fd29;
 	add.s32 	%r3, %r32, %r9;
 	setp.ge.u32	%p2, %r3, %r5;
-	@%p2 bra 	BB8_4;
+	@%p2 bra 	BB6_4;
 
 	mul.wide.u32 	%rd6, %r3, 8;
 	add.s64 	%rd7, %rd2, %rd6;
 	ld.f64 	%fd30, [%rd7];
 	min.f64 	%fd78, %fd78, %fd30;
 
-BB8_4:
+BB6_4:
 	mov.f64 	%fd77, %fd78;
 	shl.b32 	%r12, %r9, 1;
 	mov.u32 	%r13, %nctaid.x;
 	mad.lo.s32 	%r32, %r12, %r13, %r32;
 	setp.lt.u32	%p3, %r32, %r5;
 	mov.f64 	%fd76, %fd77;
-	@%p3 bra 	BB8_2;
+	@%p3 bra 	BB6_2;
 
-BB8_5:
+BB6_5:
 	mov.f64 	%fd74, %fd76;
 	mul.wide.u32 	%rd8, %r6, 8;
 	mov.u64 	%rd9, sdata;
@@ -1218,137 +917,137 @@ BB8_5:
 	st.shared.f64 	[%rd1], %fd74;
 	bar.sync 	0;
 	setp.lt.u32	%p4, %r9, 1024;
-	@%p4 bra 	BB8_9;
+	@%p4 bra 	BB6_9;
 
 	setp.gt.u32	%p5, %r6, 511;
 	mov.f64 	%fd75, %fd74;
-	@%p5 bra 	BB8_8;
+	@%p5 bra 	BB6_8;
 
 	ld.shared.f64 	%fd31, [%rd1+4096];
 	min.f64 	%fd75, %fd74, %fd31;
 	st.shared.f64 	[%rd1], %fd75;
 
-BB8_8:
+BB6_8:
 	mov.f64 	%fd74, %fd75;
 	bar.sync 	0;
 
-BB8_9:
+BB6_9:
 	mov.f64 	%fd72, %fd74;
 	setp.lt.u32	%p6, %r9, 512;
-	@%p6 bra 	BB8_13;
+	@%p6 bra 	BB6_13;
 
 	setp.gt.u32	%p7, %r6, 255;
 	mov.f64 	%fd73, %fd72;
-	@%p7 bra 	BB8_12;
+	@%p7 bra 	BB6_12;
 
 	ld.shared.f64 	%fd32, [%rd1+2048];
 	min.f64 	%fd73, %fd72, %fd32;
 	st.shared.f64 	[%rd1], %fd73;
 
-BB8_12:
+BB6_12:
 	mov.f64 	%fd72, %fd73;
 	bar.sync 	0;
 
-BB8_13:
+BB6_13:
 	mov.f64 	%fd70, %fd72;
 	setp.lt.u32	%p8, %r9, 256;
-	@%p8 bra 	BB8_17;
+	@%p8 bra 	BB6_17;
 
 	setp.gt.u32	%p9, %r6, 127;
 	mov.f64 	%fd71, %fd70;
-	@%p9 bra 	BB8_16;
+	@%p9 bra 	BB6_16;
 
 	ld.shared.f64 	%fd33, [%rd1+1024];
 	min.f64 	%fd71, %fd70, %fd33;
 	st.shared.f64 	[%rd1], %fd71;
 
-BB8_16:
+BB6_16:
 	mov.f64 	%fd70, %fd71;
 	bar.sync 	0;
 
-BB8_17:
+BB6_17:
 	mov.f64 	%fd68, %fd70;
 	setp.lt.u32	%p10, %r9, 128;
-	@%p10 bra 	BB8_21;
+	@%p10 bra 	BB6_21;
 
 	setp.gt.u32	%p11, %r6, 63;
 	mov.f64 	%fd69, %fd68;
-	@%p11 bra 	BB8_20;
+	@%p11 bra 	BB6_20;
 
 	ld.shared.f64 	%fd34, [%rd1+512];
 	min.f64 	%fd69, %fd68, %fd34;
 	st.shared.f64 	[%rd1], %fd69;
 
-BB8_20:
+BB6_20:
 	mov.f64 	%fd68, %fd69;
 	bar.sync 	0;
 
-BB8_21:
+BB6_21:
 	mov.f64 	%fd67, %fd68;
 	setp.gt.u32	%p12, %r6, 31;
-	@%p12 bra 	BB8_34;
+	@%p12 bra 	BB6_34;
 
 	setp.lt.u32	%p13, %r9, 64;
-	@%p13 bra 	BB8_24;
+	@%p13 bra 	BB6_24;
 
 	ld.volatile.shared.f64 	%fd35, [%rd1+256];
 	min.f64 	%fd67, %fd67, %fd35;
 	st.volatile.shared.f64 	[%rd1], %fd67;
 
-BB8_24:
+BB6_24:
 	mov.f64 	%fd66, %fd67;
 	setp.lt.u32	%p14, %r9, 32;
-	@%p14 bra 	BB8_26;
+	@%p14 bra 	BB6_26;
 
 	ld.volatile.shared.f64 	%fd36, [%rd1+128];
 	min.f64 	%fd66, %fd66, %fd36;
 	st.volatile.shared.f64 	[%rd1], %fd66;
 
-BB8_26:
+BB6_26:
 	mov.f64 	%fd65, %fd66;
 	setp.lt.u32	%p15, %r9, 16;
-	@%p15 bra 	BB8_28;
+	@%p15 bra 	BB6_28;
 
 	ld.volatile.shared.f64 	%fd37, [%rd1+64];
 	min.f64 	%fd65, %fd65, %fd37;
 	st.volatile.shared.f64 	[%rd1], %fd65;
 
-BB8_28:
+BB6_28:
 	mov.f64 	%fd64, %fd65;
 	setp.lt.u32	%p16, %r9, 8;
-	@%p16 bra 	BB8_30;
+	@%p16 bra 	BB6_30;
 
 	ld.volatile.shared.f64 	%fd38, [%rd1+32];
 	min.f64 	%fd64, %fd64, %fd38;
 	st.volatile.shared.f64 	[%rd1], %fd64;
 
-BB8_30:
+BB6_30:
 	mov.f64 	%fd63, %fd64;
 	setp.lt.u32	%p17, %r9, 4;
-	@%p17 bra 	BB8_32;
+	@%p17 bra 	BB6_32;
 
 	ld.volatile.shared.f64 	%fd39, [%rd1+16];
 	min.f64 	%fd63, %fd63, %fd39;
 	st.volatile.shared.f64 	[%rd1], %fd63;
 
-BB8_32:
+BB6_32:
 	setp.lt.u32	%p18, %r9, 2;
-	@%p18 bra 	BB8_34;
+	@%p18 bra 	BB6_34;
 
 	ld.volatile.shared.f64 	%fd40, [%rd1+8];
 	min.f64 	%fd41, %fd63, %fd40;
 	st.volatile.shared.f64 	[%rd1], %fd41;
 
-BB8_34:
+BB6_34:
 	setp.ne.s32	%p19, %r6, 0;
-	@%p19 bra 	BB8_36;
+	@%p19 bra 	BB6_36;
 
 	ld.shared.f64 	%fd42, [sdata];
 	mul.wide.u32 	%rd10, %r7, 8;
 	add.s64 	%rd11, %rd3, %rd10;
 	st.f64 	[%rd11], %fd42;
 
-BB8_36:
+BB6_36:
 	ret;
 }
 
@@ -1376,14 +1075,14 @@ BB8_36:
 	ld.param.f64 	%fd40, [_Z10reduce_rowI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_6];
 	mov.u32 	%r1, %ctaid.x;
 	setp.ge.u32	%p1, %r1, %r7;
-	@%p1 bra 	BB9_34;
+	@%p1 bra 	BB7_34;
 
 	mov.u32 	%r28, %tid.x;
 	mul.lo.s32 	%r3, %r1, %r6;
 	setp.ge.u32	%p2, %r28, %r6;
-	@%p2 bra 	BB9_3;
+	@%p2 bra 	BB7_3;
 
-BB9_2:
+BB7_2:
 	add.s32 	%r8, %r28, %r3;
 	mul.wide.u32 	%rd4, %r8, 8;
 	add.s64 	%rd5, %rd2, %rd4;
@@ -1392,9 +1091,9 @@ BB9_2:
 	mov.u32 	%r9, %ntid.x;
 	add.s32 	%r28, %r9, %r28;
 	setp.lt.u32	%p3, %r28, %r6;
-	@%p3 bra 	BB9_2;
+	@%p3 bra 	BB7_2;
 
-BB9_3:
+BB7_3:
 	mov.u32 	%r10, %tid.x;
 	mul.wide.u32 	%rd6, %r10, 8;
 	mov.u64 	%rd7, sdata;
@@ -1403,121 +1102,121 @@ BB9_3:
 	bar.sync 	0;
 	mov.u32 	%r11, %ntid.x;
 	setp.lt.u32	%p4, %r11, 1024;
-	@%p4 bra 	BB9_7;
+	@%p4 bra 	BB7_7;
 
 	setp.gt.u32	%p5, %r10, 511;
-	@%p5 bra 	BB9_6;
+	@%p5 bra 	BB7_6;
 
 	ld.shared.f64 	%fd28, [%rd1+4096];
 	min.f64 	%fd40, %fd40, %fd28;
 	st.shared.f64 	[%rd1], %fd40;
 
-BB9_6:
+BB7_6:
 	bar.sync 	0;
 
-BB9_7:
+BB7_7:
 	setp.lt.u32	%p6, %r11, 512;
-	@%p6 bra 	BB9_11;
+	@%p6 bra 	BB7_11;
 
 	setp.gt.u32	%p7, %r10, 255;
-	@%p7 bra 	BB9_10;
+	@%p7 bra 	BB7_10;
 
 	ld.shared.f64 	%fd29, [%rd1+2048];
 	min.f64 	%fd40, %fd40, %fd29;
 	st.shared.f64 	[%rd1], %fd40;
 
-BB9_10:
+BB7_10:
 	bar.sync 	0;
 
-BB9_11:
+BB7_11:
 	setp.lt.u32	%p8, %r11, 256;
-	@%p8 bra 	BB9_15;
+	@%p8 bra 	BB7_15;
 
 	setp.gt.u32	%p9, %r10, 127;
-	@%p9 bra 	BB9_14;
+	@%p9 bra 	BB7_14;
 
 	ld.shared.f64 	%fd30, [%rd1+1024];
 	min.f64 	%fd40, %fd40, %fd30;
 	st.shared.f64 	[%rd1], %fd40;
 
-BB9_14:
+BB7_14:
 	bar.sync 	0;
 
-BB9_15:
+BB7_15:
 	setp.lt.u32	%p10, %r11, 128;
-	@%p10 bra 	BB9_19;
+	@%p10 bra 	BB7_19;
 
 	setp.gt.u32	%p11, %r10, 63;
-	@%p11 bra 	BB9_18;
+	@%p11 bra 	BB7_18;
 
 	ld.shared.f64 	%fd31, [%rd1+512];
 	min.f64 	%fd40, %fd40, %fd31;
 	st.shared.f64 	[%rd1], %fd40;
 
-BB9_18:
+BB7_18:
 	bar.sync 	0;
 
-BB9_19:
+BB7_19:
 	setp.gt.u32	%p12, %r10, 31;
-	@%p12 bra 	BB9_32;
+	@%p12 bra 	BB7_32;
 
 	setp.lt.u32	%p13, %r11, 64;
-	@%p13 bra 	BB9_22;
+	@%p13 bra 	BB7_22;
 
 	ld.volatile.shared.f64 	%fd32, [%rd1+256];
 	min.f64 	%fd40, %fd40, %fd32;
 	st.volatile.shared.f64 	[%rd1], %fd40;
 
-BB9_22:
+BB7_22:
 	setp.lt.u32	%p14, %r11, 32;
-	@%p14 bra 	BB9_24;
+	@%p14 bra 	BB7_24;
 
 	ld.volatile.shared.f64 	%fd33, [%rd1+128];
 	min.f64 	%fd40, %fd40, %fd33;
 	st.volatile.shared.f64 	[%rd1], %fd40;
 
-BB9_24:
+BB7_24:
 	setp.lt.u32	%p15, %r11, 16;
-	@%p15 bra 	BB9_26;
+	@%p15 bra 	BB7_26;
 
 	ld.volatile.shared.f64 	%fd34, [%rd1+64];
 	min.f64 	%fd40, %fd40, %fd34;
 	st.volatile.shared.f64 	[%rd1], %fd40;
 
-BB9_26:
+BB7_26:
 	setp.lt.u32	%p16, %r11, 8;
-	@%p16 bra 	BB9_28;
+	@%p16 bra 	BB7_28;
 
 	ld.volatile.shared.f64 	%fd35, [%rd1+32];
 	min.f64 	%fd40, %fd40, %fd35;
 	st.volatile.shared.f64 	[%rd1], %fd40;
 
-BB9_28:
+BB7_28:
 	setp.lt.u32	%p17, %r11, 4;
-	@%p17 bra 	BB9_30;
+	@%p17 bra 	BB7_30;
 
 	ld.volatile.shared.f64 	%fd36, [%rd1+16];
 	min.f64 	%fd40, %fd40, %fd36;
 	st.volatile.shared.f64 	[%rd1], %fd40;
 
-BB9_30:
+BB7_30:
 	setp.lt.u32	%p18, %r11, 2;
-	@%p18 bra 	BB9_32;
+	@%p18 bra 	BB7_32;
 
 	ld.volatile.shared.f64 	%fd37, [%rd1+8];
 	min.f64 	%fd38, %fd40, %fd37;
 	st.volatile.shared.f64 	[%rd1], %fd38;
 
-BB9_32:
+BB7_32:
 	setp.ne.s32	%p19, %r10, 0;
-	@%p19 bra 	BB9_34;
+	@%p19 bra 	BB7_34;
 
 	ld.shared.f64 	%fd39, [sdata];
 	mul.wide.u32 	%rd8, %r1, 8;
 	add.s64 	%rd9, %rd3, %rd8;
 	st.f64 	[%rd9], %fd39;
 
-BB9_34:
+BB7_34:
 	ret;
 }
 
@@ -1548,15 +1247,15 @@ BB9_34:
 	mov.u32 	%r9, %tid.x;
 	mad.lo.s32 	%r1, %r8, %r7, %r9;
 	setp.ge.u32	%p1, %r1, %r6;
-	@%p1 bra 	BB10_5;
+	@%p1 bra 	BB8_5;
 
 	mul.lo.s32 	%r2, %r6, %r5;
 	setp.ge.u32	%p2, %r1, %r2;
-	@%p2 bra 	BB10_4;
+	@%p2 bra 	BB8_4;
 
 	mov.u32 	%r10, %r1;
 
-BB10_3:
+BB8_3:
 	mov.u32 	%r3, %r10;
 	mul.wide.u32 	%rd3, %r3, 8;
 	add.s64 	%rd4, %rd1, %rd3;
@@ -1565,14 +1264,14 @@ BB10_3:
 	add.s32 	%r4, %r3, %r6;
 	setp.lt.u32	%p3, %r4, %r2;
 	mov.u32 	%r10, %r4;
-	@%p3 bra 	BB10_3;
+	@%p3 bra 	BB8_3;
 
-BB10_4:
+BB8_4:
 	mul.wide.u32 	%rd5, %r1, 8;
 	add.s64 	%rd6, %rd2, %rd5;
 	st.f64 	[%rd6], %fd6;
 
-BB10_5:
+BB8_5:
 	ret;
 }
 
@@ -1601,11 +1300,11 @@ BB10_5:
 	mov.u32 	%r9, %ntid.x;
 	mad.lo.s32 	%r32, %r8, %r9, %r6;
 	setp.ge.u32	%p1, %r32, %r5;
-	@%p1 bra 	BB11_5;
+	@%p1 bra 	BB9_5;
 
 	mov.f64 	%fd77, %fd76;
 
-BB11_2:
+BB9_2:
 	mov.f64 	%fd1, %fd77;
 	mul.wide.u32 	%rd4, %r32, 8;
 	add.s64 	%rd5, %rd2, %rd4;
@@ -1613,23 +1312,23 @@ BB11_2:
 	mul.f64 	%fd78, %fd1, %fd29;
 	add.s32 	%r3, %r32, %r9;
 	setp.ge.u32	%p2, %r3, %r5;
-	@%p2 bra 	BB11_4;
+	@%p2 bra 	BB9_4;
 
 	mul.wide.u32 	%rd6, %r3, 8;
 	add.s64 	%rd7, %rd2, %rd6;
 	ld.f64 	%fd30, [%rd7];
 	mul.f64 	%fd78, %fd78, %fd30;
 
-BB11_4:
+BB9_4:
 	mov.f64 	%fd77, %fd78;
 	shl.b32 	%r12, %r9, 1;
 	mov.u32 	%r13, %nctaid.x;
 	mad.lo.s32 	%r32, %r12, %r13, %r32;
 	setp.lt.u32	%p3, %r32, %r5;
 	mov.f64 	%fd76, %fd77;
-	@%p3 bra 	BB11_2;
+	@%p3 bra 	BB9_2;
 
-BB11_5:
+BB9_5:
 	mov.f64 	%fd74, %fd76;
 	mul.wide.u32 	%rd8, %r6, 8;
 	mov.u64 	%rd9, sdata;
@@ -1637,137 +1336,137 @@ BB11_5:
 	st.shared.f64 	[%rd1], %fd74;
 	bar.sync 	0;
 	setp.lt.u32	%p4, %r9, 1024;
-	@%p4 bra 	BB11_9;
+	@%p4 bra 	BB9_9;
 
 	setp.gt.u32	%p5, %r6, 511;
 	mov.f64 	%fd75, %fd74;
-	@%p5 bra 	BB11_8;
+	@%p5 bra 	BB9_8;
 
 	ld.shared.f64 	%fd31, [%rd1+4096];
 	mul.f64 	%fd75, %fd74, %fd31;
 	st.shared.f64 	[%rd1], %fd75;
 
-BB11_8:
+BB9_8:
 	mov.f64 	%fd74, %fd75;
 	bar.sync 	0;
 
-BB11_9:
+BB9_9:
 	mov.f64 	%fd72, %fd74;
 	setp.lt.u32	%p6, %r9, 512;
-	@%p6 bra 	BB11_13;
+	@%p6 bra 	BB9_13;
 
 	setp.gt.u32	%p7, %r6, 255;
 	mov.f64 	%fd73, %fd72;
-	@%p7 bra 	BB11_12;
+	@%p7 bra 	BB9_12;
 
 	ld.shared.f64 	%fd32, [%rd1+2048];
 	mul.f64 	%fd73, %fd72, %fd32;
 	st.shared.f64 	[%rd1], %fd73;
 
-BB11_12:
+BB9_12:
 	mov.f64 	%fd72, %fd73;
 	bar.sync 	0;
 
-BB11_13:
+BB9_13:
 	mov.f64 	%fd70, %fd72;
 	setp.lt.u32	%p8, %r9, 256;
-	@%p8 bra 	BB11_17;
+	@%p8 bra 	BB9_17;
 
 	setp.gt.u32	%p9, %r6, 127;
 	mov.f64 	%fd71, %fd70;
-	@%p9 bra 	BB11_16;
+	@%p9 bra 	BB9_16;
 
 	ld.shared.f64 	%fd33, [%rd1+1024];
 	mul.f64 	%fd71, %fd70, %fd33;
 	st.shared.f64 	[%rd1], %fd71;
 
-BB11_16:
+BB9_16:
 	mov.f64 	%fd70, %fd71;
 	bar.sync 	0;
 
-BB11_17:
+BB9_17:
 	mov.f64 	%fd68, %fd70;
 	setp.lt.u32	%p10, %r9, 128;
-	@%p10 bra 	BB11_21;
+	@%p10 bra 	BB9_21;
 
 	setp.gt.u32	%p11, %r6, 63;
 	mov.f64 	%fd69, %fd68;
-	@%p11 bra 	BB11_20;
+	@%p11 bra 	BB9_20;
 
 	ld.shared.f64 	%fd34, [%rd1+512];
 	mul.f64 	%fd69, %fd68, %fd34;
 	st.shared.f64 	[%rd1], %fd69;
 
-BB11_20:
+BB9_20:
 	mov.f64 	%fd68, %fd69;
 	bar.sync 	0;
 
-BB11_21:
+BB9_21:
 	mov.f64 	%fd67, %fd68;
 	setp.gt.u32	%p12, %r6, 31;
-	@%p12 bra 	BB11_34;
+	@%p12 bra 	BB9_34;
 
 	setp.lt.u32	%p13, %r9, 64;
-	@%p13 bra 	BB11_24;
+	@%p13 bra 	BB9_24;
 
 	ld.volatile.shared.f64 	%fd35, [%rd1+256];
 	mul.f64 	%fd67, %fd67, %fd35;
 	st.volatile.shared.f64 	[%rd1], %fd67;
 
-BB11_24:
+BB9_24:
 	mov.f64 	%fd66, %fd67;
 	setp.lt.u32	%p14, %r9, 32;
-	@%p14 bra 	BB11_26;
+	@%p14 bra 	BB9_26;
 
 	ld.volatile.shared.f64 	%fd36, [%rd1+128];
 	mul.f64 	%fd66, %fd66, %fd36;
 	st.volatile.shared.f64 	[%rd1], %fd66;
 
-BB11_26:
+BB9_26:
 	mov.f64 	%fd65, %fd66;
 	setp.lt.u32	%p15, %r9, 16;
-	@%p15 bra 	BB11_28;
+	@%p15 bra 	BB9_28;
 
 	ld.volatile.shared.f64 	%fd37, [%rd1+64];
 	mul.f64 	%fd65, %fd65, %fd37;
 	st.volatile.shared.f64 	[%rd1], %fd65;
 
-BB11_28:
+BB9_28:
 	mov.f64 	%fd64, %fd65;
 	setp.lt.u32	%p16, %r9, 8;
-	@%p16 bra 	BB11_30;
+	@%p16 bra 	BB9_30;
 
 	ld.volatile.shared.f64 	%fd38, [%rd1+32];
 	mul.f64 	%fd64, %fd64, %fd38;
 	st.volatile.shared.f64 	[%rd1], %fd64;
 
-BB11_30:
+BB9_30:
 	mov.f64 	%fd63, %fd64;
 	setp.lt.u32	%p17, %r9, 4;
-	@%p17 bra 	BB11_32;
+	@%p17 bra 	BB9_32;
 
 	ld.volatile.shared.f64 	%fd39, [%rd1+16];
 	mul.f64 	%fd63, %fd63, %fd39;
 	st.volatile.shared.f64 	[%rd1], %fd63;
 
-BB11_32:
+BB9_32:
 	setp.lt.u32	%p18, %r9, 2;
-	@%p18 bra 	BB11_34;
+	@%p18 bra 	BB9_34;
 
 	ld.volatile.shared.f64 	%fd40, [%rd1+8];
 	mul.f64 	%fd41, %fd63, %fd40;
 	st.volatile.shared.f64 	[%rd1], %fd41;
 
-BB11_34:
+BB9_34:
 	setp.ne.s32	%p19, %r6, 0;
-	@%p19 bra 	BB11_36;
+	@%p19 bra 	BB9_36;
 
 	ld.shared.f64 	%fd42, [sdata];
 	mul.wide.u32 	%rd10, %r7, 8;
 	add.s64 	%rd11, %rd3, %rd10;
 	st.f64 	[%rd11], %fd42;
 
-BB11_36:
+BB9_36:
 	ret;
 }
 
@@ -1796,14 +1495,14 @@ BB11_36:
 	ld.param.f64 	%fd42, [_Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_6];
 	mov.u32 	%r7, %ctaid.x;
 	setp.ge.u32	%p1, %r7, %r6;
-	@%p1 bra 	BB12_34;
+	@%p1 bra 	BB10_34;
 
 	mov.u32 	%r29, %tid.x;
 	mul.lo.s32 	%r2, %r7, %r5;
 	setp.ge.u32	%p2, %r29, %r5;
-	@%p2 bra 	BB12_3;
+	@%p2 bra 	BB10_3;
 
-BB12_2:
+BB10_2:
 	add.s32 	%r9, %r29, %r2;
 	mul.wide.u32 	%rd5, %r9, 8;
 	add.s64 	%rd6, %rd2, %rd5;
@@ -1812,9 +1511,9 @@ BB12_2:
 	mov.u32 	%r10, %ntid.x;
 	add.s32 	%r29, %r10, %r29;
 	setp.lt.u32	%p3, %r29, %r5;
-	@%p3 bra 	BB12_2;
+	@%p3 bra 	BB10_2;
 
-BB12_3:
+BB10_3:
 	mov.u32 	%r11, %tid.x;
 	mul.wide.u32 	%rd7, %r11, 8;
 	mov.u64 	%rd8, sdata;
@@ -1823,114 +1522,114 @@ BB12_3:
 	bar.sync 	0;
 	mov.u32 	%r12, %ntid.x;
 	setp.lt.u32	%p4, %r12, 1024;
-	@%p4 bra 	BB12_7;
+	@%p4 bra 	BB10_7;
 
 	setp.gt.u32	%p5, %r11, 511;
-	@%p5 bra 	BB12_6;
+	@%p5 bra 	BB10_6;
 
 	ld.shared.f64 	%fd28, [%rd1+4096];
 	add.f64 	%fd42, %fd42, %fd28;
 	st.shared.f64 	[%rd1], %fd42;
 
-BB12_6:
+BB10_6:
 	bar.sync 	0;
 
-BB12_7:
+BB10_7:
 	setp.lt.u32	%p6, %r12, 512;
-	@%p6 bra 	BB12_11;
+	@%p6 bra 	BB10_11;
 
 	setp.gt.u32	%p7, %r11, 255;
-	@%p7 bra 	BB12_10;
+	@%p7 bra 	BB10_10;
 
 	ld.shared.f64 	%fd29, [%rd1+2048];
 	add.f64 	%fd42, %fd42, %fd29;
 	st.shared.f64 	[%rd1], %fd42;
 
-BB12_10:
+BB10_10:
 	bar.sync 	0;
 
-BB12_11:
+BB10_11:
 	setp.lt.u32	%p8, %r12, 256;
-	@%p8 bra 	BB12_15;
+	@%p8 bra 	BB10_15;
 
 	setp.gt.u32	%p9, %r11, 127;
-	@%p9 bra 	BB12_14;
+	@%p9 bra 	BB10_14;
 
 	ld.shared.f64 	%fd30, [%rd1+1024];
 	add.f64 	%fd42, %fd42, %fd30;
 	st.shared.f64 	[%rd1], %fd42;
 
-BB12_14:
+BB10_14:
 	bar.sync 	0;
 
-BB12_15:
+BB10_15:
 	setp.lt.u32	%p10, %r12, 128;
-	@%p10 bra 	BB12_19;
+	@%p10 bra 	BB10_19;
 
 	setp.gt.u32	%p11, %r11, 63;
-	@%p11 bra 	BB12_18;
+	@%p11 bra 	BB10_18;
 
 	ld.shared.f64 	%fd31, [%rd1+512];
 	add.f64 	%fd42, %fd42, %fd31;
 	st.shared.f64 	[%rd1], %fd42;
 
-BB12_18:
+BB10_18:
 	bar.sync 	0;
 
-BB12_19:
+BB10_19:
 	setp.gt.u32	%p12, %r11, 31;
-	@%p12 bra 	BB12_32;
+	@%p12 bra 	BB10_32;
 
 	setp.lt.u32	%p13, %r12, 64;
-	@%p13 bra 	BB12_22;
+	@%p13 bra 	BB10_22;
 
 	ld.volatile.shared.f64 	%fd32, [%rd1+256];
 	add.f64 	%fd42, %fd42, %fd32;
 	st.volatile.shared.f64 	[%rd1], %fd42;
 
-BB12_22:
+BB10_22:
 	setp.lt.u32	%p14, %r12, 32;
-	@%p14 bra 	BB12_24;
+	@%p14 bra 	BB10_24;
 
 	ld.volatile.shared.f64 	%fd33, [%rd1+128];
 	add.f64 	%fd42, %fd42, %fd33;
 	st.volatile.shared.f64 	[%rd1], %fd42;
 
-BB12_24:
+BB10_24:
 	setp.lt.u32	%p15, %r12, 16;
-	@%p15 bra 	BB12_26;
+	@%p15 bra 	BB10_26;
 
 	ld.volatile.shared.f64 	%fd34, [%rd1+64];
 	add.f64 	%fd42, %fd42, %fd34;
 	st.volatile.shared.f64 	[%rd1], %fd42;
 
-BB12_26:
+BB10_26:
 	setp.lt.u32	%p16, %r12, 8;
-	@%p16 bra 	BB12_28;
+	@%p16 bra 	BB10_28;
 
 	ld.volatile.shared.f64 	%fd35, [%rd1+32];
 	add.f64 	%fd42, %fd42, %fd35;
 	st.volatile.shared.f64 	[%rd1], %fd42;
 
-BB12_28:
+BB10_28:
 	setp.lt.u32	%p17, %r12, 4;
-	@%p17 bra 	BB12_30;
+	@%p17 bra 	BB10_30;
 
 	ld.volatile.shared.f64 	%fd36, [%rd1+16];
 	add.f64 	%fd42, %fd42, %fd36;
 	st.volatile.shared.f64 	[%rd1], %fd42;
 
-BB12_30:
+BB10_30:
 	setp.lt.u32	%p18, %r12, 2;
-	@%p18 bra 	BB12_32;
+	@%p18 bra 	BB10_32;
 
 	ld.volatile.shared.f64 	%fd37, [%rd1+8];
 	add.f64 	%fd38, %fd42, %fd37;
 	st.volatile.shared.f64 	[%rd1], %fd38;
 
-BB12_32:
+BB10_32:
 	setp.ne.s32	%p19, %r11, 0;
-	@%p19 bra 	BB12_34;
+	@%p19 bra 	BB10_34;
 
 	ld.shared.f64 	%fd39, [sdata];
 	cvt.rn.f64.s64	%fd40, %rd4;
@@ -1939,7 +1638,7 @@ BB12_32:
 	add.s64 	%rd10, %rd3, %rd9;
 	st.f64 	[%rd10], %fd41;
 
-BB12_34:
+BB10_34:
 	ret;
 }
 
@@ -1971,15 +1670,15 @@ BB12_34:
 	mov.u32 	%r9, %tid.x;
 	mad.lo.s32 	%r1, %r7, %r8, %r9;
 	setp.ge.u32	%p1, %r1, %r6;
-	@%p1 bra 	BB13_5;
+	@%p1 bra 	BB11_5;
 
 	mul.lo.s32 	%r2, %r6, %r5;
 	setp.ge.u32	%p2, %r1, %r2;
-	@%p2 bra 	BB13_4;
+	@%p2 bra 	BB11_4;
 
 	mov.u32 	%r10, %r1;
 
-BB13_3:
+BB11_3:
 	mov.u32 	%r3, %r10;
 	mul.wide.u32 	%rd4, %r3, 8;
 	add.s64 	%rd5, %rd1, %rd4;
@@ -1988,16 +1687,16 @@ BB13_3:
 	add.s32 	%r4, %r3, %r6;
 	setp.lt.u32	%p3, %r4, %r2;
 	mov.u32 	%r10, %r4;
-	@%p3 bra 	BB13_3;
+	@%p3 bra 	BB11_3;
 
-BB13_4:
+BB11_4:
 	cvt.rn.f64.s64	%fd6, %rd3;
 	div.rn.f64 	%fd7, %fd8, %fd6;
 	mul.wide.u32 	%rd6, %r1, 8;
 	add.s64 	%rd7, %rd2, %rd6;
 	st.f64 	[%rd7], %fd7;
 
-BB13_5:
+BB11_5:
 	ret;
 }
 
@@ -2029,10 +1728,10 @@ BB13_5:
 	setp.gt.s32	%p1, %r2, %r1;
 	setp.lt.s32	%p2, %r3, %r5;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB14_2;
-	bra.uni 	BB14_1;
+	@!%p3 bra 	BB12_2;
+	bra.uni 	BB12_1;
 
-BB14_1:
+BB12_1:
 	cvta.to.global.u64 	%rd2, %rd1;
 	mad.lo.s32 	%r12, %r1, %r4, %r2;
 	mul.wide.s32 	%rd3, %r12, 8;
@@ -2042,7 +1741,7 @@ BB14_1:
 	add.s64 	%rd6, %rd2, %rd5;
 	st.global.f64 	[%rd6], %fd1;
 
-BB14_2:
+BB12_2:
 	ret;
 }
 
@@ -2075,14 +1774,14 @@ BB14_2:
 	mad.lo.s32 	%r1, %r8, %r9, %r11;
 	mul.lo.s32 	%r12, %r3, %r2;
 	setp.ge.s32	%p1, %r1, %r12;
-	@%p1 bra 	BB15_2;
+	@%p1 bra 	BB13_2;
 
 	cvta.to.global.u64 	%rd2, %rd1;
 	mul.wide.s32 	%rd3, %r1, 8;
 	add.s64 	%rd4, %rd2, %rd3;
 	st.global.f64 	[%rd4], %fd1;
 
-BB15_2:
+BB13_2:
 	ret;
 }
 
@@ -2116,10 +1815,10 @@ BB15_2:
 	setp.lt.s32	%p1, %r7, %r2;
 	setp.lt.s32	%p2, %r11, %r3;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB16_2;
-	bra.uni 	BB16_1;
+	@!%p3 bra 	BB14_2;
+	bra.uni 	BB14_1;
 
-BB16_1:
+BB14_1:
 	cvta.to.global.u64 	%rd3, %rd1;
 	mul.wide.s32 	%rd4, %r1, 8;
 	add.s64 	%rd5, %rd3, %rd4;
@@ -2128,7 +1827,7 @@ BB16_1:
 	add.s64 	%rd7, %rd6, %rd4;
 	st.global.f64 	[%rd7], %fd1;
 
-BB16_2:
+BB14_2:
 	ret;
 }
 
@@ -2161,10 +1860,10 @@ BB16_2:
 	setp.lt.s32	%p1, %r1, %r4;
 	setp.lt.s32	%p2, %r2, %r3;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB17_2;
-	bra.uni 	BB17_1;
+	@!%p3 bra 	BB15_2;
+	bra.uni 	BB15_1;
 
-BB17_1:
+BB15_1:
 	cvta.to.global.u64 	%rd3, %rd1;
 	mad.lo.s32 	%r11, %r1, %r3, %r2;
 	mul.wide.s32 	%rd4, %r11, 8;
@@ -2176,7 +1875,7 @@ BB17_1:
 	add.s64 	%rd7, %rd6, %rd4;
 	st.global.f64 	[%rd7], %fd3;
 
-BB17_2:
+BB15_2:
 	ret;
 }
 
@@ -2211,10 +1910,10 @@ BB17_2:
 	setp.lt.s32	%p1, %r1, %r5;
 	setp.lt.s32	%p2, %r2, %r4;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB18_4;
-	bra.uni 	BB18_1;
+	@!%p3 bra 	BB16_4;
+	bra.uni 	BB16_1;
 
-BB18_1:
+BB16_1:
 	cvta.to.global.u64 	%rd4, %rd1;
 	mad.lo.s32 	%r3, %r1, %r4, %r2;
 	mul.wide.s32 	%rd5, %r3, 8;
@@ -2222,18 +1921,18 @@ BB18_1:
 	ld.global.f64 	%fd4, [%rd6];
 	mov.f64 	%fd5, 0d0000000000000000;
 	setp.leu.f64	%p4, %fd4, 0d0000000000000000;
-	@%p4 bra 	BB18_3;
+	@%p4 bra 	BB16_3;
 
 	cvta.to.global.u64 	%rd7, %rd2;
 	add.s64 	%rd9, %rd7, %rd5;
 	ld.global.f64 	%fd5, [%rd9];
 
-BB18_3:
+BB16_3:
 	cvta.to.global.u64 	%rd10, %rd3;
 	add.s64 	%rd12, %rd10, %rd5;
 	st.global.f64 	[%rd12], %fd5;
 
-BB18_4:
+BB16_4:
 	ret;
 }
 
@@ -2270,10 +1969,10 @@ BB18_4:
 	setp.lt.s32	%p1, %r1, %r5;
 	setp.lt.s32	%p2, %r2, %r3;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB19_2;
-	bra.uni 	BB19_1;
+	@!%p3 bra 	BB17_2;
+	bra.uni 	BB17_1;
 
-BB19_1:
+BB17_1:
 	cvta.to.global.u64 	%rd4, %rd1;
 	mad.lo.s32 	%r12, %r1, %r3, %r2;
 	mul.wide.s32 	%rd5, %r12, 8;
@@ -2289,7 +1988,7 @@ BB19_1:
 	add.s64 	%rd11, %rd10, %rd5;
 	st.global.f64 	[%rd11], %fd3;
 
-BB19_2:
+BB17_2:
 	ret;
 }
 
@@ -2333,10 +2032,10 @@ BB19_2:
 	setp.lt.s32	%p1, %r7, %r2;
 	setp.lt.s32	%p2, %r11, %r3;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB20_6;
-	bra.uni 	BB20_1;
+	@!%p3 bra 	BB18_6;
+	bra.uni 	BB18_1;
 
-BB20_1:
+BB18_1:
 	cvta.to.global.u64 	%rd4, %rd2;
 	mul.wide.s32 	%rd5, %r1, 8;
 	add.s64 	%rd6, %rd4, %rd5;
@@ -2346,39 +2045,39 @@ BB20_1:
 	setp.lt.f64	%p4, %fd8, %fd3;
 	cvta.to.global.u64 	%rd7, %rd3;
 	add.s64 	%rd1, %rd7, %rd5;
-	@%p4 bra 	BB20_5;
-	bra.uni 	BB20_2;
+	@%p4 bra 	BB18_5;
+	bra.uni 	BB18_2;
 
-BB20_5:
+BB18_5:
 	st.global.f64 	[%rd1], %fd4;
-	bra.uni 	BB20_6;
+	bra.uni 	BB18_6;
 
-BB20_2:
+BB18_2:
 	setp.lt.f64	%p5, %fd1, %fd2;
-	@%p5 bra 	BB20_4;
-	bra.uni 	BB20_3;
+	@%p5 bra 	BB18_4;
+	bra.uni 	BB18_3;
 
-BB20_4:
+BB18_4:
 	st.global.f64 	[%rd1], %fd5;
-	bra.uni 	BB20_6;
+	bra.uni 	BB18_6;
 
-BB20_3:
+BB18_3:
 	st.global.f64 	[%rd1], %fd6;
 
-BB20_6:
+BB18_6:
 	ret;
 }
 
-	// .globl	binCellOp
-.visible .entry binCellOp(
-	.param .u64 binCellOp_param_0,
-	.param .u64 binCellOp_param_1,
-	.param .u64 binCellOp_param_2,
-	.param .u32 binCellOp_param_3,
-	.param .u32 binCellOp_param_4,
-	.param .u32 binCellOp_param_5,
-	.param .u32 binCellOp_param_6,
-	.param .u32 binCellOp_param_7
+	// .globl	matrix_matrix_cellwise_op
+.visible .entry matrix_matrix_cellwise_op(
+	.param .u64 matrix_matrix_cellwise_op_param_0,
+	.param .u64 matrix_matrix_cellwise_op_param_1,
+	.param .u64 matrix_matrix_cellwise_op_param_2,
+	.param .u32 matrix_matrix_cellwise_op_param_3,
+	.param .u32 matrix_matrix_cellwise_op_param_4,
+	.param .u32 matrix_matrix_cellwise_op_param_5,
+	.param .u32 matrix_matrix_cellwise_op_param_6,
+	.param .u32 matrix_matrix_cellwise_op_param_7
 )
 {
 	.reg .pred 	%p<52>;
@@ -2387,14 +2086,14 @@ BB20_6:
 	.reg .b64 	%rd<15>;
 
 
-	ld.param.u64 	%rd2, [binCellOp_param_0];
-	ld.param.u64 	%rd3, [binCellOp_param_1];
-	ld.param.u64 	%rd4, [binCellOp_param_2];
-	ld.param.u32 	%r14, [binCellOp_param_3];
-	ld.param.u32 	%r10, [binCellOp_param_4];
-	ld.param.u32 	%r11, [binCellOp_param_5];
-	ld.param.u32 	%r12, [binCellOp_param_6];
-	ld.param.u32 	%r13, [binCellOp_param_7];
+	ld.param.u64 	%rd2, [matrix_matrix_cellwise_op_param_0];
+	ld.param.u64 	%rd3, [matrix_matrix_cellwise_op_param_1];
+	ld.param.u64 	%rd4, [matrix_matrix_cellwise_op_param_2];
+	ld.param.u32 	%r14, [matrix_matrix_cellwise_op_param_3];
+	ld.param.u32 	%r10, [matrix_matrix_cellwise_op_param_4];
+	ld.param.u32 	%r11, [matrix_matrix_cellwise_op_param_5];
+	ld.param.u32 	%r12, [matrix_matrix_cellwise_op_param_6];
+	ld.param.u32 	%r13, [matrix_matrix_cellwise_op_param_7];
 	mov.u32 	%r15, %ntid.x;
 	mov.u32 	%r16, %ctaid.x;
 	mov.u32 	%r17, %tid.x;
@@ -2406,42 +2105,42 @@ BB20_6:
 	setp.lt.s32	%p2, %r1, %r14;
 	setp.lt.s32	%p3, %r2, %r10;
 	and.pred  	%p4, %p2, %p3;
-	@!%p4 bra 	BB21_55;
-	bra.uni 	BB21_1;
+	@!%p4 bra 	BB19_55;
+	bra.uni 	BB19_1;
 
-BB21_1:
+BB19_1:
 	mad.lo.s32 	%r3, %r1, %r10, %r2;
 	setp.eq.s32	%p5, %r11, 1;
 	mov.u32 	%r54, %r1;
-	@%p5 bra 	BB21_5;
+	@%p5 bra 	BB19_5;
 
 	setp.ne.s32	%p6, %r11, 2;
 	mov.u32 	%r55, %r3;
-	@%p6 bra 	BB21_4;
+	@%p6 bra 	BB19_4;
 
 	mov.u32 	%r55, %r2;
 
-BB21_4:
+BB19_4:
 	mov.u32 	%r49, %r55;
 	mov.u32 	%r4, %r49;
 	mov.u32 	%r54, %r4;
 
-BB21_5:
+BB19_5:
 	mov.u32 	%r5, %r54;
 	setp.eq.s32	%p7, %r12, 1;
 	mov.u32 	%r52, %r1;
-	@%p7 bra 	BB21_9;
+	@%p7 bra 	BB19_9;
 
 	setp.ne.s32	%p8, %r12, 2;
 	mov.u32 	%r53, %r3;
-	@%p8 bra 	BB21_8;
+	@%p8 bra 	BB19_8;
 
 	mov.u32 	%r53, %r2;
 
-BB21_8:
+BB19_8:
 	mov.u32 	%r52, %r53;
 
-BB21_9:
+BB19_9:
 	cvta.to.global.u64 	%rd5, %rd3;
 	cvta.to.global.u64 	%rd6, %rd2;
 	mul.wide.s32 	%rd7, %r5, 8;
@@ -2450,49 +2149,49 @@ BB21_9:
 	mul.wide.s32 	%rd9, %r52, 8;
 	add.s64 	%rd10, %rd5, %rd9;
 	ld.global.f64 	%fd2, [%rd10];
-	mov.f64 	%fd39, 0dC08F380000000000;
+	mov.f64 	%fd39, 0d7FEFFFFFFFFFFFFF;
 	setp.gt.s32	%p9, %r13, 5;
-	@%p9 bra 	BB21_19;
+	@%p9 bra 	BB19_19;
 
 	setp.gt.s32	%p19, %r13, 2;
-	@%p19 bra 	BB21_15;
+	@%p19 bra 	BB19_15;
 
 	setp.eq.s32	%p23, %r13, 0;
-	@%p23 bra 	BB21_53;
+	@%p23 bra 	BB19_53;
 
 	setp.eq.s32	%p24, %r13, 1;
-	@%p24 bra 	BB21_52;
-	bra.uni 	BB21_13;
+	@%p24 bra 	BB19_52;
+	bra.uni 	BB19_13;
 
-BB21_52:
+BB19_52:
 	sub.f64 	%fd39, %fd1, %fd2;
-	bra.uni 	BB21_54;
+	bra.uni 	BB19_54;
 
-BB21_19:
+BB19_19:
 	setp.gt.s32	%p10, %r13, 8;
-	@%p10 bra 	BB21_24;
+	@%p10 bra 	BB19_24;
 
 	setp.eq.s32	%p16, %r13, 6;
-	@%p16 bra 	BB21_34;
+	@%p16 bra 	BB19_34;
 
 	setp.eq.s32	%p17, %r13, 7;
-	@%p17 bra 	BB21_33;
-	bra.uni 	BB21_22;
+	@%p17 bra 	BB19_33;
+	bra.uni 	BB19_22;
 
-BB21_33:
+BB19_33:
 	setp.gt.f64	%p29, %fd1, %fd2;
 	selp.f64	%fd39, 0d3FF0000000000000, 0d0000000000000000, %p29;
-	bra.uni 	BB21_54;
+	bra.uni 	BB19_54;
 
-BB21_15:
+BB19_15:
 	setp.eq.s32	%p20, %r13, 3;
-	@%p20 bra 	BB21_51;
+	@%p20 bra 	BB19_51;
 
 	setp.eq.s32	%p21, %r13, 4;
-	@%p21 bra 	BB21_35;
-	bra.uni 	BB21_17;
+	@%p21 bra 	BB19_35;
+	bra.uni 	BB19_17;
 
-BB21_35:
+BB19_35:
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r8}, %fd1;
@@ -2507,7 +2206,7 @@ BB21_35:
 	shl.b64 	%rd1, %rd11, %r22;
 	setp.eq.s64	%p32, %rd1, -9223372036854775808;
 	abs.f64 	%fd11, %fd1;
-	// Callseq Start 1
+	// Callseq Start 0
 	{
 	.reg .b32 temp_param_reg;
 	// <end>}
@@ -2525,13 +2224,13 @@ BB21_35:
 	ld.param.f64	%fd38, [retval0+0];
 	
 	//{
-	}// Callseq End 1
+	}// Callseq End 0
 	setp.lt.s32	%p33, %r8, 0;
 	and.pred  	%p1, %p33, %p32;
-	@!%p1 bra 	BB21_37;
-	bra.uni 	BB21_36;
+	@!%p1 bra 	BB19_37;
+	bra.uni 	BB19_36;
 
-BB21_36:
+BB19_36:
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r23}, %fd38;
@@ -2543,111 +2242,111 @@ BB21_36:
 	}
 	mov.b64 	%fd38, {%r25, %r24};
 
-BB21_37:
+BB19_37:
 	mov.f64 	%fd37, %fd38;
 	setp.eq.f64	%p34, %fd1, 0d0000000000000000;
-	@%p34 bra 	BB21_40;
-	bra.uni 	BB21_38;
+	@%p34 bra 	BB19_40;
+	bra.uni 	BB19_38;
 
-BB21_40:
+BB19_40:
 	selp.b32	%r26, %r8, 0, %p32;
 	or.b32  	%r27, %r26, 2146435072;
 	setp.lt.s32	%p38, %r9, 0;
 	selp.b32	%r28, %r27, %r26, %p38;
 	mov.u32 	%r29, 0;
 	mov.b64 	%fd37, {%r29, %r28};
-	bra.uni 	BB21_41;
+	bra.uni 	BB19_41;
 
-BB21_24:
+BB19_24:
 	setp.gt.s32	%p11, %r13, 10;
-	@%p11 bra 	BB21_28;
+	@%p11 bra 	BB19_28;
 
 	setp.eq.s32	%p14, %r13, 9;
-	@%p14 bra 	BB21_32;
-	bra.uni 	BB21_26;
+	@%p14 bra 	BB19_32;
+	bra.uni 	BB19_26;
 
-BB21_32:
+BB19_32:
 	setp.eq.f64	%p27, %fd1, %fd2;
 	selp.f64	%fd39, 0d3FF0000000000000, 0d0000000000000000, %p27;
-	bra.uni 	BB21_54;
+	bra.uni 	BB19_54;
 
-BB21_28:
+BB19_28:
 	setp.eq.s32	%p12, %r13, 11;
-	@%p12 bra 	BB21_31;
-	bra.uni 	BB21_29;
+	@%p12 bra 	BB19_31;
+	bra.uni 	BB19_29;
 
-BB21_31:
+BB19_31:
 	min.f64 	%fd39, %fd1, %fd2;
-	bra.uni 	BB21_54;
+	bra.uni 	BB19_54;
 
-BB21_53:
+BB19_53:
 	add.f64 	%fd39, %fd1, %fd2;
-	bra.uni 	BB21_54;
+	bra.uni 	BB19_54;
 
-BB21_13:
+BB19_13:
 	setp.eq.s32	%p25, %r13, 2;
-	@%p25 bra 	BB21_14;
-	bra.uni 	BB21_54;
+	@%p25 bra 	BB19_14;
+	bra.uni 	BB19_54;
 
-BB21_14:
+BB19_14:
 	mul.f64 	%fd39, %fd1, %fd2;
-	bra.uni 	BB21_54;
+	bra.uni 	BB19_54;
 
-BB21_34:
+BB19_34:
 	setp.le.f64	%p30, %fd1, %fd2;
 	selp.f64	%fd39, 0d3FF0000000000000, 0d0000000000000000, %p30;
-	bra.uni 	BB21_54;
+	bra.uni 	BB19_54;
 
-BB21_22:
+BB19_22:
 	setp.eq.s32	%p18, %r13, 8;
-	@%p18 bra 	BB21_23;
-	bra.uni 	BB21_54;
+	@%p18 bra 	BB19_23;
+	bra.uni 	BB19_54;
 
-BB21_23:
+BB19_23:
 	setp.ge.f64	%p28, %fd1, %fd2;
 	selp.f64	%fd39, 0d3FF0000000000000, 0d0000000000000000, %p28;
-	bra.uni 	BB21_54;
+	bra.uni 	BB19_54;
 
-BB21_51:
+BB19_51:
 	div.rn.f64 	%fd39, %fd1, %fd2;
-	bra.uni 	BB21_54;
+	bra.uni 	BB19_54;
 
-BB21_17:
+BB19_17:
 	setp.eq.s32	%p22, %r13, 5;
-	@%p22 bra 	BB21_18;
-	bra.uni 	BB21_54;
+	@%p22 bra 	BB19_18;
+	bra.uni 	BB19_54;
 
-BB21_18:
+BB19_18:
 	setp.lt.f64	%p31, %fd1, %fd2;
 	selp.f64	%fd39, 0d3FF0000000000000, 0d0000000000000000, %p31;
-	bra.uni 	BB21_54;
+	bra.uni 	BB19_54;
 
-BB21_26:
+BB19_26:
 	setp.eq.s32	%p15, %r13, 10;
-	@%p15 bra 	BB21_27;
-	bra.uni 	BB21_54;
+	@%p15 bra 	BB19_27;
+	bra.uni 	BB19_54;
 
-BB21_27:
+BB19_27:
 	setp.neu.f64	%p26, %fd1, %fd2;
 	selp.f64	%fd39, 0d3FF0000000000000, 0d0000000000000000, %p26;
-	bra.uni 	BB21_54;
+	bra.uni 	BB19_54;
 
-BB21_29:
+BB19_29:
 	setp.ne.s32	%p13, %r13, 12;
-	@%p13 bra 	BB21_54;
+	@%p13 bra 	BB19_54;
 
 	max.f64 	%fd39, %fd1, %fd2;
-	bra.uni 	BB21_54;
+	bra.uni 	BB19_54;
 
-BB21_38:
+BB19_38:
 	setp.gt.s32	%p35, %r8, -1;
-	@%p35 bra 	BB21_41;
+	@%p35 bra 	BB19_41;
 
 	cvt.rzi.f64.f64	%fd29, %fd2;
 	setp.neu.f64	%p36, %fd29, %fd2;
 	selp.f64	%fd37, 0dFFF8000000000000, %fd37, %p36;
 
-BB21_41:
+BB19_41:
 	mov.f64 	%fd17, %fd37;
 	add.f64 	%fd18, %fd1, %fd2;
 	{
@@ -2657,35 +2356,35 @@ BB21_41:
 	and.b32  	%r31, %r30, 2146435072;
 	setp.ne.s32	%p39, %r31, 2146435072;
 	mov.f64 	%fd36, %fd17;
-	@%p39 bra 	BB21_50;
+	@%p39 bra 	BB19_50;
 
 	setp.gtu.f64	%p40, %fd11, 0d7FF0000000000000;
 	mov.f64 	%fd36, %fd18;
-	@%p40 bra 	BB21_50;
+	@%p40 bra 	BB19_50;
 
 	abs.f64 	%fd30, %fd2;
 	setp.gtu.f64	%p41, %fd30, 0d7FF0000000000000;
 	mov.f64 	%fd35, %fd18;
 	mov.f64 	%fd36, %fd35;
-	@%p41 bra 	BB21_50;
+	@%p41 bra 	BB19_50;
 
 	and.b32  	%r32, %r9, 2147483647;
 	setp.ne.s32	%p42, %r32, 2146435072;
-	@%p42 bra 	BB21_46;
+	@%p42 bra 	BB19_46;
 
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%r33, %temp}, %fd2;
 	}
 	setp.eq.s32	%p43, %r33, 0;
-	@%p43 bra 	BB21_49;
+	@%p43 bra 	BB19_49;
 
-BB21_46:
+BB19_46:
 	and.b32  	%r34, %r8, 2147483647;
 	setp.ne.s32	%p44, %r34, 2146435072;
 	mov.f64 	%fd33, %fd17;
 	mov.f64 	%fd36, %fd33;
-	@%p44 bra 	BB21_50;
+	@%p44 bra 	BB19_50;
 
 	{
 	.reg .b32 %temp; 
@@ -2693,7 +2392,7 @@ BB21_46:
 	}
 	setp.ne.s32	%p45, %r35, 0;
 	mov.f64 	%fd36, %fd17;
-	@%p45 bra 	BB21_50;
+	@%p45 bra 	BB19_50;
 
 	shr.s32 	%r36, %r9, 31;
 	and.b32  	%r37, %r36, -2146435072;
@@ -2702,9 +2401,9 @@ BB21_46:
 	selp.b32	%r40, %r39, %r38, %p1;
 	mov.u32 	%r41, 0;
 	mov.b64 	%fd36, {%r41, %r40};
-	bra.uni 	BB21_50;
+	bra.uni 	BB19_50;
 
-BB21_49:
+BB19_49:
 	setp.gt.f64	%p46, %fd11, 0d3FF0000000000000;
 	selp.b32	%r42, 2146435072, 0, %p46;
 	xor.b32  	%r43, %r42, 2146435072;
@@ -2715,58 +2414,51 @@ BB21_49:
 	mov.u32 	%r46, 0;
 	mov.b64 	%fd36, {%r46, %r45};
 
-BB21_50:
+BB19_50:
 	setp.eq.f64	%p49, %fd2, 0d0000000000000000;
 	setp.eq.f64	%p50, %fd1, 0d3FF0000000000000;
 	or.pred  	%p51, %p50, %p49;
 	selp.f64	%fd39, 0d3FF0000000000000, %fd36, %p51;
 
-BB21_54:
+BB19_54:
 	cvta.to.global.u64 	%rd12, %rd4;
 	mul.wide.s32 	%rd13, %r3, 8;
 	add.s64 	%rd14, %rd12, %rd13;
 	st.global.f64 	[%rd14], %fd39;
+	bar.sync 	0;
 
-BB21_55:
+BB19_55:
 	ret;
 }
 
-	// .globl	binCellScalarOp
-.visible .entry binCellScalarOp(
-	.param .u64 binCellScalarOp_param_0,
-	.param .f64 binCellScalarOp_param_1,
-	.param .u64 binCellScalarOp_param_2,
-	.param .u32 binCellScalarOp_param_3,
-	.param .u32 binCellScalarOp_param_4,
-	.param .u32 binCellScalarOp_param_5,
-	.param .u32 binCellScalarOp_param_6
+	// .globl	matrix_scalar_op
+.visible .entry matrix_scalar_op(
+	.param .u64 matrix_scalar_op_param_0,
+	.param .f64 matrix_scalar_op_param_1,
+	.param .u64 matrix_scalar_op_param_2,
+	.param .u32 matrix_scalar_op_param_3,
+	.param .u32 matrix_scalar_op_param_4,
+	.param .u32 matrix_scalar_op_param_5
 )
 {
-	.reg .pred 	%p<89>;
-	.reg .b32 	%r<71>;
+	.reg .pred 	%p<91>;
+	.reg .b32 	%r<64>;
 	.reg .f64 	%fd<77>;
 	.reg .b64 	%rd<12>;
 
 
-	ld.param.u64 	%rd4, [binCellScalarOp_param_0];
-	ld.param.f64 	%fd52, [binCellScalarOp_param_1];
-	ld.param.u64 	%rd5, [binCellScalarOp_param_2];
-	ld.param.u32 	%r8, [binCellScalarOp_param_3];
-	ld.param.u32 	%r9, [binCellScalarOp_param_4];
-	ld.param.u32 	%r6, [binCellScalarOp_param_5];
-	ld.param.u32 	%r7, [binCellScalarOp_param_6];
-	mov.u32 	%r10, %ctaid.x;
-	mov.u32 	%r11, %ntid.x;
-	mov.u32 	%r12, %tid.x;
-	mad.lo.s32 	%r13, %r11, %r10, %r12;
-	mov.u32 	%r14, %ntid.y;
-	mov.u32 	%r15, %ctaid.y;
-	mov.u32 	%r16, %tid.y;
-	mad.lo.s32 	%r17, %r13, %r9, %r16;
-	mad.lo.s32 	%r1, %r14, %r15, %r17;
-	mul.lo.s32 	%r18, %r9, %r8;
-	setp.ge.s32	%p3, %r1, %r18;
-	@%p3 bra 	BB22_92;
+	ld.param.u64 	%rd4, [matrix_scalar_op_param_0];
+	ld.param.f64 	%fd52, [matrix_scalar_op_param_1];
+	ld.param.u64 	%rd5, [matrix_scalar_op_param_2];
+	ld.param.u32 	%r8, [matrix_scalar_op_param_3];
+	ld.param.u32 	%r6, [matrix_scalar_op_param_4];
+	ld.param.u32 	%r7, [matrix_scalar_op_param_5];
+	mov.u32 	%r9, %ctaid.x;
+	mov.u32 	%r10, %ntid.x;
+	mov.u32 	%r11, %tid.x;
+	mad.lo.s32 	%r1, %r10, %r9, %r11;
+	setp.ge.s32	%p3, %r1, %r8;
+	@%p3 bra 	BB20_94;
 
 	cvta.to.global.u64 	%rd6, %rd5;
 	cvta.to.global.u64 	%rd7, %rd4;
@@ -2775,178 +2467,86 @@ BB21_55:
 	ld.global.f64 	%fd1, [%rd9];
 	add.s64 	%rd1, %rd6, %rd8;
 	setp.eq.s32	%p4, %r7, 0;
-	@%p4 bra 	BB22_47;
+	@%p4 bra 	BB20_48;
 
-	setp.eq.s32	%p5, %r6, 0;
-	@%p5 bra 	BB22_45;
+	mov.f64 	%fd67, 0d7FEFFFFFFFFFFFFF;
+	setp.gt.s32	%p5, %r6, 5;
+	@%p5 bra 	BB20_12;
 
-	mov.f64 	%fd67, 0dC08F380000000000;
-	setp.gt.s32	%p6, %r6, 6;
-	@%p6 bra 	BB22_13;
+	setp.gt.s32	%p15, %r6, 2;
+	@%p15 bra 	BB20_8;
 
-	setp.gt.s32	%p14, %r6, 3;
-	@%p14 bra 	BB22_9;
+	setp.eq.s32	%p19, %r6, 0;
+	@%p19 bra 	BB20_46;
 
-	setp.eq.s32	%p18, %r6, 1;
-	@%p18 bra 	BB22_44;
+	setp.eq.s32	%p20, %r6, 1;
+	@%p20 bra 	BB20_45;
+	bra.uni 	BB20_6;
 
-	setp.eq.s32	%p19, %r6, 2;
-	@%p19 bra 	BB22_43;
-	bra.uni 	BB22_7;
+BB20_45:
+	sub.f64 	%fd67, %fd52, %fd1;
+	bra.uni 	BB20_47;
 
-BB22_43:
-	mul.f64 	%fd67, %fd1, %fd52;
-	bra.uni 	BB22_46;
-
-BB22_47:
-	setp.eq.s32	%p47, %r6, 0;
-	@%p47 bra 	BB22_90;
-
-	mov.f64 	%fd76, 0dC08F380000000000;
-	setp.gt.s32	%p48, %r6, 6;
-	@%p48 bra 	BB22_58;
-
-	setp.gt.s32	%p56, %r6, 3;
-	@%p56 bra 	BB22_54;
-
-	setp.eq.s32	%p60, %r6, 1;
-	@%p60 bra 	BB22_89;
-
-	setp.eq.s32	%p61, %r6, 2;
-	@%p61 bra 	BB22_88;
-	bra.uni 	BB22_52;
-
-BB22_88:
-	mul.f64 	%fd76, %fd1, %fd52;
-	bra.uni 	BB22_91;
-
-BB22_45:
-	add.f64 	%fd67, %fd1, %fd52;
-
-BB22_46:
-	st.global.f64 	[%rd1], %fd67;
-	bra.uni 	BB22_92;
-
-BB22_13:
-	setp.gt.s32	%p7, %r6, 9;
-	@%p7 bra 	BB22_18;
-
-	setp.eq.s32	%p11, %r6, 7;
-	@%p11 bra 	BB22_25;
-
-	setp.eq.s32	%p12, %r6, 8;
-	@%p12 bra 	BB22_24;
-	bra.uni 	BB22_16;
-
-BB22_24:
-	setp.le.f64	%p23, %fd1, %fd52;
-	selp.f64	%fd67, 0d3FF0000000000000, 0d0000000000000000, %p23;
-	bra.uni 	BB22_46;
-
-BB22_90:
-	add.f64 	%fd76, %fd1, %fd52;
-
-BB22_91:
-	st.global.f64 	[%rd1], %fd76;
-
-BB22_92:
-	ret;
-
-BB22_58:
-	setp.gt.s32	%p49, %r6, 9;
-	@%p49 bra 	BB22_63;
+BB20_48:
+	mov.f64 	%fd76, 0d7FEFFFFFFFFFFFFF;
+	setp.gt.s32	%p48, %r6, 5;
+	@%p48 bra 	BB20_58;
 
-	setp.eq.s32	%p53, %r6, 7;
-	@%p53 bra 	BB22_70;
+	setp.gt.s32	%p58, %r6, 2;
+	@%p58 bra 	BB20_54;
 
-	setp.eq.s32	%p54, %r6, 8;
-	@%p54 bra 	BB22_69;
-	bra.uni 	BB22_61;
+	setp.eq.s32	%p62, %r6, 0;
+	@%p62 bra 	BB20_92;
 
-BB22_69:
-	setp.ge.f64	%p65, %fd1, %fd52;
-	selp.f64	%fd76, 0d3FF0000000000000, 0d0000000000000000, %p65;
-	bra.uni 	BB22_91;
+	setp.eq.s32	%p63, %r6, 1;
+	@%p63 bra 	BB20_91;
+	bra.uni 	BB20_52;
 
-BB22_9:
-	setp.eq.s32	%p15, %r6, 4;
-	@%p15 bra 	BB22_27;
+BB20_91:
+	sub.f64 	%fd76, %fd1, %fd52;
+	bra.uni 	BB20_93;
 
-	setp.eq.s32	%p16, %r6, 5;
-	@%p16 bra 	BB22_26;
-	bra.uni 	BB22_11;
+BB20_12:
+	setp.gt.s32	%p6, %r6, 8;
+	@%p6 bra 	BB20_17;
 
-BB22_26:
-	setp.gt.f64	%p26, %fd1, %fd52;
-	selp.f64	%fd67, 0d3FF0000000000000, 0d0000000000000000, %p26;
-	bra.uni 	BB22_46;
+	setp.eq.s32	%p12, %r6, 6;
+	@%p12 bra 	BB20_27;
 
-BB22_18:
-	setp.eq.s32	%p8, %r6, 10;
-	@%p8 bra 	BB22_23;
+	setp.eq.s32	%p13, %r6, 7;
+	@%p13 bra 	BB20_26;
+	bra.uni 	BB20_15;
 
-	setp.eq.s32	%p9, %r6, 11;
-	@%p9 bra 	BB22_22;
-	bra.uni 	BB22_20;
+BB20_26:
+	setp.lt.f64	%p25, %fd1, %fd52;
+	selp.f64	%fd67, 0d3FF0000000000000, 0d0000000000000000, %p25;
+	bra.uni 	BB20_47;
 
-BB22_22:
-	min.f64 	%fd67, %fd52, %fd1;
-	bra.uni 	BB22_46;
+BB20_58:
+	setp.gt.s32	%p49, %r6, 8;
+	@%p49 bra 	BB20_63;
 
-BB22_54:
-	setp.eq.s32	%p57, %r6, 4;
-	@%p57 bra 	BB22_72;
+	setp.eq.s32	%p55, %r6, 6;
+	@%p55 bra 	BB20_73;
 
-	setp.eq.s32	%p58, %r6, 5;
-	@%p58 bra 	BB22_71;
-	bra.uni 	BB22_56;
+	setp.eq.s32	%p56, %r6, 7;
+	@%p56 bra 	BB20_72;
+	bra.uni 	BB20_61;
 
-BB22_71:
-	setp.lt.f64	%p68, %fd1, %fd52;
+BB20_72:
+	setp.gt.f64	%p68, %fd1, %fd52;
 	selp.f64	%fd76, 0d3FF0000000000000, 0d0000000000000000, %p68;
-	bra.uni 	BB22_91;
+	bra.uni 	BB20_93;
 
-BB22_63:
-	setp.eq.s32	%p50, %r6, 10;
-	@%p50 bra 	BB22_68;
+BB20_8:
+	setp.eq.s32	%p16, %r6, 3;
+	@%p16 bra 	BB20_44;
 
-	setp.eq.s32	%p51, %r6, 11;
-	@%p51 bra 	BB22_67;
-	bra.uni 	BB22_65;
-
-BB22_67:
-	min.f64 	%fd76, %fd1, %fd52;
-	bra.uni 	BB22_91;
-
-BB22_44:
-	sub.f64 	%fd67, %fd52, %fd1;
-	bra.uni 	BB22_46;
-
-BB22_7:
-	setp.eq.s32	%p20, %r6, 3;
-	@%p20 bra 	BB22_8;
-	bra.uni 	BB22_46;
+	setp.eq.s32	%p17, %r6, 4;
+	@%p17 bra 	BB20_28;
+	bra.uni 	BB20_10;
 
-BB22_8:
-	div.rn.f64 	%fd67, %fd52, %fd1;
-	bra.uni 	BB22_46;
-
-BB22_25:
-	setp.lt.f64	%p24, %fd1, %fd52;
-	selp.f64	%fd67, 0d3FF0000000000000, 0d0000000000000000, %p24;
-	bra.uni 	BB22_46;
-
-BB22_16:
-	setp.eq.s32	%p13, %r6, 9;
-	@%p13 bra 	BB22_17;
-	bra.uni 	BB22_46;
-
-BB22_17:
-	setp.eq.f64	%p22, %fd1, %fd52;
-	selp.f64	%fd67, 0d3FF0000000000000, 0d0000000000000000, %p22;
-	bra.uni 	BB22_46;
-
-BB22_27:
+BB20_28:
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r2}, %fd52;
@@ -2955,13 +2555,13 @@ BB22_27:
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r3}, %fd1;
 	}
-	bfe.u32 	%r19, %r3, 20, 11;
-	add.s32 	%r20, %r19, -1012;
+	bfe.u32 	%r12, %r3, 20, 11;
+	add.s32 	%r13, %r12, -1012;
 	mov.b64 	 %rd10, %fd1;
-	shl.b64 	%rd2, %rd10, %r20;
-	setp.eq.s64	%p27, %rd2, -9223372036854775808;
+	shl.b64 	%rd2, %rd10, %r13;
+	setp.eq.s64	%p28, %rd2, -9223372036854775808;
 	abs.f64 	%fd10, %fd52;
-	// Callseq Start 2
+	// Callseq Start 1
 	{
 	.reg .b32 temp_param_reg;
 	// <end>}
@@ -2979,90 +2579,61 @@ BB22_27:
 	ld.param.f64	%fd66, [retval0+0];
 	
 	//{
-	}// Callseq End 2
-	setp.lt.s32	%p28, %r2, 0;
-	and.pred  	%p1, %p28, %p27;
-	@!%p1 bra 	BB22_29;
-	bra.uni 	BB22_28;
+	}// Callseq End 1
+	setp.lt.s32	%p29, %r2, 0;
+	and.pred  	%p1, %p29, %p28;
+	@!%p1 bra 	BB20_30;
+	bra.uni 	BB20_29;
 
-BB22_28:
+BB20_29:
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r21}, %fd66;
+	mov.b64 	{%temp, %r14}, %fd66;
 	}
-	xor.b32  	%r22, %r21, -2147483648;
+	xor.b32  	%r15, %r14, -2147483648;
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r23, %temp}, %fd66;
+	mov.b64 	{%r16, %temp}, %fd66;
 	}
-	mov.b64 	%fd66, {%r23, %r22};
+	mov.b64 	%fd66, {%r16, %r15};
 
-BB22_29:
+BB20_30:
 	mov.f64 	%fd65, %fd66;
-	setp.eq.f64	%p29, %fd52, 0d0000000000000000;
-	@%p29 bra 	BB22_32;
-	bra.uni 	BB22_30;
-
-BB22_32:
-	selp.b32	%r24, %r2, 0, %p27;
-	or.b32  	%r25, %r24, 2146435072;
-	setp.lt.s32	%p33, %r3, 0;
-	selp.b32	%r26, %r25, %r24, %p33;
-	mov.u32 	%r27, 0;
-	mov.b64 	%fd65, {%r27, %r26};
-	bra.uni 	BB22_33;
-
-BB22_11:
-	setp.eq.s32	%p17, %r6, 6;
-	@%p17 bra 	BB22_12;
-	bra.uni 	BB22_46;
-
-BB22_12:
-	setp.ge.f64	%p25, %fd1, %fd52;
-	selp.f64	%fd67, 0d3FF0000000000000, 0d0000000000000000, %p25;
-	bra.uni 	BB22_46;
-
-BB22_23:
-	setp.neu.f64	%p21, %fd1, %fd52;
-	selp.f64	%fd67, 0d3FF0000000000000, 0d0000000000000000, %p21;
-	bra.uni 	BB22_46;
-
-BB22_20:
-	setp.ne.s32	%p10, %r6, 12;
-	@%p10 bra 	BB22_46;
-
-	max.f64 	%fd67, %fd52, %fd1;
-	bra.uni 	BB22_46;
-
-BB22_89:
-	sub.f64 	%fd76, %fd1, %fd52;
-	bra.uni 	BB22_91;
-
-BB22_52:
-	setp.eq.s32	%p62, %r6, 3;
-	@%p62 bra 	BB22_53;
-	bra.uni 	BB22_91;
-
-BB22_53:
-	div.rn.f64 	%fd76, %fd1, %fd52;
-	bra.uni 	BB22_91;
-
-BB22_70:
-	setp.gt.f64	%p66, %fd1, %fd52;
-	selp.f64	%fd76, 0d3FF0000000000000, 0d0000000000000000, %p66;
-	bra.uni 	BB22_91;
+	setp.eq.f64	%p30, %fd52, 0d0000000000000000;
+	@%p30 bra 	BB20_33;
+	bra.uni 	BB20_31;
+
+BB20_33:
+	selp.b32	%r17, %r2, 0, %p28;
+	or.b32  	%r18, %r17, 2146435072;
+	setp.lt.s32	%p34, %r3, 0;
+	selp.b32	%r19, %r18, %r17, %p34;
+	mov.u32 	%r20, 0;
+	mov.b64 	%fd65, {%r20, %r19};
+	bra.uni 	BB20_34;
+
+BB20_17:
+	setp.gt.s32	%p7, %r6, 10;
+	@%p7 bra 	BB20_21;
+
+	setp.eq.s32	%p10, %r6, 9;
+	@%p10 bra 	BB20_25;
+	bra.uni 	BB20_19;
+
+BB20_25:
+	setp.eq.f64	%p23, %fd1, %fd52;
+	selp.f64	%fd67, 0d3FF0000000000000, 0d0000000000000000, %p23;
+	bra.uni 	BB20_47;
 
-BB22_61:
-	setp.eq.s32	%p55, %r6, 9;
-	@%p55 bra 	BB22_62;
-	bra.uni 	BB22_91;
+BB20_54:
+	setp.eq.s32	%p59, %r6, 3;
+	@%p59 bra 	BB20_90;
 
-BB22_62:
-	setp.eq.f64	%p64, %fd1, %fd52;
-	selp.f64	%fd76, 0d3FF0000000000000, 0d0000000000000000, %p64;
-	bra.uni 	BB22_91;
+	setp.eq.s32	%p60, %r6, 4;
+	@%p60 bra 	BB20_74;
+	bra.uni 	BB20_56;
 
-BB22_72:
+BB20_74:
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r4}, %fd1;
@@ -3071,13 +2642,13 @@ BB22_72:
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r5}, %fd52;
 	}
-	bfe.u32 	%r45, %r5, 20, 11;
-	add.s32 	%r46, %r45, -1012;
+	bfe.u32 	%r38, %r5, 20, 11;
+	add.s32 	%r39, %r38, -1012;
 	mov.b64 	 %rd11, %fd52;
-	shl.b64 	%rd3, %rd11, %r46;
-	setp.eq.s64	%p69, %rd3, -9223372036854775808;
+	shl.b64 	%rd3, %rd11, %r39;
+	setp.eq.s64	%p71, %rd3, -9223372036854775808;
 	abs.f64 	%fd35, %fd1;
-	// Callseq Start 3
+	// Callseq Start 2
 	{
 	.reg .b32 temp_param_reg;
 	// <end>}
@@ -3095,226 +2666,362 @@ BB22_72:
 	ld.param.f64	%fd75, [retval0+0];
 	
 	//{
-	}// Callseq End 3
-	setp.lt.s32	%p70, %r4, 0;
-	and.pred  	%p2, %p70, %p69;
-	@!%p2 bra 	BB22_74;
-	bra.uni 	BB22_73;
+	}// Callseq End 2
+	setp.lt.s32	%p72, %r4, 0;
+	and.pred  	%p2, %p72, %p71;
+	@!%p2 bra 	BB20_76;
+	bra.uni 	BB20_75;
 
-BB22_73:
+BB20_75:
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r47}, %fd75;
+	mov.b64 	{%temp, %r40}, %fd75;
 	}
-	xor.b32  	%r48, %r47, -2147483648;
+	xor.b32  	%r41, %r40, -2147483648;
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r49, %temp}, %fd75;
+	mov.b64 	{%r42, %temp}, %fd75;
 	}
-	mov.b64 	%fd75, {%r49, %r48};
+	mov.b64 	%fd75, {%r42, %r41};
 
-BB22_74:
+BB20_76:
 	mov.f64 	%fd74, %fd75;
-	setp.eq.f64	%p71, %fd1, 0d0000000000000000;
-	@%p71 bra 	BB22_77;
-	bra.uni 	BB22_75;
-
-BB22_77:
-	selp.b32	%r50, %r4, 0, %p69;
-	or.b32  	%r51, %r50, 2146435072;
-	setp.lt.s32	%p75, %r5, 0;
-	selp.b32	%r52, %r51, %r50, %p75;
-	mov.u32 	%r53, 0;
-	mov.b64 	%fd74, {%r53, %r52};
-	bra.uni 	BB22_78;
-
-BB22_56:
-	setp.eq.s32	%p59, %r6, 6;
-	@%p59 bra 	BB22_57;
-	bra.uni 	BB22_91;
-
-BB22_57:
-	setp.le.f64	%p67, %fd1, %fd52;
+	setp.eq.f64	%p73, %fd1, 0d0000000000000000;
+	@%p73 bra 	BB20_79;
+	bra.uni 	BB20_77;
+
+BB20_79:
+	selp.b32	%r43, %r4, 0, %p71;
+	or.b32  	%r44, %r43, 2146435072;
+	setp.lt.s32	%p77, %r5, 0;
+	selp.b32	%r45, %r44, %r43, %p77;
+	mov.u32 	%r46, 0;
+	mov.b64 	%fd74, {%r46, %r45};
+	bra.uni 	BB20_80;
+
+BB20_63:
+	setp.gt.s32	%p50, %r6, 10;
+	@%p50 bra 	BB20_67;
+
+	setp.eq.s32	%p53, %r6, 9;
+	@%p53 bra 	BB20_71;
+	bra.uni 	BB20_65;
+
+BB20_71:
+	setp.eq.f64	%p66, %fd1, %fd52;
+	selp.f64	%fd76, 0d3FF0000000000000, 0d0000000000000000, %p66;
+	bra.uni 	BB20_93;
+
+BB20_21:
+	setp.eq.s32	%p8, %r6, 11;
+	@%p8 bra 	BB20_24;
+	bra.uni 	BB20_22;
+
+BB20_24:
+	min.f64 	%fd67, %fd52, %fd1;
+	bra.uni 	BB20_47;
+
+BB20_46:
+	add.f64 	%fd67, %fd1, %fd52;
+	bra.uni 	BB20_47;
+
+BB20_6:
+	setp.eq.s32	%p21, %r6, 2;
+	@%p21 bra 	BB20_7;
+	bra.uni 	BB20_47;
+
+BB20_7:
+	mul.f64 	%fd67, %fd1, %fd52;
+	bra.uni 	BB20_47;
+
+BB20_27:
+	setp.ge.f64	%p26, %fd1, %fd52;
+	selp.f64	%fd67, 0d3FF0000000000000, 0d0000000000000000, %p26;
+	bra.uni 	BB20_47;
+
+BB20_15:
+	setp.eq.s32	%p14, %r6, 8;
+	@%p14 bra 	BB20_16;
+	bra.uni 	BB20_47;
+
+BB20_16:
+	setp.le.f64	%p24, %fd1, %fd52;
+	selp.f64	%fd67, 0d3FF0000000000000, 0d0000000000000000, %p24;
+	bra.uni 	BB20_47;
+
+BB20_44:
+	div.rn.f64 	%fd67, %fd52, %fd1;
+	bra.uni 	BB20_47;
+
+BB20_10:
+	setp.eq.s32	%p18, %r6, 5;
+	@%p18 bra 	BB20_11;
+	bra.uni 	BB20_47;
+
+BB20_11:
+	setp.gt.f64	%p27, %fd1, %fd52;
+	selp.f64	%fd67, 0d3FF0000000000000, 0d0000000000000000, %p27;
+	bra.uni 	BB20_47;
+
+BB20_67:
+	setp.eq.s32	%p51, %r6, 11;
+	@%p51 bra 	BB20_70;
+	bra.uni 	BB20_68;
+
+BB20_70:
+	min.f64 	%fd76, %fd1, %fd52;
+	bra.uni 	BB20_93;
+
+BB20_19:
+	setp.eq.s32	%p11, %r6, 10;
+	@%p11 bra 	BB20_20;
+	bra.uni 	BB20_47;
+
+BB20_20:
+	setp.neu.f64	%p22, %fd1, %fd52;
+	selp.f64	%fd67, 0d3FF0000000000000, 0d0000000000000000, %p22;
+	bra.uni 	BB20_47;
+
+BB20_22:
+	setp.ne.s32	%p9, %r6, 12;
+	@%p9 bra 	BB20_47;
+
+	max.f64 	%fd67, %fd52, %fd1;
+	bra.uni 	BB20_47;
+
+BB20_92:
+	add.f64 	%fd76, %fd1, %fd52;
+	bra.uni 	BB20_93;
+
+BB20_52:
+	setp.eq.s32	%p64, %r6, 2;
+	@%p64 bra 	BB20_53;
+	bra.uni 	BB20_93;
+
+BB20_53:
+	mul.f64 	%fd76, %fd1, %fd52;
+	bra.uni 	BB20_93;
+
+BB20_73:
+	setp.le.f64	%p69, %fd1, %fd52;
+	selp.f64	%fd76, 0d3FF0000000000000, 0d0000000000000000, %p69;
+	bra.uni 	BB20_93;
+
+BB20_61:
+	setp.eq.s32	%p57, %r6, 8;
+	@%p57 bra 	BB20_62;
+	bra.uni 	BB20_93;
+
+BB20_62:
+	setp.ge.f64	%p67, %fd1, %fd52;
 	selp.f64	%fd76, 0d3FF0000000000000, 0d0000000000000000, %p67;
-	bra.uni 	BB22_91;
+	bra.uni 	BB20_93;
+
+BB20_90:
+	div.rn.f64 	%fd76, %fd1, %fd52;
+	bra.uni 	BB20_93;
+
+BB20_56:
+	setp.eq.s32	%p61, %r6, 5;
+	@%p61 bra 	BB20_57;
+	bra.uni 	BB20_93;
 
-BB22_68:
-	setp.neu.f64	%p63, %fd1, %fd52;
-	selp.f64	%fd76, 0d3FF0000000000000, 0d0000000000000000, %p63;
-	bra.uni 	BB22_91;
+BB20_57:
+	setp.lt.f64	%p70, %fd1, %fd52;
+	selp.f64	%fd76, 0d3FF0000000000000, 0d0000000000000000, %p70;
+	bra.uni 	BB20_93;
 
-BB22_65:
+BB20_65:
+	setp.eq.s32	%p54, %r6, 10;
+	@%p54 bra 	BB20_66;
+	bra.uni 	BB20_93;
+
+BB20_66:
+	setp.neu.f64	%p65, %fd1, %fd52;
+	selp.f64	%fd76, 0d3FF0000000000000, 0d0000000000000000, %p65;
+	bra.uni 	BB20_93;
+
+BB20_68:
 	setp.ne.s32	%p52, %r6, 12;
-	@%p52 bra 	BB22_91;
+	@%p52 bra 	BB20_93;
 
 	max.f64 	%fd76, %fd1, %fd52;
-	bra.uni 	BB22_91;
+	bra.uni 	BB20_93;
 
-BB22_30:
-	setp.gt.s32	%p30, %r2, -1;
-	@%p30 bra 	BB22_33;
+BB20_31:
+	setp.gt.s32	%p31, %r2, -1;
+	@%p31 bra 	BB20_34;
 
 	cvt.rzi.f64.f64	%fd54, %fd1;
-	setp.neu.f64	%p31, %fd54, %fd1;
-	selp.f64	%fd65, 0dFFF8000000000000, %fd65, %p31;
+	setp.neu.f64	%p32, %fd54, %fd1;
+	selp.f64	%fd65, 0dFFF8000000000000, %fd65, %p32;
 
-BB22_33:
+BB20_34:
 	mov.f64 	%fd16, %fd65;
 	add.f64 	%fd17, %fd1, %fd52;
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r28}, %fd17;
+	mov.b64 	{%temp, %r21}, %fd17;
 	}
-	and.b32  	%r29, %r28, 2146435072;
-	setp.ne.s32	%p34, %r29, 2146435072;
+	and.b32  	%r22, %r21, 2146435072;
+	setp.ne.s32	%p35, %r22, 2146435072;
 	mov.f64 	%fd64, %fd16;
-	@%p34 bra 	BB22_42;
+	@%p35 bra 	BB20_43;
 
-	setp.gtu.f64	%p35, %fd10, 0d7FF0000000000000;
+	setp.gtu.f64	%p36, %fd10, 0d7FF0000000000000;
 	mov.f64 	%fd64, %fd17;
-	@%p35 bra 	BB22_42;
+	@%p36 bra 	BB20_43;
 
 	abs.f64 	%fd55, %fd1;
-	setp.gtu.f64	%p36, %fd55, 0d7FF0000000000000;
+	setp.gtu.f64	%p37, %fd55, 0d7FF0000000000000;
 	mov.f64 	%fd63, %fd17;
 	mov.f64 	%fd64, %fd63;
-	@%p36 bra 	BB22_42;
+	@%p37 bra 	BB20_43;
 
-	and.b32  	%r30, %r3, 2147483647;
-	setp.ne.s32	%p37, %r30, 2146435072;
-	@%p37 bra 	BB22_38;
+	and.b32  	%r23, %r3, 2147483647;
+	setp.ne.s32	%p38, %r23, 2146435072;
+	@%p38 bra 	BB20_39;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r31, %temp}, %fd1;
+	mov.b64 	{%r24, %temp}, %fd1;
 	}
-	setp.eq.s32	%p38, %r31, 0;
-	@%p38 bra 	BB22_41;
+	setp.eq.s32	%p39, %r24, 0;
+	@%p39 bra 	BB20_42;
 
-BB22_38:
-	and.b32  	%r32, %r2, 2147483647;
-	setp.ne.s32	%p39, %r32, 2146435072;
+BB20_39:
+	and.b32  	%r25, %r2, 2147483647;
+	setp.ne.s32	%p40, %r25, 2146435072;
 	mov.f64 	%fd61, %fd16;
 	mov.f64 	%fd64, %fd61;
-	@%p39 bra 	BB22_42;
+	@%p40 bra 	BB20_43;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r33, %temp}, %fd52;
+	mov.b64 	{%r26, %temp}, %fd52;
 	}
-	setp.ne.s32	%p40, %r33, 0;
+	setp.ne.s32	%p41, %r26, 0;
 	mov.f64 	%fd64, %fd16;
-	@%p40 bra 	BB22_42;
+	@%p41 bra 	BB20_43;
 
-	shr.s32 	%r34, %r3, 31;
-	and.b32  	%r35, %r34, -2146435072;
-	add.s32 	%r36, %r35, 2146435072;
-	or.b32  	%r37, %r36, -2147483648;
-	selp.b32	%r38, %r37, %r36, %p1;
-	mov.u32 	%r39, 0;
-	mov.b64 	%fd64, {%r39, %r38};
-	bra.uni 	BB22_42;
+	shr.s32 	%r27, %r3, 31;
+	and.b32  	%r28, %r27, -2146435072;
+	add.s32 	%r29, %r28, 2146435072;
+	or.b32  	%r30, %r29, -2147483648;
+	selp.b32	%r31, %r30, %r29, %p1;
+	mov.u32 	%r32, 0;
+	mov.b64 	%fd64, {%r32, %r31};
+	bra.uni 	BB20_43;
 
-BB22_75:
-	setp.gt.s32	%p72, %r4, -1;
-	@%p72 bra 	BB22_78;
+BB20_77:
+	setp.gt.s32	%p74, %r4, -1;
+	@%p74 bra 	BB20_80;
 
 	cvt.rzi.f64.f64	%fd57, %fd52;
-	setp.neu.f64	%p73, %fd57, %fd52;
-	selp.f64	%fd74, 0dFFF8000000000000, %fd74, %p73;
+	setp.neu.f64	%p75, %fd57, %fd52;
+	selp.f64	%fd74, 0dFFF8000000000000, %fd74, %p75;
 
-BB22_78:
+BB20_80:
 	mov.f64 	%fd41, %fd74;
 	add.f64 	%fd42, %fd1, %fd52;
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r54}, %fd42;
+	mov.b64 	{%temp, %r47}, %fd42;
 	}
-	and.b32  	%r55, %r54, 2146435072;
-	setp.ne.s32	%p76, %r55, 2146435072;
+	and.b32  	%r48, %r47, 2146435072;
+	setp.ne.s32	%p78, %r48, 2146435072;
 	mov.f64 	%fd73, %fd41;
-	@%p76 bra 	BB22_87;
+	@%p78 bra 	BB20_89;
 
-	setp.gtu.f64	%p77, %fd35, 0d7FF0000000000000;
+	setp.gtu.f64	%p79, %fd35, 0d7FF0000000000000;
 	mov.f64 	%fd73, %fd42;
-	@%p77 bra 	BB22_87;
+	@%p79 bra 	BB20_89;
 
 	abs.f64 	%fd58, %fd52;
-	setp.gtu.f64	%p78, %fd58, 0d7FF0000000000000;
+	setp.gtu.f64	%p80, %fd58, 0d7FF0000000000000;
 	mov.f64 	%fd72, %fd42;
 	mov.f64 	%fd73, %fd72;
-	@%p78 bra 	BB22_87;
+	@%p80 bra 	BB20_89;
 
-	and.b32  	%r56, %r5, 2147483647;
-	setp.ne.s32	%p79, %r56, 2146435072;
-	@%p79 bra 	BB22_83;
+	and.b32  	%r49, %r5, 2147483647;
+	setp.ne.s32	%p81, %r49, 2146435072;
+	@%p81 bra 	BB20_85;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r57, %temp}, %fd52;
+	mov.b64 	{%r50, %temp}, %fd52;
 	}
-	setp.eq.s32	%p80, %r57, 0;
-	@%p80 bra 	BB22_86;
+	setp.eq.s32	%p82, %r50, 0;
+	@%p82 bra 	BB20_88;
 
-BB22_83:
-	and.b32  	%r58, %r4, 2147483647;
-	setp.ne.s32	%p81, %r58, 2146435072;
+BB20_85:
+	and.b32  	%r51, %r4, 2147483647;
+	setp.ne.s32	%p83, %r51, 2146435072;
 	mov.f64 	%fd70, %fd41;
 	mov.f64 	%fd73, %fd70;
-	@%p81 bra 	BB22_87;
+	@%p83 bra 	BB20_89;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r59, %temp}, %fd1;
+	mov.b64 	{%r52, %temp}, %fd1;
 	}
-	setp.ne.s32	%p82, %r59, 0;
+	setp.ne.s32	%p84, %r52, 0;
 	mov.f64 	%fd73, %fd41;
-	@%p82 bra 	BB22_87;
-
-	shr.s32 	%r60, %r5, 31;
-	and.b32  	%r61, %r60, -2146435072;
-	add.s32 	%r62, %r61, 2146435072;
-	or.b32  	%r63, %r62, -2147483648;
-	selp.b32	%r64, %r63, %r62, %p2;
-	mov.u32 	%r65, 0;
-	mov.b64 	%fd73, {%r65, %r64};
-	bra.uni 	BB22_87;
-
-BB22_41:
-	setp.gt.f64	%p41, %fd10, 0d3FF0000000000000;
-	selp.b32	%r40, 2146435072, 0, %p41;
-	xor.b32  	%r41, %r40, 2146435072;
-	setp.lt.s32	%p42, %r3, 0;
-	selp.b32	%r42, %r41, %r40, %p42;
-	setp.eq.f64	%p43, %fd52, 0dBFF0000000000000;
-	selp.b32	%r43, 1072693248, %r42, %p43;
-	mov.u32 	%r44, 0;
-	mov.b64 	%fd64, {%r44, %r43};
-
-BB22_42:
-	setp.eq.f64	%p44, %fd1, 0d0000000000000000;
-	setp.eq.f64	%p45, %fd52, 0d3FF0000000000000;
-	or.pred  	%p46, %p45, %p44;
-	selp.f64	%fd67, 0d3FF0000000000000, %fd64, %p46;
-	bra.uni 	BB22_46;
-
-BB22_86:
-	setp.gt.f64	%p83, %fd35, 0d3FF0000000000000;
-	selp.b32	%r66, 2146435072, 0, %p83;
-	xor.b32  	%r67, %r66, 2146435072;
-	setp.lt.s32	%p84, %r5, 0;
-	selp.b32	%r68, %r67, %r66, %p84;
-	setp.eq.f64	%p85, %fd1, 0dBFF0000000000000;
-	selp.b32	%r69, 1072693248, %r68, %p85;
-	mov.u32 	%r70, 0;
-	mov.b64 	%fd73, {%r70, %r69};
-
-BB22_87:
-	setp.eq.f64	%p86, %fd52, 0d0000000000000000;
-	setp.eq.f64	%p87, %fd1, 0d3FF0000000000000;
-	or.pred  	%p88, %p87, %p86;
-	selp.f64	%fd76, 0d3FF0000000000000, %fd73, %p88;
-	bra.uni 	BB22_91;
+	@%p84 bra 	BB20_89;
+
+	shr.s32 	%r53, %r5, 31;
+	and.b32  	%r54, %r53, -2146435072;
+	add.s32 	%r55, %r54, 2146435072;
+	or.b32  	%r56, %r55, -2147483648;
+	selp.b32	%r57, %r56, %r55, %p2;
+	mov.u32 	%r58, 0;
+	mov.b64 	%fd73, {%r58, %r57};
+	bra.uni 	BB20_89;
+
+BB20_42:
+	setp.gt.f64	%p42, %fd10, 0d3FF0000000000000;
+	selp.b32	%r33, 2146435072, 0, %p42;
+	xor.b32  	%r34, %r33, 2146435072;
+	setp.lt.s32	%p43, %r3, 0;
+	selp.b32	%r35, %r34, %r33, %p43;
+	setp.eq.f64	%p44, %fd52, 0dBFF0000000000000;
+	selp.b32	%r36, 1072693248, %r35, %p44;
+	mov.u32 	%r37, 0;
+	mov.b64 	%fd64, {%r37, %r36};
+
+BB20_43:
+	setp.eq.f64	%p45, %fd1, 0d0000000000000000;
+	setp.eq.f64	%p46, %fd52, 0d3FF0000000000000;
+	or.pred  	%p47, %p46, %p45;
+	selp.f64	%fd67, 0d3FF0000000000000, %fd64, %p47;
+
+BB20_47:
+	st.global.f64 	[%rd1], %fd67;
+	bra.uni 	BB20_94;
+
+BB20_88:
+	setp.gt.f64	%p85, %fd35, 0d3FF0000000000000;
+	selp.b32	%r59, 2146435072, 0, %p85;
+	xor.b32  	%r60, %r59, 2146435072;
+	setp.lt.s32	%p86, %r5, 0;
+	selp.b32	%r61, %r60, %r59, %p86;
+	setp.eq.f64	%p87, %fd1, 0dBFF0000000000000;
+	selp.b32	%r62, 1072693248, %r61, %p87;
+	mov.u32 	%r63, 0;
+	mov.b64 	%fd73, {%r63, %r62};
+
+BB20_89:
+	setp.eq.f64	%p88, %fd52, 0d0000000000000000;
+	setp.eq.f64	%p89, %fd1, 0d3FF0000000000000;
+	or.pred  	%p90, %p89, %p88;
+	selp.f64	%fd76, 0d3FF0000000000000, %fd73, %p90;
+
+BB20_93:
+	st.global.f64 	[%rd1], %fd76;
+
+BB20_94:
+	bar.sync 	0;
+	ret;
 }
 
 	// .globl	fill
@@ -3338,14 +3045,14 @@ BB22_87:
 	mov.u32 	%r5, %tid.x;
 	mad.lo.s32 	%r1, %r4, %r3, %r5;
 	setp.ge.s32	%p1, %r1, %r2;
-	@%p1 bra 	BB23_2;
+	@%p1 bra 	BB21_2;
 
 	cvta.to.global.u64 	%rd2, %rd1;
 	mul.wide.s32 	%rd3, %r1, 8;
 	add.s64 	%rd4, %rd2, %rd3;
 	st.global.f64 	[%rd4], %fd1;
 
-BB23_2:
+BB21_2:
 	ret;
 }
 
@@ -3373,9 +3080,9 @@ BB23_2:
 	mov.f64 	%fd76, 0d0000000000000000;
 	mov.f64 	%fd77, %fd76;
 	setp.ge.u32	%p1, %r32, %r5;
-	@%p1 bra 	BB24_4;
+	@%p1 bra 	BB22_4;
 
-BB24_1:
+BB22_1:
 	mov.f64 	%fd1, %fd77;
 	cvta.to.global.u64 	%rd4, %rd2;
 	mul.wide.u32 	%rd5, %r32, 8;
@@ -3384,23 +3091,23 @@ BB24_1:
 	add.f64 	%fd78, %fd1, %fd30;
 	add.s32 	%r3, %r32, %r9;
 	setp.ge.u32	%p2, %r3, %r5;
-	@%p2 bra 	BB24_3;
+	@%p2 bra 	BB22_3;
 
 	mul.wide.u32 	%rd8, %r3, 8;
 	add.s64 	%rd9, %rd4, %rd8;
 	ld.global.f64 	%fd31, [%rd9];
 	add.f64 	%fd78, %fd78, %fd31;
 
-BB24_3:
+BB22_3:
 	mov.f64 	%fd77, %fd78;
 	shl.b32 	%r12, %r9, 1;
 	mov.u32 	%r13, %nctaid.x;
 	mad.lo.s32 	%r32, %r12, %r13, %r32;
 	setp.lt.u32	%p3, %r32, %r5;
 	mov.f64 	%fd76, %fd77;
-	@%p3 bra 	BB24_1;
+	@%p3 bra 	BB22_1;
 
-BB24_4:
+BB22_4:
 	mov.f64 	%fd74, %fd76;
 	mul.wide.u32 	%rd10, %r6, 8;
 	mov.u64 	%rd11, sdata;
@@ -3408,130 +3115,130 @@ BB24_4:
 	st.shared.f64 	[%rd1], %fd74;
 	bar.sync 	0;
 	setp.lt.u32	%p4, %r9, 1024;
-	@%p4 bra 	BB24_8;
+	@%p4 bra 	BB22_8;
 
 	setp.gt.u32	%p5, %r6, 511;
 	mov.f64 	%fd75, %fd74;
-	@%p5 bra 	BB24_7;
+	@%p5 bra 	BB22_7;
 
 	ld.shared.f64 	%fd32, [%rd1+4096];
 	add.f64 	%fd75, %fd74, %fd32;
 	st.shared.f64 	[%rd1], %fd75;
 
-BB24_7:
+BB22_7:
 	mov.f64 	%fd74, %fd75;
 	bar.sync 	0;
 
-BB24_8:
+BB22_8:
 	mov.f64 	%fd72, %fd74;
 	setp.lt.u32	%p6, %r9, 512;
-	@%p6 bra 	BB24_12;
+	@%p6 bra 	BB22_12;
 
 	setp.gt.u32	%p7, %r6, 255;
 	mov.f64 	%fd73, %fd72;
-	@%p7 bra 	BB24_11;
+	@%p7 bra 	BB22_11;
 
 	ld.shared.f64 	%fd33, [%rd1+2048];
 	add.f64 	%fd73, %fd72, %fd33;
 	st.shared.f64 	[%rd1], %fd73;
 
-BB24_11:
+BB22_11:
 	mov.f64 	%fd72, %fd73;
 	bar.sync 	0;
 
-BB24_12:
+BB22_12:
 	mov.f64 	%fd70, %fd72;
 	setp.lt.u32	%p8, %r9, 256;
-	@%p8 bra 	BB24_16;
+	@%p8 bra 	BB22_16;
 
 	setp.gt.u32	%p9, %r6, 127;
 	mov.f64 	%fd71, %fd70;
-	@%p9 bra 	BB24_15;
+	@%p9 bra 	BB22_15;
 
 	ld.shared.f64 	%fd34, [%rd1+1024];
 	add.f64 	%fd71, %fd70, %fd34;
 	st.shared.f64 	[%rd1], %fd71;
 
-BB24_15:
+BB22_15:
 	mov.f64 	%fd70, %fd71;
 	bar.sync 	0;
 
-BB24_16:
+BB22_16:
 	mov.f64 	%fd68, %fd70;
 	setp.lt.u32	%p10, %r9, 128;
-	@%p10 bra 	BB24_20;
+	@%p10 bra 	BB22_20;
 
 	setp.gt.u32	%p11, %r6, 63;
 	mov.f64 	%fd69, %fd68;
-	@%p11 bra 	BB24_19;
+	@%p11 bra 	BB22_19;
 
 	ld.shared.f64 	%fd35, [%rd1+512];
 	add.f64 	%fd69, %fd68, %fd35;
 	st.shared.f64 	[%rd1], %fd69;
 
-BB24_19:
+BB22_19:
 	mov.f64 	%fd68, %fd69;
 	bar.sync 	0;
 
-BB24_20:
+BB22_20:
 	mov.f64 	%fd67, %fd68;
 	setp.gt.u32	%p12, %r6, 31;
-	@%p12 bra 	BB24_33;
+	@%p12 bra 	BB22_33;
 
 	setp.lt.u32	%p13, %r9, 64;
-	@%p13 bra 	BB24_23;
+	@%p13 bra 	BB22_23;
 
 	ld.volatile.shared.f64 	%fd36, [%rd1+256];
 	add.f64 	%fd67, %fd67, %fd36;
 	st.volatile.shared.f64 	[%rd1], %fd67;
 
-BB24_23:
+BB22_23:
 	mov.f64 	%fd66, %fd67;
 	setp.lt.u32	%p14, %r9, 32;
-	@%p14 bra 	BB24_25;
+	@%p14 bra 	BB22_25;
 
 	ld.volatile.shared.f64 	%fd37, [%rd1+128];
 	add.f64 	%fd66, %fd66, %fd37;
 	st.volatile.shared.f64 	[%rd1], %fd66;
 
-BB24_25:
+BB22_25:
 	mov.f64 	%fd65, %fd66;
 	setp.lt.u32	%p15, %r9, 16;
-	@%p15 bra 	BB24_27;
+	@%p15 bra 	BB22_27;
 
 	ld.volatile.shared.f64 	%fd38, [%rd1+64];
 	add.f64 	%fd65, %fd65, %fd38;
 	st.volatile.shared.f64 	[%rd1], %fd65;
 
-BB24_27:
+BB22_27:
 	mov.f64 	%fd64, %fd65;
 	setp.lt.u32	%p16, %r9, 8;
-	@%p16 bra 	BB24_29;
+	@%p16 bra 	BB22_29;
 
 	ld.volatile.shared.f64 	%fd39, [%rd1+32];
 	add.f64 	%fd64, %fd64, %fd39;
 	st.volatile.shared.f64 	[%rd1], %fd64;
 
-BB24_29:
+BB22_29:
 	mov.f64 	%fd63, %fd64;
 	setp.lt.u32	%p17, %r9, 4;
-	@%p17 bra 	BB24_31;
+	@%p17 bra 	BB22_31;
 
 	ld.volatile.shared.f64 	%fd40, [%rd1+16];
 	add.f64 	%fd63, %fd63, %fd40;
 	st.volatile.shared.f64 	[%rd1], %fd63;
 
-BB24_31:
+BB22_31:
 	setp.lt.u32	%p18, %r9, 2;
-	@%p18 bra 	BB24_33;
+	@%p18 bra 	BB22_33;
 
 	ld.volatile.shared.f64 	%fd41, [%rd1+8];
 	add.f64 	%fd42, %fd63, %fd41;
 	st.volatile.shared.f64 	[%rd1], %fd42;
 
-BB24_33:
+BB22_33:
 	setp.ne.s32	%p19, %r6, 0;
-	@%p19 bra 	BB24_35;
+	@%p19 bra 	BB22_35;
 
 	ld.shared.f64 	%fd43, [sdata];
 	cvta.to.global.u64 	%rd12, %rd3;
@@ -3539,7 +3246,7 @@ BB24_33:
 	add.s64 	%rd14, %rd12, %rd13;
 	st.global.f64 	[%rd14], %fd43;
 
-BB24_35:
+BB22_35:
 	ret;
 }
 
@@ -3563,17 +3270,17 @@ BB24_35:
 	ld.param.u32 	%r4, [reduce_row_sum_param_3];
 	mov.u32 	%r6, %ctaid.x;
 	setp.ge.u32	%p1, %r6, %r5;
-	@%p1 bra 	BB25_35;
+	@%p1 bra 	BB23_35;
 
 	mov.u32 	%r38, %tid.x;
 	mov.f64 	%fd72, 0d0000000000000000;
 	mov.f64 	%fd73, %fd72;
 	setp.ge.u32	%p2, %r38, %r4;
-	@%p2 bra 	BB25_4;
+	@%p2 bra 	BB23_4;
 
 	cvta.to.global.u64 	%rd3, %rd1;
 
-BB25_3:
+BB23_3:
 	mad.lo.s32 	%r8, %r6, %r4, %r38;
 	mul.wide.u32 	%rd4, %r8, 8;
 	add.s64 	%rd5, %rd3, %rd4;
@@ -3583,9 +3290,9 @@ BB25_3:
 	add.s32 	%r38, %r9, %r38;
 	setp.lt.u32	%p3, %r38, %r4;
 	mov.f64 	%fd72, %fd73;
-	@%p3 bra 	BB25_3;
+	@%p3 bra 	BB23_3;
 
-BB25_4:
+BB23_4:
 	mov.f64 	%fd70, %fd72;
 	mov.u32 	%r10, %tid.x;
 	mul.wide.u32 	%rd6, %r10, 8;
@@ -3595,130 +3302,130 @@ BB25_4:
 	bar.sync 	0;
 	mov.u32 	%r11, %ntid.x;
 	setp.lt.u32	%p4, %r11, 1024;
-	@%p4 bra 	BB25_8;
+	@%p4 bra 	BB23_8;
 
 	setp.gt.u32	%p5, %r10, 511;
 	mov.f64 	%fd71, %fd70;
-	@%p5 bra 	BB25_7;
+	@%p5 bra 	BB23_7;
 
 	ld.shared.f64 	%fd29, [%rd8+4096];
 	add.f64 	%fd71, %fd70, %fd29;
 	st.shared.f64 	[%rd8], %fd71;
 
-BB25_7:
+BB23_7:
 	mov.f64 	%fd70, %fd71;
 	bar.sync 	0;
 
-BB25_8:
+BB23_8:
 	mov.f64 	%fd68, %fd70;
 	setp.lt.u32	%p6, %r11, 512;
-	@%p6 bra 	BB25_12;
+	@%p6 bra 	BB23_12;
 
 	setp.gt.u32	%p7, %r10, 255;
 	mov.f64 	%fd69, %fd68;
-	@%p7 bra 	BB25_11;
+	@%p7 bra 	BB23_11;
 
 	ld.shared.f64 	%fd30, [%rd8+2048];
 	add.f64 	%fd69, %fd68, %fd30;
 	st.shared.f64 	[%rd8], %fd69;
 
-BB25_11:
+BB23_11:
 	mov.f64 	%fd68, %fd69;
 	bar.sync 	0;
 
-BB25_12:
+BB23_12:
 	mov.f64 	%fd66, %fd68;
 	setp.lt.u32	%p8, %r11, 256;
-	@%p8 bra 	BB25_16;
+	@%p8 bra 	BB23_16;
 
 	setp.gt.u32	%p9, %r10, 127;
 	mov.f64 	%fd67, %fd66;
-	@%p9 bra 	BB25_15;
+	@%p9 bra 	BB23_15;
 
 	ld.shared.f64 	%fd31, [%rd8+1024];
 	add.f64 	%fd67, %fd66, %fd31;
 	st.shared.f64 	[%rd8], %fd67;
 
-BB25_15:
+BB23_15:
 	mov.f64 	%fd66, %fd67;
 	bar.sync 	0;
 
-BB25_16:
+BB23_16:
 	mov.f64 	%fd64, %fd66;
 	setp.lt.u32	%p10, %r11, 128;
-	@%p10 bra 	BB25_20;
+	@%p10 bra 	BB23_20;
 
 	setp.gt.u32	%p11, %r10, 63;
 	mov.f64 	%fd65, %fd64;
-	@%p11 bra 	BB25_19;
+	@%p11 bra 	BB23_19;
 
 	ld.shared.f64 	%fd32, [%rd8+512];
 	add.f64 	%fd65, %fd64, %fd32;
 	st.shared.f64 	[%rd8], %fd65;
 
-BB25_19:
+BB23_19:
 	mov.f64 	%fd64, %fd65;
 	bar.sync 	0;
 
-BB25_20:
+BB23_20:
 	mov.f64 	%fd63, %fd64;
 	setp.gt.u32	%p12, %r10, 31;
-	@%p12 bra 	BB25_33;
+	@%p12 bra 	BB23_33;
 
 	setp.lt.u32	%p13, %r11, 64;
-	@%p13 bra 	BB25_23;
+	@%p13 bra 	BB23_23;
 
 	ld.volatile.shared.f64 	%fd33, [%rd8+256];
 	add.f64 	%fd63, %fd63, %fd33;
 	st.volatile.shared.f64 	[%rd8], %fd63;
 
-BB25_23:
+BB23_23:
 	mov.f64 	%fd62, %fd63;
 	setp.lt.u32	%p14, %r11, 32;
-	@%p14 bra 	BB25_25;
+	@%p14 bra 	BB23_25;
 
 	ld.volatile.shared.f64 	%fd34, [%rd8+128];
 	add.f64 	%fd62, %fd62, %fd34;
 	st.volatile.shared.f64 	[%rd8], %fd62;
 
-BB25_25:
+BB23_25:
 	mov.f64 	%fd61, %fd62;
 	setp.lt.u32	%p15, %r11, 16;
-	@%p15 bra 	BB25_27;
+	@%p15 bra 	BB23_27;
 
 	ld.volatile.shared.f64 	%fd35, [%rd8+64];
 	add.f64 	%fd61, %fd61, %fd35;
 	st.volatile.shared.f64 	[%rd8], %fd61;
 
-BB25_27:
+BB23_27:
 	mov.f64 	%fd60, %fd61;
 	setp.lt.u32	%p16, %r11, 8;
-	@%p16 bra 	BB25_29;
+	@%p16 bra 	BB23_29;
 
 	ld.volatile.shared.f64 	%fd36, [%rd8+32];
 	add.f64 	%fd60, %fd60, %fd36;
 	st.volatile.shared.f64 	[%rd8], %fd60;
 
-BB25_29:
+BB23_29:
 	mov.f64 	%fd59, %fd60;
 	setp.lt.u32	%p17, %r11, 4;
-	@%p17 bra 	BB25_31;
+	@%p17 bra 	BB23_31;
 
 	ld.volatile.shared.f64 	%fd37, [%rd8+16];
 	add.f64 	%fd59, %fd59, %fd37;
 	st.volatile.shared.f64 	[%rd8], %fd59;
 
-BB25_31:
+BB23_31:
 	setp.lt.u32	%p18, %r11, 2;
-	@%p18 bra 	BB25_33;
+	@%p18 bra 	BB23_33;
 
 	ld.volatile.shared.f64 	%fd38, [%rd8+8];
 	add.f64 	%fd39, %fd59, %fd38;
 	st.volatile.shared.f64 	[%rd8], %fd39;
 
-BB25_33:
+BB23_33:
 	setp.ne.s32	%p19, %r10, 0;
-	@%p19 bra 	BB25_35;
+	@%p19 bra 	BB23_35;
 
 	ld.shared.f64 	%fd40, [sdata];
 	cvta.to.global.u64 	%rd39, %rd2;
@@ -3726,7 +3433,7 @@ BB25_33:
 	add.s64 	%rd41, %rd39, %rd40;
 	st.global.f64 	[%rd41], %fd40;
 
-BB25_35:
+BB23_35:
 	ret;
 }
 
@@ -3753,18 +3460,18 @@ BB25_35:
 	mov.u32 	%r9, %tid.x;
 	mad.lo.s32 	%r1, %r7, %r8, %r9;
 	setp.ge.u32	%p1, %r1, %r6;
-	@%p1 bra 	BB26_5;
+	@%p1 bra 	BB24_5;
 
 	cvta.to.global.u64 	%rd1, %rd2;
 	mul.lo.s32 	%r2, %r6, %r5;
 	mov.f64 	%fd8, 0d0000000000000000;
 	mov.f64 	%fd9, %fd8;
 	setp.ge.u32	%p2, %r1, %r2;
-	@%p2 bra 	BB26_4;
+	@%p2 bra 	BB24_4;
 
 	mov.u32 	%r10, %r1;
 
-BB26_3:
+BB24_3:
 	mov.u32 	%r3, %r10;
 	mul.wide.u32 	%rd4, %r3, 8;
 	add.s64 	%rd5, %rd1, %rd4;
@@ -3774,15 +3481,15 @@ BB26_3:
 	setp.lt.u32	%p3, %r4, %r2;
 	mov.u32 	%r10, %r4;
 	mov.f64 	%fd8, %fd9;
-	@%p3 bra 	BB26_3;
+	@%p3 bra 	BB24_3;
 
-BB26_4:
+BB24_4:
 	cvta.to.global.u64 	%rd6, %rd3;
 	mul.wide.u32 	%rd7, %r1, 8;
 	add.s64 	%rd8, %rd6, %rd7;
 	st.global.f64 	[%rd8], %fd8;
 
-BB26_5:
+BB24_5:
 	ret;
 }
 
@@ -3807,12 +3514,12 @@ BB26_5:
 	shl.b32 	%r8, %r7, 1;
 	mov.u32 	%r9, %ntid.x;
 	mad.lo.s32 	%r32, %r8, %r9, %r6;
-	mov.f64 	%fd76, 0d0010000000000000;
+	mov.f64 	%fd76, 0dFFEFFFFFFFFFFFFF;
 	mov.f64 	%fd77, %fd76;
 	setp.ge.u32	%p1, %r32, %r5;
-	@%p1 bra 	BB27_4;
+	@%p1 bra 	BB25_4;
 
-BB27_1:
+BB25_1:
 	mov.f64 	%fd1, %fd77;
 	cvta.to.global.u64 	%rd4, %rd2;
 	mul.wide.u32 	%rd5, %r32, 8;
@@ -3821,23 +3528,23 @@ BB27_1:
 	max.f64 	%fd78, %fd1, %fd30;
 	add.s32 	%r3, %r32, %r9;
 	setp.ge.u32	%p2, %r3, %r5;
-	@%p2 bra 	BB27_3;
+	@%p2 bra 	BB25_3;
 
 	mul.wide.u32 	%rd8, %r3, 8;
 	add.s64 	%rd9, %rd4, %rd8;
 	ld.global.f64 	%fd31, [%rd9];
 	max.f64 	%fd78, %fd78, %fd31;
 
-BB27_3:
+BB25_3:
 	mov.f64 	%fd77, %fd78;
 	shl.b32 	%r12, %r9, 1;
 	mov.u32 	%r13, %nctaid.x;
 	mad.lo.s32 	%r32, %r12, %r13, %r32;
 	setp.lt.u32	%p3, %r32, %r5;
 	mov.f64 	%fd76, %fd77;
-	@%p3 bra 	BB27_1;
+	@%p3 bra 	BB25_1;
 
-BB27_4:
+BB25_4:
 	mov.f64 	%fd74, %fd76;
 	mul.wide.u32 	%rd10, %r6, 8;
 	mov.u64 	%rd11, sdata;
@@ -3845,130 +3552,130 @@ BB27_4:
 	st.shared.f64 	[%rd1], %fd74;
 	bar.sync 	0;
 	setp.lt.u32	%p4, %r9, 1024;
-	@%p4 bra 	BB27_8;
+	@%p4 bra 	BB25_8;
 
 	setp.gt.u32	%p5, %r6, 511;
 	mov.f64 	%fd75, %fd74;
-	@%p5 bra 	BB27_7;
+	@%p5 bra 	BB25_7;
 
 	ld.shared.f64 	%fd32, [%rd1+4096];
 	max.f64 	%fd75, %fd74, %fd32;
 	st.shared.f64 	[%rd1], %fd75;
 
-BB27_7:
+BB25_7:
 	mov.f64 	%fd74, %fd75;
 	bar.sync 	0;
 
-BB27_8:
+BB25_8:
 	mov.f64 	%fd72, %fd74;
 	setp.lt.u32	%p6, %r9, 512;
-	@%p6 bra 	BB27_12;
+	@%p6 bra 	BB25_12;
 
 	setp.gt.u32	%p7, %r6, 255;
 	mov.f64 	%fd73, %fd72;
-	@%p7 bra 	BB27_11;
+	@%p7 bra 	BB25_11;
 
 	ld.shared.f64 	%fd33, [%rd1+2048];
 	max.f64 	%fd73, %fd72, %fd33;
 	st.shared.f64 	[%rd1], %fd73;
 
-BB27_11:
+BB25_11:
 	mov.f64 	%fd72, %fd73;
 	bar.sync 	0;
 
-BB27_12:
+BB25_12:
 	mov.f64 	%fd70, %fd72;
 	setp.lt.u32	%p8, %r9, 256;
-	@%p8 bra 	BB27_16;
+	@%p8 bra 	BB25_16;
 
 	setp.gt.u32	%p9, %r6, 127;
 	mov.f64 	%fd71, %fd70;
-	@%p9 bra 	BB27_15;
+	@%p9 bra 	BB25_15;
 
 	ld.shared.f64 	%fd34, [%rd1+1024];
 	max.f64 	%fd71, %fd70, %fd34;
 	st.shared.f64 	[%rd1], %fd71;
 
-BB27_15:
+BB25_15:
 	mov.f64 	%fd70, %fd71;
 	bar.sync 	0;
 
-BB27_16:
+BB25_16:
 	mov.f64 	%fd68, %fd70;
 	setp.lt.u32	%p10, %r9, 128;
-	@%p10 bra 	BB27_20;
+	@%p10 bra 	BB25_20;
 
 	setp.gt.u32	%p11, %r6, 63;
 	mov.f64 	%fd69, %fd68;
-	@%p11 bra 	BB27_19;
+	@%p11 bra 	BB25_19;
 
 	ld.shared.f64 	%fd35, [%rd1+512];
 	max.f64 	%fd69, %fd68, %fd35;
 	st.shared.f64 	[%rd1], %fd69;
 
-BB27_19:
+BB25_19:
 	mov.f64 	%fd68, %fd69;
 	bar.sync 	0;
 
-BB27_20:
+BB25_20:
 	mov.f64 	%fd67, %fd68;
 	setp.gt.u32	%p12, %r6, 31;
-	@%p12 bra 	BB27_33;
+	@%p12 bra 	BB25_33;
 
 	setp.lt.u32	%p13, %r9, 64;
-	@%p13 bra 	BB27_23;
+	@%p13 bra 	BB25_23;
 
 	ld.volatile.shared.f64 	%fd36, [%rd1+256];
 	max.f64 	%fd67, %fd67, %fd36;
 	st.volatile.shared.f64 	[%rd1], %fd67;
 
-BB27_23:
+BB25_23:
 	mov.f64 	%fd66, %fd67;
 	setp.lt.u32	%p14, %r9, 32;
-	@%p14 bra 	BB27_25;
+	@%p14 bra 	BB25_25;
 
 	ld.volatile.shared.f64 	%fd37, [%rd1+128];
 	max.f64 	%fd66, %fd66, %fd37;
 	st.volatile.shared.f64 	[%rd1], %fd66;
 
-BB27_25:
+BB25_25:
 	mov.f64 	%fd65, %fd66;
 	setp.lt.u32	%p15, %r9, 16;
-	@%p15 bra 	BB27_27;
+	@%p15 bra 	BB25_27;
 
 	ld.volatile.shared.f64 	%fd38, [%rd1+64];
 	max.f64 	%fd65, %fd65, %fd38;
 	st.volatile.shared.f64 	[%rd1], %fd65;
 
-BB27_27:
+BB25_27:
 	mov.f64 	%fd64, %fd65;
 	setp.lt.u32	%p16, %r9, 8;
-	@%p16 bra 	BB27_29;
+	@%p16 bra 	BB25_29;
 
 	ld.volatile.shared.f64 	%fd39, [%rd1+32];
 	max.f64 	%fd64, %fd64, %fd39;
 	st.volatile.shared.f64 	[%rd1], %fd64;
 
-BB27_29:
+BB25_29:
 	mov.f64 	%fd63, %fd64;
 	setp.lt.u32	%p17, %r9, 4;
-	@%p17 bra 	BB27_31;
+	@%p17 bra 	BB25_31;
 
 	ld.volatile.shared.f64 	%fd40, [%rd1+16];
 	max.f64 	%fd63, %fd63, %fd40;
 	st.volatile.shared.f64 	[%rd1], %fd63;
 
-BB27_31:
+BB25_31:
 	setp.lt.u32	%p18, %r9, 2;
-	@%p18 bra 	BB27_33;
+	@%p18 bra 	BB25_33;
 
 	ld.volatile.shared.f64 	%fd41, [%rd1+8];
 	max.f64 	%fd42, %fd63, %fd41;
 	st.volatile.shared.f64 	[%rd1], %fd42;
 
-BB27_33:
+BB25_33:
 	setp.ne.s32	%p19, %r6, 0;
-	@%p19 bra 	BB27_35;
+	@%p19 bra 	BB25_35;
 
 	ld.shared.f64 	%fd43, [sdata];
 	cvta.to.global.u64 	%rd12, %rd3;
@@ -3976,7 +3683,7 @@ BB27_33:
 	add.s64 	%rd14, %rd12, %rd13;
 	st.global.f64 	[%rd14], %fd43;
 
-BB27_35:
+BB25_35:
 	ret;
 }
 
@@ -4

<TRUNCATED>