You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2018/09/18 23:22:58 UTC

[2/3] systemml git commit: [SYSTEMML-445] Added sparse scalar-matrix arithmetic/relational operators

http://git-wip-us.apache.org/repos/asf/systemml/blob/61139e40/src/main/cpp/kernels/SystemML.ptx
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.ptx b/src/main/cpp/kernels/SystemML.ptx
index 1ab32f5..ac04967 100644
--- a/src/main/cpp/kernels/SystemML.ptx
+++ b/src/main/cpp/kernels/SystemML.ptx
@@ -4595,6 +4595,1739 @@ BB31_126:
 	ret;
 }
 
+	// .globl	sparse_dense_matrix_scalar_op_d
+.visible .entry sparse_dense_matrix_scalar_op_d(
+	.param .u64 sparse_dense_matrix_scalar_op_d_param_0,
+	.param .u64 sparse_dense_matrix_scalar_op_d_param_1,
+	.param .u64 sparse_dense_matrix_scalar_op_d_param_2,
+	.param .f64 sparse_dense_matrix_scalar_op_d_param_3,
+	.param .u64 sparse_dense_matrix_scalar_op_d_param_4,
+	.param .u32 sparse_dense_matrix_scalar_op_d_param_5,
+	.param .u32 sparse_dense_matrix_scalar_op_d_param_6,
+	.param .u32 sparse_dense_matrix_scalar_op_d_param_7,
+	.param .u32 sparse_dense_matrix_scalar_op_d_param_8
+)
+{
+	.reg .pred 	%p<133>;
+	.reg .b32 	%r<92>;
+	.reg .f64 	%fd<99>;
+	.reg .b64 	%rd<28>;
+
+
+	ld.param.u64 	%rd4, [sparse_dense_matrix_scalar_op_d_param_0];
+	ld.param.u64 	%rd5, [sparse_dense_matrix_scalar_op_d_param_1];
+	ld.param.u64 	%rd6, [sparse_dense_matrix_scalar_op_d_param_2];
+	ld.param.f64 	%fd68, [sparse_dense_matrix_scalar_op_d_param_3];
+	ld.param.u64 	%rd7, [sparse_dense_matrix_scalar_op_d_param_4];
+	ld.param.u32 	%r9, [sparse_dense_matrix_scalar_op_d_param_5];
+	ld.param.u32 	%r6, [sparse_dense_matrix_scalar_op_d_param_6];
+	ld.param.u32 	%r7, [sparse_dense_matrix_scalar_op_d_param_7];
+	ld.param.u32 	%r8, [sparse_dense_matrix_scalar_op_d_param_8];
+	mov.u32 	%r10, %ntid.x;
+	mov.u32 	%r11, %ctaid.x;
+	mov.u32 	%r12, %tid.x;
+	mad.lo.s32 	%r1, %r10, %r11, %r12;
+	setp.ge.s32	%p3, %r1, %r9;
+	@%p3 bra 	BB32_142;
+
+	cvta.to.global.u64 	%rd8, %rd7;
+	cvta.to.global.u64 	%rd9, %rd6;
+	mul.wide.s32 	%rd10, %r1, 8;
+	add.s64 	%rd11, %rd9, %rd10;
+	ld.global.f64 	%fd1, [%rd11];
+	cvta.to.global.u64 	%rd12, %rd4;
+	mul.wide.s32 	%rd13, %r1, 4;
+	add.s64 	%rd14, %rd12, %rd13;
+	ld.global.u32 	%r13, [%rd14];
+	cvta.to.global.u64 	%rd15, %rd5;
+	add.s64 	%rd16, %rd15, %rd13;
+	ld.global.u32 	%r14, [%rd16];
+	mad.lo.s32 	%r15, %r13, %r6, %r14;
+	mul.wide.s32 	%rd17, %r15, 8;
+	add.s64 	%rd1, %rd8, %rd17;
+	setp.eq.s32	%p4, %r8, 0;
+	@%p4 bra 	BB32_72;
+
+	mov.f64 	%fd94, 0d7FEFFFFFFFFFFFFF;
+	setp.gt.s32	%p5, %r7, 8;
+	@%p5 bra 	BB32_19;
+
+	setp.gt.s32	%p19, %r7, 3;
+	@%p19 bra 	BB32_11;
+
+	setp.gt.s32	%p26, %r7, 1;
+	@%p26 bra 	BB32_8;
+
+	setp.eq.s32	%p29, %r7, 0;
+	@%p29 bra 	BB32_70;
+	bra.uni 	BB32_6;
+
+BB32_70:
+	add.f64 	%fd94, %fd1, %fd68;
+	bra.uni 	BB32_71;
+
+BB32_72:
+	mov.f64 	%fd98, 0d7FEFFFFFFFFFFFFF;
+	setp.gt.s32	%p69, %r7, 8;
+	@%p69 bra 	BB32_89;
+
+	setp.gt.s32	%p83, %r7, 3;
+	@%p83 bra 	BB32_81;
+
+	setp.gt.s32	%p90, %r7, 1;
+	@%p90 bra 	BB32_78;
+
+	setp.eq.s32	%p93, %r7, 0;
+	@%p93 bra 	BB32_140;
+	bra.uni 	BB32_76;
+
+BB32_140:
+	add.f64 	%fd98, %fd1, %fd68;
+	bra.uni 	BB32_141;
+
+BB32_19:
+	setp.gt.s32	%p6, %r7, 13;
+	@%p6 bra 	BB32_28;
+
+	setp.gt.s32	%p13, %r7, 10;
+	@%p13 bra 	BB32_24;
+
+	setp.eq.s32	%p17, %r7, 9;
+	@%p17 bra 	BB32_48;
+	bra.uni 	BB32_22;
+
+BB32_48:
+	setp.eq.f64	%p44, %fd1, %fd68;
+	selp.f64	%fd94, 0d3FF0000000000000, 0d0000000000000000, %p44;
+	bra.uni 	BB32_71;
+
+BB32_89:
+	setp.gt.s32	%p70, %r7, 13;
+	@%p70 bra 	BB32_98;
+
+	setp.gt.s32	%p77, %r7, 10;
+	@%p77 bra 	BB32_94;
+
+	setp.eq.s32	%p81, %r7, 9;
+	@%p81 bra 	BB32_118;
+	bra.uni 	BB32_92;
+
+BB32_118:
+	setp.eq.f64	%p108, %fd1, %fd68;
+	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p108;
+	bra.uni 	BB32_141;
+
+BB32_11:
+	setp.gt.s32	%p20, %r7, 5;
+	@%p20 bra 	BB32_15;
+
+	setp.eq.s32	%p24, %r7, 4;
+	@%p24 bra 	BB32_51;
+	bra.uni 	BB32_13;
+
+BB32_51:
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r2}, %fd68;
+	}
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r3}, %fd1;
+	}
+	bfe.u32 	%r28, %r3, 20, 11;
+	add.s32 	%r29, %r28, -1012;
+	mov.b64 	 %rd22, %fd1;
+	shl.b64 	%rd2, %rd22, %r29;
+	setp.eq.s64	%p49, %rd2, -9223372036854775808;
+	abs.f64 	%fd18, %fd68;
+	// Callseq Start 3
+	{
+	.reg .b32 temp_param_reg;
+	// <end>}
+	.param .b64 param0;
+	st.param.f64	[param0+0], %fd18;
+	.param .b64 param1;
+	st.param.f64	[param1+0], %fd1;
+	.param .b64 retval0;
+	call.uni (retval0), 
+	__internal_accurate_pow, 
+	(
+	param0, 
+	param1
+	);
+	ld.param.f64	%fd24, [retval0+0];
+	
+	//{
+	}// Callseq End 3
+	setp.lt.s32	%p50, %r2, 0;
+	and.pred  	%p1, %p50, %p49;
+	@!%p1 bra 	BB32_53;
+	bra.uni 	BB32_52;
+
+BB32_52:
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r30}, %fd24;
+	}
+	xor.b32  	%r31, %r30, -2147483648;
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r32, %temp}, %fd24;
+	}
+	mov.b64 	%fd24, {%r32, %r31};
+
+BB32_53:
+	setp.eq.f64	%p51, %fd68, 0d0000000000000000;
+	@%p51 bra 	BB32_56;
+	bra.uni 	BB32_54;
+
+BB32_56:
+	selp.b32	%r33, %r2, 0, %p49;
+	or.b32  	%r34, %r33, 2146435072;
+	setp.lt.s32	%p55, %r3, 0;
+	selp.b32	%r35, %r34, %r33, %p55;
+	mov.u32 	%r36, 0;
+	mov.b64 	%fd24, {%r36, %r35};
+	bra.uni 	BB32_57;
+
+BB32_28:
+	setp.gt.s32	%p7, %r7, 15;
+	@%p7 bra 	BB32_32;
+
+	setp.eq.s32	%p11, %r7, 14;
+	@%p11 bra 	BB32_45;
+	bra.uni 	BB32_30;
+
+BB32_45:
+	cvt.rni.s64.f64	%rd18, %fd68;
+	cvt.u32.u64	%r22, %rd18;
+	cvt.rni.s64.f64	%rd19, %fd1;
+	cvt.u32.u64	%r23, %rd19;
+	or.b32  	%r24, %r23, %r22;
+	setp.eq.s32	%p41, %r24, 0;
+	selp.f64	%fd94, 0d0000000000000000, 0d3FF0000000000000, %p41;
+	bra.uni 	BB32_71;
+
+BB32_81:
+	setp.gt.s32	%p84, %r7, 5;
+	@%p84 bra 	BB32_85;
+
+	setp.eq.s32	%p88, %r7, 4;
+	@%p88 bra 	BB32_121;
+	bra.uni 	BB32_83;
+
+BB32_121:
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r4}, %fd1;
+	}
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r5}, %fd68;
+	}
+	bfe.u32 	%r66, %r5, 20, 11;
+	add.s32 	%r67, %r66, -1012;
+	mov.b64 	 %rd27, %fd68;
+	shl.b64 	%rd3, %rd27, %r67;
+	setp.eq.s64	%p113, %rd3, -9223372036854775808;
+	abs.f64 	%fd51, %fd1;
+	// Callseq Start 4
+	{
+	.reg .b32 temp_param_reg;
+	// <end>}
+	.param .b64 param0;
+	st.param.f64	[param0+0], %fd51;
+	.param .b64 param1;
+	st.param.f64	[param1+0], %fd68;
+	.param .b64 retval0;
+	call.uni (retval0), 
+	__internal_accurate_pow, 
+	(
+	param0, 
+	param1
+	);
+	ld.param.f64	%fd57, [retval0+0];
+	
+	//{
+	}// Callseq End 4
+	setp.lt.s32	%p114, %r4, 0;
+	and.pred  	%p2, %p114, %p113;
+	@!%p2 bra 	BB32_123;
+	bra.uni 	BB32_122;
+
+BB32_122:
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r68}, %fd57;
+	}
+	xor.b32  	%r69, %r68, -2147483648;
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r70, %temp}, %fd57;
+	}
+	mov.b64 	%fd57, {%r70, %r69};
+
+BB32_123:
+	setp.eq.f64	%p115, %fd1, 0d0000000000000000;
+	@%p115 bra 	BB32_126;
+	bra.uni 	BB32_124;
+
+BB32_126:
+	selp.b32	%r71, %r4, 0, %p113;
+	or.b32  	%r72, %r71, 2146435072;
+	setp.lt.s32	%p119, %r5, 0;
+	selp.b32	%r73, %r72, %r71, %p119;
+	mov.u32 	%r74, 0;
+	mov.b64 	%fd57, {%r74, %r73};
+	bra.uni 	BB32_127;
+
+BB32_98:
+	setp.gt.s32	%p71, %r7, 15;
+	@%p71 bra 	BB32_102;
+
+	setp.eq.s32	%p75, %r7, 14;
+	@%p75 bra 	BB32_115;
+	bra.uni 	BB32_100;
+
+BB32_115:
+	cvt.rni.s64.f64	%rd23, %fd1;
+	cvt.u32.u64	%r60, %rd23;
+	cvt.rni.s64.f64	%rd24, %fd68;
+	cvt.u32.u64	%r61, %rd24;
+	or.b32  	%r62, %r61, %r60;
+	setp.eq.s32	%p105, %r62, 0;
+	selp.f64	%fd98, 0d0000000000000000, 0d3FF0000000000000, %p105;
+	bra.uni 	BB32_141;
+
+BB32_8:
+	setp.eq.s32	%p27, %r7, 2;
+	@%p27 bra 	BB32_69;
+	bra.uni 	BB32_9;
+
+BB32_69:
+	mul.f64 	%fd94, %fd1, %fd68;
+	bra.uni 	BB32_71;
+
+BB32_24:
+	setp.eq.s32	%p14, %r7, 11;
+	@%p14 bra 	BB32_47;
+
+	setp.eq.s32	%p15, %r7, 12;
+	@%p15 bra 	BB32_46;
+	bra.uni 	BB32_26;
+
+BB32_46:
+	max.f64 	%fd94, %fd68, %fd1;
+	bra.uni 	BB32_71;
+
+BB32_15:
+	setp.eq.s32	%p21, %r7, 6;
+	@%p21 bra 	BB32_50;
+
+	setp.eq.s32	%p22, %r7, 7;
+	@%p22 bra 	BB32_49;
+	bra.uni 	BB32_17;
+
+BB32_49:
+	setp.lt.f64	%p46, %fd1, %fd68;
+	selp.f64	%fd94, 0d3FF0000000000000, 0d0000000000000000, %p46;
+	bra.uni 	BB32_71;
+
+BB32_32:
+	setp.eq.s32	%p8, %r7, 16;
+	@%p8 bra 	BB32_44;
+
+	setp.eq.s32	%p9, %r7, 17;
+	@%p9 bra 	BB32_39;
+	bra.uni 	BB32_34;
+
+BB32_39:
+	setp.eq.f64	%p34, %fd1, 0d0000000000000000;
+	setp.eq.f64	%p35, %fd1, 0d8000000000000000;
+	or.pred  	%p36, %p34, %p35;
+	mov.f64 	%fd94, 0d7FF8000000000000;
+	@%p36 bra 	BB32_71;
+
+	div.rn.f64 	%fd94, %fd68, %fd1;
+	abs.f64 	%fd72, %fd94;
+	setp.gtu.f64	%p37, %fd72, 0d7FF0000000000000;
+	@%p37 bra 	BB32_71;
+
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r19}, %fd94;
+	}
+	and.b32  	%r20, %r19, 2147483647;
+	setp.ne.s32	%p38, %r20, 2146435072;
+	@%p38 bra 	BB32_43;
+
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r21, %temp}, %fd94;
+	}
+	setp.eq.s32	%p39, %r21, 0;
+	@%p39 bra 	BB32_71;
+
+BB32_43:
+	cvt.rmi.f64.f64	%fd73, %fd94;
+	mul.f64 	%fd74, %fd1, %fd73;
+	sub.f64 	%fd94, %fd68, %fd74;
+	bra.uni 	BB32_71;
+
+BB32_78:
+	setp.eq.s32	%p91, %r7, 2;
+	@%p91 bra 	BB32_139;
+	bra.uni 	BB32_79;
+
+BB32_139:
+	mul.f64 	%fd98, %fd1, %fd68;
+	bra.uni 	BB32_141;
+
+BB32_94:
+	setp.eq.s32	%p78, %r7, 11;
+	@%p78 bra 	BB32_117;
+
+	setp.eq.s32	%p79, %r7, 12;
+	@%p79 bra 	BB32_116;
+	bra.uni 	BB32_96;
+
+BB32_116:
+	max.f64 	%fd98, %fd1, %fd68;
+	bra.uni 	BB32_141;
+
+BB32_85:
+	setp.eq.s32	%p85, %r7, 6;
+	@%p85 bra 	BB32_120;
+
+	setp.eq.s32	%p86, %r7, 7;
+	@%p86 bra 	BB32_119;
+	bra.uni 	BB32_87;
+
+BB32_119:
+	setp.gt.f64	%p110, %fd1, %fd68;
+	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p110;
+	bra.uni 	BB32_141;
+
+BB32_102:
+	setp.eq.s32	%p72, %r7, 16;
+	@%p72 bra 	BB32_114;
+
+	setp.eq.s32	%p73, %r7, 17;
+	@%p73 bra 	BB32_109;
+	bra.uni 	BB32_104;
+
+BB32_109:
+	setp.eq.f64	%p98, %fd68, 0d0000000000000000;
+	setp.eq.f64	%p99, %fd68, 0d8000000000000000;
+	or.pred  	%p100, %p98, %p99;
+	mov.f64 	%fd98, 0d7FF8000000000000;
+	@%p100 bra 	BB32_141;
+
+	div.rn.f64 	%fd98, %fd1, %fd68;
+	abs.f64 	%fd83, %fd98;
+	setp.gtu.f64	%p101, %fd83, 0d7FF0000000000000;
+	@%p101 bra 	BB32_141;
+
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r57}, %fd98;
+	}
+	and.b32  	%r58, %r57, 2147483647;
+	setp.ne.s32	%p102, %r58, 2146435072;
+	@%p102 bra 	BB32_113;
+
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r59, %temp}, %fd98;
+	}
+	setp.eq.s32	%p103, %r59, 0;
+	@%p103 bra 	BB32_141;
+
+BB32_113:
+	cvt.rmi.f64.f64	%fd84, %fd98;
+	mul.f64 	%fd85, %fd84, %fd68;
+	sub.f64 	%fd98, %fd1, %fd85;
+	bra.uni 	BB32_141;
+
+BB32_6:
+	setp.eq.s32	%p30, %r7, 1;
+	@%p30 bra 	BB32_7;
+	bra.uni 	BB32_71;
+
+BB32_7:
+	sub.f64 	%fd94, %fd68, %fd1;
+	bra.uni 	BB32_71;
+
+BB32_22:
+	setp.eq.s32	%p18, %r7, 10;
+	@%p18 bra 	BB32_23;
+	bra.uni 	BB32_71;
+
+BB32_23:
+	setp.neu.f64	%p43, %fd1, %fd68;
+	selp.f64	%fd94, 0d3FF0000000000000, 0d0000000000000000, %p43;
+	bra.uni 	BB32_71;
+
+BB32_13:
+	setp.eq.s32	%p25, %r7, 5;
+	@%p25 bra 	BB32_14;
+	bra.uni 	BB32_71;
+
+BB32_14:
+	setp.gt.f64	%p48, %fd1, %fd68;
+	selp.f64	%fd94, 0d3FF0000000000000, 0d0000000000000000, %p48;
+	bra.uni 	BB32_71;
+
+BB32_30:
+	setp.eq.s32	%p12, %r7, 15;
+	@%p12 bra 	BB32_31;
+	bra.uni 	BB32_71;
+
+BB32_31:
+	mul.f64 	%fd76, %fd1, %fd68;
+	mov.f64 	%fd77, 0d3FF0000000000000;
+	sub.f64 	%fd94, %fd77, %fd76;
+	bra.uni 	BB32_71;
+
+BB32_9:
+	setp.eq.s32	%p28, %r7, 3;
+	@%p28 bra 	BB32_10;
+	bra.uni 	BB32_71;
+
+BB32_10:
+	div.rn.f64 	%fd94, %fd68, %fd1;
+	bra.uni 	BB32_71;
+
+BB32_47:
+	min.f64 	%fd94, %fd68, %fd1;
+	bra.uni 	BB32_71;
+
+BB32_26:
+	setp.eq.s32	%p16, %r7, 13;
+	@%p16 bra 	BB32_27;
+	bra.uni 	BB32_71;
+
+BB32_27:
+	cvt.rni.s64.f64	%rd20, %fd68;
+	cvt.u32.u64	%r25, %rd20;
+	cvt.rni.s64.f64	%rd21, %fd1;
+	cvt.u32.u64	%r26, %rd21;
+	and.b32  	%r27, %r26, %r25;
+	setp.eq.s32	%p42, %r27, 0;
+	selp.f64	%fd94, 0d0000000000000000, 0d3FF0000000000000, %p42;
+	bra.uni 	BB32_71;
+
+BB32_50:
+	setp.ltu.f64	%p47, %fd1, %fd68;
+	selp.f64	%fd94, 0d0000000000000000, 0d3FF0000000000000, %p47;
+	bra.uni 	BB32_71;
+
+BB32_17:
+	setp.eq.s32	%p23, %r7, 8;
+	@%p23 bra 	BB32_18;
+	bra.uni 	BB32_71;
+
+BB32_18:
+	setp.gtu.f64	%p45, %fd1, %fd68;
+	selp.f64	%fd94, 0d0000000000000000, 0d3FF0000000000000, %p45;
+	bra.uni 	BB32_71;
+
+BB32_44:
+	setp.neu.f64	%p40, %fd68, 0d0000000000000000;
+	sub.f64 	%fd75, %fd68, %fd1;
+	selp.f64	%fd94, %fd75, 0d0000000000000000, %p40;
+	bra.uni 	BB32_71;
+
+BB32_34:
+	setp.ne.s32	%p10, %r7, 18;
+	@%p10 bra 	BB32_71;
+
+	div.rn.f64 	%fd94, %fd68, %fd1;
+	abs.f64 	%fd70, %fd94;
+	setp.gtu.f64	%p31, %fd70, 0d7FF0000000000000;
+	@%p31 bra 	BB32_71;
+
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r16}, %fd94;
+	}
+	and.b32  	%r17, %r16, 2147483647;
+	setp.ne.s32	%p32, %r17, 2146435072;
+	@%p32 bra 	BB32_38;
+
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r18, %temp}, %fd94;
+	}
+	setp.eq.s32	%p33, %r18, 0;
+	@%p33 bra 	BB32_71;
+
+BB32_38:
+	cvt.rmi.f64.f64	%fd94, %fd94;
+	bra.uni 	BB32_71;
+
+BB32_76:
+	setp.eq.s32	%p94, %r7, 1;
+	@%p94 bra 	BB32_77;
+	bra.uni 	BB32_141;
+
+BB32_77:
+	sub.f64 	%fd98, %fd1, %fd68;
+	bra.uni 	BB32_141;
+
+BB32_92:
+	setp.eq.s32	%p82, %r7, 10;
+	@%p82 bra 	BB32_93;
+	bra.uni 	BB32_141;
+
+BB32_93:
+	setp.neu.f64	%p107, %fd1, %fd68;
+	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p107;
+	bra.uni 	BB32_141;
+
+BB32_83:
+	setp.eq.s32	%p89, %r7, 5;
+	@%p89 bra 	BB32_84;
+	bra.uni 	BB32_141;
+
+BB32_84:
+	setp.lt.f64	%p112, %fd1, %fd68;
+	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p112;
+	bra.uni 	BB32_141;
+
+BB32_100:
+	setp.eq.s32	%p76, %r7, 15;
+	@%p76 bra 	BB32_101;
+	bra.uni 	BB32_141;
+
+BB32_101:
+	mul.f64 	%fd87, %fd1, %fd68;
+	mov.f64 	%fd88, 0d3FF0000000000000;
+	sub.f64 	%fd98, %fd88, %fd87;
+	bra.uni 	BB32_141;
+
+BB32_79:
+	setp.eq.s32	%p92, %r7, 3;
+	@%p92 bra 	BB32_80;
+	bra.uni 	BB32_141;
+
+BB32_80:
+	div.rn.f64 	%fd98, %fd1, %fd68;
+	bra.uni 	BB32_141;
+
+BB32_117:
+	min.f64 	%fd98, %fd1, %fd68;
+	bra.uni 	BB32_141;
+
+BB32_96:
+	setp.eq.s32	%p80, %r7, 13;
+	@%p80 bra 	BB32_97;
+	bra.uni 	BB32_141;
+
+BB32_97:
+	cvt.rni.s64.f64	%rd25, %fd1;
+	cvt.u32.u64	%r63, %rd25;
+	cvt.rni.s64.f64	%rd26, %fd68;
+	cvt.u32.u64	%r64, %rd26;
+	and.b32  	%r65, %r64, %r63;
+	setp.eq.s32	%p106, %r65, 0;
+	selp.f64	%fd98, 0d0000000000000000, 0d3FF0000000000000, %p106;
+	bra.uni 	BB32_141;
+
+BB32_120:
+	setp.gtu.f64	%p111, %fd1, %fd68;
+	selp.f64	%fd98, 0d0000000000000000, 0d3FF0000000000000, %p111;
+	bra.uni 	BB32_141;
+
+BB32_87:
+	setp.eq.s32	%p87, %r7, 8;
+	@%p87 bra 	BB32_88;
+	bra.uni 	BB32_141;
+
+BB32_88:
+	setp.ltu.f64	%p109, %fd1, %fd68;
+	selp.f64	%fd98, 0d0000000000000000, 0d3FF0000000000000, %p109;
+	bra.uni 	BB32_141;
+
+BB32_114:
+	setp.neu.f64	%p104, %fd1, 0d0000000000000000;
+	sub.f64 	%fd86, %fd1, %fd68;
+	selp.f64	%fd98, %fd86, 0d0000000000000000, %p104;
+	bra.uni 	BB32_141;
+
+BB32_104:
+	setp.ne.s32	%p74, %r7, 18;
+	@%p74 bra 	BB32_141;
+
+	div.rn.f64 	%fd98, %fd1, %fd68;
+	abs.f64 	%fd81, %fd98;
+	setp.gtu.f64	%p95, %fd81, 0d7FF0000000000000;
+	@%p95 bra 	BB32_141;
+
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r54}, %fd98;
+	}
+	and.b32  	%r55, %r54, 2147483647;
+	setp.ne.s32	%p96, %r55, 2146435072;
+	@%p96 bra 	BB32_108;
+
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r56, %temp}, %fd98;
+	}
+	setp.eq.s32	%p97, %r56, 0;
+	@%p97 bra 	BB32_141;
+
+BB32_108:
+	cvt.rmi.f64.f64	%fd98, %fd98;
+	bra.uni 	BB32_141;
+
+BB32_54:
+	setp.gt.s32	%p52, %r2, -1;
+	@%p52 bra 	BB32_57;
+
+	cvt.rzi.f64.f64	%fd78, %fd1;
+	setp.neu.f64	%p53, %fd78, %fd1;
+	selp.f64	%fd24, 0dFFF8000000000000, %fd24, %p53;
+
+BB32_57:
+	add.f64 	%fd93, %fd1, %fd68;
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r37}, %fd93;
+	}
+	and.b32  	%r38, %r37, 2146435072;
+	setp.ne.s32	%p56, %r38, 2146435072;
+	@%p56 bra 	BB32_58;
+
+	setp.gtu.f64	%p57, %fd18, 0d7FF0000000000000;
+	@%p57 bra 	BB32_68;
+
+	abs.f64 	%fd79, %fd1;
+	setp.gtu.f64	%p58, %fd79, 0d7FF0000000000000;
+	@%p58 bra 	BB32_68;
+
+	and.b32  	%r39, %r3, 2147483647;
+	setp.ne.s32	%p59, %r39, 2146435072;
+	@%p59 bra 	BB32_63;
+
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r40, %temp}, %fd1;
+	}
+	setp.eq.s32	%p60, %r40, 0;
+	@%p60 bra 	BB32_67;
+
+BB32_63:
+	and.b32  	%r41, %r2, 2147483647;
+	setp.ne.s32	%p61, %r41, 2146435072;
+	@%p61 bra 	BB32_64;
+
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r42, %temp}, %fd68;
+	}
+	setp.ne.s32	%p62, %r42, 0;
+	mov.f64 	%fd93, %fd24;
+	@%p62 bra 	BB32_68;
+
+	shr.s32 	%r43, %r3, 31;
+	and.b32  	%r44, %r43, -2146435072;
+	add.s32 	%r45, %r44, 2146435072;
+	or.b32  	%r46, %r45, -2147483648;
+	selp.b32	%r47, %r46, %r45, %p1;
+	mov.u32 	%r48, 0;
+	mov.b64 	%fd93, {%r48, %r47};
+	bra.uni 	BB32_68;
+
+BB32_58:
+	mov.f64 	%fd93, %fd24;
+
+BB32_68:
+	setp.eq.f64	%p66, %fd1, 0d0000000000000000;
+	setp.eq.f64	%p67, %fd68, 0d3FF0000000000000;
+	or.pred  	%p68, %p67, %p66;
+	selp.f64	%fd94, 0d3FF0000000000000, %fd93, %p68;
+
+BB32_71:
+	st.global.f64 	[%rd1], %fd94;
+	bra.uni 	BB32_142;
+
+BB32_124:
+	setp.gt.s32	%p116, %r4, -1;
+	@%p116 bra 	BB32_127;
+
+	cvt.rzi.f64.f64	%fd89, %fd68;
+	setp.neu.f64	%p117, %fd89, %fd68;
+	selp.f64	%fd57, 0dFFF8000000000000, %fd57, %p117;
+
+BB32_127:
+	add.f64 	%fd97, %fd1, %fd68;
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r75}, %fd97;
+	}
+	and.b32  	%r76, %r75, 2146435072;
+	setp.ne.s32	%p120, %r76, 2146435072;
+	@%p120 bra 	BB32_128;
+
+	setp.gtu.f64	%p121, %fd51, 0d7FF0000000000000;
+	@%p121 bra 	BB32_138;
+
+	abs.f64 	%fd90, %fd68;
+	setp.gtu.f64	%p122, %fd90, 0d7FF0000000000000;
+	@%p122 bra 	BB32_138;
+
+	and.b32  	%r77, %r5, 2147483647;
+	setp.ne.s32	%p123, %r77, 2146435072;
+	@%p123 bra 	BB32_133;
+
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r78, %temp}, %fd68;
+	}
+	setp.eq.s32	%p124, %r78, 0;
+	@%p124 bra 	BB32_137;
+
+BB32_133:
+	and.b32  	%r79, %r4, 2147483647;
+	setp.ne.s32	%p125, %r79, 2146435072;
+	@%p125 bra 	BB32_134;
+
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r80, %temp}, %fd1;
+	}
+	setp.ne.s32	%p126, %r80, 0;
+	mov.f64 	%fd97, %fd57;
+	@%p126 bra 	BB32_138;
+
+	shr.s32 	%r81, %r5, 31;
+	and.b32  	%r82, %r81, -2146435072;
+	add.s32 	%r83, %r82, 2146435072;
+	or.b32  	%r84, %r83, -2147483648;
+	selp.b32	%r85, %r84, %r83, %p2;
+	mov.u32 	%r86, 0;
+	mov.b64 	%fd97, {%r86, %r85};
+	bra.uni 	BB32_138;
+
+BB32_128:
+	mov.f64 	%fd97, %fd57;
+
+BB32_138:
+	setp.eq.f64	%p130, %fd68, 0d0000000000000000;
+	setp.eq.f64	%p131, %fd1, 0d3FF0000000000000;
+	or.pred  	%p132, %p131, %p130;
+	selp.f64	%fd98, 0d3FF0000000000000, %fd97, %p132;
+
+BB32_141:
+	st.global.f64 	[%rd1], %fd98;
+
+BB32_142:
+	bar.sync 	0;
+	ret;
+
+BB32_64:
+	mov.f64 	%fd93, %fd24;
+	bra.uni 	BB32_68;
+
+BB32_134:
+	mov.f64 	%fd97, %fd57;
+	bra.uni 	BB32_138;
+
+BB32_67:
+	setp.gt.f64	%p63, %fd18, 0d3FF0000000000000;
+	selp.b32	%r49, 2146435072, 0, %p63;
+	xor.b32  	%r50, %r49, 2146435072;
+	setp.lt.s32	%p64, %r3, 0;
+	selp.b32	%r51, %r50, %r49, %p64;
+	setp.eq.f64	%p65, %fd68, 0dBFF0000000000000;
+	selp.b32	%r52, 1072693248, %r51, %p65;
+	mov.u32 	%r53, 0;
+	mov.b64 	%fd93, {%r53, %r52};
+	bra.uni 	BB32_68;
+
+BB32_137:
+	setp.gt.f64	%p127, %fd51, 0d3FF0000000000000;
+	selp.b32	%r87, 2146435072, 0, %p127;
+	xor.b32  	%r88, %r87, 2146435072;
+	setp.lt.s32	%p128, %r5, 0;
+	selp.b32	%r89, %r88, %r87, %p128;
+	setp.eq.f64	%p129, %fd1, 0dBFF0000000000000;
+	selp.b32	%r90, 1072693248, %r89, %p129;
+	mov.u32 	%r91, 0;
+	mov.b64 	%fd97, {%r91, %r90};
+	bra.uni 	BB32_138;
+}
+
+	// .globl	sparse_dense_matrix_scalar_op_f
+.visible .entry sparse_dense_matrix_scalar_op_f(
+	.param .u64 sparse_dense_matrix_scalar_op_f_param_0,
+	.param .u64 sparse_dense_matrix_scalar_op_f_param_1,
+	.param .u64 sparse_dense_matrix_scalar_op_f_param_2,
+	.param .f64 sparse_dense_matrix_scalar_op_f_param_3,
+	.param .u64 sparse_dense_matrix_scalar_op_f_param_4,
+	.param .u32 sparse_dense_matrix_scalar_op_f_param_5,
+	.param .u32 sparse_dense_matrix_scalar_op_f_param_6,
+	.param .u32 sparse_dense_matrix_scalar_op_f_param_7,
+	.param .u32 sparse_dense_matrix_scalar_op_f_param_8
+)
+{
+	.reg .pred 	%p<139>;
+	.reg .f32 	%f<267>;
+	.reg .b32 	%r<62>;
+	.reg .f64 	%fd<2>;
+	.reg .b64 	%rd<23>;
+
+
+	ld.param.u64 	%rd2, [sparse_dense_matrix_scalar_op_f_param_0];
+	ld.param.u64 	%rd3, [sparse_dense_matrix_scalar_op_f_param_1];
+	ld.param.u64 	%rd4, [sparse_dense_matrix_scalar_op_f_param_2];
+	ld.param.f64 	%fd1, [sparse_dense_matrix_scalar_op_f_param_3];
+	ld.param.u64 	%rd5, [sparse_dense_matrix_scalar_op_f_param_4];
+	ld.param.u32 	%r5, [sparse_dense_matrix_scalar_op_f_param_5];
+	ld.param.u32 	%r2, [sparse_dense_matrix_scalar_op_f_param_6];
+	ld.param.u32 	%r3, [sparse_dense_matrix_scalar_op_f_param_7];
+	ld.param.u32 	%r4, [sparse_dense_matrix_scalar_op_f_param_8];
+	cvt.rn.f32.f64	%f1, %fd1;
+	mov.u32 	%r6, %ntid.x;
+	mov.u32 	%r7, %ctaid.x;
+	mov.u32 	%r8, %tid.x;
+	mad.lo.s32 	%r1, %r6, %r7, %r8;
+	setp.ge.s32	%p3, %r1, %r5;
+	@%p3 bra 	BB33_126;
+
+	cvta.to.global.u64 	%rd6, %rd5;
+	cvta.to.global.u64 	%rd7, %rd4;
+	mul.wide.s32 	%rd8, %r1, 4;
+	add.s64 	%rd9, %rd7, %rd8;
+	ld.global.f32 	%f2, [%rd9];
+	cvta.to.global.u64 	%rd10, %rd2;
+	add.s64 	%rd11, %rd10, %rd8;
+	ld.global.u32 	%r9, [%rd11];
+	cvta.to.global.u64 	%rd12, %rd3;
+	add.s64 	%rd13, %rd12, %rd8;
+	ld.global.u32 	%r10, [%rd13];
+	mad.lo.s32 	%r11, %r9, %r2, %r10;
+	mul.wide.s32 	%rd14, %r11, 4;
+	add.s64 	%rd1, %rd6, %rd14;
+	setp.eq.s32	%p4, %r4, 0;
+	@%p4 bra 	BB33_64;
+
+	mov.f32 	%f262, 0f7F7FFFFF;
+	setp.gt.s32	%p5, %r3, 8;
+	@%p5 bra 	BB33_19;
+
+	setp.gt.s32	%p19, %r3, 3;
+	@%p19 bra 	BB33_11;
+
+	setp.gt.s32	%p26, %r3, 1;
+	@%p26 bra 	BB33_8;
+
+	setp.eq.s32	%p29, %r3, 0;
+	@%p29 bra 	BB33_62;
+	bra.uni 	BB33_6;
+
+BB33_62:
+	add.f32 	%f262, %f1, %f2;
+	bra.uni 	BB33_63;
+
+BB33_64:
+	mov.f32 	%f266, 0f7F7FFFFF;
+	setp.gt.s32	%p72, %r3, 8;
+	@%p72 bra 	BB33_81;
+
+	setp.gt.s32	%p86, %r3, 3;
+	@%p86 bra 	BB33_73;
+
+	setp.gt.s32	%p93, %r3, 1;
+	@%p93 bra 	BB33_70;
+
+	setp.eq.s32	%p96, %r3, 0;
+	@%p96 bra 	BB33_124;
+	bra.uni 	BB33_68;
+
+BB33_124:
+	add.f32 	%f266, %f1, %f2;
+	bra.uni 	BB33_125;
+
+BB33_19:
+	setp.gt.s32	%p6, %r3, 13;
+	@%p6 bra 	BB33_28;
+
+	setp.gt.s32	%p13, %r3, 10;
+	@%p13 bra 	BB33_24;
+
+	setp.eq.s32	%p17, %r3, 9;
+	@%p17 bra 	BB33_44;
+	bra.uni 	BB33_22;
+
+BB33_44:
+	setp.eq.f32	%p40, %f1, %f2;
+	selp.f32	%f262, 0f3F800000, 0f00000000, %p40;
+	bra.uni 	BB33_63;
+
+BB33_81:
+	setp.gt.s32	%p73, %r3, 13;
+	@%p73 bra 	BB33_90;
+
+	setp.gt.s32	%p80, %r3, 10;
+	@%p80 bra 	BB33_86;
+
+	setp.eq.s32	%p84, %r3, 9;
+	@%p84 bra 	BB33_106;
+	bra.uni 	BB33_84;
+
+BB33_106:
+	setp.eq.f32	%p107, %f2, %f1;
+	selp.f32	%f266, 0f3F800000, 0f00000000, %p107;
+	bra.uni 	BB33_125;
+
+BB33_11:
+	setp.gt.s32	%p20, %r3, 5;
+	@%p20 bra 	BB33_15;
+
+	setp.eq.s32	%p24, %r3, 4;
+	@%p24 bra 	BB33_47;
+	bra.uni 	BB33_13;
+
+BB33_47:
+	mul.f32 	%f88, %f2, 0f3F000000;
+	cvt.rzi.f32.f32	%f89, %f88;
+	fma.rn.f32 	%f90, %f89, 0fC0000000, %f2;
+	abs.f32 	%f19, %f90;
+	abs.f32 	%f20, %f1;
+	setp.lt.f32	%p45, %f20, 0f00800000;
+	mul.f32 	%f91, %f20, 0f4B800000;
+	selp.f32	%f92, 0fC3170000, 0fC2FE0000, %p45;
+	selp.f32	%f93, %f91, %f20, %p45;
+	mov.b32 	 %r18, %f93;
+	and.b32  	%r19, %r18, 8388607;
+	or.b32  	%r20, %r19, 1065353216;
+	mov.b32 	 %f94, %r20;
+	shr.u32 	%r21, %r18, 23;
+	cvt.rn.f32.u32	%f95, %r21;
+	add.f32 	%f96, %f92, %f95;
+	setp.gt.f32	%p46, %f94, 0f3FB504F3;
+	mul.f32 	%f97, %f94, 0f3F000000;
+	add.f32 	%f98, %f96, 0f3F800000;
+	selp.f32	%f99, %f97, %f94, %p46;
+	selp.f32	%f100, %f98, %f96, %p46;
+	add.f32 	%f101, %f99, 0fBF800000;
+	add.f32 	%f87, %f99, 0f3F800000;
+	// inline asm
+	rcp.approx.ftz.f32 %f86,%f87;
+	// inline asm
+	add.f32 	%f102, %f101, %f101;
+	mul.f32 	%f103, %f86, %f102;
+	mul.f32 	%f104, %f103, %f103;
+	mov.f32 	%f105, 0f3C4CAF63;
+	mov.f32 	%f106, 0f3B18F0FE;
+	fma.rn.f32 	%f107, %f106, %f104, %f105;
+	mov.f32 	%f108, 0f3DAAAABD;
+	fma.rn.f32 	%f109, %f107, %f104, %f108;
+	mul.rn.f32 	%f110, %f109, %f104;
+	mul.rn.f32 	%f111, %f110, %f103;
+	sub.f32 	%f112, %f101, %f103;
+	neg.f32 	%f113, %f103;
+	add.f32 	%f114, %f112, %f112;
+	fma.rn.f32 	%f115, %f113, %f101, %f114;
+	mul.rn.f32 	%f116, %f86, %f115;
+	add.f32 	%f117, %f111, %f103;
+	sub.f32 	%f118, %f103, %f117;
+	add.f32 	%f119, %f111, %f118;
+	add.f32 	%f120, %f116, %f119;
+	add.f32 	%f121, %f117, %f120;
+	sub.f32 	%f122, %f117, %f121;
+	add.f32 	%f123, %f120, %f122;
+	mov.f32 	%f124, 0f3F317200;
+	mul.rn.f32 	%f125, %f100, %f124;
+	mov.f32 	%f126, 0f35BFBE8E;
+	mul.rn.f32 	%f127, %f100, %f126;
+	add.f32 	%f128, %f125, %f121;
+	sub.f32 	%f129, %f125, %f128;
+	add.f32 	%f130, %f121, %f129;
+	add.f32 	%f131, %f123, %f130;
+	add.f32 	%f132, %f127, %f131;
+	add.f32 	%f133, %f128, %f132;
+	sub.f32 	%f134, %f128, %f133;
+	add.f32 	%f135, %f132, %f134;
+	abs.f32 	%f21, %f2;
+	setp.gt.f32	%p47, %f21, 0f77F684DF;
+	mul.f32 	%f136, %f2, 0f39000000;
+	selp.f32	%f137, %f136, %f2, %p47;
+	mul.rn.f32 	%f138, %f137, %f133;
+	neg.f32 	%f139, %f138;
+	fma.rn.f32 	%f140, %f137, %f133, %f139;
+	fma.rn.f32 	%f141, %f137, %f135, %f140;
+	mov.f32 	%f142, 0f00000000;
+	fma.rn.f32 	%f143, %f142, %f133, %f141;
+	add.rn.f32 	%f144, %f138, %f143;
+	neg.f32 	%f145, %f144;
+	add.rn.f32 	%f146, %f138, %f145;
+	add.rn.f32 	%f147, %f146, %f143;
+	mov.b32 	 %r22, %f144;
+	setp.eq.s32	%p48, %r22, 1118925336;
+	add.s32 	%r23, %r22, -1;
+	mov.b32 	 %f148, %r23;
+	add.f32 	%f149, %f147, 0f37000000;
+	selp.f32	%f150, %f148, %f144, %p48;
+	selp.f32	%f22, %f149, %f147, %p48;
+	mul.f32 	%f151, %f150, 0f3FB8AA3B;
+	cvt.rzi.f32.f32	%f152, %f151;
+	mov.f32 	%f153, 0fBF317200;
+	fma.rn.f32 	%f154, %f152, %f153, %f150;
+	mov.f32 	%f155, 0fB5BFBE8E;
+	fma.rn.f32 	%f156, %f152, %f155, %f154;
+	mul.f32 	%f157, %f156, 0f3FB8AA3B;
+	ex2.approx.ftz.f32 	%f158, %f157;
+	add.f32 	%f159, %f152, 0f00000000;
+	ex2.approx.f32 	%f160, %f159;
+	mul.f32 	%f161, %f158, %f160;
+	setp.lt.f32	%p49, %f150, 0fC2D20000;
+	selp.f32	%f162, 0f00000000, %f161, %p49;
+	setp.gt.f32	%p50, %f150, 0f42D20000;
+	selp.f32	%f259, 0f7F800000, %f162, %p50;
+	setp.eq.f32	%p51, %f259, 0f7F800000;
+	@%p51 bra 	BB33_49;
+
+	fma.rn.f32 	%f259, %f259, %f22, %f259;
+
+BB33_49:
+	setp.lt.f32	%p52, %f1, 0f00000000;
+	setp.eq.f32	%p53, %f19, 0f3F800000;
+	and.pred  	%p1, %p52, %p53;
+	mov.b32 	 %r24, %f259;
+	xor.b32  	%r25, %r24, -2147483648;
+	mov.b32 	 %f163, %r25;
+	selp.f32	%f261, %f163, %f259, %p1;
+	setp.eq.f32	%p54, %f1, 0f00000000;
+	@%p54 bra 	BB33_52;
+	bra.uni 	BB33_50;
+
+BB33_52:
+	add.f32 	%f165, %f1, %f1;
+	mov.b32 	 %r26, %f165;
+	selp.b32	%r27, %r26, 0, %p53;
+	or.b32  	%r28, %r27, 2139095040;
+	setp.lt.f32	%p58, %f2, 0f00000000;
+	selp.b32	%r29, %r28, %r27, %p58;
+	mov.b32 	 %f261, %r29;
+	bra.uni 	BB33_53;
+
+BB33_28:
+	setp.gt.s32	%p7, %r3, 15;
+	@%p7 bra 	BB33_32;
+
+	setp.eq.s32	%p11, %r3, 14;
+	@%p11 bra 	BB33_41;
+	bra.uni 	BB33_30;
+
+BB33_41:
+	cvt.rni.s64.f32	%rd15, %f1;
+	cvt.u32.u64	%r12, %rd15;
+	cvt.rni.s64.f32	%rd16, %f2;
+	cvt.u32.u64	%r13, %rd16;
+	or.b32  	%r14, %r13, %r12;
+	setp.eq.s32	%p37, %r14, 0;
+	selp.f32	%f262, 0f00000000, 0f3F800000, %p37;
+	bra.uni 	BB33_63;
+
+BB33_73:
+	setp.gt.s32	%p87, %r3, 5;
+	@%p87 bra 	BB33_77;
+
+	setp.eq.s32	%p91, %r3, 4;
+	@%p91 bra 	BB33_109;
+	bra.uni 	BB33_75;
+
+BB33_109:
+	mul.f32 	%f179, %f1, 0f3F000000;
+	cvt.rzi.f32.f32	%f180, %f179;
+	fma.rn.f32 	%f181, %f180, 0fC0000000, %f1;
+	abs.f32 	%f56, %f181;
+	abs.f32 	%f57, %f2;
+	setp.lt.f32	%p112, %f57, 0f00800000;
+	mul.f32 	%f182, %f57, 0f4B800000;
+	selp.f32	%f183, 0fC3170000, 0fC2FE0000, %p112;
+	selp.f32	%f184, %f182, %f57, %p112;
+	mov.b32 	 %r43, %f184;
+	and.b32  	%r44, %r43, 8388607;
+	or.b32  	%r45, %r44, 1065353216;
+	mov.b32 	 %f185, %r45;
+	shr.u32 	%r46, %r43, 23;
+	cvt.rn.f32.u32	%f186, %r46;
+	add.f32 	%f187, %f183, %f186;
+	setp.gt.f32	%p113, %f185, 0f3FB504F3;
+	mul.f32 	%f188, %f185, 0f3F000000;
+	add.f32 	%f189, %f187, 0f3F800000;
+	selp.f32	%f190, %f188, %f185, %p113;
+	selp.f32	%f191, %f189, %f187, %p113;
+	add.f32 	%f192, %f190, 0fBF800000;
+	add.f32 	%f178, %f190, 0f3F800000;
+	// inline asm
+	rcp.approx.ftz.f32 %f177,%f178;
+	// inline asm
+	add.f32 	%f193, %f192, %f192;
+	mul.f32 	%f194, %f177, %f193;
+	mul.f32 	%f195, %f194, %f194;
+	mov.f32 	%f196, 0f3C4CAF63;
+	mov.f32 	%f197, 0f3B18F0FE;
+	fma.rn.f32 	%f198, %f197, %f195, %f196;
+	mov.f32 	%f199, 0f3DAAAABD;
+	fma.rn.f32 	%f200, %f198, %f195, %f199;
+	mul.rn.f32 	%f201, %f200, %f195;
+	mul.rn.f32 	%f202, %f201, %f194;
+	sub.f32 	%f203, %f192, %f194;
+	neg.f32 	%f204, %f194;
+	add.f32 	%f205, %f203, %f203;
+	fma.rn.f32 	%f206, %f204, %f192, %f205;
+	mul.rn.f32 	%f207, %f177, %f206;
+	add.f32 	%f208, %f202, %f194;
+	sub.f32 	%f209, %f194, %f208;
+	add.f32 	%f210, %f202, %f209;
+	add.f32 	%f211, %f207, %f210;
+	add.f32 	%f212, %f208, %f211;
+	sub.f32 	%f213, %f208, %f212;
+	add.f32 	%f214, %f211, %f213;
+	mov.f32 	%f215, 0f3F317200;
+	mul.rn.f32 	%f216, %f191, %f215;
+	mov.f32 	%f217, 0f35BFBE8E;
+	mul.rn.f32 	%f218, %f191, %f217;
+	add.f32 	%f219, %f216, %f212;
+	sub.f32 	%f220, %f216, %f219;
+	add.f32 	%f221, %f212, %f220;
+	add.f32 	%f222, %f214, %f221;
+	add.f32 	%f223, %f218, %f222;
+	add.f32 	%f224, %f219, %f223;
+	sub.f32 	%f225, %f219, %f224;
+	add.f32 	%f226, %f223, %f225;
+	abs.f32 	%f58, %f1;
+	setp.gt.f32	%p114, %f58, 0f77F684DF;
+	mul.f32 	%f227, %f1, 0f39000000;
+	selp.f32	%f228, %f227, %f1, %p114;
+	mul.rn.f32 	%f229, %f228, %f224;
+	neg.f32 	%f230, %f229;
+	fma.rn.f32 	%f231, %f228, %f224, %f230;
+	fma.rn.f32 	%f232, %f228, %f226, %f231;
+	mov.f32 	%f233, 0f00000000;
+	fma.rn.f32 	%f234, %f233, %f224, %f232;
+	add.rn.f32 	%f235, %f229, %f234;
+	neg.f32 	%f236, %f235;
+	add.rn.f32 	%f237, %f229, %f236;
+	add.rn.f32 	%f238, %f237, %f234;
+	mov.b32 	 %r47, %f235;
+	setp.eq.s32	%p115, %r47, 1118925336;
+	add.s32 	%r48, %r47, -1;
+	mov.b32 	 %f239, %r48;
+	add.f32 	%f240, %f238, 0f37000000;
+	selp.f32	%f241, %f239, %f235, %p115;
+	selp.f32	%f59, %f240, %f238, %p115;
+	mul.f32 	%f242, %f241, 0f3FB8AA3B;
+	cvt.rzi.f32.f32	%f243, %f242;
+	mov.f32 	%f244, 0fBF317200;
+	fma.rn.f32 	%f245, %f243, %f244, %f241;
+	mov.f32 	%f246, 0fB5BFBE8E;
+	fma.rn.f32 	%f247, %f243, %f246, %f245;
+	mul.f32 	%f248, %f247, 0f3FB8AA3B;
+	ex2.approx.ftz.f32 	%f249, %f248;
+	add.f32 	%f250, %f243, 0f00000000;
+	ex2.approx.f32 	%f251, %f250;
+	mul.f32 	%f252, %f249, %f251;
+	setp.lt.f32	%p116, %f241, 0fC2D20000;
+	selp.f32	%f253, 0f00000000, %f252, %p116;
+	setp.gt.f32	%p117, %f241, 0f42D20000;
+	selp.f32	%f263, 0f7F800000, %f253, %p117;
+	setp.eq.f32	%p118, %f263, 0f7F800000;
+	@%p118 bra 	BB33_111;
+
+	fma.rn.f32 	%f263, %f263, %f59, %f263;
+
+BB33_111:
+	setp.lt.f32	%p119, %f2, 0f00000000;
+	setp.eq.f32	%p120, %f56, 0f3F800000;
+	and.pred  	%p2, %p119, %p120;
+	mov.b32 	 %r49, %f263;
+	xor.b32  	%r50, %r49, -2147483648;
+	mov.b32 	 %f254, %r50;
+	selp.f32	%f265, %f254, %f263, %p2;
+	setp.eq.f32	%p121, %f2, 0f00000000;
+	@%p121 bra 	BB33_114;
+	bra.uni 	BB33_112;
+
+BB33_114:
+	add.f32 	%f256, %f2, %f2;
+	mov.b32 	 %r51, %f256;
+	selp.b32	%r52, %r51, 0, %p120;
+	or.b32  	%r53, %r52, 2139095040;
+	setp.lt.f32	%p125, %f1, 0f00000000;
+	selp.b32	%r54, %r53, %r52, %p125;
+	mov.b32 	 %f265, %r54;
+	bra.uni 	BB33_115;
+
+BB33_90:
+	setp.gt.s32	%p74, %r3, 15;
+	@%p74 bra 	BB33_94;
+
+	setp.eq.s32	%p78, %r3, 14;
+	@%p78 bra 	BB33_103;
+	bra.uni 	BB33_92;
+
+BB33_103:
+	cvt.rni.s64.f32	%rd19, %f2;
+	cvt.u32.u64	%r37, %rd19;
+	cvt.rni.s64.f32	%rd20, %f1;
+	cvt.u32.u64	%r38, %rd20;
+	or.b32  	%r39, %r38, %r37;
+	setp.eq.s32	%p104, %r39, 0;
+	selp.f32	%f266, 0f00000000, 0f3F800000, %p104;
+	bra.uni 	BB33_125;
+
+BB33_8:
+	setp.eq.s32	%p27, %r3, 2;
+	@%p27 bra 	BB33_61;
+	bra.uni 	BB33_9;
+
+BB33_61:
+	mul.f32 	%f262, %f1, %f2;
+	bra.uni 	BB33_63;
+
+BB33_24:
+	setp.eq.s32	%p14, %r3, 11;
+	@%p14 bra 	BB33_43;
+
+	setp.eq.s32	%p15, %r3, 12;
+	@%p15 bra 	BB33_42;
+	bra.uni 	BB33_26;
+
+BB33_42:
+	max.f32 	%f262, %f1, %f2;
+	bra.uni 	BB33_63;
+
+BB33_15:
+	setp.eq.s32	%p21, %r3, 6;
+	@%p21 bra 	BB33_46;
+
+	setp.eq.s32	%p22, %r3, 7;
+	@%p22 bra 	BB33_45;
+	bra.uni 	BB33_17;
+
+BB33_45:
+	setp.gt.f32	%p42, %f1, %f2;
+	selp.f32	%f262, 0f3F800000, 0f00000000, %p42;
+	bra.uni 	BB33_63;
+
+BB33_32:
+	setp.eq.s32	%p8, %r3, 16;
+	@%p8 bra 	BB33_40;
+
+	setp.eq.s32	%p9, %r3, 17;
+	@%p9 bra 	BB33_37;
+	bra.uni 	BB33_34;
+
+BB33_37:
+	setp.eq.f32	%p32, %f2, 0f00000000;
+	setp.eq.f32	%p33, %f2, 0f80000000;
+	or.pred  	%p34, %p32, %p33;
+	mov.f32 	%f262, 0f7FC00000;
+	@%p34 bra 	BB33_63;
+
+	div.rn.f32 	%f262, %f1, %f2;
+	abs.f32 	%f80, %f262;
+	setp.geu.f32	%p35, %f80, 0f7F800000;
+	@%p35 bra 	BB33_63;
+
+	cvt.rmi.f32.f32	%f81, %f262;
+	mul.f32 	%f82, %f2, %f81;
+	sub.f32 	%f262, %f1, %f82;
+	bra.uni 	BB33_63;
+
+BB33_70:
+	setp.eq.s32	%p94, %r3, 2;
+	@%p94 bra 	BB33_123;
+	bra.uni 	BB33_71;
+
+BB33_123:
+	mul.f32 	%f266, %f1, %f2;
+	bra.uni 	BB33_125;
+
+BB33_86:
+	setp.eq.s32	%p81, %r3, 11;
+	@%p81 bra 	BB33_105;
+
+	setp.eq.s32	%p82, %r3, 12;
+	@%p82 bra 	BB33_104;
+	bra.uni 	BB33_88;
+
+BB33_104:
+	max.f32 	%f266, %f2, %f1;
+	bra.uni 	BB33_125;
+
+BB33_77:
+	setp.eq.s32	%p88, %r3, 6;
+	@%p88 bra 	BB33_108;
+
+	setp.eq.s32	%p89, %r3, 7;
+	@%p89 bra 	BB33_107;
+	bra.uni 	BB33_79;
+
+BB33_107:
+	setp.gt.f32	%p109, %f2, %f1;
+	selp.f32	%f266, 0f3F800000, 0f00000000, %p109;
+	bra.uni 	BB33_125;
+
+BB33_94:
+	setp.eq.s32	%p75, %r3, 16;
+	@%p75 bra 	BB33_102;
+
+	setp.eq.s32	%p76, %r3, 17;
+	@%p76 bra 	BB33_99;
+	bra.uni 	BB33_96;
+
+BB33_99:
+	setp.eq.f32	%p99, %f1, 0f00000000;
+	setp.eq.f32	%p100, %f1, 0f80000000;
+	or.pred  	%p101, %p99, %p100;
+	mov.f32 	%f266, 0f7FC00000;
+	@%p101 bra 	BB33_125;
+
+	div.rn.f32 	%f266, %f2, %f1;
+	abs.f32 	%f171, %f266;
+	setp.geu.f32	%p102, %f171, 0f7F800000;
+	@%p102 bra 	BB33_125;
+
+	cvt.rmi.f32.f32	%f172, %f266;
+	mul.f32 	%f173, %f1, %f172;
+	sub.f32 	%f266, %f2, %f173;
+	bra.uni 	BB33_125;
+
+BB33_6:
+	setp.eq.s32	%p30, %r3, 1;
+	@%p30 bra 	BB33_7;
+	bra.uni 	BB33_63;
+
+BB33_7:
+	sub.f32 	%f262, %f1, %f2;
+	bra.uni 	BB33_63;
+
+BB33_22:
+	setp.eq.s32	%p18, %r3, 10;
+	@%p18 bra 	BB33_23;
+	bra.uni 	BB33_63;
+
+BB33_23:
+	setp.neu.f32	%p39, %f1, %f2;
+	selp.f32	%f262, 0f3F800000, 0f00000000, %p39;
+	bra.uni 	BB33_63;
+
+BB33_13:
+	setp.eq.s32	%p25, %r3, 5;
+	@%p25 bra 	BB33_14;
+	bra.uni 	BB33_63;
+
+BB33_14:
+	setp.lt.f32	%p44, %f1, %f2;
+	selp.f32	%f262, 0f3F800000, 0f00000000, %p44;
+	bra.uni 	BB33_63;
+
+BB33_30:
+	setp.eq.s32	%p12, %r3, 15;
+	@%p12 bra 	BB33_31;
+	bra.uni 	BB33_63;
+
+BB33_31:
+	mul.f32 	%f84, %f1, %f2;
+	mov.f32 	%f85, 0f3F800000;
+	sub.f32 	%f262, %f85, %f84;
+	bra.uni 	BB33_63;
+
+BB33_9:
+	setp.eq.s32	%p28, %r3, 3;
+	@%p28 bra 	BB33_10;
+	bra.uni 	BB33_63;
+
+BB33_10:
+	div.rn.f32 	%f262, %f1, %f2;
+	bra.uni 	BB33_63;
+
+BB33_43:
+	min.f32 	%f262, %f1, %f2;
+	bra.uni 	BB33_63;
+
+BB33_26:
+	setp.eq.s32	%p16, %r3, 13;
+	@%p16 bra 	BB33_27;
+	bra.uni 	BB33_63;
+
+BB33_27:
+	cvt.rni.s64.f32	%rd17, %f1;
+	cvt.u32.u64	%r15, %rd17;
+	cvt.rni.s64.f32	%rd18, %f2;
+	cvt.u32.u64	%r16, %rd18;
+	and.b32  	%r17, %r16, %r15;
+	setp.eq.s32	%p38, %r17, 0;
+	selp.f32	%f262, 0f00000000, 0f3F800000, %p38;
+	bra.uni 	BB33_63;
+
+BB33_46:
+	setp.gtu.f32	%p43, %f1, %f2;
+	selp.f32	%f262, 0f00000000, 0f3F800000, %p43;
+	bra.uni 	BB33_63;
+
+BB33_17:
+	setp.eq.s32	%p23, %r3, 8;
+	@%p23 bra 	BB33_18;
+	bra.uni 	BB33_63;
+
+BB33_18:
+	setp.ltu.f32	%p41, %f1, %f2;
+	selp.f32	%f262, 0f00000000, 0f3F800000, %p41;
+	bra.uni 	BB33_63;
+
+BB33_40:
+	setp.neu.f32	%p36, %f1, 0f00000000;
+	sub.f32 	%f83, %f1, %f2;
+	selp.f32	%f262, %f83, 0f00000000, %p36;
+	bra.uni 	BB33_63;
+
+BB33_34:
+	setp.ne.s32	%p10, %r3, 18;
+	@%p10 bra 	BB33_63;
+
+	div.rn.f32 	%f262, %f1, %f2;
+	abs.f32 	%f78, %f262;
+	setp.geu.f32	%p31, %f78, 0f7F800000;
+	@%p31 bra 	BB33_63;
+
+	cvt.rmi.f32.f32	%f262, %f262;
+	bra.uni 	BB33_63;
+
+BB33_68:
+	setp.eq.s32	%p97, %r3, 1;
+	@%p97 bra 	BB33_69;
+	bra.uni 	BB33_125;
+
+BB33_69:
+	sub.f32 	%f266, %f2, %f1;
+	bra.uni 	BB33_125;
+
+BB33_84:
+	setp.eq.s32	%p85, %r3, 10;
+	@%p85 bra 	BB33_85;
+	bra.uni 	BB33_125;
+
+BB33_85:
+	setp.neu.f32	%p106, %f2, %f1;
+	selp.f32	%f266, 0f3F800000, 0f00000000, %p106;
+	bra.uni 	BB33_125;
+
+BB33_75:
+	setp.eq.s32	%p92, %r3, 5;
+	@%p92 bra 	BB33_76;
+	bra.uni 	BB33_125;
+
+BB33_76:
+	setp.lt.f32	%p111, %f2, %f1;
+	selp.f32	%f266, 0f3F800000, 0f00000000, %p111;
+	bra.uni 	BB33_125;
+
+BB33_92:
+	setp.eq.s32	%p79, %r3, 15;
+	@%p79 bra 	BB33_93;
+	bra.uni 	BB33_125;
+
+BB33_93:
+	mul.f32 	%f175, %f1, %f2;
+	mov.f32 	%f176, 0f3F800000;
+	sub.f32 	%f266, %f176, %f175;
+	bra.uni 	BB33_125;
+
+BB33_71:
+	setp.eq.s32	%p95, %r3, 3;
+	@%p95 bra 	BB33_72;
+	bra.uni 	BB33_125;
+
+BB33_72:
+	div.rn.f32 	%f266, %f2, %f1;
+	bra.uni 	BB33_125;
+
+BB33_105:
+	min.f32 	%f266, %f2, %f1;
+	bra.uni 	BB33_125;
+
+BB33_88:
+	setp.eq.s32	%p83, %r3, 13;
+	@%p83 bra 	BB33_89;
+	bra.uni 	BB33_125;
+
+BB33_89:
+	cvt.rni.s64.f32	%rd21, %f2;
+	cvt.u32.u64	%r40, %rd21;
+	cvt.rni.s64.f32	%rd22, %f1;
+	cvt.u32.u64	%r41, %rd22;
+	and.b32  	%r42, %r41, %r40;
+	setp.eq.s32	%p105, %r42, 0;
+	selp.f32	%f266, 0f00000000, 0f3F800000, %p105;
+	bra.uni 	BB33_125;
+
+BB33_108:
+	setp.gtu.f32	%p110, %f2, %f1;
+	selp.f32	%f266, 0f00000000, 0f3F800000, %p110;
+	bra.uni 	BB33_125;
+
+BB33_79:
+	setp.eq.s32	%p90, %r3, 8;
+	@%p90 bra 	BB33_80;
+	bra.uni 	BB33_125;
+
+BB33_80:
+	setp.ltu.f32	%p108, %f2, %f1;
+	selp.f32	%f266, 0f00000000, 0f3F800000, %p108;
+	bra.uni 	BB33_125;
+
+BB33_102:
+	setp.neu.f32	%p103, %f2, 0f00000000;
+	sub.f32 	%f174, %f2, %f1;
+	selp.f32	%f266, %f174, 0f00000000, %p103;
+	bra.uni 	BB33_125;
+
+BB33_96:
+	setp.ne.s32	%p77, %r3, 18;
+	@%p77 bra 	BB33_125;
+
+	div.rn.f32 	%f266, %f2, %f1;
+	abs.f32 	%f169, %f266;
+	setp.geu.f32	%p98, %f169, 0f7F800000;
+	@%p98 bra 	BB33_125;
+
+	cvt.rmi.f32.f32	%f266, %f266;
+	bra.uni 	BB33_125;
+
+BB33_50:
+	setp.geu.f32	%p55, %f1, 0f00000000;
+	@%p55 bra 	BB33_53;
+
+	cvt.rzi.f32.f32	%f164, %f2;
+	setp.neu.f32	%p56, %f164, %f2;
+	selp.f32	%f261, 0f7FFFFFFF, %f261, %p56;
+
+BB33_53:
+	add.f32 	%f166, %f20, %f21;
+	mov.b32 	 %r30, %f166;
+	setp.lt.s32	%p59, %r30, 2139095040;
+	@%p59 bra 	BB33_60;
+
+	setp.gtu.f32	%p60, %f20, 0f7F800000;
+	setp.gtu.f32	%p61, %f21, 0f7F800000;
+	or.pred  	%p62, %p60, %p61;
+	@%p62 bra 	BB33_59;
+	bra.uni 	BB33_55;
+
+BB33_59:
+	add.f32 	%f261, %f1, %f2;
+	bra.uni 	BB33_60;
+
+BB33_55:
+	setp.eq.f32	%p63, %f21, 0f7F800000;
+	@%p63 bra 	BB33_58;
+	bra.uni 	BB33_56;
+
+BB33_58:
+	setp.gt.f32	%p66, %f20, 0f3F800000;
+	selp.b32	%r34, 2139095040, 0, %p66;
+	xor.b32  	%r35, %r34, 2139095040;
+	setp.lt.f32	%p67, %f2, 0f00000000;
+	selp.b32	%r36, %r35, %r34, %p67;
+	mov.b32 	 %f167, %r36;
+	setp.eq.f32	%p68, %f1, 0fBF800000;
+	selp.f32	%f261, 0f3F800000, %f167, %p68;
+	bra.uni 	BB33_60;
+
+BB33_112:
+	setp.geu.f32	%p122, %f2, 0f00000000;
+	@%p122 bra 	BB33_115;
+
+	cvt.rzi.f32.f32	%f255, %f1;
+	setp.neu.f32	%p123, %f255, %f1;
+	selp.f32	%f265, 0f7FFFFFFF, %f265, %p123;
+
+BB33_115:
+	add.f32 	%f257, %f57, %f58;
+	mov.b32 	 %r55, %f257;
+	setp.lt.s32	%p126, %r55, 2139095040;
+	@%p126 bra 	BB33_122;
+
+	setp.gtu.f32	%p127, %f57, 0f7F800000;
+	setp.gtu.f32	%p128, %f58, 0f7F800000;
+	or.pred  	%p129, %p127, %p128;
+	@%p129 bra 	BB33_121;
+	bra.uni 	BB33_117;
+
+BB33_121:
+	add.f32 	%f265, %f1, %f2;
+	bra.uni 	BB33_122;
+
+BB33_117:
+	setp.eq.f32	%p130, %f58, 0f7F800000;
+	@%p130 bra 	BB33_120;
+	bra.uni 	BB33_118;
+
+BB33_120:
+	setp.gt.f32	%p133, %f57, 0f3F800000;
+	selp.b32	%r59, 2139095040, 0, %p133;
+	xor.b32  	%r60, %r59, 2139095040;
+	setp.lt.f32	%p134, %f1, 0f00000000;
+	selp.b32	%r61, %r60, %r59, %p134;
+	mov.b32 	 %f258, %r61;
+	setp.eq.f32	%p135, %f2, 0fBF800000;
+	selp.f32	%f265, 0f3F800000, %f258, %p135;
+	bra.uni 	BB33_122;
+
+BB33_56:
+	setp.neu.f32	%p64, %f20, 0f7F800000;
+	@%p64 bra 	BB33_60;
+
+	setp.ltu.f32	%p65, %f2, 0f00000000;
+	selp.b32	%r31, 0, 2139095040, %p65;
+	or.b32  	%r32, %r31, -2147483648;
+	selp.b32	%r33, %r32, %r31, %p1;
+	mov.b32 	 %f261, %r33;
+
+BB33_60:
+	setp.eq.f32	%p69, %f2, 0f00000000;
+	setp.eq.f32	%p70, %f1, 0f3F800000;
+	or.pred  	%p71, %p70, %p69;
+	selp.f32	%f262, 0f3F800000, %f261, %p71;
+
+BB33_63:
+	st.global.f32 	[%rd1], %f262;
+	bra.uni 	BB33_126;
+
+BB33_118:
+	setp.neu.f32	%p131, %f57, 0f7F800000;
+	@%p131 bra 	BB33_122;
+
+	setp.ltu.f32	%p132, %f1, 0f00000000;
+	selp.b32	%r56, 0, 2139095040, %p132;
+	or.b32  	%r57, %r56, -2147483648;
+	selp.b32	%r58, %r57, %r56, %p2;
+	mov.b32 	 %f265, %r58;
+
+BB33_122:
+	setp.eq.f32	%p136, %f1, 0f00000000;
+	setp.eq.f32	%p137, %f2, 0f3F800000;
+	or.pred  	%p138, %p137, %p136;
+	selp.f32	%f266, 0f3F800000, %f265, %p138;
+
+BB33_125:
+	st.global.f32 	[%rd1], %f266;
+
+BB33_126:
+	bar.sync 	0;
+	ret;
+}
+
 	// .globl	fill_d
 .visible .entry fill_d(
 	.param .u64 fill_d_param_0,
@@ -4616,14 +6349,14 @@ BB31_126:
 	mov.u32 	%r5, %tid.x;
 	mad.lo.s32 	%r1, %r4, %r3, %r5;
 	setp.ge.s32	%p1, %r1, %r2;
-	@%p1 bra 	BB32_2;
+	@%p1 bra 	BB34_2;
 
 	cvta.to.global.u64 	%rd2, %rd1;
 	mul.wide.s32 	%rd3, %r1, 8;
 	add.s64 	%rd4, %rd2, %rd3;
 	st.global.f64 	[%rd4], %fd1;
 
-BB32_2:
+BB34_2:
 	ret;
 }
 
@@ -4649,7 +6382,7 @@ BB32_2:
 	mov.u32 	%r5, %tid.x;
 	mad.lo.s32 	%r1, %r4, %r3, %r5;
 	setp.ge.s32	%p1, %r1, %r2;
-	@%p1 bra 	BB33_2;
+	@%p1 bra 	BB35_2;
 
 	cvt.rn.f32.f64	%f1, %fd1;
 	cvta.to.global.u64 	%rd2, %rd1;
@@ -4657,7 +6390,7 @@ BB32_2:
 	add.s64 	%rd4, %rd2, %rd3;
 	st.global.f32 	[%rd4], %f1;
 
-BB33_2:
+BB35_2:
 	ret;
 }
 
@@ -4697,10 +6430,10 @@ BB33_2:
 	setp.lt.s32	%p1, %r1, %r7;
 	setp.lt.s32	%p2, %r2, %r4;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB34_2;
-	bra.uni 	BB34_1;
+	@!%p3 bra 	BB36_2;
+	bra.uni 	BB36_1;
 
-BB34_1:
+BB36_1:
 	cvta.to.global.u64 	%rd5, %rd2;
 	mad.lo.s32 	%r13, %r1, %r4, %r2;
 	mul.wide.s32 	%rd6, %r13, 8;
@@ -4711,14 +6444,14 @@ BB34_1:
 	add.s64 	%rd9, %rd1, %rd8;
 	st.global.f64 	[%rd9], %fd1;
 
-BB34_2:
+BB36_2:
 	setp.lt.s32	%p4, %r1, %r5;
 	setp.lt.s32	%p5, %r2, %r6;
 	and.pred  	%p6, %p4, %p5;
-	@!%p6 bra 	BB34_4;
-	bra.uni 	BB34_3;
+	@!%p6 bra 	BB36_4;
+	bra.uni 	BB36_3;
 
-BB34_3:
+BB36_3:
 	cvta.to.global.u64 	%rd10, %rd3;
 	mad.lo.s32 	%r15, %r1, %r6, %r2;
 	mul.wide.s32 	%rd11, %r15, 8;
@@ -4730,7 +6463,7 @@ BB34_3:
 	add.s64 	%rd14, %rd1, %rd13;
 	st.global.f64 	[%rd14], %fd2;
 
-BB34_4:
+BB36_4:
 	ret;
 }
 
@@ -4770,10 +6503,10 @@ BB34_4:
 	setp.lt.s32	%p1, %r1, %r7;
 	setp.lt.s32	%p2, %r2, %r4;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB35_2;
-	bra.uni 	BB35_1;
+	@!%p3 bra 	BB37_2;
+	bra.uni 	BB37_1;
 
-BB35_1:
+BB37_1:
 	cvta.to.global.u64 	%rd5, %rd2;
 	mad.lo.s32 	%r13, %r1, %r4, %r2;
 	mul.wide.s32 	%rd6, %r13, 4;
@@ -4784,14 +6517,14 @@ BB35_1:
 	add.s64 	%rd9, %rd1, %rd8;
 	st.global.f32 	[%rd9], %f1;
 
-BB35_2:
+BB37_2:
 	setp.lt.s32	%p4, %r1, %r5;
 	setp.lt.s32	%p5, %r2, %r6;
 	and.pred  	%p6, %p4, %p5;
-	@!%p6 bra 	BB35_4;
-	bra.uni 	BB35_3;
+	@!%p6 bra 	BB37_4;
+	bra.uni 	BB37_3;
 
-BB35_3:
+BB37_3:
 	cvta.to.global.u64 	%rd10, %rd3;
 	mad.lo.s32 	%r15, %r1, %r6, %r2;
 	mul.wide.s32 	%rd11, %r15, 4;
@@ -4803,7 +6536,7 @@ BB35_3:
 	add.s64 	%rd14, %rd1, %rd13;
 	st.global.f32 	[%rd14], %f2;
 
-BB35_4:
+BB37_4:
 	ret;
 }
 
@@ -4842,10 +6575,10 @@ BB35_4:
 	setp.lt.s32	%p1, %r1, %r3;
 	setp.lt.s32	%p2, %r2, %r4;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB36_2;
-	bra.uni 	BB36_1;
+	@!%p3 bra 	BB38_2;
+	bra.uni 	BB38_1;
 
-BB36_1:
+BB38_1:
 	cvta.to.global.u64 	%rd5, %rd2;
 	mad.lo.s32 	%r12, %r1, %r4, %r2;
 	mul.wide.s32 	%rd6, %r12, 8;
@@ -4854,14 +6587,14 @@ BB36_1:
 	add.s64 	%rd8, %rd1, %rd6;
 	st.global.f64 	[%rd8], %fd1;
 
-BB36_2:
+BB38_2:
 	setp.lt.s32	%p4, %r1, %r5;
 	setp.lt.s32	%p5, %r2, %r6;
 	and.pred  	%p6, %p4, %p5;
-	@!%p6 bra 	BB36_4;
-	bra.uni 	BB36_3;
+	@!%p6 bra 	BB38_4;
+	bra.uni 	BB38_3;
 
-BB36_3:
+BB38_3:
 	cvta.to.global.u64 	%rd9, %rd3;
 	mad.lo.s32 	%r13, %r1, %r6, %r2;
 	mul.wide.s32 	%rd10, %r13, 8;
@@ -4873,7 +6606,7 @@ BB36_3:
 	add.s64 	%rd13, %rd1, %rd12;
 	st.global.f64 	[%rd13], %fd2;
 
-BB36_4:
+BB38_4:
 	ret;
 }
 
@@ -4912,10 +6645,10 @@ BB36_4:
 	setp.lt.s32	%p1, %r1, %r3;
 	setp.lt.s32	%p2, %r2, %r4;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB37_2;
-	bra.uni 	BB37_1;
+	@!%p3 bra 	BB39_2;
+	bra.uni 	BB39_1;
 
-BB37_1:
+BB39_1:
 	cvta.to.global.u64 	%rd5, %rd2;
 	mad.lo.s32 	%r12, %r1, %r4, %r2;
 	mul.wide.s32 	%rd6, %r12, 4;
@@ -4924,14 +6657,14 @@ BB37_1:
 	add.s64 	%rd8, %rd1, %rd6;
 	st.global.f32 	[%rd8], %f1;
 
-BB37_2:
+BB39_2:
 	setp.lt.s32	%p4, %r1, %r5;
 	setp.lt.s32	%p5, %r2, %r6;
 	and.pred  	%p6, %p4, %p5;
-	@!%p6 bra 	BB37_4;
-	bra.uni 	BB37_3;
+	@!%p6 bra 	BB39_4;
+	bra.uni 	BB39_3;
 
-BB37_3:
+BB39_3:
 	cvta.to.global.u64 	%rd9, %rd3;
 	mad.lo.s32 	%r13, %r1, %r6, %r2;
 	mul.wide.s32 	%rd10, %r13, 4;
@@ -4943,7 +6676,7 @@ BB37_3:
 	add.s64 	%rd13, %rd1, %rd12;
 	st.global.f32 	[%rd13], %f2;
 
-BB37_4:
+BB39_4:
 	ret;
 }
 
@@ -4970,9 +6703,9 @@ BB37_4:
 	mad.lo.s32 	%r35, %r9, %r10, %r7;
 	mov.f64 	%fd44, 0d0000000000000000;
 	setp.ge.u32	%p1, %r35, %r6;
-	@%p1 bra 	BB38_4;
+	@%p1 bra 	BB40_4;
 
-BB38_1:
+BB40_1:
 	cvta.to.global.u64 	%rd3, %rd1;
 	mul.wide.u32 	%rd4, %r35, 8;
 	add.s64 	%rd5, %rd3, %rd4;
@@ -4980,135 +6713,135 @@ BB38_1:
 	add.f64 	%fd44, %fd44, %fd30;
 	add.s32 	%r3, %r35, %r10;
 	setp.ge.u32	%p2, %r3, %r6;
-	@%p2 bra 	BB38_3;
+	@%p2 bra 	BB40_3;
 
 	mul.wide.u32 	%rd7, %r3, 8;
 	add.s64 	%rd8, %rd3, %rd7;
 	ld.global.f64 	%fd31, [%rd8];
 	add.f64 	%fd44, %fd44, %fd31;
 
-BB38_3:
+BB40_3:
 	shl.b32 	%r13, %r10, 1;
 	mov.u32 	%r14, %nctaid.x;
 	mad.lo.s32 	%r35, %r13, %r14, %r35;
 	setp.lt.u32	%p3, %r35, %r6;
-	@%p3 bra 	BB38_1;
+	@%p3 bra 	BB40_1;
 
-BB38_4:
+BB40_4:
 	shl.b32 	%r16, %r7, 3;
 	mov.u32 	%r17, my_sdata;
 	add.s32 	%r5, %r17, %r16;
 	st.shared.f64 	[%r5], %fd44;
 	bar.sync 	0;
 	setp.lt.u32	%p4, %r10, 1024;
-	@%p4 bra 	BB38_8;
+	@%p4 bra 	BB40_8;
 
 	setp.gt.u32	%p5, %r7, 511;
-	@%p5 bra 	BB38_7;
+	@%p5 bra 	BB40_7;
 
 	ld.shared.f64 	%fd32, [%r5+4096];
 	add.f64 	%fd44, %fd44, %fd32;
 	st.shared.f64 	[%r5], %fd44;
 
-BB38_7:
+BB40_7:
 	bar.sync 	0;
 
-BB38_8:
+BB40_8:
 	setp.lt.u32	%p6, %r10, 512;
-	@%p6 bra 	BB38_12;
+	@%p6 bra 	BB40_12;
 
 	setp.gt.u32	%p7, %r7, 255;
-	@%p7 bra 	BB38_11;
+	@%p7 bra 	BB40_11;
 
 	ld.shared.f64 	%fd33, [%r5+2048];
 	add.f64 	%fd44, %fd44, %fd33;
 	st.shared.f64 	[%r5], %fd44;
 
-BB38_11:
+BB40_11:
 	bar.sync 	0;
 
-BB38_12:
+BB40_12:
 	setp.lt.u32	%p8, %r10, 256;
-	@%p8 bra 	BB38_16;
+	@%p8 bra 	BB40_16;
 
 	setp.gt.u32	%p9, %r7, 127;
-	@%p9 bra 	BB38_15;
+	@%p9 bra 	BB40_15;
 
 	ld.shared.f64 	%fd34, [%r5+1024];
 	add.f64 	%fd44, %fd44, %fd34;
 	st.shared.f64 	[%r5], %fd44;
 
-BB38_15:
+BB40_15:
 	bar.sync 	0;
 
-BB38_16:
+BB40_16:
 	setp.lt.u32	%p10, %r10, 128;
-	@%p10 bra 	BB38_20;
+	@%p10 bra 	BB40_20;
 
 	setp.gt.u32	%p11, %r7, 63;
-	@%p11 bra 	BB38_19;
+	@%p11 bra 	BB40_19;
 
 	ld.shared.f64 	%fd35, [%r5+512];
 	add.f64 	%fd44, %fd44, %fd35;
 	st.shared.f64 	[%r5], %fd44;
 
-BB38_19:
+BB40_19:
 	bar.sync 	0;
 
-BB38_20:
+BB40_20:
 	setp.gt.u32	%p12, %r7, 31;
-	@%p12 bra 	BB38_33;
+	@%p12 bra 	BB40_33;
 
 	setp.lt.u32	%p13, %r10, 64;
-	@%p13 bra 	BB38_23;
+	@%p13 bra 	BB40_23;
 
 	ld.volatile.shared.f64 	%fd36, [%r5+256];
 	add.f64 	%fd44, %fd44, %fd36;
 	st.volatile.shared.f64 	[%r5], %fd44;
 
-BB38_23:
+BB40_23:
 	setp.lt.u32	%p14, %r10, 32;
-	@%p14 bra 	BB38_25;
+	@%p14 bra 	BB40_25;
 
 	ld.volatile.shared.f64 	%fd37, [%r5+128];
 	add.f64 	%fd44, %fd44, %fd37;
 	st.volatile.shared.f64 	[%r5], %fd44;
 
-BB38_25:
+BB40_25:
 	setp.lt.u32	%p15, %r10, 16;
-	@%p15 bra 	BB38_27;
+	@%p15 bra 	BB40_27;
 
 	ld.volatile.shared.f64 	%fd38, [%r5+64];
 	add.f64 	%fd44, %fd44, %fd38;
 	st.volatile.shared.f64 	[%r5], %fd44;
 
-BB38_27:
+BB40_27:
 	setp.lt.u32	%p16, %r10, 8;
-	@%p16 bra 	BB38_29;
+	@%p16 bra 	BB40_29;
 
 	ld.volatile.shared.f64 	%fd39, [%r5+32];
 	add.f64 	%fd44, %fd44, %fd39;
 	st.volatile.shared.f64 	[%r5], %fd44;
 
-BB38_29:
+BB40_29:
 	setp.lt.u32	%p17, %r10, 4;
-	@%p17 bra 	BB38_31;
+	@%p17 bra 	BB40_31;
 
 	ld.volatile.shared.f64 	%fd40, [%r5+16];
 	add.f64 	%fd44, %fd44, %fd40;
 	st.volatile.shared.f64 	[%r5], %fd44;
 
-BB38_31:
+BB40_31:
 	setp.lt.u32	%p18, %r10, 2;
-	@%p18 bra 	BB38_33;
+	@%p18 bra 	BB40_33;
 
 	ld.volatile.shared.f64 	%fd41, [%r5+8];
 	add.f64 	%fd42, %fd44, %fd41;
 	st.volatile.shared.f64 	[%r5], %fd42;
 
-BB38_33:
+BB40_33:
 	setp.ne.s32	%p19, %r7, 0;
-	@%p19 bra 	BB38_35;
+	@%p19 bra 	BB40_35;
 
 	ld.shared.f64 	%fd43, [my_sdata];
 	cvta.to.global.u64 	%rd9, %rd2;
@@ -5116,7 +6849,7 @@ BB38_33:
 	add.s64 	%rd11, %rd9, %rd10;
 	st.global.f64 	[%rd11], %fd43;
 
-BB38_35:
+BB40_35:
 	ret;
 }
 
@@ -5143,9 +6876,9 @@ BB38_35:
 	mad.lo.s32 	%r35, %r9, %r10, %r7;
 	mov.f32 	%f44, 0f00000000;
 	setp.ge.u32	%p1, %r35, %r6;
-	@%p1 bra 	BB39_4;
+	@%p1 bra 	BB41_4;
 
-BB39_1:
+BB41_1:
 	cvta.to.global.u64 	%rd3, %rd1;
 	mul.wide.u32 	%rd4, %r35, 4;
 	add.s64 	%rd5, %rd3, %rd4;
@@ -5153,135 +6886,135 @@ BB39_1:
 	add.f32 	%f44, %f44, %f30;
 	add.s32 	%r3, %r35, %r10;
 	setp.ge.u32	%p2, %r3, %r6;
-	@%p2 bra 	BB39_3;
+	@%p2 bra 	BB41_3;
 
 	mul.wide.u32 	%rd7, %r3, 4;
 	add.s64 	%rd8, %rd3, %rd7;
 	ld.global.f32 	%f31, [%rd8];
 	add.f32 	%f44, %f44, %f31;
 
-BB39_3:
+BB41_3:
 	shl.b32 	%r13, %r10, 1;
 	mov.u32 	%r14, %nctaid.x;
 	mad.lo.s32 	%r35, %r13, %r14, %r35;
 	setp.lt.u32	%p3, %r35, %r6;
-	@%p3 bra 	BB39_1;
+	@%p3 bra 	BB41_1;
 
-BB39_4:
+BB41_4:
 	shl.b32 	%r16, %r7, 2;
 	mov.u32 	%r17, my_sdata;
 	add.s32 	%r5, %r17, %r16;
 	st.shared.f32 	[%r5], %f44;
 	bar.sync 	0;
 	setp.lt.u32	%p4, %r10, 1024;
-	@%p4 bra 	BB39_8;
+	@%p4 bra 	BB41_8;
 
 	setp.gt.u32	%p5, %r7, 511;
-	@%p5 bra 	BB39_7;
+	@%p5 bra 	BB41_7;
 
 	ld.shared.f32 	%f32, [%r5+2048];
 	add.f32 	%f44, %f44, %f32;
 	st.shared.f32 	[%r5], %f44;
 
-BB39_7:
+BB41_7:
 	bar.sync 	0;
 
-BB39_8:
+BB41_8:
 	setp.lt.u32	%p6, %r10, 512;
-	@%p6 bra 	BB39_12;
+	@%p6 bra 	BB41_12;
 
 	setp.gt.u32	%p7, %r7, 255;
-	@%p7 bra 	BB39_11;
+	@%p7 bra 	BB41_11;
 
 	ld.shared.f32 	%f33, [%r5+1024];
 	add.f32 	%f44, %f44, %f33;
 	st.shared.f32 	[%r5], %f44;
 
-BB39_11:
+BB41_11:
 	bar.sync 	0;
 
-BB39_12:
+BB41_12:
 	setp.lt.u32	%p8, %r10, 256;
-	@%p8 bra 	BB39_16;
+	@%p8 bra 	BB41_16;
 
 	setp.gt.u32	%p9, %r7, 127;
-	@%p9 bra 	BB39_15;
+	@%p9 bra 	BB41_15;
 
 	ld.shared.f32 	%f34, [%r5+512];
 	add.f32 	%f44, %f44, %f34;
 	st.shared.f32 	[%r5], %f44;
 
-BB39_15:
+BB41_15:
 	bar.sync 	0;
 
-BB39_16:
+BB41_16:
 	setp.lt.u32	%p10, %r10, 128;
-	@%p10 bra 	BB39_20;
+	@%p10 bra 	BB41_20;
 
 	setp.gt.u32	%p11, %r7, 63;
-	@%p11 bra 	BB39_19;
+	@%p11 bra 	BB41_19;
 
 	ld.shared.f32 	%f35, [%r5+256];
 	add.f32 	%f44, %f44, %f35;
 	st.shared.f32 	[%r5], %f44;
 
-BB39_19:
+BB41_19:
 	bar.sync 	0;
 
-BB39_20:
+BB41_20:
 	setp.gt.u32	%p12, %r7, 31;
-	@%p12 bra 	BB39_33;
+	@%p12 bra 	BB41_33;
 
 	setp.lt.u32	%p13, %r10, 64;
-	@%p13 bra 	BB39_23;
+	@%p13 bra 	BB41_23;
 
 	ld.volatile.shared.f32 	%f36, [%r5+128];
 	add.f32 	%f44, %f44, %f36;
 	st.volatile.shared.f32 	[%r5], %f44;
 
-BB39_23:
+BB41_23:
 	setp.lt.u32	%p14, %r10, 32;
-	@%p14 bra 	BB39_25;
+	@%p14 bra 	BB41_25;
 
 	ld.volatile.shared.f32 	%f37, [%r5+64];
 	add.f32 	%f44, %f44, %f37;
 	st.volatile.shared.f32 	[%r5], %f44;
 
-BB39_25:
+BB41_25:
 	setp.lt.u32	%p15, %r10, 16;
-	@%p15 bra 	BB39_27;
+	@%p15 bra 	BB41_27;
 
 	ld.volatile.shared.f32 	%f38, [%r5+32];
 	add.f32 	%f44, %f44, %f38;
 	st.volatile.shared.f32 	[%r5], %f44;
 
-BB39_27:
+BB41_27:
 	setp.lt.u32	%p16, %r10, 8;
-	@%p16 bra 	BB39_29;
+	@%p16 bra 	BB41_29;
 
 	ld.volatile.shared.f32 	%f39, [%r5+16];
 	add.f32 	%f44, %f44, %f39;
 	st.volatile.shared.f32 	[%r5], %f44;
 
-BB39_29:
+BB41_29:
 	setp.lt.u32	%p17, %r10, 4;
-	@%p17 bra 	BB39_31;
+	@%p17 bra 	BB41_31;
 
 	ld.volatile.shared.f32 	%f40, [%r5+8];
 	add.f32 	%f44, %f44, %f40;
 	st.volatile.shared.f32 	[%r5], %f44;
 
-BB39_31:
+BB41_31:
 	setp.lt.u32	%p18, %r10, 2;
-	@%p18 bra 	BB39_33;
+	@%p18 bra 	BB41_33;
 
 	ld.volatile.shared.f32 	%f41, [%r5+4];
 	add.f32 	%f42, %f44, %f41;
 	st.volatile.shared.f32 	[%r5], %f42;
 
-BB39_33:
+BB41_33:
 	setp.ne.s32	%p19, %r7, 0;
-	@%p19 bra 	BB39_35;
+	@%p19 bra 	BB41_35;
 
 	ld.shared.f32 	%f43, [my_sdata];
 	cvta.to.global.u64 	%rd9, %rd2;
@@ -5289,7 +7022,7 @@ BB39_33:
 	add.s64 	%rd11, %rd9, %rd10;
 	st.global.f32 	[%rd11], %f43;
 
-BB39_35:
+BB41_35:
 	ret;
 }
 
@@ -5313,16 +7046,16 @@ BB39_35:
 	ld.param.u32 	%r4, [reduce_row_sum_d_param_3];
 	mov.u32 	%r6, %ctaid.x;
 	setp.ge.u32	%p1, %r6, %r5;
-	@%p1 bra 	BB40_35;
+	@%p1 bra 	BB42_35;
 
 	mov.u32 	%r71, %tid.x;
 	mov.f64 	%fd6, 0d0000000000000000;
 	setp.ge.u32	%p2, %r71, %r4;
-	@%p2 bra 	BB40_4;
+	@%p2 bra 	BB42_4;
 
 	cvta.to.global.u64 	%rd3, %rd1;
 
-BB40_3:
+BB42_3:
 	mad.lo.s32 	%r8, %r6, %r4, %r71;
 	mul.wide.u32 	%rd4, %r8, 8;
 	add.s64 	%rd5, %rd3, %rd4;
@@ -5331,9 +7064,9 @@ BB40_3:
 	mov.u32 	%r9, %ntid.x;
 	add.s32 	%r71, %r9, %r71;
 	setp.lt.u32	%p3, %r71, %r4;
-	@%p3 bra 	BB40_3;
+	@%p3 bra 	BB42_3;
 
-BB40_4:
+BB42_4:
 	mov.u32 	%r10, %tid.x;
 	shl.b32 	%r11, %r10, 3;
 	mov.u32 	%r12, my_sdata;
@@ -5342,114 +7075,114 @@ BB40_4:
 	bar.sync 	0;
 	mov.u32 	%r14, %ntid.x;
 	setp.lt.u32	%p4, %r14, 1024;
-	@%p4 bra 	BB40_8;
+	@%p4 bra 	BB42_8;
 
 	setp.gt.u32	%p5, %r10, 511;
-	@%p5 bra 	BB40_7;
+	@%p5 bra 	BB42_7;
 
 	ld.shared.f64 	%fd29, [%r13+4096];
 	add.f64 	%fd6, %fd6, %fd29;
 	st.shared.f64 	[%r13], %fd6;
 
-BB40_7:
+BB42_7:
 	bar.sync 	0;
 
-BB40_8:
+BB42_8:
 	setp.lt.u32	%p6, %r14, 512;
-	@%p6 bra 	BB40_12;
+	@%p6 bra 	BB42_12;
 
 	setp.gt.u32	%p7, %r10, 255;
-	@%p7 bra 	BB40_11;
+	@%p7 bra 	BB42_11;
 
 	ld.shared.f64 	%fd30, [%r13+2048];
 	add.f64 	%fd6, %fd6, %fd30;
 	st.shared.f64 	[%r13], %fd6;
 
-BB40_11:
+BB42_11:
 	bar.sync 	0;
 
-BB40_12:
+BB42_12:
 	setp.lt.u32	%p8, %r14, 256;
-	@%p8 bra 	BB40_16;
+	@%p8 bra 	BB42_16;
 
 	setp.gt.u32	%p9, %r10, 127;
-	@%p9 bra 	BB40_15;
+	@%p9 bra 	BB42_15;
 
 	ld.shared.f64 	%fd31, [%r13+1024];
 	add.f64 	%fd6, %fd6, %fd31;
 	st.shared.f64 	[%r13], %fd6;
 
-BB40_15:
+BB42_15:
 	bar.sync 	0;
 
-BB40_16:
+BB42_16:
 	setp.lt.u32	%p10, %r14, 128;
-	@%p10 bra 	BB40_20;
+	@%p10 bra 	BB42_20;
 
 	setp.gt.u32	%p11, %r10, 63;
-	@%p11 bra 	BB40_19;
+	@%p11 bra 	BB42_19;
 
 	ld.shared.f64 	%fd32, [%r13+512];
 	add.f64 	%fd6, %fd6, %fd32;
 	st.shared.f64 	[%r13], %fd6;
 
-BB40_19:
+BB42_19:
 	bar.sync 	0;
 
-BB40_20:
+BB42_20:
 	setp.gt.u32	%p12, %r10, 31;
-	@%p12 bra 	BB40_33;
+	@%p12 bra 	BB42_33;
 
 	setp.lt.u32	%p13, %r14, 64;
-	@%p13 bra 	BB40_23;
+	@%p13 bra 	BB42_23;
 
 	ld.volatile.shared.f64 	%fd33, [%r13+256];
 	add.f64 	%fd6, %fd6, %fd33;
 	st.volatile.shared.f64 	[%r13], %fd6;
 
-BB40_23:
+BB42_23:
 	setp.lt.u32	%p14, %r14, 32;
-	@%p14 bra 	BB40_25;
+	@%p14 bra 	BB42_25;
 
 	ld.volatile.shared.f64 	%fd34, [%r13+128];
 	add.f64 	%fd6, %fd6, %fd34;
 	st.volatile.shared.f64 	[%r13], %fd6;
 
-BB40_25:
+BB42_25:
 	setp.lt.u32	%p15, %r14, 16;
-	@%p15 bra 	BB40_27;
+	@%p15 bra 	BB42_27;
 
 	ld.volatile.shared.f64 	%fd35, [%r13+64];
 	add.f64 	%fd6, %fd6, %fd35;
 	st.volatile.shared.f64 	[%r13], %fd6;
 
-BB40_27:
+BB42_27:
 	setp.lt.u32	%p16, %r14, 8;
-	@%p16 bra 	BB40_29;
+	@%p16 bra 	BB42_29;
 
 	ld.volatile.shared.f64 	%fd36, [%r13+32];
 	add.f64 	%fd6, %fd6, %fd36;
 	st.volatile.shared.f64 	[%r13], %fd6;
 
-BB40_29:
+BB42_29:
 	setp.lt.u32	%p17, %r14, 4;
-	@%p17 bra 	BB40_31;
+	@%p17 bra 	BB42_31;
 
 	ld.volatile.shared.f64 	%fd37, [%r13+16];
 	add.f64 	%fd6, %fd6, %fd37;
 	st.volatile.shared.f64 	[%r13], %fd6;
 
-BB40_31:
+BB42_31:
 	setp.lt.u32	%p18, %r14, 2;
-	@%p18 bra 	BB40_33;
+	@%p18 bra 	BB42_33;
 
 	ld.volatile.shared.f64 	%fd38, [%r13+8];
 	add.f64 	%fd39, %fd6, %fd38;
 	st.volatile.shared.f64 	[%r13], %fd39;
 
-BB40_33:
+BB42_33:
 	setp.ne.s32	%p19, %r10, 0;
-	@%p19 bra 	BB40_35;
+	@%p19 bra 	BB42_35;
 
 	ld.shared.f64 	%fd40, [my_sdata];
 	cvta.to.global.u64 	%rd6, %rd2;
@@ -5457,7 +7190,7 @@ BB40_33:
 	add.s64 	%rd8, %rd6, %rd7;
 	st.global.f64 	[%rd8], %fd40;
 
-BB40_35:
+BB42_35:
 	ret;
 }
 
@@ -5481,16 +7214,16 @@ BB40_35:
 	ld.param.u32 	%r4, [reduce_row_sum_f_param_3];
 	mov.u32 	%r6, %ctaid.x;
 	setp.ge.u32	%p1, %r6, %r5;
-	@%p1 bra 	BB41_35;
+	@%p1 bra 	BB43_35;
 
 	mov.u32 	%r71, %tid.x;
 	mov.f32 	%f6, 0f00000000;
 	setp.ge.u32	%p2, %r71, %r4;
-	@%p2 bra 	BB41_4;
+	@%p2 bra 	BB43_4;
 
 	cvta.to.global.u64 	%rd3, %rd1;
 
-BB41_3:
+BB43_3:
 	mad.lo.s32 	%r8, %r6, %r4, %r71;
 	mul.wide.u32 	%rd4, %r8, 4;
 	add.s64 	%rd5, %rd3, %rd4;
@@ -5499,9 +7232,9 @@ BB41_3:
 	mov.u32 	%r9, %ntid.x;
 	add.s32 	%r71, %r9, %r71;
 	setp.lt.u32	%p3, %r71, %r4;
-	@%p3 bra 	BB41_3;
+	@%p3 bra 	BB43_3;
 
-BB41_4:
+BB43_4:
 	mov.u32 	%r10, %tid.x;
 	shl.b32 	%r11, %r10, 2;
 	mov.u32 	%r12, my_sdata;
@@ -5510,114 +7243,114 @@ BB41_4:
 	bar.sync 	0;
 	mov.u32 	%r14, %ntid.x;
 	setp.lt.u32	%p4, %r14, 1024;
-	@%p4 bra 	BB41_8;
+	@%p4 bra 	BB43_8;
 
 	setp.gt.u32	%p5, %r10, 511;
-	@%p5 bra 	BB41_7;
+	@%p5 bra 	BB43_7;
 
 	ld.shared.f32 	%f29, [%r13+2048];
 	add.f32 	%f6, %f6, %f29;
 	st.shared.f32 	[%r13], %f6;
 
-BB41_7:
+BB43_7:
 	bar.sync 	0;
 
-BB41_8:
+BB43_8:
 	setp.lt.u32	%p6, %r14, 512;
-	@%p6 bra 	BB41_12;
+	@%p6 bra 	BB43_12;
 
 	setp.gt.u32	%p7, %r10, 255;
-	@%p7 bra 	BB41_11;
+	@%p7 bra 	BB43_11;
 
 	ld.shared.f32 	%f30, [%r13+1024];
 	add.f32 	%f6, %f6, %f30;
 	st.shared.f32 	[%r13], %f6;
 
-BB41_11:
+BB43_11:
 	bar.sync 	0;
 
-BB41_12:
+BB43_12:
 	setp.lt.u32	%p8, %r14, 256;
-	@%p8 bra 	BB41_16;
+	@%p8 bra 	BB43_16;
 
 	setp.gt.u32	%p9, %r10, 127;
-	@%p9 bra 	BB41_15;
+	@%p9 bra 	BB43_15;
 
 	ld.shared.f32 	%f31, [%r13+512];
 	add.f32 	%f6, %f6, %f31;
 	st.shared.f32 	[%r13], %f6;
 
-BB41_15:
+BB43_15:
 	bar.sync 	0;
 
-BB41_16:
+BB43_16:
 	setp.lt.u32	%p10, %r14, 128;
-	@%p10 bra 	BB41_20;
+	@%p10 bra 	BB43_20;
 
 	setp.gt.u32	%p11, %r10, 63;
-	@%p11 bra 	BB41_19;
+	@%p11 bra 	BB43_19;
 
 	ld.shared.f32 	%f32, [%r13+256];
 	add.f32 	%f6, %f6, %f32;
 	st.shared.f32 	[%r13], %f6;
 
-BB41_19:
+BB43_19:
 	bar.sync 	0;
 
-BB41_20:
+BB43_20:
 	setp.gt.u32	%p12, %r10, 31;
-	@%p12 bra 	BB41_33;
+	@%p12 bra 	BB43_33;
 
 	setp.lt.u32	%p13, %r14, 64;
-	@%p13 bra 	BB41_23;
+	@%p13 bra 	BB43_23;
 
 	ld.volatile.shared.f32 	%f33, [%r13+128];
 	add.f32 	%f6, %f6, %f33;
 	st.volatile.shared.f32 	[%r13], %f6;
 
-BB41_23:
+BB43_23:
 	setp.lt.u32	%p14, %r14, 32;
-	@%p14 bra 	BB41_25;
+	@%p14 bra 	BB43_25;
 
 	ld.volatile.shared.f32 	%f34, [%r13+64];
 	add.f32 	%f6, %f6, %f34;
 	st.volatile.shared.f32 	[%r13], %f6;
 
-BB41_25:
+BB43_25:
 	setp.lt.u32	%p15, %r14, 16;
-	@%p15 bra 	BB41_27;
+	@%p15 bra 	BB43_27;
 
 	ld.volatile.shared.f32 	%f35, [%r13+32];
 	add.f32 	%f6, %f6, %f35;
 	st.volatile.shared.f32 	[%r13], %f6;
 
-BB41_27:
+BB43_27:
 	setp.lt.u32	%p16, %r14, 8;
-	@%p16 bra 	BB41_29;
+	@%p16 bra 	BB43_29;
 
 	ld.volatile.shared.f32 	%f36, [%r13+16];
 	add.f32 	%f6, %f6, %f36;
 	st.volatile.shared.f32 	[%r13], %f6;
 
-BB41_29:
+BB43_29:
 	setp.lt.u32	%p17, %r14, 4;
-	@%p17 bra 	BB41_31;
+	@%p17 bra 	BB43_31;
 
 	ld.volatile.shared.f32 	%f37, [%r13+8];
 	add.f32 	%f6, %f6, %f37;
 	st.volatile.shared.f32 	[%r13], %f6;
 
-BB41_31:
+BB43_31:
 	setp.lt.u32	%p18, %r14, 2;
-	@%p18 bra 	BB41_33;
+	@%p18 bra 	BB43_33;
 
 	ld.volatile.shared.f32 	%f38, [%r13+4];
 	add.f32 	%f39, %f6, %f38;
 	st.volatile.shared.f32 	[%r13], %f39;
 
-BB41_33:
+BB43_33:
 	setp.ne.s32	%p19, %r10, 0;
-	@%p19 bra 	BB41_35;
+	@%p19 bra 	BB43_35;
 
 	ld.shared.f32 	%f40, [my_sdata];
 	cvta.to.global.u64 	%rd6, %rd2;
@@ -5625,7 +7358,7 @@ BB41_33:
 	add.s64 	%rd8, %rd6, %rd7;
 	st.global.f32 	[%rd8], %f40;
 
-BB41_35:
+BB43_35:
 	ret;
 }
 
@@ -5652,32 +7385,32 @@ BB41_35:
 	mov.u32 	%r9, %tid.x;
 	mad.lo.s32 	%r1, %r7, %r8, %r9;
 	setp.ge.u32	%p1, %r1, %r6;
-	@%p1 bra 	BB42_5;
+	@%p1 bra 	BB44_5;
 
 	cvta.to.global.u64 	%rd1, %rd2;
 	mul.lo.s32 	%r2, %r6, %r5;
 	mov.f64 	%fd8, 0d0000000000000000;
 	setp.ge.u32	%p2, %r1, %r2;
-	@%p2 bra 	BB42_4;
+	@%p2 bra 	BB44_4;
 
 	mov.u32 	%r10, %r1;
 
-BB42_3:
+BB44_3:
 	mul.wide.u32 	%rd4, %r10, 8;
 	add.s64 	%rd5, %rd1, %rd4;
 	ld.global.f64 	%fd6, [%rd5];
 	add.f64 	%fd8, %fd8, %fd6;
 	add.s32 	%r10, %r10, %r6;
 	setp.lt.u32	%p3, %r10, %r2;
-	@%p3 bra 	BB42_3;
+	@%p3 bra 	BB44_3;
 
-BB42_4:
+BB44_4:
 	cvta.to.global.u64 	%rd6, %rd3;
 	mul.wide.u32 	%rd7, %r1, 8;
 	add.s64 	%rd8, %rd6, %rd7;
 	st.global.f64 	[%rd8], %fd8;
 
-BB42_5:
+BB44_5:
 	ret;
 }
 
@@ -5704,32 +7437,32 @@ BB42_5:
 	mov.u32 	%r9, %tid.x;
 	mad.lo.s32 	%r1, %r7, %r8, %r9;
 	setp.ge.u32	%p1, %r1, %r6;
-	@%p1 bra 	BB43_5;
+	@%p1 bra 	BB45_5;
 
 	cvta.to.global.u64 	%rd1, %rd2;
 	mul.lo.s32 	%r2, %r6, %r5;
 	mov.f32 	%f8, 0f00000000;
 	setp.ge.u32	%p2, %r1, %r2;
-	@%p2 bra 	BB43_4;
+	@%p2 bra 	BB45_4;
 
 	mov.u32 	%r10, %r1;
 
-BB43_3:
+BB45_3:
 	mul.wide.u32 	%rd4, %r10, 4;
 	add.s64 	%rd5, %rd1, %rd4;
 	ld.global.f32 	%f6, [%rd5];
 	add.f32 	%f8, %f8, %f6;
 	add.s32 	%r10, %r10, %r6;
 	setp.lt.u32	%p3, %r10, %r2;
-	@%p3 bra 	BB43_3;
+	@%p3 bra 	BB45_3;
 
-BB43_4:
+BB45_4:
 	cvta.to.global.u64 	%rd6, %rd3;
 	mul.wide.u32 	%rd7, %r1, 4;
 	add.s64 	%rd8, %rd6, %rd7;
 	st.global.f32 	[%rd8], %f8;
 
-BB43_5:
+BB45_5:
 	ret;
 }
 
@@ -5756,9 +7489,9 @@ BB43_5:
 	mad.lo.s32 	%r35, %r9, %r10, %r7;
 	mov.f64 	%fd44, 0dFFEFFFFFFFFFFFFF;
 	setp.ge.u32	%p1, %r35, %r6;
-	@%p1 bra 	BB44_4;
+	@%p1 bra 	BB46_4;
 
-BB44_1:
+BB46_1:
 	cvta.to.global.u64 	%rd3, %rd1;
 	mul.wide.u32 	%rd4, %r35, 8;
 	add.s64 	%rd5, %rd3, %rd4;
@@ -5766,135 +7499,135 @@ BB44_1:
 	max.f64 	%fd44, %fd44, %fd30;
 	add.s32 	%r3, %r35, %r10;
 	setp.ge.u32	%p2, %r3, %r6;
-	@%p2 bra 	BB44_3;
+	@%p2 bra 	BB46_3;
 
 	mul.wide.u32 	%rd7, %r3, 8;
 	add.s64 	%rd8, %rd3, %rd7;
 	ld.global.f64 	%fd31, [%rd8];
 	max.f64 	%fd44, %fd44, %fd31;
 
-BB44_3:
+BB46_3:
 	shl.b32 	%r13, %r10, 1;
 	mov.u32 	%r14, %nctaid.x;
 	mad.lo.s32 	%r35, %r13, %r14, %r35;
 	setp.lt.u32	%p3, %r35, %r6;
-	@%p3 bra 	BB44_1;
+	@%p3 bra 	BB46_1;
 
-BB44_4:
+BB46_4:
 	shl.b32 	%r16, %r7, 3;
 	mov.u32 	%r17, my_sdata;
 	add.s32 	%r5, %r17, %r16;
 	st.shared.f64 	[%r5], %fd44;
 	bar.sync 	0;
 	setp.lt.u32	%p4, %r10, 1024;
-	@%p4 bra 	BB44_8;
+	@%p4 bra 	BB46_8;
 
 	setp.gt.u32	%p5, %r7, 511;
-	@%p5 bra 	BB44_7;
+	@%p5 bra 	BB46_7;
 
 	ld.shared.f64 	%fd32, [%r5+4096];
 	max.f64 	%fd44, %fd44, %fd32;
 	st.shared.f64 	[%r5], %fd44;
 
-BB44_7:
+BB46_7:
 	bar.sync 	0;
 
-BB44_8:
+BB46_8:
 	setp.lt.u32	%p6, %r10, 512;
-	@%p6 bra 	BB44_12;
+	@%p6 bra 	BB46_12;
 
 	setp.gt.u32	%p7, %r7, 255;
-	@%p7 bra 	BB44_11;
+	@%p7 bra 	BB46_11;
 
 	ld.shared.f64 	%fd33, [%r5+2048];
 	max.f64 	%fd44, %fd44, %fd33;
 	st.shared.f64 	[%r5], %fd44;
 
-BB44_11:
+BB46_11:
 	bar.sync 	0;
 
-BB44_12:
+BB46_12:
 	setp.lt.u32	%p8, %r10, 256;
-	@%p8 bra 	BB44_16;
+	@%p8 bra 	BB46_16;
 
 	setp.gt.u32	%p9, %r7, 127;
-	@%p9 bra 	BB44_15;
+	@%p9 bra 	BB46_15;
 
 	ld.shared.f64 	%fd34, [%r5+1024];
 	max.f64 	%fd44, %fd44, %fd34;
 	st.shared.f64 	[%r5], %fd44;
 
-BB44_15:
+BB46_15:
 	bar.sync 	0;
 
-BB44_16:
+BB46_16:
 	setp.lt.u32	%p10, %r10, 128;
-	@%p10 bra 	BB44_20;
+	@%p10 bra 	BB46_20;
 
 	setp.gt.u32	%p11, %r7, 63;
-	@%p11 bra 	BB44_19;
+	@%p11 bra 	BB46_19;
 
 	ld.shared.f64 	%fd35, [%r5+512];
 	max.f64 	%fd44, %fd44, %fd35;
 	st.shared.f64 	[%r5], %fd44;
 
-BB44_19:
+BB46_19:
 	bar.sync 	0;
 
-BB44_20:
+BB46_20:
 	setp.gt.u32	%p12, %r7, 31;
-	@%p12 bra 	BB44_33;
+	@%p12 bra 	BB46_33;
 
 	setp.lt.u32	%p13, %r10, 64;
-	@%p13 bra 	BB44_23;
+	@%p13 bra 	BB46_23;
 
 	ld.volatile.shared.f64 	%fd36, [%r5+256];
 	max.f64 	%fd44, %fd44, %fd36;
 	st.volatile.shared.f64 	[%r5], %fd44;
 
-BB44_23:
+BB46_23:
 	setp.lt.u32	%p14, %r10, 32;
-	@%p14 bra 	BB44_25;
+	@%p14 bra 	BB46_25;
 
 	ld.volatile.shared.f64 	%fd37, [%r5+128];
 	max.f64 	%fd44, %fd44, %fd37;
 	st.volatile.shared.f64 	[%r5], %fd44;
 
-BB44_25:
+BB46_25:
 	setp.lt.u32	%p15, %r10, 16;
-	@%p15 bra 	BB44_27;
+	@%p15 bra 	BB46_27;
 
 	ld.volatile.shared.f64 	%fd38, [%r5+64];
 	max.f64 	%fd44, %fd44, %fd38;
 	st.volatile.shared.f64 	[%r5], %fd44;
 
-BB44_27:
+BB46_27:
 	setp.lt.u32	%p16, %r10, 8;
-	@%p16 bra 	BB44_29;
+	@%p16 bra 	BB46_29;
 
 	ld.volatile.shared.f64 	%fd39, [%r5+32];
 	max.f64 	%fd44, %fd44, %fd39;
 	st.volatile.shared.f64 	[%r5], %fd44;
 
-BB44_29:
+BB46_29:
 	setp.lt.u32	%p17, %r10, 4;
-	@%p17 bra 	BB44_31;
+	@%p17 bra 	BB46_31;
 
 	ld.volatile.shared.f64 	%fd40, [%r5+16];
 	max.f64 	%fd44, %fd44, %fd40;
 	st.volatile.shared.f64 	[%r5], %fd44;
 
-BB44_31:
+BB46_31:
 	setp.lt.u32	%p18, %r10, 2;
-	@%p18 bra 	BB44_33;
+	@%p18 bra 	BB46_33;
 
 	ld.volatile.shared.f64 	%fd41, [%r5+8];
 	max.f64 	%fd42, %fd44, %fd41;
 	st.volatile.shared.f64 	[%r5], %fd42;
 
-BB44_33:
+BB46_33:
 	setp.ne.s32	%p19, %r7, 0;
-	@%p19 bra 	BB44_35;
+	@%p19 bra 	BB46_35;
 
 	ld.shared.f64 	%fd43, [my_sdata];
 	cvta.to.global.u64 	%rd9, %rd2;
@@ -5902,7 +7635,7 @@ BB44_33:
 	add.s64 	%rd11, %rd9, %rd10;
 	st.global.f64 	[%rd11], %fd43;
 
-BB44_35:
+BB46_35:
 	ret;
 }
 
@@ -5929,9 +7662,9 @@ BB44_35:
 	mad.lo.s32 	%r35, %r9, %r10, %r7;
 	mov.f32 	%f44, 0fFF7FFFFF;
 	setp.ge.u32	%p1, %r35, %r6;
-	@%p1 bra 	BB45_4;
+	@%p1 bra 	BB47_4;
 
-BB45_1:
+BB47_1:
 	cvta.to.global.u64 	%rd3, %rd1;
 	mul.wide.u32 	%rd4, %r35, 4;
 	add.s64 	%rd5, %rd3, %rd4;
@@ -5939,135 +7672,135 @@ BB45_1:
 	max.f32 	%f44, %f44, %f30;
 	add.s32 	%r3, %r35, %r10;
 	setp.ge.u32	%p2, %r3, %r6;
-	@%p2 bra 	BB45_3;
+	@%p2 bra 	BB47_3;
 
 	mul.wide.u32 	%rd7, %r3, 4;
 	add.s64 	%rd8, %rd3, %rd7;
 	ld.global.f32 	%f31, [%rd8];
 	max.f32 	%f44, %f44, %f31;
 
-BB45_3:
+BB47_3:
 	shl.b32 	%r13, %r10, 1;
 	mov.u32 	%r14, %nctaid.x;
 	mad.lo.s32 	%r35, %r13, %r14, %r35;
 	setp.lt.u32	%p3, %r35, %r6;
-	@%p3 bra 	BB45_1;
+	@%p3 bra 	BB47_1;
 
-BB45_4:
+BB47_4:
 	shl.b32 	%r16, %r7, 2;
 	mov.u32 	%r17, my_sdata;
 	add.s32 	%r5, %r17, %r16;
 	st.shared.f32 	[%r5], %f44;
 	bar.sync 	0;
 	setp.lt.u32	%p4, %r10, 1024;
-	@%p4 bra 	BB45_8;
+	@%p4 bra 	BB47_8;
 
 	setp.gt.u32	%p5, %r7, 511;
-	@%p5 bra 	BB45_7;
+	@%p5 bra 	BB47_7;
 
 	ld.shared.f32 	%f32, [%r5+2048];
 	max.f32 	%f44, %f44, %f32;
 	st.shared.f32 	[%r5], %f44;
 
-BB45_7:
+BB47_7:
 	bar.sync 	0;
 
-BB45_8:
+BB47_8:
 	setp.lt.u32	%p6, %r10, 512;
-	@%p6 bra 	BB45_12;
+	@%p6 bra 	BB47_12;
 
 	setp.gt.u32	%p7, %r7, 255;
-	@%p7 bra 	BB45_11;
+	@%p7 bra 	BB47_11;
 
 	ld.shared.f32 	%f33, [%r5+1024];
 	max.f32 	%f44, %f44, %f33;
 	st.shared.f32 	[%r5], %f44;
 
-BB45_11:
+BB47_11:
 	bar.sync 	0;
 
-BB45_12:
+BB47_12:
 	setp.lt.u32	%p8, %r10, 256;
-	@%p8 bra 	BB45_16;
+	@%p8 bra 	BB47_16;
 
 	setp.gt.u32	%p9, %r7, 127;
-	@%p9 bra 	BB45_15;
+	@%p9 bra 	BB47_15;
 
 	ld.shared.f32 	%f34, [%r5+512];
 	max.f32 	%f44, %f44, %f34;
 	st.shared.f32 	[%r5], %f44;
 
-BB45_15:
+BB47_15:
 	bar.sync 	0;
 
-BB45_16:
+BB47_16:
 	setp.lt.u32	%p10, %r10, 128;
-	@%p10 bra 	BB45_20;
+	@%p10 bra 	BB47_20;
 
 	setp.gt.u32	%p11, %r7, 63;
-	@%p11 bra 	BB45_19;
+	@%p11 bra 	BB47_19;
 
 	ld.shared.f32 	%f35, [%r5+256];
 	max.f32 	%f44, %f44, %f35;
 	st.shared.f32 	[%r5], %f44;
 
-BB45_19:
+BB47_19:
 	bar.sync 	0;
 
-BB45_20:
+BB47_20:
 	setp.gt.u32	%p12, %r7, 31;
-	@%p12 bra 	BB45_33;
+	@%p12 bra 	BB47_33;
 
 	setp.lt.u32	%p13, %r10, 64;
-	@%p13 bra 	BB45_23;
+	@%p13 bra 	BB47_23;
 
 	ld.volatile.shared.f32 	%f36, [%r5+128];
 	max.f32 	%f44, %f44, %f36;
 	st.volatile.shared.f32 	[%r5], %f44;
 
-BB45_23:
+BB47_23:
 	setp.lt.u32	%p14, %r10, 32;
-	@%p14 bra 	BB45_25;
+	@%p14 bra 	BB47_25;
 
 	ld.volatile.shared.f32 	%f37, [%r5+64];
 	max.f32 	%f44, %f44, %f37;
 	st.volatile.shared.f32 	[%r5], %f44;
 
-BB45_25:
+BB47_25:
 	setp.lt.u32	%p15, %r10, 16;
-	@%p15 bra 	BB45_27;
+	@%p15 bra 	BB47_27;
 
 	ld.volatile.shared.f32 	%f38, [%r5+32];
 	max.f32 	%f44, %f44, %f38;
 	st.volatile.shared.f32 	[%r5], %f44;
 
-BB45_27:
+BB47_27:
 	setp.lt.u32	%p16, %r10, 8;
-	@%p16 bra 	BB45_29;
+	@%p16 bra 	BB47_29;
 
 	ld.volatile.shared.f32 	%f39, [%r5+16];
 	max.f32 	%f44, %f44, %f39;
 	st.volatile.shared.f32 	[%r5], %f44;
 
-BB45_29:
+BB47_29:
 	setp.lt.u32	%p17, %r10, 4;
-	@%p17 bra 	BB45_31;
+	@%p17 bra 	BB47_31;
 
 	ld.volatile.shared.f32 	%f40, [%r5+8];
 	max.f32 	%f44, %f44, %f40;
 	st.volatile.shared.f32 	[%r5], %f44;
 
-BB45_31:
+BB47_31:
 	setp.lt.u32	%p18, %r10, 2;
-	@%p18 bra 	BB45_33;
+	@%p18 bra 	BB47_33;
 
 	ld.volatile.shared.f32 	%f41, [%r5+4];
 	max.f32 	%f42, %f44, %f41;
 	st.volatile.shared.f32 	[%r5], %f42;
 
-BB45_33:
+BB47_33:
 	setp.ne.s32	%p19, %r7, 0;
-	@%p19 bra 	BB45_35;
+	@%p19 bra 	BB47_35;
 
 	ld.shared.f32 	%f43, [my_sdata];
 	cvta.to.global.u64 	%rd9, %rd2;
@@ -6075,7 +7808,7 @@ BB45_33:
 	add.s64 	%rd11, %rd9, %rd10;
 	st.global.f32 	[%rd11], %f43;
 
-BB45_35:
+BB47_35:
 	ret;
 }
 
@@ -6099,16 +7832,16 @@ BB45_35:
 	ld.param.u32 	%r4, [reduce_row_max_d_param_3];
 	mov.u32 	%r6, %ctaid.x;
 	setp.ge.u32	%p1, %r6, %r5;
-	@%p1 bra 	BB46_35;
+	@%p1 bra 	BB48_35;
 
 	mov.u32 	%r71, %tid.x;
 	mov.f64 	%fd6, 0dFFEFFFFFFFFFFFFF;
 	setp.ge.u32	%p2, %r71, %r4;
-	@%p2 bra 	BB46_4;
+	@%p2 bra 	BB48_4;
 
 	cvta.to.global.u64 	%rd3, %rd1;
 
-BB46_3:
+BB48_3:
 	mad.lo.s32 	%r8, %r6, %r4, %r71;
 	mul.wide.u32 	%rd4, %r8, 8;
 	add.s64 	%rd5, %rd3, %rd4;
@@ -6117,9 +7850,9 @@ BB46_3:
 	mov.u32 	%r9, %ntid.x;
 	add.s32 	%r71, %r9, %r71;
 	setp.lt.u32	%p3, %r71, %r4;
-	@%p3 bra 	BB46_3;
+	@%p3 bra 	BB48_3;
 
-BB46_4:
+BB48_4:
 	mov.u32 	%r10, %tid.x;
 	shl.b32 	%r11, %r10, 3;
 	mov.u32 	%r12, my_sdata;
@@ -6128,114 +7861,114 @@ BB46_4:
 	bar.sync 	0;
 	mov.u32 	%r14, %ntid.x;
 	setp.lt.u32	%p4, %r14, 1024;
-	@%p4 bra 	BB46_8;
+	@%p4 bra 	BB48_8;
 
 	setp.gt.u32	%p5, %r10, 511;
-	@%p5 bra 	BB46_7;
+	@%p5 bra 	BB48_7;
 
 	ld.shared.f64 	%fd29, [%r13+4096];
 	max.f64 	%fd6, %fd6, %fd29;
 	st.shared.f64 	[%r13], %fd6;
 
-BB46_7:
+BB48_7:
 	bar.sync 	0;
 
-BB46_8:
+BB48_8:
 	setp.lt.u32	%p6, %r14, 512;
-	@%p6 bra 	BB46_12;
+	@%p6 bra 	BB48_12;
 
 	setp.gt.u32	%p7, %r10, 255;
-	@%p7 bra 	BB46_11;
+	@%p7 bra 	BB48_11;
 
 	ld.shared.f64 	%fd30, [%r13+2048];
 	max.f64 	%fd6, %fd6, %fd30;
 	st.shared.f64 	[%r13], %fd6;
 
-BB46_11:
+BB48_11:
 	bar.sync 	0;
 
-BB46_12:
+BB48_12:
 	setp.lt.u32	%p8, %r14, 256;
-	@%p8 bra 	BB46_16;
+	@%p8 bra 	BB48_16;
 
 	setp.gt.u32	%p9, %r10, 127;
-	@%p9 bra 	BB46_15;
+	@%p9 bra 	BB48_15;
 
 	ld.shared.f64 	%fd31, [%r13+1024];
 	max.f64 	%fd6, %fd6, %fd31;
 	st.shared.f64 	[%r13], %fd6;
 
-BB46_15:
+BB48_15:
 	bar.sync 	0;
 
-BB46_16:
+BB48_16:
 	setp.lt.u32	%p10, %r14, 128;
-	@%p10 bra 	BB46_20;
+	@%p10 bra 	BB48_20;
 
 	setp.gt.u32	%p11, %r10, 63;
-	@%p11 bra 	BB46_19;
+	@%p11 bra 	BB48_19;
 
 	ld.shared.f64 	%fd32, [%r13+512];
 	max.f64 	%fd6, %fd6, %fd32;
 	st.shared.f64 	[%r13], %fd6;
 
-BB46_19:
+BB48_19:
 	bar.sync 	0;
 
-BB46_20:
+BB48_20:
 	setp.gt.u32	%p12, %r10, 31;
-	@%p12 bra 	BB46_33;
+	@%p12 bra 	BB48_33;
 
 	setp.lt.u32	%p13, %r14, 64;
-	@%p13 bra 	BB46_23;
+	@%p13 bra 	BB48_23;
 
 	ld.volatile.shared.f64 	%fd33, [%r13+256];
 	max.f64 	%fd6, %fd6, %fd33;
 	st.volatile.shared.f64 	[%r13], %fd6;
 
-BB46_23:
+BB48_23:
 	setp.lt.u32	%p14, %r14, 32;
-	@%p14 bra 	BB46_25;
+	@%p14 bra 	BB48_25;
 
 	ld.volatile.shared.f64 	%fd34, [%r13+128];
 	max.f64 	%fd6, %fd6, %fd34;
 	st.volatile.shared.f64 	[%r13], %fd6;
 
-BB46_25:
+BB48_25:
 	setp.lt.u32	%p15, %r14, 16;
-	@%p15 bra 	BB46_27;
+	@%p15 bra 	BB48_27;
 
 	ld.volatile.shared.f64 	%fd35, [%r13+64];
 	max.f64 	%fd6, %fd6, %fd35;
 	st.volatile.shared.f64 	[%r13], %fd6;
 
-BB46_27:
+BB48_27:
 	setp.lt.u32	%p16, %r14, 8;
-	@%p16 bra 	BB46_29;
+	@%p16 bra 	BB48_29;
 
 	ld.volatile.shared.f64 	%fd36, [%r13+32];
 	max.f64 	%fd6, %fd6, %fd36;
 	st.volatile.shared.f64 	[%r13], %fd6;
 
-BB46_29:
+BB48_29:
 	setp.lt.u32	%p17, %r14, 4;
-	@%p17 bra 	BB46_31;
+	@%p17 bra 	BB48_31;
 
 	ld.volatile.shared.f64 	%fd37, [%r13+16];
 	max.f64 	%fd6, %fd6, %fd37;
 	st.volatile.shared.f64 	[%r13], %fd6;
 
-BB46_31:
+BB48_31:
 	setp.lt.u32	%p18, %r14, 2;
-	@%p18 bra 	BB46_33;
+	@%p18 bra 	BB48_33;
 
 	ld.volatile.shared.f64 	%fd38, [%r13+8];
 	max.f64 	%fd39, %fd6, %fd38;
 	st.volatile.shared.f64 	[%r13], %fd39;
 
-BB46_33:
+BB48_33:
 	setp.ne.s32	%p19, %r10, 0;
-	@%p19 bra 	BB46_35;
+	@%p19 bra 	BB48_35;
 
 	ld.shared.f64 	%fd40, [my_sdata];
 	cvta.to.global.u64 	%rd6, %rd2;
@@ -6243,7 +7976,7 @@ BB46_33:
 	add.s64 	%rd8, %rd6, %rd7;
 	st.global.f64 	[%rd8], %fd40;
 
-BB46_35:
+BB48_35:
 	ret;
 }
 
@@ -6267,16 +8000,16 @@ BB46_35:
 	ld.param.u32 	%r4, [reduce_row_max_f_param_3];
 	mov.u32 	%r6, %ctaid.x;
 	setp.ge.u32	%p1, %r6, %r5;
-	@%p1 bra 	BB47_35;
+	@%p1 bra 	BB49_35;
 
 	mov.u32 	%r71, %tid.x;
 	mov.f32 	%f6, 0fFF7FFFFF;
 	setp.ge.u32	%p2, %r71, %r4;
-	@%p2 bra 	BB47_4;
+	@%p2 bra 	BB49_4;
 
 	cvta.to.global.u64 	%rd3, %rd1;
 
-BB47_3:
+BB49_3:
 	mad.lo.s32 	%r8, %r6, %r4, %r71;
 	mul.wide.u32 	%rd4, %r8, 4;
 	add.s64 	%rd5, %rd3, %rd4;
@@ -6285,9 +8018,9 @@ BB47_3:
 	mov.u32 	%r9, %ntid.x;
 	add.s32 	%r71, %r9, %r71;
 	setp.lt.u32	%p3, %r71, %r4;
-	@%p3 bra 	BB47_3;
+	@%p3 bra 	BB49_3;
 
-BB47_4:
+BB49_4:
 	mov.u32 	%r10, %tid.x;
 	shl.b32 	%r11, %r10, 2;
 	mov.u32 	%r12, my_sdata;
@@ -6296,114 +8029,114 @@ BB47_4:
 	bar.sync 	0;
 	mov.u32 	%r14, %ntid.x;
 	setp.lt.u32	%p4, %r14, 1024;
-	@%p4 bra 	BB47_8;
+	@%p4 bra 	BB49_8;
 
 	setp.gt.u32	%p5, %r10, 511;
-	@%p5 bra 	BB47_7;
+	@%p5 bra 	BB49_7;
 
 	ld.shared.f32 	%f29, [%r13+2048];
 	max.f32 	%f6, %f6, %f29;
 	st.shared.f32 	[%r13], %f6;
 
-BB47_7:
+BB49_7:
 	bar.sync 	0;
 
-BB47_8:
+BB49_8:
 	setp.lt.u32	%p6, %r14, 512;
-	@%p6 bra 	BB47_12;
+	@%p6 bra 	BB49_12;
 
 	setp.gt.u32	%p7, %r10, 255;
-	@%p7 bra 	BB47_11;
+	@%p7 bra 	BB49_11;
 
 	ld.shared.f32 	%f30, [%r13+1024];
 	max.f32 	%f6, %f6, %f30;
 	st.shared.f32 	[%r13], %f6;
 
-BB47_11:
+BB49_11:
 	bar.sync 	0;
 
-BB47_12:
+BB49_12:
 	setp.lt.u32	%p8, %r14, 256;
-	@%p8 bra 	BB47_16;
+	@%p8 bra 	BB49_16;
 
 	setp.gt.u32	%p9, %r10, 127;
-	@%p9 bra 	BB47_15;
+	@%p9 bra 	BB49_15;
 
 	ld.shared.f32 	%f31, [%r13+512];
 	max.f32 	%f6, %f6, %f31;
 	st.shared.f32 	[%r13], %f6;
 
-BB47_15:
+BB49_15:
 	bar.sync 	0;
 
-BB47_16:
+BB49_16:
 	setp.lt.u32	%p10, %r14, 128;
-	@%p10 bra 	BB47_20;
+	@%p10 bra 	BB49_20;
 
 	setp.gt.u32	%p11, %r10, 63;
-	@%p11 bra 	BB47_19;
+	@%p11 bra 	BB49_19;
 
 	ld.shared.f32 	%f32, [%r13+256];
 	max.f32 	%f6, %f6, %f32;
 	st.shared.f32 	[%r13], %f6;
 
-BB47_19:
+BB49_19:
 	bar.sync 	0;
 
-BB47_20:
+BB49_20:
 	setp.gt.u32	%p12, %r10, 31;
-	@%p12 bra 	BB47_33;
+	@%p12 bra 	BB49_33;
 
 	setp.lt.u32	%p13, %r14, 64;
-	@%p13 bra 	BB47_23;
+	@%p13 bra 	BB49_23;
 
 	ld.volatile.shared.f32 	%f33, [%r13+128];
 	max.f32 	%f6, %f6, %f33;
 	st.volatile.shared.f32 	[%r13], %f6;
 
-BB47_23:
+BB49_23:
 	setp.lt.u32	%p14, %r14, 32;
-	@%p14 bra 	BB47_25;
+	@%p14 bra 	BB49_25;
 
 	ld.volatile.shared.f32 	%f34, [%r13+64];
 	max.f32 	%f6, %f6, %f34;
 	st.volatile.shared.f32 	[%r13], %f6;
 
-BB47_25:
+BB49_25:
 	setp.lt.u32	%p15, %r14, 16;
-	@%p15 bra 	BB47_27;
+	@%p15 bra 	BB49_27;
 
 	ld.volatile.shared.f32 	%f35, [%r13+32];
 	max.f32 	%f6, %f6, %f35;
 	st.volatile.shared.f32 	[%r13], %f6;
 
-BB47_27:
+BB49_27:
 	setp.lt.u32	%p16, %r14, 8;
-	@%p16 bra 	BB47_29;
+	@%p16 bra 	BB49_29;
 
 	ld.volatile.shared.f32 	%f36, [%r13+16];
 	max.f32 	%f6, %f6, %f36;
 	st.volatile.shared.f32 	[%r13], %f6;
 
-BB47_29:
+BB49_29:
 	setp.lt.u32	%p17, %r14, 4;
-	@%p17 bra 	BB47_31;
+	@%p17 bra 	BB49_31;
 
 	ld.volatile.shared.f32 	%f37, [%r13+8];
 	max.f32 	%f6, %f6, %f37;
 	st.volatile.shared.f32 	[%r13], %f6;
 
-BB47_31:
+BB49_31:
 	setp.lt.u32	%p18, %r14, 2;
-	@%p18 bra 	BB47_33;
+	@%p18 bra 	BB49_33;
 
 	ld.volatile.shared.f32 	%f38, [%r13+4];
 	max.f32 	%f39, %f6, %f38;
 	st.volatile.shared.f32 	[%r13], %f39;
 
-BB47_33:
+BB49_33:
 	setp.ne.s32	%p19, %r10, 0;
-	@%p19 bra 	BB47_35;
+	@%p19 bra 	BB49_35;
 
 	ld.shared.f32 	%f40, [my_sdata];
 	cvta.to.global.u64 	%rd6, %rd2;
@@ -6411,7 +8144,7 @@ BB47_33:
 	add.s64 	%rd8, %rd6, %rd7;
 	st.global.f32 	[%rd8], %f40;
 
-BB47_35:
+BB49_35:
 	ret;
 }
 
@@ -6438,32 +8171,32 @@ BB47_35:
 	mov.u32 	%r9, %tid.x;
 	mad.lo.s32 	%r1, %r7, %r8, %r9;
 	setp.ge.u32	%p1, %r1, %r6;
-	@%p1 bra 	BB48_5;
+	@%p1 bra 	BB50_5;
 
 	cvta.to.global.u64 	%rd1, %rd2;
 	mul.lo.s32 	%r2, %r6, %r5;
 	mov.f64 	%fd8, 0dFFEFFFFFFFFFFFFF;
 	setp.ge.u32	%p2, %r1, %r2;
-	@%p2 bra 	BB48_4;
+	@%p2 bra 	BB50_4;
 
 	mov.u32 	%r10, %r1;
 
-BB48_3:
+BB50_3:
 	mul.wide.u32 	%rd4, %r10, 8;
 	add.s64 	%rd5, %rd1, %rd4;
 	ld.global.f64 	%fd6, [%rd5];
 	max.f64 	%fd8, %fd8, %fd6;
 	add.s32 	%r10, %r10, %r6;
 	setp.lt.u32	%p3, %r10, %r2;
-	@%p3 bra 	BB48_3;
+	@%p3 bra 	BB50_3;
 
-BB48_4:
+BB50_4:
 	cvta.to.global.u64 	%rd6, %rd3;
 	mul.wide.u32 	%rd7, %r1, 8;
 	add.s64 	%rd8, %rd6, %rd7;
 	st.global.f64 	[%rd8], %fd8;
 
-BB48_5:
+BB50_5:
 	ret;
 }
 
@@ -6490,32 +8223,32 @@ BB48_5:
 	mov.u32 	%r9, %tid.x;
 	mad.lo.s32 	%r1, %r7, %r8, %r9;
 	setp.ge.u32	%p1, %r1, %r6;
-	@%p1 bra 	BB49_5;
+	@%p1 bra 	BB51_5;
 
 	cvta.to.global.u64 	%rd1, %rd2;
 	mul.lo.s32 	%r2, %r6, %r5;
 	mov.f32 	%f8, 0fFF7FFFFF;
 	setp.ge.u32	%p2, %r1, %r2;
-	@%p2 bra 	BB49_4;
+	@%p2 bra 	BB51_4;
 
 	mov.u32 	%r10, %r1;
 
-BB49_3:
+BB51_3:
 	mul.wide.u32 	%rd4, %r10, 4;
 	add.s64 	%rd5, %rd1, %rd4;
 	ld.global.f32 	%f6, [%rd5];
 	max.f32 	%f8, %f8, %f6;
 	add.s32 	%r10, %r10, %r6;
 	setp.lt.u32	%p3, %r10, %r2;
-	@%p3 bra 	BB49_3;
+	@%p3 bra 	BB51_3;
 
-BB49_4:
+BB51_4:
 	cvta.to.global.u64 	%rd6, %rd3;
 	mul.wide.u32 	%rd7, %r1, 4;
 	add.s64 	%rd8, %rd6, %rd7;
 	st.global.f32 	[%rd8], %f8;
 
-BB49_5:
+BB51_5:
 	ret;
 }
 
@@ -6542,9 +8275,9 @@ BB49_5:
 	mad.lo.s32 	%r35, %r9, %r10, %r7;
 	mov.f64 	%fd44, 0d7FEFFFFFFFFFFFFF;
 	setp.ge.u32	%p1, %r35, %r6;
-	@%p1 bra 	BB50_4;
+	@%p1 bra 	BB52_4;
 
-BB50_1:
+BB52_1:
 	cvta.to.global.u64 	%rd3, %rd1;
 	mul.wide.u32 	%rd4, %r35, 8;
 	add.s64 	%rd5, %rd3, %rd4;
@@ -6552,135 +8285,135 @@ BB50_1:
 	min.f64 	%fd44, %fd44, %fd30;
 	add.s32 	%r3, %r35, %r10;
 	setp.ge.u32	%p2, %r3, %r6;
-	@%p2 bra 	BB50_3;
+	@%p2 bra 	BB52_3;
 
 	mul.wide.u32 	%rd7, %r3, 8;
 	add.s64 	%rd8, %rd3, %rd7;
 	ld.global.f64 	%fd31, [%rd8];
 	min.f64 	%fd44, %fd44, %fd31;
 
-BB50_3:
+BB52_3:
 	shl.b32 	%r13, %r10, 1;
 	mov.u32 	%r14, %nctaid.x;
 	mad.lo.s32 	%r35, %r13, %r14, %r35;
 	setp.lt.u32	%p3, %r35, %r6;
-	@%p3 bra 	BB50_1;
+	@%p3 bra 	BB52_1;
 
-BB50_4:
+BB52_4:
 	shl.b32 	%r16, %r7, 3;
 	mov.u32 	%r17, my_sdata;
 	add.s32 	%r5, %r17, %r16;
 	st.shared.f64 	[%r5], %fd44;
 	bar.sync 	0;
 	setp.lt.u32	%p4, %r10, 1024;
-	@%p4 bra 	BB50_8;
+	@%p4 bra 	BB52_8;
 
 	setp.gt.u32	%p5, %r7, 511;
-	@%p5 bra 	BB50_7;
+	@%p5 bra 	BB52_7;
 
 	ld.shared.f64 	%fd32, [%r5+4096];
 	min.f64 	%fd44, %fd44, %fd32;
 	st.shared.f64 	[%r5], %fd44;
 
-BB50_7:
+BB52_7:
 	bar.sync 	0;
 
-BB50_8:
+BB52_8:
 	setp.lt.u32	%p6, %r10, 512;
-	@%p6 bra 	BB50_12;
+	@%p6 bra 	BB52_12;
 
 	setp.gt.u32	%p7, %r7, 255;
-	@%p7 bra 	BB50_11;
+	@%p7 bra 	BB52_11;
 
 	ld.shared.f64 	%fd33, [%r5+2048];
 	min.f64 	%fd44, %fd44, %fd33;
 	st.shared.f64 	[%r5], %fd44;
 
-BB50_11:
+BB52_11:
 	bar.sync 	0;
 
-BB50_12:
+BB52_12:
 	setp.lt.u32	%p8, %r10, 256;
-	@%p8 bra 	BB50_16;
+	@%p8 bra 	BB52_16;
 
 	setp.gt.u32	%p9, %r7, 127;
-	@%p9 bra 	BB50_15;
+	@%p9 bra 	BB52_15;
 
 	ld.shared.f64 	%fd34, [%r5+1024];
 	min.f64 	%fd44, %fd44, %fd34;
 	st.shared.f64 	[%r5], %fd44;
 
-BB50_15:
+BB52_15:
 	bar.sync 	0;
 
-BB50_16:
+BB52_16:
 	setp.lt.u32	%p10, %r10, 128;
-	@%p10 bra 	BB50_20;
+	@%p10 bra 	BB52_20;
 
 	setp.gt.u32	%p11, %r7, 63;
-	@%p11 bra 	BB50_19;
+	@%p11 bra 	BB52_19;
 
 	ld.shared.f64 	%fd35, [%r5+512];
 	min.f64 	%fd44, %fd44, %fd35;
 	st.shared.f64 	[%r5], %fd44;
 
-BB50_19:
+BB52_19:
 	bar.sync 	0;
 
-BB50_20:
+BB52_20:
 	setp.gt.u32	%p12, %r7, 31;
-	@%p12 bra 	BB50_33;
+	@%p12 bra 	BB52_33;
 
 	setp.lt.u32	%p13, %r10, 64;
-	@%p13 bra 	BB50_23;
+	@%p13 bra 	BB52_23;
 
 	ld.volatile.shared.f64 	%fd36, [%r5+256];
 	min.f64 	%fd44, %fd44, %fd36;
 	st.volatile.shared.f64 	[%r5], %fd44;
 
-BB50_23:
+BB52_23:
 	setp.lt.u32	%p14, %r10, 32;
-	@%p14 bra 	BB50_25;
+	@%p14 bra 	BB52_25;
 
 	ld.volatile.shared.f64 	%fd37, [%r5+128];
 	min.f64 	%fd44, %fd44, %fd37;
 	st.volatile.shared.f64 	[%r5], %fd44;
 
-BB50_25:
+BB52_25:
 	setp.lt.u32	%p15, %r10, 16;
-	@%p15 bra 	BB50_27;
+	@%p15 bra 	BB52_27;
 
 	ld.volatile.shared.f64 	%fd38, [%r5+64];
 	min.f64 	%fd44, %fd44, %fd38;
 	st.volatile.shared.f64 	[%r5], %fd44;
 
-BB50_27:
+BB52_27:
 	setp.lt.u32	%p16, %r10, 8;
-	@%p16 bra 	BB50_29;
+	@%p16 bra 	BB52_29;
 
 	ld.volatile.shared.f64 	%fd39, [%r5+32];
 	min.f64 	%fd44, %fd44, %fd39;
 	st.volatile.shared.f64 	[%r5], %fd44;
 
-BB50_29:
+BB52_29:
 	setp.lt.u32	%p17, %r10, 4;
-	@%p17 bra 	BB50_31;
+	@%p17 bra 	BB52_31;
 
 	ld.volatile.shared.f64 	%fd40, [%r5+16];
 	min.f64 	%fd44, %fd44, %fd40;
 	st.volatile.shared.f64 	[%r5], %fd44;
 
-BB50_31:
+BB52_31:
 	setp.lt.u32	%p18, %r10, 2;
-	@%p18 bra 	BB50_33;
+	@%p18 bra 	BB52_33;
 
 	ld.volatile.shared.f64 	%fd41, [%r5+8];
 	min.f64 	%fd42, %fd44, %fd41;
 	st.volatile.shared.f64 	[%r5], %fd42;
 
-BB50_33:
+BB52_33:
 	setp.ne.s32	%p19, %r7, 0;
-	@%p19 bra 	BB50_35;
+	@%p19 bra 	BB52_35;
 
 	ld.shared.f64 	%fd43, [my_sdata];
 	cvta.to.global.u64 	%rd9, %rd2;
@@ -6688,7 +8421,7 @@ BB50_33:
 	add.s64 	%rd11, %rd9, %rd10;
 	st.global.f64 	[%rd11], %fd43;
 
-BB50_35:
+BB52_35:
 	ret;
 }
 
@@ -6715,9 +8448,9 @@ BB50_35:
 	mad.lo.s32 	%r35, %r9, %r10, %r7;
 	mov.f32 	%f44, 0f7F7FFFFF;
 	setp.ge.u32	%p1, %r35, %r6;
-	@%p1 bra 	BB51_4;
+	@%p1 bra 	BB53_4;
 
-BB51_1:
+BB53_1:
 	cvta.to.global.u64 	%rd3, %rd1;
 	mul.wide.u32 	%rd4, %r35, 4;
 	add.s64 	%rd5, %rd3, %rd4;
@@ -6725,135 +8458,135 @@ BB51_1:
 	min.f32 	%f44, %f44, %f30;
 	add.s32 	%r3, %r35, %r10;
 	setp.ge.u32	%p2, %r3, %r6;
-	@%p2 bra 	BB51_3;
+	@%p2 bra 	BB53_3;
 
 	mul.wide.u32 	%rd7, %r3, 4;
 	add.s64 	%rd8, %rd3, %rd7;
 	ld.global.f32 	%f31, [%rd8];
 	min.f32 	%f44, %f44, %f31;
 
-BB51_3:
+BB53_3:
 	shl.b32 	%r13, %r10, 1;
 	mov.u32 	%r14, %nctaid.x;
 	mad.lo.s32 	%r35, %r13, %r14, %r35;
 	setp.lt.u32	%p3, %r35, %r6;
-	@%p3 bra 	BB51_1;
+	@%p3 bra 	BB53_1;
 
-BB51_4:
+BB53_4:
 	shl.b32 	%r16, %r7, 2;
 	mov.u32 	%r17, my_sdata;
 	add.s32 	%r5, %r17, %r16;
 	st.shared.f32 	[%r5], %f44;
 	bar.sync 	0;
 	setp.lt.u32	%p4, %r10, 1024;
-	@%p4 bra 	BB51_8;
+	@%p4 bra 	BB53_8;
 
 	setp.gt.u32	%p5, %r7, 511;
-	@%p5 bra 	BB51_7;
+	@%p5 bra 	BB53_7;
 
 	ld.shared.f32 	%f32, [%r5+2048];
 	min.f32 	%f44, %f44, %f32;
 	st.shared.f32 	[%r5], %f44;
 
-BB51_7:
+BB53_7:
 	bar.sync 	0;
 
-BB51_8:
+BB53_8:
 	setp.lt.u32	%p6, %r10, 512;
-	@%p6 bra 	BB51_12;
+	@%p6 bra 	BB53_12;
 
 	setp.gt.u32	%p7, %r7, 255;
-	@%p7 bra 	BB51_11;
+	@%p7 bra 	BB53_11;
 
 	ld.shared.f32 	%f33, [%r5+1024];
 	min.f32 	%f44, %f44, %f33;
 	st.shared.f32 	[%r5], %f44;
 
-BB51_11:
+BB53_11:
 	bar.sync 	0;
 
-BB51_12:
+BB53_12:
 	setp.lt.u32	%p8, %r10, 256;
-	@%p8 bra 	BB51_16;
+	@%p8 bra 	BB53_16;
 
 	setp.gt.u32	%p9, %r7, 127;
-	@%p9 bra 	BB51_15;
+	@%p9 bra 	BB53_15;
 
 	ld.shared.f32 	%f34, [%r5+512];
 	min.f32 	%f44, %f44, %f34;
 	st.shared.f32 	[%r5], %f44;
 
-BB51_15:
+BB53_15:
 	bar.sync 	0;
 
-BB51_16:
+BB53_16:
 	setp.lt.u32	%p10, %r10, 128;
-	@%p10 bra 	BB51_20;
+	@%p10 bra 	BB53_20;
 
 	setp.gt.u32	%p11, %r7, 63;
-	@%p11 bra 	BB51_19;
+	@%p11 bra 	BB53_19;
 
 	ld.shared.f32 	%f35, [%r5+256];
 	min.f32 	%f44, %f44, %f35;
 	st.shared.f32 	[%r5], %f44;
 
-BB51_19:
+BB53_19:
 	bar.sync 	0;
 
-BB51_20:
+BB53_20:
 	setp.gt.u32	%p12, %r7, 31;
-	@%p12 bra 	BB51_33;
+	@%p12 bra 	BB53_33;
 
 	setp.lt.u32	%p13, %r10, 64;
-	@%p13 bra 	BB51_23;
+	@%p13 bra 	BB53_23;
 
 	ld.volatile.shared.f32 	%f36, [%r5+128];
 	min.f32 	%f44, %f44, %f36;
 	st.volatile.shared.f32 	[%r5], %f44;
 
-BB51_23:
+BB53_23:
 	setp.lt.u32	%p14, %r10, 32;
-	@%p14 bra 	BB51_25;
+	@%p14 bra 	BB53_25;
 
 	ld.volatile.shared.f32 	%f37, [%r5+64];
 	min.f32 	%f44, %f44, %f37;
 	st.volatile.shared.f32 	[%r5], %f44;
 
-BB51_25:
+BB53_25:
 	setp.lt.u32	%p15, %r10, 16;
-	@%p15 bra 	BB51_27;
+	@%p15 bra 	BB53_27;
 
 	ld.volatile.shared.f32 	%f38, [%r5+32];
 	min.f32 	%f44, %f44, %f38;
 	st.volatile.shared.f32 	[%r5], %f44;
 
-BB51_27:
+BB53_27:
 	setp.lt.u32	%p16, %r10, 8;
-	@%p16 bra 	BB51_29;
+	@%p16 bra 	BB53_29;
 
 	ld.volatile.shared.f32 	%f39, [%r5+16];
 	min.f32 	%f44, %f44, %f39;
 	st.volatile.shared.f32 	[%r5], %f44;
 
-BB51_29:
+BB53_29:
 	setp.lt.u32	%p17, %r10, 4;
-	@%p17 bra 	BB51_31;
+	@%p17 bra 	BB53_31;
 
 	ld.volatile.shared.f32 	%f40, [%r5+8];
 	min.f32 	%f44, %f44, %f40;
 	st.volatile.shared.f32 	[%r5], %f44;
 
-BB51_31:
+BB53_31:
 	setp.lt.u32	%p18, %r10, 2;
-	@%p18 bra 	BB51_33;
+	@%p18 bra 	BB53_33;
 
 	ld.volatile.shared.f32 	%f41, [%r5+4];
 	min.f32 	%f42, %f44, %f41;
 	st.volatile.shared.f32 	[%r5], %f42;
 
-BB51_33:
+BB53_33:
 	setp.ne.s32	%p19, %r7, 0;
-	@%p19 bra 	BB51_35;
+	@%p19 bra 	BB53_35;
 
 	ld.shared.f32 	%f43, [my_sdata];
 	cvta.to.global.u64 	%rd9, %rd2;
@@ -6861,7 +8594,7 @@ BB51_33:
 	add.s64 	%rd11, %rd9, %rd10;
 	st.global.f32 	[%rd11], %f43;
 
-BB51_35:
+BB53_35:
 	ret;
 }
 
@@ -6885,16 +8618,16 @@ BB51_35:
 	ld.param.u32 	%r4, [reduce_row_min_d_param_3];
 	mov.u32 	%r6, %ctaid.x;
 	setp.ge.u32	%p1, %r6, %r5;
-	@%p1 bra 	BB52_35;
+	@%p1 bra 	BB54_35;
 
 	mov.u32 	%r71, %tid.x;
 	mov.f64 	%fd6, 0d7FEFFFFFFFFFFFFF;
 	setp.ge.u32	%p2, %r71, %r4;
-	@%p2 bra 	BB52_4;
+	@%p2 bra 	BB54_4;
 
 	cvta.to.global.u64 	%rd3, %rd1;
 
-BB52_3:
+BB54_3:
 	mad.lo.s32 	%r8, %r6, %r4, %r71;
 	mul.wide.u32 	%rd4, %r8, 8;
 	add.s64 	%rd5, %rd3, %rd4;
@@ -6903,9 +8636,9 @@ BB52_3:
 	mov.u32 	%r9, %ntid.x;
 	add.s32 	%r71, %r9, %r71;
 	setp.lt.u32	%p3, %r71, %r4;
-	@%p3 bra 	BB52_3;
+	@%p3 bra 	BB54_3;
 
-BB52_4:
+BB54_4:
 	mov.u32 	%r10, %tid.x;
 	shl.b32 	%r11, %r10, 3;
 	mov.u32 	%r12, my_sdata;
@@ -6914,114 +8647,114 @@ BB52_4:
 	bar.sync 	0;
 	mov.u32 	%r14, %ntid.x;
 	setp.lt.u32	%p4, %r14, 1024;
-	@%p4 bra 	BB52_8;
+	@%p4 bra 	BB54_8;
 
 	setp.gt.u32	%p5, %r10, 511;
-	@%p5 bra 	BB52_7;
+	@%p5 bra 	BB54_7;
 
 	ld.shared.f64 	%fd29, [%r13+4096];
 	min.f64 	%fd6, %fd6, %fd29;
 	st.shared.f64 	[%r13], %fd6;
 
-BB52_7:
+BB54_7:
 	bar.sync 	0;
 
-BB52_8:
+BB54_8:
 	setp.lt.u32	%p6, %r14, 512;
-	@%p6 bra 	BB52_12;
+	@%p6 bra 	BB54_12;
 
 	setp.gt.u32	%p7, %r10, 255;
-	@%p7 bra 	BB52_11;
+	@%p7 bra 	BB54_11;
 
 	ld.shared.f64 	%fd30, [%r13+2048];
 	min.f64 	%fd6, %fd6, %fd30;
 	st.shared.f64 	[%r13], %fd6;
 
-BB52_11:
+BB54_11:
 	bar.sync 	0;
 
-BB52_12:
+BB54_12:
 	setp.lt.u32	%p8, %r14, 256;
-	@%p8 bra 	BB52_16;
+	@%p8 bra 	BB54_16;
 
 	setp.gt.u32	%p9, %r10, 127;
-	@%p9 bra 	BB52_15;
+	@%p9 bra 	BB54_15;
 
 	ld.shared.f64 	%fd31, [%r13+1024];
 	min.f64 	%fd6, %fd6, %fd31;
 	st.shared.f64 	[%r13], %fd6;
 
-BB52_15:
+BB54_15:
 	bar.sync 	0;
 
-BB52_16:
+BB54_16:
 	setp.lt.u32	%p10, %r14, 128;
-	@%p10 bra 	BB52_20;
+	@%p10 bra 	BB54_20;
 
 	setp.gt.u32	%p11, %r10, 63;
-	@%p11 bra 	BB52_19;
+	@%p11 bra 	BB54_19;
 
 	ld.shared.f64 	%fd32, [%r13+512];
 	min.f64 	%fd6, %fd6, %fd32;
 	st.shared.f64 	[%r13], %fd6;
 
-BB52_19:
+BB54_19:
 	bar.sync 	0;
 
-BB52_20:
+BB54_20:
 	setp.gt.u32	%p12, %r10, 31;
-	@%p12 bra 	BB52_33;
+	@%p12 bra 	BB54_33;
 
 	setp.lt.u32	%p13, %r14, 64;
-	@%p13 bra 	BB52_23;
+	@%p13 bra 	BB54_23;
 
 	ld.volatile.shared.f64 	%fd33, [%r13+256];
 	min.f64 	%fd6, %fd6, %fd33;
 	st.volatile.shared.f64 	[%r13], %fd6;
 
-BB52_23:
+BB54_23:
 	setp.lt.u32	%p14, %r14, 32;
-	@%p14 bra 	BB52_25;
+	@%p14 bra 	BB54_25;
 
 	ld.volatile.shared.f64 	%fd34, [%r13+128];
 	min.f64 	%fd6, %fd6, %fd34;
 	st.volatile.shared.f64 	[%r13], %fd6;
 
-BB52_25:
+BB54_25:
 	setp.lt.u32	%p15, %r14, 16;
-	@%p15 bra 	BB52_27;
+	@%p15 bra 	BB54_27;
 
 	ld.volatile.shared.f64 	%fd35, [%r13+64];
 	min.f64 	%fd6, %fd6, %fd35;
 	st.volatile.shared.f64 	[%r13], %fd6;
 
-BB52_27:
+BB54_27:
 	setp.lt.u32	%p16, %r14, 8;
-	@%p16 bra 	BB52_29;
+	@%p16 bra 	BB54_29;
 
 	ld.volatile.shared.f64 	%fd36, [%r13+32];
 	min.f64 	%fd6, %fd6, %fd36;
 	st.volatile.shared.f64 	[%r13], %fd6;
 
-BB52_29:
+BB54_29:
 	setp.lt.u32	%p17, %r14, 4;
-	@%p17 bra 	BB52_31;
+	@%p17 bra 	BB54_31;
 
 	ld.volatile.shared.f64 	%fd37, [%r13+16];
 	min.f64 	%fd6, %fd6, %fd37;
 	st.volatile.shared.f64 	[%r13], %fd6;
 
-BB52_31:
+BB54_31:
 	setp.lt.u32	%p18, %r14, 2;
-	@%p18 bra 	BB52_33;
+	@%p18 bra 	BB54_33;
 
 	ld.volatile.shared.f64 	%fd38, [%r13+8];
 	min.f64 	%fd39, %fd6, %fd38;
 	st.volatile.shared.f64 	[%r13], %fd39;
 
-BB52_33:
+BB54_33:
 	setp.ne.s32	%p19, %r10, 0;
-	@%p19 bra 	BB52_35;
+	@%p19 bra 	BB54_35;
 
 	ld.shared.f64 	%fd40, [my_sdata];
 	cvta.to.global.u64 	%rd6, %rd2;
@@ -7029,7 +8762,7 @@ BB52_33:
 	add.s64 	%rd8, %rd6, %rd7;
 	st.global.f64 	[%rd8], %fd40;
 
-BB52_35:
+BB54_35:
 	ret;
 }
 
@@ -7053,16 +8786,16 @@ BB52_35:
 	ld.param.u32 	%r4, [reduce_row_min_f_param_3];
 	mov.u32 	%r6, %ctaid.x;
 	setp.ge.u32	%p1, %r6, %r5;
-	@%p1 bra 	BB53_35;
+	@%p1 bra 	BB55_35;
 
 	mov.u32 	%r71, %tid.x;
 	mov.f32 	%f6, 0f7F7FFFFF;
 	setp.ge.u32	%p2, %r71, %r4;
-	@%p2 bra 	BB53_4;
+	@%p2 bra 	BB55_4;
 
 	cvta.to.global.u64 	%rd3, %rd1;
 
-BB53_3:
+BB55_3:
 	mad.lo.s32 	%r8, %r6, %r4, %r71;
 	mul.wide.u32 	%rd4, %r8, 4;
 	add.s64 	%rd5, %rd3, %rd4;
@@ -7071,9 +8804,9 @@ BB53_3:
 	mov.u32 	%r9, %ntid.x;
 	add.s32 	%r71, %r9, %r71;
 	setp.lt.u32	%p3, %r71, %r4;
-	@%p3 bra 	BB53_3;
+	@%p3 bra 	BB55_3;
 
-BB53_4:
+BB55_4:
 	mov.u32 	%r10, %tid.x;
 	shl.b32 	%r11, %r10, 2;
 	mov.u32 	%r12, my_sdata;
@@ -7082,114 +8815,114 @@ BB53_4:
 	bar.sync 	0;
 	mov.u32 	%r14, %ntid.x;
 	setp.lt.u32	%p4, %r14, 1024;
-	@%p4 bra 	BB53_8;
+	@%p4 bra 	BB55_8;
 
 	setp.gt.u32	%p5, %r10, 511;
-	@%p5 bra 	BB53_7;
+	@%p5 bra 	BB55_7;
 
 	ld.shared.f32 	%f29, [%r13+2048];
 	min.f32 	%f6, %f6, %f29;
 	st.shared.f32 	[%r13], %f6;
 
-BB53_7:
+BB55_7:
 	bar.sync 	0;
 
-BB53_8:
+BB55_8:
 	setp.lt.u32	%p6, %r14, 512;
-	@%p6 bra 	BB53_12;
+	@%p6 bra 	BB55_12;
 
 	setp.gt.u32	%p7, %r10, 255;
-	@%p7 bra 	BB53_11;
+	@%p7 bra 	BB55_11;
 
 	ld.shared.f32 	%f30, [%r13+1024];
 	min.f32 	%f6, %f6, %f30;
 	st.shared.f32 	[%r13], %f6;
 
-BB53_11:
+BB55_11:
 	bar.sync 	0;
 
-BB53_12:
+BB55_12:
 	setp.lt.u32	%p8, %r14, 256;
-	@%p8 bra 	BB53_16;
+	@%p8 bra 	BB55_16;
 
 	setp.gt.u32	%p9, %r10, 127;
-	@%p9 bra 	BB53_15;
+	@%p9 bra 	BB55_15;
 
 	ld.shared.f32 	%f31, [%r13+512];
 	min.f32 	%f6, %f6, %f31;
 	st.shared.f32 	[%r13], %f6;
 
-BB53_15:
+BB55_15:
 	bar.sync 	0;
 
-BB53_16:
+BB55_16:
 	setp.lt.u32	%p10, %r14, 128;
-	@%p10 bra 	BB53_20;
+	@%p10 bra 	BB55_20;
 
 	setp.gt.u32	%p11, %r10, 63;
-	@%p11 bra 	BB53_19;
+	@%p11 bra 	BB55_19;
 
 	ld.shared.f32 	%f32, [%r13+256

<TRUNCATED>