You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by ma...@apache.org on 2022/04/20 12:17:56 UTC

[systemds] 02/02: [SYSTEMDS-3352] CUDA code generation binaries

This is an automated email from the ASF dual-hosted git repository.

markd pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git

commit 29bf8f18ad4893bd22015ab4f5e46b6f8b7c218c
Author: Mark Dokter <ma...@dokter.cc>
AuthorDate: Wed Apr 20 14:12:41 2022 +0200

    [SYSTEMDS-3352] CUDA code generation binaries
    
    Code gen native support compiled on Ubuntu 20 LTS (still on CUDA 10.2 ofc)
---
 .../cpp/lib/libsystemds_spoof_cuda-Linux-x86_64.so |  Bin 302880 -> 285976 bytes
 src/main/cuda/kernels/reduction.ptx                | 1185 ++++++++++++--------
 2 files changed, 698 insertions(+), 487 deletions(-)

diff --git a/src/main/cpp/lib/libsystemds_spoof_cuda-Linux-x86_64.so b/src/main/cpp/lib/libsystemds_spoof_cuda-Linux-x86_64.so
index ec5be11087..81d1184b18 100644
Binary files a/src/main/cpp/lib/libsystemds_spoof_cuda-Linux-x86_64.so and b/src/main/cpp/lib/libsystemds_spoof_cuda-Linux-x86_64.so differ
diff --git a/src/main/cuda/kernels/reduction.ptx b/src/main/cuda/kernels/reduction.ptx
index 72b922596a..8b949f9dba 100644
--- a/src/main/cuda/kernels/reduction.ptx
+++ b/src/main/cuda/kernels/reduction.ptx
@@ -11,7 +11,14 @@
 .address_size 64
 
 	// .globl	double2float_f
+.extern .func  (.param .b32 func_retval0) vprintf
+(
+	.param .b64 vprintf_param_0,
+	.param .b64 vprintf_param_1
+)
+;
 .extern .shared .align 1 .b8 memory[];
+.global .align 1 .b8 $str[28] = {84, 66, 73, 58, 32, 118, 97, 108, 95, 115, 112, 97, 114, 115, 101, 95, 114, 99, 40, 37, 100, 44, 32, 37, 100, 41, 10, 0};
 
 .visible .entry double2float_f(
 	.param .u64 double2float_f_param_0,
@@ -95,151 +102,151 @@ BB1_2:
 	.param .u32 reduce_sum_f_param_2
 )
 {
+	.local .align 8 .b8 	__local_depot2[8];
+	.reg .b64 	%SP;
+	.reg .b64 	%SPL;
 	.reg .pred 	%p<25>;
 	.reg .f32 	%f<69>;
-	.reg .b32 	%r<57>;
-	.reg .b64 	%rd<36>;
-
-
-	ld.param.u64 	%rd9, [reduce_sum_f_param_0];
-	ld.param.u64 	%rd10, [reduce_sum_f_param_1];
-	ld.param.u32 	%r13, [reduce_sum_f_param_2];
-	mov.u32 	%r14, %ctaid.x;
-	shl.b32 	%r15, %r14, 1;
-	mov.u32 	%r16, %ntid.x;
+	.reg .b32 	%r<51>;
+	.reg .b64 	%rd<38>;
+
+
+	mov.u64 	%SPL, __local_depot2;
+	cvta.local.u64 	%SP, %SPL;
+	ld.param.u64 	%rd10, [reduce_sum_f_param_0];
+	ld.param.u64 	%rd11, [reduce_sum_f_param_1];
+	ld.param.u32 	%r14, [reduce_sum_f_param_2];
+	mov.u32 	%r15, %ctaid.x;
+	shl.b32 	%r16, %r15, 1;
+	mov.u32 	%r1, %ntid.x;
 	mov.u32 	%r17, %tid.x;
-	mad.lo.s32 	%r56, %r15, %r16, %r17;
+	mad.lo.s32 	%r48, %r16, %r1, %r17;
 	mov.f32 	%f51, 0f00000000;
-	setp.ge.u32	%p1, %r56, %r13;
+	setp.ge.u32	%p1, %r48, %r14;
 	@%p1 bra 	BB2_11;
 
-	cvta.to.global.u64 	%rd11, %rd9;
-	ld.global.u64 	%rd1, [%rd11+16];
+	cvta.to.global.u64 	%rd12, %rd10;
+	ld.global.u64 	%rd1, [%rd12+16];
 	setp.eq.s64	%p2, %rd1, 0;
-	ld.global.u64 	%rd12, [%rd11+32];
-	cvta.to.global.u64 	%rd2, %rd12;
+	ld.global.u64 	%rd2, [%rd12+32];
+	mov.u32 	%r18, %nctaid.x;
+	mul.lo.s32 	%r19, %r1, %r18;
+	shl.b32 	%r4, %r19, 1;
 	mov.f32 	%f51, 0f00000000;
 	@%p2 bra 	BB2_8;
 
-	mad.lo.s32 	%r54, %r15, %r16, %r17;
-	mov.f32 	%f51, 0f00000000;
-	mov.u64 	%rd32, %rd1;
+	mov.u64 	%rd34, %rd1;
 
 BB2_3:
-	cvta.to.global.u64 	%rd13, %rd32;
-	mul.wide.u32 	%rd14, %r54, 4;
-	add.s64 	%rd15, %rd13, %rd14;
-	ld.global.u32 	%r27, [%rd15];
-	mul.wide.u32 	%rd16, %r27, 4;
-	add.s64 	%rd17, %rd2, %rd16;
-	ld.global.f32 	%f36, [%rd17];
+	mul.wide.u32 	%rd13, %r48, 4;
+	add.s64 	%rd14, %rd34, %rd13;
+	ld.u32 	%r20, [%rd14];
+	mul.wide.u32 	%rd15, %r20, 4;
+	add.s64 	%rd16, %rd2, %rd15;
+	ld.f32 	%f36, [%rd16];
 	add.f32 	%f51, %f51, %f36;
-	add.s32 	%r55, %r54, %r16;
-	setp.ge.u32	%p3, %r55, %r13;
+	add.s32 	%r49, %r48, %r1;
+	setp.ge.u32	%p3, %r49, %r14;
 	@%p3 bra 	BB2_7;
 
-	setp.eq.s64	%p4, %rd32, 0;
-	mov.u64 	%rd32, 0;
+	setp.eq.s64	%p4, %rd34, 0;
+	mov.u64 	%rd34, 0;
 	@%p4 bra 	BB2_6;
 
-	cvta.to.global.u64 	%rd19, %rd1;
-	mul.wide.u32 	%rd20, %r55, 4;
-	add.s64 	%rd21, %rd19, %rd20;
-	ld.global.u32 	%r55, [%rd21];
-	mov.u64 	%rd32, %rd1;
+	mul.wide.u32 	%rd18, %r49, 4;
+	add.s64 	%rd19, %rd1, %rd18;
+	ld.u32 	%r49, [%rd19];
+	mov.u64 	%rd34, %rd1;
 
 BB2_6:
-	mul.wide.u32 	%rd22, %r55, 4;
-	add.s64 	%rd23, %rd2, %rd22;
-	ld.global.f32 	%f37, [%rd23];
+	mul.wide.u32 	%rd20, %r49, 4;
+	add.s64 	%rd21, %rd2, %rd20;
+	ld.f32 	%f37, [%rd21];
 	add.f32 	%f51, %f51, %f37;
 
 BB2_7:
-	shl.b32 	%r30, %r16, 1;
-	mov.u32 	%r31, %nctaid.x;
-	mad.lo.s32 	%r54, %r30, %r31, %r54;
-	setp.lt.u32	%p5, %r54, %r13;
+	shl.b32 	%r23, %r1, 1;
+	mad.lo.s32 	%r48, %r23, %r18, %r48;
+	setp.lt.u32	%p5, %r48, %r14;
 	@%p5 bra 	BB2_3;
 	bra.uni 	BB2_11;
 
 BB2_8:
-	mul.wide.u32 	%rd24, %r56, 4;
-	add.s64 	%rd25, %rd2, %rd24;
-	ld.global.f32 	%f38, [%rd25];
+	mul.wide.u32 	%rd22, %r48, 4;
+	add.s64 	%rd23, %rd2, %rd22;
+	ld.f32 	%f38, [%rd23];
 	add.f32 	%f51, %f51, %f38;
-	add.s32 	%r10, %r56, %r16;
-	setp.ge.u32	%p6, %r10, %r13;
+	add.s32 	%r11, %r48, %r1;
+	setp.ge.u32	%p6, %r11, %r14;
 	@%p6 bra 	BB2_10;
 
-	mul.wide.u32 	%rd26, %r10, 4;
-	add.s64 	%rd27, %rd2, %rd26;
-	ld.global.f32 	%f39, [%rd27];
+	mul.wide.u32 	%rd24, %r11, 4;
+	add.s64 	%rd25, %rd2, %rd24;
+	ld.f32 	%f39, [%rd25];
 	add.f32 	%f51, %f51, %f39;
 
 BB2_10:
-	mov.u32 	%r32, %nctaid.x;
-	shl.b32 	%r33, %r16, 1;
-	mad.lo.s32 	%r56, %r33, %r32, %r56;
-	setp.lt.u32	%p7, %r56, %r13;
+	add.s32 	%r48, %r48, %r4;
+	setp.lt.u32	%p7, %r48, %r14;
 	@%p7 bra 	BB2_8;
 
 BB2_11:
-	shl.b32 	%r35, %r17, 2;
-	mov.u32 	%r36, memory;
-	add.s32 	%r12, %r36, %r35;
-	st.shared.f32 	[%r12], %f51;
+	shl.b32 	%r26, %r17, 2;
+	mov.u32 	%r27, memory;
+	add.s32 	%r13, %r27, %r26;
+	st.shared.f32 	[%r13], %f51;
 	bar.sync 	0;
-	setp.lt.u32	%p8, %r16, 1024;
+	setp.lt.u32	%p8, %r1, 1024;
 	@%p8 bra 	BB2_15;
 
 	setp.gt.u32	%p9, %r17, 511;
 	@%p9 bra 	BB2_14;
 
-	ld.shared.f32 	%f40, [%r12+2048];
+	ld.shared.f32 	%f40, [%r13+2048];
 	add.f32 	%f51, %f51, %f40;
-	st.shared.f32 	[%r12], %f51;
+	st.shared.f32 	[%r13], %f51;
 
 BB2_14:
 	bar.sync 	0;
 
 BB2_15:
-	setp.lt.u32	%p10, %r16, 512;
+	setp.lt.u32	%p10, %r1, 512;
 	@%p10 bra 	BB2_19;
 
 	setp.gt.u32	%p11, %r17, 255;
 	@%p11 bra 	BB2_18;
 
-	ld.shared.f32 	%f41, [%r12+1024];
+	ld.shared.f32 	%f41, [%r13+1024];
 	add.f32 	%f51, %f51, %f41;
-	st.shared.f32 	[%r12], %f51;
+	st.shared.f32 	[%r13], %f51;
 
 BB2_18:
 	bar.sync 	0;
 
 BB2_19:
-	setp.lt.u32	%p12, %r16, 256;
+	setp.lt.u32	%p12, %r1, 256;
 	@%p12 bra 	BB2_23;
 
 	setp.gt.u32	%p13, %r17, 127;
 	@%p13 bra 	BB2_22;
 
-	ld.shared.f32 	%f42, [%r12+512];
+	ld.shared.f32 	%f42, [%r13+512];
 	add.f32 	%f51, %f51, %f42;
-	st.shared.f32 	[%r12], %f51;
+	st.shared.f32 	[%r13], %f51;
 
 BB2_22:
 	bar.sync 	0;
 
 BB2_23:
-	setp.lt.u32	%p14, %r16, 128;
+	setp.lt.u32	%p14, %r1, 128;
 	@%p14 bra 	BB2_27;
 
 	setp.gt.u32	%p15, %r17, 63;
 	@%p15 bra 	BB2_26;
 
-	ld.shared.f32 	%f43, [%r12+256];
+	ld.shared.f32 	%f43, [%r13+256];
 	add.f32 	%f51, %f51, %f43;
-	st.shared.f32 	[%r12], %f51;
+	st.shared.f32 	[%r13], %f51;
 
 BB2_26:
 	bar.sync 	0;
@@ -248,72 +255,105 @@ BB2_27:
 	setp.gt.u32	%p16, %r17, 31;
 	@%p16 bra 	BB2_40;
 
-	setp.lt.u32	%p17, %r16, 64;
+	setp.lt.u32	%p17, %r1, 64;
 	@%p17 bra 	BB2_30;
 
-	ld.volatile.shared.f32 	%f44, [%r12+128];
+	ld.volatile.shared.f32 	%f44, [%r13+128];
 	add.f32 	%f51, %f51, %f44;
-	st.volatile.shared.f32 	[%r12], %f51;
+	st.volatile.shared.f32 	[%r13], %f51;
 
 BB2_30:
-	setp.lt.u32	%p18, %r16, 32;
+	setp.lt.u32	%p18, %r1, 32;
 	@%p18 bra 	BB2_32;
 
-	ld.volatile.shared.f32 	%f45, [%r12+64];
+	ld.volatile.shared.f32 	%f45, [%r13+64];
 	add.f32 	%f51, %f51, %f45;
-	st.volatile.shared.f32 	[%r12], %f51;
+	st.volatile.shared.f32 	[%r13], %f51;
 
 BB2_32:
-	setp.lt.u32	%p19, %r16, 16;
+	setp.lt.u32	%p19, %r1, 16;
 	@%p19 bra 	BB2_34;
 
-	ld.volatile.shared.f32 	%f46, [%r12+32];
+	ld.volatile.shared.f32 	%f46, [%r13+32];
 	add.f32 	%f51, %f51, %f46;
-	st.volatile.shared.f32 	[%r12], %f51;
+	st.volatile.shared.f32 	[%r13], %f51;
 
 BB2_34:
-	setp.lt.u32	%p20, %r16, 8;
+	setp.lt.u32	%p20, %r1, 8;
 	@%p20 bra 	BB2_36;
 
-	ld.volatile.shared.f32 	%f47, [%r12+16];
+	ld.volatile.shared.f32 	%f47, [%r13+16];
 	add.f32 	%f51, %f51, %f47;
-	st.volatile.shared.f32 	[%r12], %f51;
+	st.volatile.shared.f32 	[%r13], %f51;
 
 BB2_36:
-	setp.lt.u32	%p21, %r16, 4;
+	setp.lt.u32	%p21, %r1, 4;
 	@%p21 bra 	BB2_38;
 
-	ld.volatile.shared.f32 	%f48, [%r12+8];
+	ld.volatile.shared.f32 	%f48, [%r13+8];
 	add.f32 	%f51, %f51, %f48;
-	st.volatile.shared.f32 	[%r12], %f51;
+	st.volatile.shared.f32 	[%r13], %f51;
 
 BB2_38:
-	setp.lt.u32	%p22, %r16, 2;
+	setp.lt.u32	%p22, %r1, 2;
 	@%p22 bra 	BB2_40;
 
-	ld.volatile.shared.f32 	%f49, [%r12+4];
+	ld.volatile.shared.f32 	%f49, [%r13+4];
 	add.f32 	%f50, %f51, %f49;
-	st.volatile.shared.f32 	[%r12], %f50;
+	st.volatile.shared.f32 	[%r13], %f50;
 
 BB2_40:
 	setp.ne.s32	%p23, %r17, 0;
-	@%p23 bra 	BB2_44;
+	@%p23 bra 	BB2_45;
 
 	ld.shared.f32 	%f32, [memory];
-	cvta.to.global.u64 	%rd28, %rd10;
-	ld.global.u64 	%rd29, [%rd28+16];
-	ld.global.u64 	%rd30, [%rd28+32];
-	cvta.to.global.u64 	%rd35, %rd30;
-	setp.ne.s64	%p24, %rd29, 0;
+	cvta.to.global.u64 	%rd26, %rd11;
+	add.s64 	%rd6, %rd26, 16;
+	ld.global.u64 	%rd27, [%rd26+16];
+	setp.eq.s64	%p24, %rd27, 0;
 	@%p24 bra 	BB2_43;
 
-	mul.wide.u32 	%rd31, %r14, 4;
-	add.s64 	%rd35, %rd35, %rd31;
+	mov.u32 	%r44, 0;
+	add.u64 	%rd28, %SP, 0;
+	add.u64 	%rd29, %SPL, 0;
+	st.local.u32 	[%rd29], %r44;
+	st.local.u32 	[%rd29+4], %r15;
+	mov.u64 	%rd30, $str;
+	cvta.global.u64 	%rd31, %rd30;
+	// Callseq Start 0
+	{
+	.reg .b32 temp_param_reg;
+	// <end>}
+	.param .b64 param0;
+	st.param.b64	[param0+0], %rd31;
+	.param .b64 param1;
+	st.param.b64	[param1+0], %rd28;
+	.param .b32 retval0;
+	call.uni (retval0), 
+	vprintf, 
+	(
+	param0, 
+	param1
+	);
+	ld.param.b32	%r46, [retval0+0];
+	
+	//{
+	}// Callseq End 0
+	// inline asm
+	trap;
+	// inline asm
+	ld.global.u64 	%rd37, [%rd6+16];
+	bra.uni 	BB2_44;
 
 BB2_43:
-	st.global.f32 	[%rd35], %f32;
+	ld.global.u64 	%rd32, [%rd6+16];
+	mul.wide.u32 	%rd33, %r15, 4;
+	add.s64 	%rd37, %rd32, %rd33;
 
 BB2_44:
+	st.f32 	[%rd37], %f32;
+
+BB2_45:
 	ret;
 }
 
@@ -324,151 +364,151 @@ BB2_44:
 	.param .u32 reduce_sum_d_param_2
 )
 {
+	.local .align 8 .b8 	__local_depot3[8];
+	.reg .b64 	%SP;
+	.reg .b64 	%SPL;
 	.reg .pred 	%p<25>;
-	.reg .b32 	%r<57>;
+	.reg .b32 	%r<51>;
 	.reg .f64 	%fd<69>;
-	.reg .b64 	%rd<36>;
+	.reg .b64 	%rd<38>;
 
 
-	ld.param.u64 	%rd9, [reduce_sum_d_param_0];
-	ld.param.u64 	%rd10, [reduce_sum_d_param_1];
-	ld.param.u32 	%r13, [reduce_sum_d_param_2];
-	mov.u32 	%r14, %ctaid.x;
-	shl.b32 	%r15, %r14, 1;
-	mov.u32 	%r16, %ntid.x;
+	mov.u64 	%SPL, __local_depot3;
+	cvta.local.u64 	%SP, %SPL;
+	ld.param.u64 	%rd10, [reduce_sum_d_param_0];
+	ld.param.u64 	%rd11, [reduce_sum_d_param_1];
+	ld.param.u32 	%r14, [reduce_sum_d_param_2];
+	mov.u32 	%r15, %ctaid.x;
+	shl.b32 	%r16, %r15, 1;
+	mov.u32 	%r1, %ntid.x;
 	mov.u32 	%r17, %tid.x;
-	mad.lo.s32 	%r56, %r15, %r16, %r17;
+	mad.lo.s32 	%r48, %r16, %r1, %r17;
 	mov.f64 	%fd51, 0d0000000000000000;
-	setp.ge.u32	%p1, %r56, %r13;
+	setp.ge.u32	%p1, %r48, %r14;
 	@%p1 bra 	BB3_11;
 
-	cvta.to.global.u64 	%rd11, %rd9;
-	ld.global.u64 	%rd1, [%rd11+16];
+	cvta.to.global.u64 	%rd12, %rd10;
+	ld.global.u64 	%rd1, [%rd12+16];
 	setp.eq.s64	%p2, %rd1, 0;
-	ld.global.u64 	%rd12, [%rd11+32];
-	cvta.to.global.u64 	%rd2, %rd12;
+	ld.global.u64 	%rd2, [%rd12+32];
+	mov.u32 	%r18, %nctaid.x;
+	mul.lo.s32 	%r19, %r1, %r18;
+	shl.b32 	%r4, %r19, 1;
 	mov.f64 	%fd51, 0d0000000000000000;
 	@%p2 bra 	BB3_8;
 
-	mad.lo.s32 	%r54, %r15, %r16, %r17;
-	mov.f64 	%fd51, 0d0000000000000000;
-	mov.u64 	%rd32, %rd1;
+	mov.u64 	%rd34, %rd1;
 
 BB3_3:
-	cvta.to.global.u64 	%rd13, %rd32;
-	mul.wide.u32 	%rd14, %r54, 4;
-	add.s64 	%rd15, %rd13, %rd14;
-	ld.global.u32 	%r27, [%rd15];
-	mul.wide.u32 	%rd16, %r27, 8;
-	add.s64 	%rd17, %rd2, %rd16;
-	ld.global.f64 	%fd36, [%rd17];
+	mul.wide.u32 	%rd13, %r48, 4;
+	add.s64 	%rd14, %rd34, %rd13;
+	ld.u32 	%r20, [%rd14];
+	mul.wide.u32 	%rd15, %r20, 8;
+	add.s64 	%rd16, %rd2, %rd15;
+	ld.f64 	%fd36, [%rd16];
 	add.f64 	%fd51, %fd51, %fd36;
-	add.s32 	%r55, %r54, %r16;
-	setp.ge.u32	%p3, %r55, %r13;
+	add.s32 	%r49, %r48, %r1;
+	setp.ge.u32	%p3, %r49, %r14;
 	@%p3 bra 	BB3_7;
 
-	setp.eq.s64	%p4, %rd32, 0;
-	mov.u64 	%rd32, 0;
+	setp.eq.s64	%p4, %rd34, 0;
+	mov.u64 	%rd34, 0;
 	@%p4 bra 	BB3_6;
 
-	cvta.to.global.u64 	%rd19, %rd1;
-	mul.wide.u32 	%rd20, %r55, 4;
-	add.s64 	%rd21, %rd19, %rd20;
-	ld.global.u32 	%r55, [%rd21];
-	mov.u64 	%rd32, %rd1;
+	mul.wide.u32 	%rd18, %r49, 4;
+	add.s64 	%rd19, %rd1, %rd18;
+	ld.u32 	%r49, [%rd19];
+	mov.u64 	%rd34, %rd1;
 
 BB3_6:
-	mul.wide.u32 	%rd22, %r55, 8;
-	add.s64 	%rd23, %rd2, %rd22;
-	ld.global.f64 	%fd37, [%rd23];
+	mul.wide.u32 	%rd20, %r49, 8;
+	add.s64 	%rd21, %rd2, %rd20;
+	ld.f64 	%fd37, [%rd21];
 	add.f64 	%fd51, %fd51, %fd37;
 
 BB3_7:
-	shl.b32 	%r30, %r16, 1;
-	mov.u32 	%r31, %nctaid.x;
-	mad.lo.s32 	%r54, %r30, %r31, %r54;
-	setp.lt.u32	%p5, %r54, %r13;
+	shl.b32 	%r23, %r1, 1;
+	mad.lo.s32 	%r48, %r23, %r18, %r48;
+	setp.lt.u32	%p5, %r48, %r14;
 	@%p5 bra 	BB3_3;
 	bra.uni 	BB3_11;
 
 BB3_8:
-	mul.wide.u32 	%rd24, %r56, 8;
-	add.s64 	%rd25, %rd2, %rd24;
-	ld.global.f64 	%fd38, [%rd25];
+	mul.wide.u32 	%rd22, %r48, 8;
+	add.s64 	%rd23, %rd2, %rd22;
+	ld.f64 	%fd38, [%rd23];
 	add.f64 	%fd51, %fd51, %fd38;
-	add.s32 	%r10, %r56, %r16;
-	setp.ge.u32	%p6, %r10, %r13;
+	add.s32 	%r11, %r48, %r1;
+	setp.ge.u32	%p6, %r11, %r14;
 	@%p6 bra 	BB3_10;
 
-	mul.wide.u32 	%rd26, %r10, 8;
-	add.s64 	%rd27, %rd2, %rd26;
-	ld.global.f64 	%fd39, [%rd27];
+	mul.wide.u32 	%rd24, %r11, 8;
+	add.s64 	%rd25, %rd2, %rd24;
+	ld.f64 	%fd39, [%rd25];
 	add.f64 	%fd51, %fd51, %fd39;
 
 BB3_10:
-	mov.u32 	%r32, %nctaid.x;
-	shl.b32 	%r33, %r16, 1;
-	mad.lo.s32 	%r56, %r33, %r32, %r56;
-	setp.lt.u32	%p7, %r56, %r13;
+	add.s32 	%r48, %r48, %r4;
+	setp.lt.u32	%p7, %r48, %r14;
 	@%p7 bra 	BB3_8;
 
 BB3_11:
-	shl.b32 	%r35, %r17, 3;
-	mov.u32 	%r36, memory;
-	add.s32 	%r12, %r36, %r35;
-	st.shared.f64 	[%r12], %fd51;
+	shl.b32 	%r26, %r17, 3;
+	mov.u32 	%r27, memory;
+	add.s32 	%r13, %r27, %r26;
+	st.shared.f64 	[%r13], %fd51;
 	bar.sync 	0;
-	setp.lt.u32	%p8, %r16, 1024;
+	setp.lt.u32	%p8, %r1, 1024;
 	@%p8 bra 	BB3_15;
 
 	setp.gt.u32	%p9, %r17, 511;
 	@%p9 bra 	BB3_14;
 
-	ld.shared.f64 	%fd40, [%r12+4096];
+	ld.shared.f64 	%fd40, [%r13+4096];
 	add.f64 	%fd51, %fd51, %fd40;
-	st.shared.f64 	[%r12], %fd51;
+	st.shared.f64 	[%r13], %fd51;
 
 BB3_14:
 	bar.sync 	0;
 
 BB3_15:
-	setp.lt.u32	%p10, %r16, 512;
+	setp.lt.u32	%p10, %r1, 512;
 	@%p10 bra 	BB3_19;
 
 	setp.gt.u32	%p11, %r17, 255;
 	@%p11 bra 	BB3_18;
 
-	ld.shared.f64 	%fd41, [%r12+2048];
+	ld.shared.f64 	%fd41, [%r13+2048];
 	add.f64 	%fd51, %fd51, %fd41;
-	st.shared.f64 	[%r12], %fd51;
+	st.shared.f64 	[%r13], %fd51;
 
 BB3_18:
 	bar.sync 	0;
 
 BB3_19:
-	setp.lt.u32	%p12, %r16, 256;
+	setp.lt.u32	%p12, %r1, 256;
 	@%p12 bra 	BB3_23;
 
 	setp.gt.u32	%p13, %r17, 127;
 	@%p13 bra 	BB3_22;
 
-	ld.shared.f64 	%fd42, [%r12+1024];
+	ld.shared.f64 	%fd42, [%r13+1024];
 	add.f64 	%fd51, %fd51, %fd42;
-	st.shared.f64 	[%r12], %fd51;
+	st.shared.f64 	[%r13], %fd51;
 
 BB3_22:
 	bar.sync 	0;
 
 BB3_23:
-	setp.lt.u32	%p14, %r16, 128;
+	setp.lt.u32	%p14, %r1, 128;
 	@%p14 bra 	BB3_27;
 
 	setp.gt.u32	%p15, %r17, 63;
 	@%p15 bra 	BB3_26;
 
-	ld.shared.f64 	%fd43, [%r12+512];
+	ld.shared.f64 	%fd43, [%r13+512];
 	add.f64 	%fd51, %fd51, %fd43;
-	st.shared.f64 	[%r12], %fd51;
+	st.shared.f64 	[%r13], %fd51;
 
 BB3_26:
 	bar.sync 	0;
@@ -477,72 +517,105 @@ BB3_27:
 	setp.gt.u32	%p16, %r17, 31;
 	@%p16 bra 	BB3_40;
 
-	setp.lt.u32	%p17, %r16, 64;
+	setp.lt.u32	%p17, %r1, 64;
 	@%p17 bra 	BB3_30;
 
-	ld.volatile.shared.f64 	%fd44, [%r12+256];
+	ld.volatile.shared.f64 	%fd44, [%r13+256];
 	add.f64 	%fd51, %fd51, %fd44;
-	st.volatile.shared.f64 	[%r12], %fd51;
+	st.volatile.shared.f64 	[%r13], %fd51;
 
 BB3_30:
-	setp.lt.u32	%p18, %r16, 32;
+	setp.lt.u32	%p18, %r1, 32;
 	@%p18 bra 	BB3_32;
 
-	ld.volatile.shared.f64 	%fd45, [%r12+128];
+	ld.volatile.shared.f64 	%fd45, [%r13+128];
 	add.f64 	%fd51, %fd51, %fd45;
-	st.volatile.shared.f64 	[%r12], %fd51;
+	st.volatile.shared.f64 	[%r13], %fd51;
 
 BB3_32:
-	setp.lt.u32	%p19, %r16, 16;
+	setp.lt.u32	%p19, %r1, 16;
 	@%p19 bra 	BB3_34;
 
-	ld.volatile.shared.f64 	%fd46, [%r12+64];
+	ld.volatile.shared.f64 	%fd46, [%r13+64];
 	add.f64 	%fd51, %fd51, %fd46;
-	st.volatile.shared.f64 	[%r12], %fd51;
+	st.volatile.shared.f64 	[%r13], %fd51;
 
 BB3_34:
-	setp.lt.u32	%p20, %r16, 8;
+	setp.lt.u32	%p20, %r1, 8;
 	@%p20 bra 	BB3_36;
 
-	ld.volatile.shared.f64 	%fd47, [%r12+32];
+	ld.volatile.shared.f64 	%fd47, [%r13+32];
 	add.f64 	%fd51, %fd51, %fd47;
-	st.volatile.shared.f64 	[%r12], %fd51;
+	st.volatile.shared.f64 	[%r13], %fd51;
 
 BB3_36:
-	setp.lt.u32	%p21, %r16, 4;
+	setp.lt.u32	%p21, %r1, 4;
 	@%p21 bra 	BB3_38;
 
-	ld.volatile.shared.f64 	%fd48, [%r12+16];
+	ld.volatile.shared.f64 	%fd48, [%r13+16];
 	add.f64 	%fd51, %fd51, %fd48;
-	st.volatile.shared.f64 	[%r12], %fd51;
+	st.volatile.shared.f64 	[%r13], %fd51;
 
 BB3_38:
-	setp.lt.u32	%p22, %r16, 2;
+	setp.lt.u32	%p22, %r1, 2;
 	@%p22 bra 	BB3_40;
 
-	ld.volatile.shared.f64 	%fd49, [%r12+8];
+	ld.volatile.shared.f64 	%fd49, [%r13+8];
 	add.f64 	%fd50, %fd51, %fd49;
-	st.volatile.shared.f64 	[%r12], %fd50;
+	st.volatile.shared.f64 	[%r13], %fd50;
 
 BB3_40:
 	setp.ne.s32	%p23, %r17, 0;
-	@%p23 bra 	BB3_44;
+	@%p23 bra 	BB3_45;
 
 	ld.shared.f64 	%fd32, [memory];
-	cvta.to.global.u64 	%rd28, %rd10;
-	ld.global.u64 	%rd29, [%rd28+16];
-	ld.global.u64 	%rd30, [%rd28+32];
-	cvta.to.global.u64 	%rd35, %rd30;
-	setp.ne.s64	%p24, %rd29, 0;
+	cvta.to.global.u64 	%rd26, %rd11;
+	add.s64 	%rd6, %rd26, 16;
+	ld.global.u64 	%rd27, [%rd26+16];
+	setp.eq.s64	%p24, %rd27, 0;
 	@%p24 bra 	BB3_43;
 
-	mul.wide.u32 	%rd31, %r14, 8;
-	add.s64 	%rd35, %rd35, %rd31;
+	mov.u32 	%r44, 0;
+	add.u64 	%rd28, %SP, 0;
+	add.u64 	%rd29, %SPL, 0;
+	st.local.u32 	[%rd29], %r44;
+	st.local.u32 	[%rd29+4], %r15;
+	mov.u64 	%rd30, $str;
+	cvta.global.u64 	%rd31, %rd30;
+	// Callseq Start 1
+	{
+	.reg .b32 temp_param_reg;
+	// <end>}
+	.param .b64 param0;
+	st.param.b64	[param0+0], %rd31;
+	.param .b64 param1;
+	st.param.b64	[param1+0], %rd28;
+	.param .b32 retval0;
+	call.uni (retval0), 
+	vprintf, 
+	(
+	param0, 
+	param1
+	);
+	ld.param.b32	%r46, [retval0+0];
+	
+	//{
+	}// Callseq End 1
+	// inline asm
+	trap;
+	// inline asm
+	ld.global.u64 	%rd37, [%rd6+16];
+	bra.uni 	BB3_44;
 
 BB3_43:
-	st.global.f64 	[%rd35], %fd32;
+	ld.global.u64 	%rd32, [%rd6+16];
+	mul.wide.u32 	%rd33, %r15, 8;
+	add.s64 	%rd37, %rd32, %rd33;
 
 BB3_44:
+	st.f64 	[%rd37], %fd32;
+
+BB3_45:
 	ret;
 }
 
@@ -553,151 +626,151 @@ BB3_44:
 	.param .u32 reduce_max_f_param_2
 )
 {
+	.local .align 8 .b8 	__local_depot4[8];
+	.reg .b64 	%SP;
+	.reg .b64 	%SPL;
 	.reg .pred 	%p<25>;
 	.reg .f32 	%f<69>;
-	.reg .b32 	%r<57>;
-	.reg .b64 	%rd<36>;
-
-
-	ld.param.u64 	%rd9, [reduce_max_f_param_0];
-	ld.param.u64 	%rd10, [reduce_max_f_param_1];
-	ld.param.u32 	%r13, [reduce_max_f_param_2];
-	mov.u32 	%r14, %ctaid.x;
-	shl.b32 	%r15, %r14, 1;
-	mov.u32 	%r16, %ntid.x;
+	.reg .b32 	%r<51>;
+	.reg .b64 	%rd<38>;
+
+
+	mov.u64 	%SPL, __local_depot4;
+	cvta.local.u64 	%SP, %SPL;
+	ld.param.u64 	%rd10, [reduce_max_f_param_0];
+	ld.param.u64 	%rd11, [reduce_max_f_param_1];
+	ld.param.u32 	%r14, [reduce_max_f_param_2];
+	mov.u32 	%r15, %ctaid.x;
+	shl.b32 	%r16, %r15, 1;
+	mov.u32 	%r1, %ntid.x;
 	mov.u32 	%r17, %tid.x;
-	mad.lo.s32 	%r56, %r15, %r16, %r17;
+	mad.lo.s32 	%r48, %r16, %r1, %r17;
 	mov.f32 	%f51, 0fFF800000;
-	setp.ge.u32	%p1, %r56, %r13;
+	setp.ge.u32	%p1, %r48, %r14;
 	@%p1 bra 	BB4_11;
 
-	cvta.to.global.u64 	%rd11, %rd9;
-	ld.global.u64 	%rd1, [%rd11+16];
+	cvta.to.global.u64 	%rd12, %rd10;
+	ld.global.u64 	%rd1, [%rd12+16];
 	setp.eq.s64	%p2, %rd1, 0;
-	ld.global.u64 	%rd12, [%rd11+32];
-	cvta.to.global.u64 	%rd2, %rd12;
+	ld.global.u64 	%rd2, [%rd12+32];
+	mov.u32 	%r18, %nctaid.x;
+	mul.lo.s32 	%r19, %r1, %r18;
+	shl.b32 	%r4, %r19, 1;
 	mov.f32 	%f51, 0fFF800000;
 	@%p2 bra 	BB4_8;
 
-	mad.lo.s32 	%r54, %r15, %r16, %r17;
-	mov.f32 	%f51, 0fFF800000;
-	mov.u64 	%rd32, %rd1;
+	mov.u64 	%rd34, %rd1;
 
 BB4_3:
-	cvta.to.global.u64 	%rd13, %rd32;
-	mul.wide.u32 	%rd14, %r54, 4;
-	add.s64 	%rd15, %rd13, %rd14;
-	ld.global.u32 	%r27, [%rd15];
-	mul.wide.u32 	%rd16, %r27, 4;
-	add.s64 	%rd17, %rd2, %rd16;
-	ld.global.f32 	%f36, [%rd17];
+	mul.wide.u32 	%rd13, %r48, 4;
+	add.s64 	%rd14, %rd34, %rd13;
+	ld.u32 	%r20, [%rd14];
+	mul.wide.u32 	%rd15, %r20, 4;
+	add.s64 	%rd16, %rd2, %rd15;
+	ld.f32 	%f36, [%rd16];
 	max.f32 	%f51, %f51, %f36;
-	add.s32 	%r55, %r54, %r16;
-	setp.ge.u32	%p3, %r55, %r13;
+	add.s32 	%r49, %r48, %r1;
+	setp.ge.u32	%p3, %r49, %r14;
 	@%p3 bra 	BB4_7;
 
-	setp.eq.s64	%p4, %rd32, 0;
-	mov.u64 	%rd32, 0;
+	setp.eq.s64	%p4, %rd34, 0;
+	mov.u64 	%rd34, 0;
 	@%p4 bra 	BB4_6;
 
-	cvta.to.global.u64 	%rd19, %rd1;
-	mul.wide.u32 	%rd20, %r55, 4;
-	add.s64 	%rd21, %rd19, %rd20;
-	ld.global.u32 	%r55, [%rd21];
-	mov.u64 	%rd32, %rd1;
+	mul.wide.u32 	%rd18, %r49, 4;
+	add.s64 	%rd19, %rd1, %rd18;
+	ld.u32 	%r49, [%rd19];
+	mov.u64 	%rd34, %rd1;
 
 BB4_6:
-	mul.wide.u32 	%rd22, %r55, 4;
-	add.s64 	%rd23, %rd2, %rd22;
-	ld.global.f32 	%f37, [%rd23];
+	mul.wide.u32 	%rd20, %r49, 4;
+	add.s64 	%rd21, %rd2, %rd20;
+	ld.f32 	%f37, [%rd21];
 	max.f32 	%f51, %f51, %f37;
 
 BB4_7:
-	shl.b32 	%r30, %r16, 1;
-	mov.u32 	%r31, %nctaid.x;
-	mad.lo.s32 	%r54, %r30, %r31, %r54;
-	setp.lt.u32	%p5, %r54, %r13;
+	shl.b32 	%r23, %r1, 1;
+	mad.lo.s32 	%r48, %r23, %r18, %r48;
+	setp.lt.u32	%p5, %r48, %r14;
 	@%p5 bra 	BB4_3;
 	bra.uni 	BB4_11;
 
 BB4_8:
-	mul.wide.u32 	%rd24, %r56, 4;
-	add.s64 	%rd25, %rd2, %rd24;
-	ld.global.f32 	%f38, [%rd25];
+	mul.wide.u32 	%rd22, %r48, 4;
+	add.s64 	%rd23, %rd2, %rd22;
+	ld.f32 	%f38, [%rd23];
 	max.f32 	%f51, %f51, %f38;
-	add.s32 	%r10, %r56, %r16;
-	setp.ge.u32	%p6, %r10, %r13;
+	add.s32 	%r11, %r48, %r1;
+	setp.ge.u32	%p6, %r11, %r14;
 	@%p6 bra 	BB4_10;
 
-	mul.wide.u32 	%rd26, %r10, 4;
-	add.s64 	%rd27, %rd2, %rd26;
-	ld.global.f32 	%f39, [%rd27];
+	mul.wide.u32 	%rd24, %r11, 4;
+	add.s64 	%rd25, %rd2, %rd24;
+	ld.f32 	%f39, [%rd25];
 	max.f32 	%f51, %f51, %f39;
 
 BB4_10:
-	mov.u32 	%r32, %nctaid.x;
-	shl.b32 	%r33, %r16, 1;
-	mad.lo.s32 	%r56, %r33, %r32, %r56;
-	setp.lt.u32	%p7, %r56, %r13;
+	add.s32 	%r48, %r48, %r4;
+	setp.lt.u32	%p7, %r48, %r14;
 	@%p7 bra 	BB4_8;
 
 BB4_11:
-	shl.b32 	%r35, %r17, 2;
-	mov.u32 	%r36, memory;
-	add.s32 	%r12, %r36, %r35;
-	st.shared.f32 	[%r12], %f51;
+	shl.b32 	%r26, %r17, 2;
+	mov.u32 	%r27, memory;
+	add.s32 	%r13, %r27, %r26;
+	st.shared.f32 	[%r13], %f51;
 	bar.sync 	0;
-	setp.lt.u32	%p8, %r16, 1024;
+	setp.lt.u32	%p8, %r1, 1024;
 	@%p8 bra 	BB4_15;
 
 	setp.gt.u32	%p9, %r17, 511;
 	@%p9 bra 	BB4_14;
 
-	ld.shared.f32 	%f40, [%r12+2048];
+	ld.shared.f32 	%f40, [%r13+2048];
 	max.f32 	%f51, %f51, %f40;
-	st.shared.f32 	[%r12], %f51;
+	st.shared.f32 	[%r13], %f51;
 
 BB4_14:
 	bar.sync 	0;
 
 BB4_15:
-	setp.lt.u32	%p10, %r16, 512;
+	setp.lt.u32	%p10, %r1, 512;
 	@%p10 bra 	BB4_19;
 
 	setp.gt.u32	%p11, %r17, 255;
 	@%p11 bra 	BB4_18;
 
-	ld.shared.f32 	%f41, [%r12+1024];
+	ld.shared.f32 	%f41, [%r13+1024];
 	max.f32 	%f51, %f51, %f41;
-	st.shared.f32 	[%r12], %f51;
+	st.shared.f32 	[%r13], %f51;
 
 BB4_18:
 	bar.sync 	0;
 
 BB4_19:
-	setp.lt.u32	%p12, %r16, 256;
+	setp.lt.u32	%p12, %r1, 256;
 	@%p12 bra 	BB4_23;
 
 	setp.gt.u32	%p13, %r17, 127;
 	@%p13 bra 	BB4_22;
 
-	ld.shared.f32 	%f42, [%r12+512];
+	ld.shared.f32 	%f42, [%r13+512];
 	max.f32 	%f51, %f51, %f42;
-	st.shared.f32 	[%r12], %f51;
+	st.shared.f32 	[%r13], %f51;
 
 BB4_22:
 	bar.sync 	0;
 
 BB4_23:
-	setp.lt.u32	%p14, %r16, 128;
+	setp.lt.u32	%p14, %r1, 128;
 	@%p14 bra 	BB4_27;
 
 	setp.gt.u32	%p15, %r17, 63;
 	@%p15 bra 	BB4_26;
 
-	ld.shared.f32 	%f43, [%r12+256];
+	ld.shared.f32 	%f43, [%r13+256];
 	max.f32 	%f51, %f51, %f43;
-	st.shared.f32 	[%r12], %f51;
+	st.shared.f32 	[%r13], %f51;
 
 BB4_26:
 	bar.sync 	0;
@@ -706,72 +779,105 @@ BB4_27:
 	setp.gt.u32	%p16, %r17, 31;
 	@%p16 bra 	BB4_40;
 
-	setp.lt.u32	%p17, %r16, 64;
+	setp.lt.u32	%p17, %r1, 64;
 	@%p17 bra 	BB4_30;
 
-	ld.volatile.shared.f32 	%f44, [%r12+128];
+	ld.volatile.shared.f32 	%f44, [%r13+128];
 	max.f32 	%f51, %f51, %f44;
-	st.volatile.shared.f32 	[%r12], %f51;
+	st.volatile.shared.f32 	[%r13], %f51;
 
 BB4_30:
-	setp.lt.u32	%p18, %r16, 32;
+	setp.lt.u32	%p18, %r1, 32;
 	@%p18 bra 	BB4_32;
 
-	ld.volatile.shared.f32 	%f45, [%r12+64];
+	ld.volatile.shared.f32 	%f45, [%r13+64];
 	max.f32 	%f51, %f51, %f45;
-	st.volatile.shared.f32 	[%r12], %f51;
+	st.volatile.shared.f32 	[%r13], %f51;
 
 BB4_32:
-	setp.lt.u32	%p19, %r16, 16;
+	setp.lt.u32	%p19, %r1, 16;
 	@%p19 bra 	BB4_34;
 
-	ld.volatile.shared.f32 	%f46, [%r12+32];
+	ld.volatile.shared.f32 	%f46, [%r13+32];
 	max.f32 	%f51, %f51, %f46;
-	st.volatile.shared.f32 	[%r12], %f51;
+	st.volatile.shared.f32 	[%r13], %f51;
 
 BB4_34:
-	setp.lt.u32	%p20, %r16, 8;
+	setp.lt.u32	%p20, %r1, 8;
 	@%p20 bra 	BB4_36;
 
-	ld.volatile.shared.f32 	%f47, [%r12+16];
+	ld.volatile.shared.f32 	%f47, [%r13+16];
 	max.f32 	%f51, %f51, %f47;
-	st.volatile.shared.f32 	[%r12], %f51;
+	st.volatile.shared.f32 	[%r13], %f51;
 
 BB4_36:
-	setp.lt.u32	%p21, %r16, 4;
+	setp.lt.u32	%p21, %r1, 4;
 	@%p21 bra 	BB4_38;
 
-	ld.volatile.shared.f32 	%f48, [%r12+8];
+	ld.volatile.shared.f32 	%f48, [%r13+8];
 	max.f32 	%f51, %f51, %f48;
-	st.volatile.shared.f32 	[%r12], %f51;
+	st.volatile.shared.f32 	[%r13], %f51;
 
 BB4_38:
-	setp.lt.u32	%p22, %r16, 2;
+	setp.lt.u32	%p22, %r1, 2;
 	@%p22 bra 	BB4_40;
 
-	ld.volatile.shared.f32 	%f49, [%r12+4];
+	ld.volatile.shared.f32 	%f49, [%r13+4];
 	max.f32 	%f50, %f51, %f49;
-	st.volatile.shared.f32 	[%r12], %f50;
+	st.volatile.shared.f32 	[%r13], %f50;
 
 BB4_40:
 	setp.ne.s32	%p23, %r17, 0;
-	@%p23 bra 	BB4_44;
+	@%p23 bra 	BB4_45;
 
 	ld.shared.f32 	%f32, [memory];
-	cvta.to.global.u64 	%rd28, %rd10;
-	ld.global.u64 	%rd29, [%rd28+16];
-	ld.global.u64 	%rd30, [%rd28+32];
-	cvta.to.global.u64 	%rd35, %rd30;
-	setp.ne.s64	%p24, %rd29, 0;
+	cvta.to.global.u64 	%rd26, %rd11;
+	add.s64 	%rd6, %rd26, 16;
+	ld.global.u64 	%rd27, [%rd26+16];
+	setp.eq.s64	%p24, %rd27, 0;
 	@%p24 bra 	BB4_43;
 
-	mul.wide.u32 	%rd31, %r14, 4;
-	add.s64 	%rd35, %rd35, %rd31;
+	mov.u32 	%r44, 0;
+	add.u64 	%rd28, %SP, 0;
+	add.u64 	%rd29, %SPL, 0;
+	st.local.u32 	[%rd29], %r44;
+	st.local.u32 	[%rd29+4], %r15;
+	mov.u64 	%rd30, $str;
+	cvta.global.u64 	%rd31, %rd30;
+	// Callseq Start 2
+	{
+	.reg .b32 temp_param_reg;
+	// <end>}
+	.param .b64 param0;
+	st.param.b64	[param0+0], %rd31;
+	.param .b64 param1;
+	st.param.b64	[param1+0], %rd28;
+	.param .b32 retval0;
+	call.uni (retval0), 
+	vprintf, 
+	(
+	param0, 
+	param1
+	);
+	ld.param.b32	%r46, [retval0+0];
+	
+	//{
+	}// Callseq End 2
+	// inline asm
+	trap;
+	// inline asm
+	ld.global.u64 	%rd37, [%rd6+16];
+	bra.uni 	BB4_44;
 
 BB4_43:
-	st.global.f32 	[%rd35], %f32;
+	ld.global.u64 	%rd32, [%rd6+16];
+	mul.wide.u32 	%rd33, %r15, 4;
+	add.s64 	%rd37, %rd32, %rd33;
 
 BB4_44:
+	st.f32 	[%rd37], %f32;
+
+BB4_45:
 	ret;
 }
 
@@ -782,85 +888,87 @@ BB4_44:
 	.param .u32 reduce_max_d_param_2
 )
 {
+	.local .align 8 .b8 	__local_depot5[8];
+	.reg .b64 	%SP;
+	.reg .b64 	%SPL;
 	.reg .pred 	%p<23>;
-	.reg .b32 	%r<46>;
+	.reg .b32 	%r<49>;
 	.reg .f64 	%fd<60>;
-	.reg .b64 	%rd<34>;
+	.reg .b64 	%rd<36>;
 
 
-	ld.param.u64 	%rd10, [reduce_max_d_param_0];
-	ld.param.u64 	%rd11, [reduce_max_d_param_1];
+	mov.u64 	%SPL, __local_depot5;
+	cvta.local.u64 	%SP, %SPL;
+	ld.param.u64 	%rd11, [reduce_max_d_param_0];
+	ld.param.u64 	%rd12, [reduce_max_d_param_1];
 	ld.param.u32 	%r10, [reduce_max_d_param_2];
-	mov.u32 	%r11, %tid.x;
-	mov.u32 	%r12, %ctaid.x;
-	shl.b32 	%r13, %r12, 1;
-	mov.u32 	%r14, %ntid.x;
-	mad.lo.s32 	%r43, %r13, %r14, %r11;
+	mov.u32 	%r11, %ctaid.x;
+	shl.b32 	%r12, %r11, 1;
+	mov.u32 	%r13, %ntid.x;
+	mov.u32 	%r14, %tid.x;
+	mad.lo.s32 	%r46, %r12, %r13, %r14;
 	mov.f64 	%fd44, 0dFFF0000000000000;
-	setp.ge.u32	%p1, %r43, %r10;
+	setp.ge.u32	%p1, %r46, %r10;
 	@%p1 bra 	BB5_9;
 
-	cvta.to.global.u64 	%rd12, %rd10;
-	ld.global.u64 	%rd1, [%rd12+16];
-	ld.global.u64 	%rd13, [%rd12+32];
-	cvta.to.global.u64 	%rd2, %rd13;
+	cvta.to.global.u64 	%rd13, %rd11;
+	ld.global.u64 	%rd1, [%rd13+16];
+	ld.global.u64 	%rd2, [%rd13+32];
 	mov.f64 	%fd44, 0dFFF0000000000000;
-	mov.u64 	%rd30, %rd1;
+	mov.u64 	%rd32, %rd1;
 
 BB5_2:
 	setp.eq.s64	%p2, %rd1, 0;
-	mov.u32 	%r44, %r43;
+	mov.u32 	%r47, %r46;
 	@%p2 bra 	BB5_4;
 
-	cvta.to.global.u64 	%rd14, %rd1;
-	mul.wide.u32 	%rd15, %r43, 4;
-	add.s64 	%rd16, %rd14, %rd15;
-	ld.global.u32 	%r44, [%rd16];
-	mov.u64 	%rd30, %rd1;
+	mul.wide.u32 	%rd14, %r46, 4;
+	add.s64 	%rd15, %rd1, %rd14;
+	ld.u32 	%r47, [%rd15];
+	mov.u64 	%rd32, %rd1;
 
 BB5_4:
-	mul.wide.u32 	%rd17, %r44, 8;
-	add.s64 	%rd18, %rd2, %rd17;
-	ld.global.f64 	%fd31, [%rd18];
+	mul.wide.u32 	%rd16, %r47, 8;
+	add.s64 	%rd17, %rd2, %rd16;
+	ld.f64 	%fd31, [%rd17];
 	max.f64 	%fd44, %fd44, %fd31;
-	add.s32 	%r45, %r43, %r14;
-	setp.ge.u32	%p3, %r45, %r10;
+	add.s32 	%r48, %r46, %r13;
+	setp.ge.u32	%p3, %r48, %r10;
 	@%p3 bra 	BB5_8;
 
-	setp.eq.s64	%p4, %rd30, 0;
-	mov.u64 	%rd30, 0;
+	setp.eq.s64	%p4, %rd32, 0;
+	mov.u64 	%rd32, 0;
 	@%p4 bra 	BB5_7;
 
-	cvta.to.global.u64 	%rd20, %rd1;
-	add.s32 	%r19, %r43, %r14;
-	mul.wide.u32 	%rd21, %r19, 4;
-	add.s64 	%rd22, %rd20, %rd21;
-	ld.global.u32 	%r45, [%rd22];
-	mov.u64 	%rd30, %rd1;
+	add.s32 	%r19, %r46, %r13;
+	mul.wide.u32 	%rd19, %r19, 4;
+	add.s64 	%rd20, %rd1, %rd19;
+	ld.u32 	%r48, [%rd20];
+	mov.u64 	%rd32, %rd1;
 
 BB5_7:
-	mul.wide.u32 	%rd23, %r45, 8;
-	add.s64 	%rd24, %rd2, %rd23;
-	ld.global.f64 	%fd32, [%rd24];
+	mul.wide.u32 	%rd21, %r48, 8;
+	add.s64 	%rd22, %rd2, %rd21;
+	ld.f64 	%fd32, [%rd22];
 	max.f64 	%fd44, %fd44, %fd32;
 
 BB5_8:
-	shl.b32 	%r21, %r14, 1;
+	shl.b32 	%r21, %r13, 1;
 	mov.u32 	%r22, %nctaid.x;
-	mad.lo.s32 	%r43, %r21, %r22, %r43;
-	setp.lt.u32	%p5, %r43, %r10;
+	mad.lo.s32 	%r46, %r21, %r22, %r46;
+	setp.lt.u32	%p5, %r46, %r10;
 	@%p5 bra 	BB5_2;
 
 BB5_9:
-	shl.b32 	%r24, %r11, 3;
+	shl.b32 	%r24, %r14, 3;
 	mov.u32 	%r25, memory;
 	add.s32 	%r9, %r25, %r24;
 	st.shared.f64 	[%r9], %fd44;
 	bar.sync 	0;
-	setp.lt.u32	%p6, %r14, 1024;
+	setp.lt.u32	%p6, %r13, 1024;
 	@%p6 bra 	BB5_13;
 
-	setp.gt.u32	%p7, %r11, 511;
+	setp.gt.u32	%p7, %r14, 511;
 	@%p7 bra 	BB5_12;
 
 	ld.shared.f64 	%fd33, [%r9+4096];
@@ -871,10 +979,10 @@ BB5_12:
 	bar.sync 	0;
 
 BB5_13:
-	setp.lt.u32	%p8, %r14, 512;
+	setp.lt.u32	%p8, %r13, 512;
 	@%p8 bra 	BB5_17;
 
-	setp.gt.u32	%p9, %r11, 255;
+	setp.gt.u32	%p9, %r14, 255;
 	@%p9 bra 	BB5_16;
 
 	ld.shared.f64 	%fd34, [%r9+2048];
@@ -885,10 +993,10 @@ BB5_16:
 	bar.sync 	0;
 
 BB5_17:
-	setp.lt.u32	%p10, %r14, 256;
+	setp.lt.u32	%p10, %r13, 256;
 	@%p10 bra 	BB5_21;
 
-	setp.gt.u32	%p11, %r11, 127;
+	setp.gt.u32	%p11, %r14, 127;
 	@%p11 bra 	BB5_20;
 
 	ld.shared.f64 	%fd35, [%r9+1024];
@@ -899,10 +1007,10 @@ BB5_20:
 	bar.sync 	0;
 
 BB5_21:
-	setp.lt.u32	%p12, %r14, 128;
+	setp.lt.u32	%p12, %r13, 128;
 	@%p12 bra 	BB5_25;
 
-	setp.gt.u32	%p13, %r11, 63;
+	setp.gt.u32	%p13, %r14, 63;
 	@%p13 bra 	BB5_24;
 
 	ld.shared.f64 	%fd36, [%r9+512];
@@ -913,10 +1021,10 @@ BB5_24:
 	bar.sync 	0;
 
 BB5_25:
-	setp.gt.u32	%p14, %r11, 31;
+	setp.gt.u32	%p14, %r14, 31;
 	@%p14 bra 	BB5_38;
 
-	setp.lt.u32	%p15, %r14, 64;
+	setp.lt.u32	%p15, %r13, 64;
 	@%p15 bra 	BB5_28;
 
 	ld.volatile.shared.f64 	%fd37, [%r9+256];
@@ -924,7 +1032,7 @@ BB5_25:
 	st.volatile.shared.f64 	[%r9], %fd44;
 
 BB5_28:
-	setp.lt.u32	%p16, %r14, 32;
+	setp.lt.u32	%p16, %r13, 32;
 	@%p16 bra 	BB5_30;
 
 	ld.volatile.shared.f64 	%fd38, [%r9+128];
@@ -932,7 +1040,7 @@ BB5_28:
 	st.volatile.shared.f64 	[%r9], %fd44;
 
 BB5_30:
-	setp.lt.u32	%p17, %r14, 16;
+	setp.lt.u32	%p17, %r13, 16;
 	@%p17 bra 	BB5_32;
 
 	ld.volatile.shared.f64 	%fd39, [%r9+64];
@@ -940,7 +1048,7 @@ BB5_30:
 	st.volatile.shared.f64 	[%r9], %fd44;
 
 BB5_32:
-	setp.lt.u32	%p18, %r14, 8;
+	setp.lt.u32	%p18, %r13, 8;
 	@%p18 bra 	BB5_34;
 
 	ld.volatile.shared.f64 	%fd40, [%r9+32];
@@ -948,7 +1056,7 @@ BB5_32:
 	st.volatile.shared.f64 	[%r9], %fd44;
 
 BB5_34:
-	setp.lt.u32	%p19, %r14, 4;
+	setp.lt.u32	%p19, %r13, 4;
 	@%p19 bra 	BB5_36;
 
 	ld.volatile.shared.f64 	%fd41, [%r9+16];
@@ -956,7 +1064,7 @@ BB5_34:
 	st.volatile.shared.f64 	[%r9], %fd44;
 
 BB5_36:
-	setp.lt.u32	%p20, %r14, 2;
+	setp.lt.u32	%p20, %r13, 2;
 	@%p20 bra 	BB5_38;
 
 	ld.volatile.shared.f64 	%fd42, [%r9+8];
@@ -964,24 +1072,57 @@ BB5_36:
 	st.volatile.shared.f64 	[%r9], %fd43;
 
 BB5_38:
-	setp.ne.s32	%p21, %r11, 0;
-	@%p21 bra 	BB5_42;
+	setp.ne.s32	%p21, %r14, 0;
+	@%p21 bra 	BB5_43;
 
 	ld.shared.f64 	%fd28, [memory];
-	cvta.to.global.u64 	%rd25, %rd11;
-	ld.global.u64 	%rd26, [%rd25+16];
-	ld.global.u64 	%rd27, [%rd25+32];
-	cvta.to.global.u64 	%rd33, %rd27;
-	setp.ne.s64	%p22, %rd26, 0;
+	cvta.to.global.u64 	%rd23, %rd12;
+	add.s64 	%rd7, %rd23, 16;
+	ld.global.u64 	%rd24, [%rd23+16];
+	setp.eq.s64	%p22, %rd24, 0;
 	@%p22 bra 	BB5_41;
 
-	mul.wide.u32 	%rd28, %r12, 8;
-	add.s64 	%rd33, %rd33, %rd28;
+	mov.u32 	%r42, 0;
+	add.u64 	%rd25, %SP, 0;
+	add.u64 	%rd26, %SPL, 0;
+	st.local.u32 	[%rd26], %r42;
+	st.local.u32 	[%rd26+4], %r11;
+	mov.u64 	%rd27, $str;
+	cvta.global.u64 	%rd28, %rd27;
+	// Callseq Start 3
+	{
+	.reg .b32 temp_param_reg;
+	// <end>}
+	.param .b64 param0;
+	st.param.b64	[param0+0], %rd28;
+	.param .b64 param1;
+	st.param.b64	[param1+0], %rd25;
+	.param .b32 retval0;
+	call.uni (retval0), 
+	vprintf, 
+	(
+	param0, 
+	param1
+	);
+	ld.param.b32	%r44, [retval0+0];
+	
+	//{
+	}// Callseq End 3
+	// inline asm
+	trap;
+	// inline asm
+	ld.global.u64 	%rd35, [%rd7+16];
+	bra.uni 	BB5_42;
 
 BB5_41:
-	st.global.f64 	[%rd33], %fd28;
+	ld.global.u64 	%rd29, [%rd7+16];
+	mul.wide.u32 	%rd30, %r11, 8;
+	add.s64 	%rd35, %rd29, %rd30;
 
 BB5_42:
+	st.f64 	[%rd35], %fd28;
+
+BB5_43:
 	ret;
 }
 
@@ -992,85 +1133,87 @@ BB5_42:
 	.param .u32 reduce_min_f_param_2
 )
 {
+	.local .align 8 .b8 	__local_depot6[8];
+	.reg .b64 	%SP;
+	.reg .b64 	%SPL;
 	.reg .pred 	%p<23>;
 	.reg .f32 	%f<60>;
-	.reg .b32 	%r<46>;
-	.reg .b64 	%rd<34>;
+	.reg .b32 	%r<49>;
+	.reg .b64 	%rd<36>;
 
 
-	ld.param.u64 	%rd10, [reduce_min_f_param_0];
-	ld.param.u64 	%rd11, [reduce_min_f_param_1];
+	mov.u64 	%SPL, __local_depot6;
+	cvta.local.u64 	%SP, %SPL;
+	ld.param.u64 	%rd11, [reduce_min_f_param_0];
+	ld.param.u64 	%rd12, [reduce_min_f_param_1];
 	ld.param.u32 	%r10, [reduce_min_f_param_2];
-	mov.u32 	%r11, %tid.x;
-	mov.u32 	%r12, %ctaid.x;
-	shl.b32 	%r13, %r12, 1;
-	mov.u32 	%r14, %ntid.x;
-	mad.lo.s32 	%r43, %r13, %r14, %r11;
+	mov.u32 	%r11, %ctaid.x;
+	shl.b32 	%r12, %r11, 1;
+	mov.u32 	%r13, %ntid.x;
+	mov.u32 	%r14, %tid.x;
+	mad.lo.s32 	%r46, %r12, %r13, %r14;
 	mov.f32 	%f44, 0f7F800000;
-	setp.ge.u32	%p1, %r43, %r10;
+	setp.ge.u32	%p1, %r46, %r10;
 	@%p1 bra 	BB6_9;
 
-	cvta.to.global.u64 	%rd12, %rd10;
-	ld.global.u64 	%rd1, [%rd12+16];
-	ld.global.u64 	%rd13, [%rd12+32];
-	cvta.to.global.u64 	%rd2, %rd13;
+	cvta.to.global.u64 	%rd13, %rd11;
+	ld.global.u64 	%rd1, [%rd13+16];
+	ld.global.u64 	%rd2, [%rd13+32];
 	mov.f32 	%f44, 0f7F800000;
-	mov.u64 	%rd30, %rd1;
+	mov.u64 	%rd32, %rd1;
 
 BB6_2:
 	setp.eq.s64	%p2, %rd1, 0;
-	mov.u32 	%r44, %r43;
+	mov.u32 	%r47, %r46;
 	@%p2 bra 	BB6_4;
 
-	cvta.to.global.u64 	%rd14, %rd1;
-	mul.wide.u32 	%rd15, %r43, 4;
-	add.s64 	%rd16, %rd14, %rd15;
-	ld.global.u32 	%r44, [%rd16];
-	mov.u64 	%rd30, %rd1;
+	mul.wide.u32 	%rd14, %r46, 4;
+	add.s64 	%rd15, %rd1, %rd14;
+	ld.u32 	%r47, [%rd15];
+	mov.u64 	%rd32, %rd1;
 
 BB6_4:
-	mul.wide.u32 	%rd17, %r44, 4;
-	add.s64 	%rd18, %rd2, %rd17;
-	ld.global.f32 	%f31, [%rd18];
+	mul.wide.u32 	%rd16, %r47, 4;
+	add.s64 	%rd17, %rd2, %rd16;
+	ld.f32 	%f31, [%rd17];
 	min.f32 	%f44, %f44, %f31;
-	add.s32 	%r45, %r43, %r14;
-	setp.ge.u32	%p3, %r45, %r10;
+	add.s32 	%r48, %r46, %r13;
+	setp.ge.u32	%p3, %r48, %r10;
 	@%p3 bra 	BB6_8;
 
-	setp.eq.s64	%p4, %rd30, 0;
-	mov.u64 	%rd30, 0;
+	setp.eq.s64	%p4, %rd32, 0;
+	mov.u64 	%rd32, 0;
 	@%p4 bra 	BB6_7;
 
-	cvta.to.global.u64 	%rd20, %rd1;
-	add.s32 	%r19, %r43, %r14;
-	mul.wide.u32 	%rd21, %r19, 4;
-	add.s64 	%rd22, %rd20, %rd21;
-	ld.global.u32 	%r45, [%rd22];
-	mov.u64 	%rd30, %rd1;
+	add.s32 	%r19, %r46, %r13;
+	mul.wide.u32 	%rd19, %r19, 4;
+	add.s64 	%rd20, %rd1, %rd19;
+	ld.u32 	%r48, [%rd20];
+	mov.u64 	%rd32, %rd1;
 
 BB6_7:
-	mul.wide.u32 	%rd23, %r45, 4;
-	add.s64 	%rd24, %rd2, %rd23;
-	ld.global.f32 	%f32, [%rd24];
+	mul.wide.u32 	%rd21, %r48, 4;
+	add.s64 	%rd22, %rd2, %rd21;
+	ld.f32 	%f32, [%rd22];
 	min.f32 	%f44, %f44, %f32;
 
 BB6_8:
-	shl.b32 	%r21, %r14, 1;
+	shl.b32 	%r21, %r13, 1;
 	mov.u32 	%r22, %nctaid.x;
-	mad.lo.s32 	%r43, %r21, %r22, %r43;
-	setp.lt.u32	%p5, %r43, %r10;
+	mad.lo.s32 	%r46, %r21, %r22, %r46;
+	setp.lt.u32	%p5, %r46, %r10;
 	@%p5 bra 	BB6_2;
 
 BB6_9:
-	shl.b32 	%r24, %r11, 2;
+	shl.b32 	%r24, %r14, 2;
 	mov.u32 	%r25, memory;
 	add.s32 	%r9, %r25, %r24;
 	st.shared.f32 	[%r9], %f44;
 	bar.sync 	0;
-	setp.lt.u32	%p6, %r14, 1024;
+	setp.lt.u32	%p6, %r13, 1024;
 	@%p6 bra 	BB6_13;
 
-	setp.gt.u32	%p7, %r11, 511;
+	setp.gt.u32	%p7, %r14, 511;
 	@%p7 bra 	BB6_12;
 
 	ld.shared.f32 	%f33, [%r9+2048];
@@ -1081,10 +1224,10 @@ BB6_12:
 	bar.sync 	0;
 
 BB6_13:
-	setp.lt.u32	%p8, %r14, 512;
+	setp.lt.u32	%p8, %r13, 512;
 	@%p8 bra 	BB6_17;
 
-	setp.gt.u32	%p9, %r11, 255;
+	setp.gt.u32	%p9, %r14, 255;
 	@%p9 bra 	BB6_16;
 
 	ld.shared.f32 	%f34, [%r9+1024];
@@ -1095,10 +1238,10 @@ BB6_16:
 	bar.sync 	0;
 
 BB6_17:
-	setp.lt.u32	%p10, %r14, 256;
+	setp.lt.u32	%p10, %r13, 256;
 	@%p10 bra 	BB6_21;
 
-	setp.gt.u32	%p11, %r11, 127;
+	setp.gt.u32	%p11, %r14, 127;
 	@%p11 bra 	BB6_20;
 
 	ld.shared.f32 	%f35, [%r9+512];
@@ -1109,10 +1252,10 @@ BB6_20:
 	bar.sync 	0;
 
 BB6_21:
-	setp.lt.u32	%p12, %r14, 128;
+	setp.lt.u32	%p12, %r13, 128;
 	@%p12 bra 	BB6_25;
 
-	setp.gt.u32	%p13, %r11, 63;
+	setp.gt.u32	%p13, %r14, 63;
 	@%p13 bra 	BB6_24;
 
 	ld.shared.f32 	%f36, [%r9+256];
@@ -1123,10 +1266,10 @@ BB6_24:
 	bar.sync 	0;
 
 BB6_25:
-	setp.gt.u32	%p14, %r11, 31;
+	setp.gt.u32	%p14, %r14, 31;
 	@%p14 bra 	BB6_38;
 
-	setp.lt.u32	%p15, %r14, 64;
+	setp.lt.u32	%p15, %r13, 64;
 	@%p15 bra 	BB6_28;
 
 	ld.volatile.shared.f32 	%f37, [%r9+128];
@@ -1134,7 +1277,7 @@ BB6_25:
 	st.volatile.shared.f32 	[%r9], %f44;
 
 BB6_28:
-	setp.lt.u32	%p16, %r14, 32;
+	setp.lt.u32	%p16, %r13, 32;
 	@%p16 bra 	BB6_30;
 
 	ld.volatile.shared.f32 	%f38, [%r9+64];
@@ -1142,7 +1285,7 @@ BB6_28:
 	st.volatile.shared.f32 	[%r9], %f44;
 
 BB6_30:
-	setp.lt.u32	%p17, %r14, 16;
+	setp.lt.u32	%p17, %r13, 16;
 	@%p17 bra 	BB6_32;
 
 	ld.volatile.shared.f32 	%f39, [%r9+32];
@@ -1150,7 +1293,7 @@ BB6_30:
 	st.volatile.shared.f32 	[%r9], %f44;
 
 BB6_32:
-	setp.lt.u32	%p18, %r14, 8;
+	setp.lt.u32	%p18, %r13, 8;
 	@%p18 bra 	BB6_34;
 
 	ld.volatile.shared.f32 	%f40, [%r9+16];
@@ -1158,7 +1301,7 @@ BB6_32:
 	st.volatile.shared.f32 	[%r9], %f44;
 
 BB6_34:
-	setp.lt.u32	%p19, %r14, 4;
+	setp.lt.u32	%p19, %r13, 4;
 	@%p19 bra 	BB6_36;
 
 	ld.volatile.shared.f32 	%f41, [%r9+8];
@@ -1166,7 +1309,7 @@ BB6_34:
 	st.volatile.shared.f32 	[%r9], %f44;
 
 BB6_36:
-	setp.lt.u32	%p20, %r14, 2;
+	setp.lt.u32	%p20, %r13, 2;
 	@%p20 bra 	BB6_38;
 
 	ld.volatile.shared.f32 	%f42, [%r9+4];
@@ -1174,24 +1317,57 @@ BB6_36:
 	st.volatile.shared.f32 	[%r9], %f43;
 
 BB6_38:
-	setp.ne.s32	%p21, %r11, 0;
-	@%p21 bra 	BB6_42;
+	setp.ne.s32	%p21, %r14, 0;
+	@%p21 bra 	BB6_43;
 
 	ld.shared.f32 	%f28, [memory];
-	cvta.to.global.u64 	%rd25, %rd11;
-	ld.global.u64 	%rd26, [%rd25+16];
-	ld.global.u64 	%rd27, [%rd25+32];
-	cvta.to.global.u64 	%rd33, %rd27;
-	setp.ne.s64	%p22, %rd26, 0;
+	cvta.to.global.u64 	%rd23, %rd12;
+	add.s64 	%rd7, %rd23, 16;
+	ld.global.u64 	%rd24, [%rd23+16];
+	setp.eq.s64	%p22, %rd24, 0;
 	@%p22 bra 	BB6_41;
 
-	mul.wide.u32 	%rd28, %r12, 4;
-	add.s64 	%rd33, %rd33, %rd28;
+	mov.u32 	%r42, 0;
+	add.u64 	%rd25, %SP, 0;
+	add.u64 	%rd26, %SPL, 0;
+	st.local.u32 	[%rd26], %r42;
+	st.local.u32 	[%rd26+4], %r11;
+	mov.u64 	%rd27, $str;
+	cvta.global.u64 	%rd28, %rd27;
+	// Callseq Start 4
+	{
+	.reg .b32 temp_param_reg;
+	// <end>}
+	.param .b64 param0;
+	st.param.b64	[param0+0], %rd28;
+	.param .b64 param1;
+	st.param.b64	[param1+0], %rd25;
+	.param .b32 retval0;
+	call.uni (retval0), 
+	vprintf, 
+	(
+	param0, 
+	param1
+	);
+	ld.param.b32	%r44, [retval0+0];
+	
+	//{
+	}// Callseq End 4
+	// inline asm
+	trap;
+	// inline asm
+	ld.global.u64 	%rd35, [%rd7+16];
+	bra.uni 	BB6_42;
 
 BB6_41:
-	st.global.f32 	[%rd33], %f28;
+	ld.global.u64 	%rd29, [%rd7+16];
+	mul.wide.u32 	%rd30, %r11, 4;
+	add.s64 	%rd35, %rd29, %rd30;
 
 BB6_42:
+	st.f32 	[%rd35], %f28;
+
+BB6_43:
 	ret;
 }
 
@@ -1202,85 +1378,87 @@ BB6_42:
 	.param .u32 reduce_min_d_param_2
 )
 {
+	.local .align 8 .b8 	__local_depot7[8];
+	.reg .b64 	%SP;
+	.reg .b64 	%SPL;
 	.reg .pred 	%p<23>;
-	.reg .b32 	%r<46>;
+	.reg .b32 	%r<49>;
 	.reg .f64 	%fd<60>;
-	.reg .b64 	%rd<34>;
+	.reg .b64 	%rd<36>;
 
 
-	ld.param.u64 	%rd10, [reduce_min_d_param_0];
-	ld.param.u64 	%rd11, [reduce_min_d_param_1];
+	mov.u64 	%SPL, __local_depot7;
+	cvta.local.u64 	%SP, %SPL;
+	ld.param.u64 	%rd11, [reduce_min_d_param_0];
+	ld.param.u64 	%rd12, [reduce_min_d_param_1];
 	ld.param.u32 	%r10, [reduce_min_d_param_2];
-	mov.u32 	%r11, %tid.x;
-	mov.u32 	%r12, %ctaid.x;
-	shl.b32 	%r13, %r12, 1;
-	mov.u32 	%r14, %ntid.x;
-	mad.lo.s32 	%r43, %r13, %r14, %r11;
+	mov.u32 	%r11, %ctaid.x;
+	shl.b32 	%r12, %r11, 1;
+	mov.u32 	%r13, %ntid.x;
+	mov.u32 	%r14, %tid.x;
+	mad.lo.s32 	%r46, %r12, %r13, %r14;
 	mov.f64 	%fd44, 0d7FF0000000000000;
-	setp.ge.u32	%p1, %r43, %r10;
+	setp.ge.u32	%p1, %r46, %r10;
 	@%p1 bra 	BB7_9;
 
-	cvta.to.global.u64 	%rd12, %rd10;
-	ld.global.u64 	%rd1, [%rd12+16];
-	ld.global.u64 	%rd13, [%rd12+32];
-	cvta.to.global.u64 	%rd2, %rd13;
+	cvta.to.global.u64 	%rd13, %rd11;
+	ld.global.u64 	%rd1, [%rd13+16];
+	ld.global.u64 	%rd2, [%rd13+32];
 	mov.f64 	%fd44, 0d7FF0000000000000;
-	mov.u64 	%rd30, %rd1;
+	mov.u64 	%rd32, %rd1;
 
 BB7_2:
 	setp.eq.s64	%p2, %rd1, 0;
-	mov.u32 	%r44, %r43;
+	mov.u32 	%r47, %r46;
 	@%p2 bra 	BB7_4;
 
-	cvta.to.global.u64 	%rd14, %rd1;
-	mul.wide.u32 	%rd15, %r43, 4;
-	add.s64 	%rd16, %rd14, %rd15;
-	ld.global.u32 	%r44, [%rd16];
-	mov.u64 	%rd30, %rd1;
+	mul.wide.u32 	%rd14, %r46, 4;
+	add.s64 	%rd15, %rd1, %rd14;
+	ld.u32 	%r47, [%rd15];
+	mov.u64 	%rd32, %rd1;
 
 BB7_4:
-	mul.wide.u32 	%rd17, %r44, 8;
-	add.s64 	%rd18, %rd2, %rd17;
-	ld.global.f64 	%fd31, [%rd18];
+	mul.wide.u32 	%rd16, %r47, 8;
+	add.s64 	%rd17, %rd2, %rd16;
+	ld.f64 	%fd31, [%rd17];
 	min.f64 	%fd44, %fd44, %fd31;
-	add.s32 	%r45, %r43, %r14;
-	setp.ge.u32	%p3, %r45, %r10;
+	add.s32 	%r48, %r46, %r13;
+	setp.ge.u32	%p3, %r48, %r10;
 	@%p3 bra 	BB7_8;
 
-	setp.eq.s64	%p4, %rd30, 0;
-	mov.u64 	%rd30, 0;
+	setp.eq.s64	%p4, %rd32, 0;
+	mov.u64 	%rd32, 0;
 	@%p4 bra 	BB7_7;
 
-	cvta.to.global.u64 	%rd20, %rd1;
-	add.s32 	%r19, %r43, %r14;
-	mul.wide.u32 	%rd21, %r19, 4;
-	add.s64 	%rd22, %rd20, %rd21;
-	ld.global.u32 	%r45, [%rd22];
-	mov.u64 	%rd30, %rd1;
+	add.s32 	%r19, %r46, %r13;
+	mul.wide.u32 	%rd19, %r19, 4;
+	add.s64 	%rd20, %rd1, %rd19;
+	ld.u32 	%r48, [%rd20];
+	mov.u64 	%rd32, %rd1;
 
 BB7_7:
-	mul.wide.u32 	%rd23, %r45, 8;
-	add.s64 	%rd24, %rd2, %rd23;
-	ld.global.f64 	%fd32, [%rd24];
+	mul.wide.u32 	%rd21, %r48, 8;
+	add.s64 	%rd22, %rd2, %rd21;
+	ld.f64 	%fd32, [%rd22];
 	min.f64 	%fd44, %fd44, %fd32;
 
 BB7_8:
-	shl.b32 	%r21, %r14, 1;
+	shl.b32 	%r21, %r13, 1;
 	mov.u32 	%r22, %nctaid.x;
-	mad.lo.s32 	%r43, %r21, %r22, %r43;
-	setp.lt.u32	%p5, %r43, %r10;
+	mad.lo.s32 	%r46, %r21, %r22, %r46;
+	setp.lt.u32	%p5, %r46, %r10;
 	@%p5 bra 	BB7_2;
 
 BB7_9:
-	shl.b32 	%r24, %r11, 3;
+	shl.b32 	%r24, %r14, 3;
 	mov.u32 	%r25, memory;
 	add.s32 	%r9, %r25, %r24;
 	st.shared.f64 	[%r9], %fd44;
 	bar.sync 	0;
-	setp.lt.u32	%p6, %r14, 1024;
+	setp.lt.u32	%p6, %r13, 1024;
 	@%p6 bra 	BB7_13;
 
-	setp.gt.u32	%p7, %r11, 511;
+	setp.gt.u32	%p7, %r14, 511;
 	@%p7 bra 	BB7_12;
 
 	ld.shared.f64 	%fd33, [%r9+4096];
@@ -1291,10 +1469,10 @@ BB7_12:
 	bar.sync 	0;
 
 BB7_13:
-	setp.lt.u32	%p8, %r14, 512;
+	setp.lt.u32	%p8, %r13, 512;
 	@%p8 bra 	BB7_17;
 
-	setp.gt.u32	%p9, %r11, 255;
+	setp.gt.u32	%p9, %r14, 255;
 	@%p9 bra 	BB7_16;
 
 	ld.shared.f64 	%fd34, [%r9+2048];
@@ -1305,10 +1483,10 @@ BB7_16:
 	bar.sync 	0;
 
 BB7_17:
-	setp.lt.u32	%p10, %r14, 256;
+	setp.lt.u32	%p10, %r13, 256;
 	@%p10 bra 	BB7_21;
 
-	setp.gt.u32	%p11, %r11, 127;
+	setp.gt.u32	%p11, %r14, 127;
 	@%p11 bra 	BB7_20;
 
 	ld.shared.f64 	%fd35, [%r9+1024];
@@ -1319,10 +1497,10 @@ BB7_20:
 	bar.sync 	0;
 
 BB7_21:
-	setp.lt.u32	%p12, %r14, 128;
+	setp.lt.u32	%p12, %r13, 128;
 	@%p12 bra 	BB7_25;
 
-	setp.gt.u32	%p13, %r11, 63;
+	setp.gt.u32	%p13, %r14, 63;
 	@%p13 bra 	BB7_24;
 
 	ld.shared.f64 	%fd36, [%r9+512];
@@ -1333,10 +1511,10 @@ BB7_24:
 	bar.sync 	0;
 
 BB7_25:
-	setp.gt.u32	%p14, %r11, 31;
+	setp.gt.u32	%p14, %r14, 31;
 	@%p14 bra 	BB7_38;
 
-	setp.lt.u32	%p15, %r14, 64;
+	setp.lt.u32	%p15, %r13, 64;
 	@%p15 bra 	BB7_28;
 
 	ld.volatile.shared.f64 	%fd37, [%r9+256];
@@ -1344,7 +1522,7 @@ BB7_25:
 	st.volatile.shared.f64 	[%r9], %fd44;
 
 BB7_28:
-	setp.lt.u32	%p16, %r14, 32;
+	setp.lt.u32	%p16, %r13, 32;
 	@%p16 bra 	BB7_30;
 
 	ld.volatile.shared.f64 	%fd38, [%r9+128];
@@ -1352,7 +1530,7 @@ BB7_28:
 	st.volatile.shared.f64 	[%r9], %fd44;
 
 BB7_30:
-	setp.lt.u32	%p17, %r14, 16;
+	setp.lt.u32	%p17, %r13, 16;
 	@%p17 bra 	BB7_32;
 
 	ld.volatile.shared.f64 	%fd39, [%r9+64];
@@ -1360,7 +1538,7 @@ BB7_30:
 	st.volatile.shared.f64 	[%r9], %fd44;
 
 BB7_32:
-	setp.lt.u32	%p18, %r14, 8;
+	setp.lt.u32	%p18, %r13, 8;
 	@%p18 bra 	BB7_34;
 
 	ld.volatile.shared.f64 	%fd40, [%r9+32];
@@ -1368,7 +1546,7 @@ BB7_32:
 	st.volatile.shared.f64 	[%r9], %fd44;
 
 BB7_34:
-	setp.lt.u32	%p19, %r14, 4;
+	setp.lt.u32	%p19, %r13, 4;
 	@%p19 bra 	BB7_36;
 
 	ld.volatile.shared.f64 	%fd41, [%r9+16];
@@ -1376,7 +1554,7 @@ BB7_34:
 	st.volatile.shared.f64 	[%r9], %fd44;
 
 BB7_36:
-	setp.lt.u32	%p20, %r14, 2;
+	setp.lt.u32	%p20, %r13, 2;
 	@%p20 bra 	BB7_38;
 
 	ld.volatile.shared.f64 	%fd42, [%r9+8];
@@ -1384,24 +1562,57 @@ BB7_36:
 	st.volatile.shared.f64 	[%r9], %fd43;
 
 BB7_38:
-	setp.ne.s32	%p21, %r11, 0;
-	@%p21 bra 	BB7_42;
+	setp.ne.s32	%p21, %r14, 0;
+	@%p21 bra 	BB7_43;
 
 	ld.shared.f64 	%fd28, [memory];
-	cvta.to.global.u64 	%rd25, %rd11;
-	ld.global.u64 	%rd26, [%rd25+16];
-	ld.global.u64 	%rd27, [%rd25+32];
-	cvta.to.global.u64 	%rd33, %rd27;
-	setp.ne.s64	%p22, %rd26, 0;
+	cvta.to.global.u64 	%rd23, %rd12;
+	add.s64 	%rd7, %rd23, 16;
+	ld.global.u64 	%rd24, [%rd23+16];
+	setp.eq.s64	%p22, %rd24, 0;
 	@%p22 bra 	BB7_41;
 
-	mul.wide.u32 	%rd28, %r12, 8;
-	add.s64 	%rd33, %rd33, %rd28;
+	mov.u32 	%r42, 0;
+	add.u64 	%rd25, %SP, 0;
+	add.u64 	%rd26, %SPL, 0;
+	st.local.u32 	[%rd26], %r42;
+	st.local.u32 	[%rd26+4], %r11;
+	mov.u64 	%rd27, $str;
+	cvta.global.u64 	%rd28, %rd27;
+	// Callseq Start 5
+	{
+	.reg .b32 temp_param_reg;
+	// <end>}
+	.param .b64 param0;
+	st.param.b64	[param0+0], %rd28;
+	.param .b64 param1;
+	st.param.b64	[param1+0], %rd25;
+	.param .b32 retval0;
+	call.uni (retval0), 
+	vprintf, 
+	(
+	param0, 
+	param1
+	);
+	ld.param.b32	%r44, [retval0+0];
+	
+	//{
+	}// Callseq End 5
+	// inline asm
+	trap;
+	// inline asm
+	ld.global.u64 	%rd35, [%rd7+16];
+	bra.uni 	BB7_42;
 
 BB7_41:
-	st.global.f64 	[%rd33], %fd28;
+	ld.global.u64 	%rd29, [%rd7+16];
+	mul.wide.u32 	%rd30, %r11, 8;
+	add.s64 	%rd35, %rd29, %rd30;
 
 BB7_42:
+	st.f64 	[%rd35], %fd28;
+
+BB7_43:
 	ret;
 }