You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2017/08/24 21:41:34 UTC
[4/5] systemml git commit: [SYSTEMML-1793] Support matrix range
indexing on GPU
http://git-wip-us.apache.org/repos/asf/systemml/blob/628ffad1/src/main/cpp/kernels/SystemML.ptx
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.ptx b/src/main/cpp/kernels/SystemML.ptx
index 7778317..f6ba15a 100644
--- a/src/main/cpp/kernels/SystemML.ptx
+++ b/src/main/cpp/kernels/SystemML.ptx
@@ -1,8 +1,8 @@
//
// Generated by NVIDIA NVVM Compiler
//
-// Compiler Build ID: CL-21554848
-// Cuda compilation tools, release 8.0, V8.0.61
+// Compiler Build ID: CL-21124049
+// Cuda compilation tools, release 8.0, V8.0.44
// Based on LLVM 3.4svn
//
@@ -10,7 +10,7 @@
.target sm_30
.address_size 64
- // .globl copy_u2l_dense
+ // .globl slice_sparse_dense
.func (.param .b64 func_retval0) __internal_trig_reduction_slowpathd
(
.param .b64 __internal_trig_reduction_slowpathd_param_0,
@@ -27,6 +27,86 @@
.const .align 8 .b8 __cudart_i2opi_d[144] = {8, 93, 141, 31, 177, 95, 251, 107, 234, 146, 82, 138, 247, 57, 7, 61, 123, 241, 229, 235, 199, 186, 39, 117, 45, 234, 95, 158, 102, 63, 70, 79, 183, 9, 203, 39, 207, 126, 54, 109, 31, 109, 10, 90, 139, 17, 47, 239, 15, 152, 5, 222, 255, 151, 248, 31, 59, 40, 249, 189, 139, 95, 132, 156, 244, 57, 83, 131, 57, 214, 145, 57, 65, 126, 95, 180, 38, 112, 156, 233, 132, 68, 187, 46, 245, 53, 130, 232, 62, 167, 41, 177, 28, 235, 29, 254, 28, 146, 209, 9, 234, 46, 73, 6, 224, 210, 77, 66, 58, 110, 36, 183, 97, 197, 187, 222, 171, 99, 81, 254, 65, 144, 67, 60, 153, 149, 98, 219, 192, 221, 52, 245, 209, 87, 39, 252, 41, 21, 68, 78, 110, 131, 249, 162};
.const .align 8 .b8 __cudart_sin_cos_coeffs[128] = {186, 94, 120, 249, 101, 219, 229, 61, 70, 210, 176, 44, 241, 229, 90, 190, 146, 227, 172, 105, 227, 29, 199, 62, 161, 98, 219, 25, 160, 1, 42, 191, 24, 8, 17, 17, 17, 17, 129, 63, 84, 85, 85, 85, 85, 85, 197, 191, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 100, 129, 253, 32, 131, 255, 168, 189, 40, 133, 239, 193, 167, 238, 33, 62, 217, 230, 6, 142, 79, 126, 146, 190, 233, 188, 221, 25, 160, 1, 250, 62, 71, 93, 193, 22, 108, 193, 86, 191, 81, 85, 85, 85, 85, 85, 165, 63, 0, 0, 0, 0, 0, 0, 224, 191, 0, 0, 0, 0, 0, 0, 240, 63};
+.visible .entry slice_sparse_dense(
+ .param .u64 slice_sparse_dense_param_0,
+ .param .u64 slice_sparse_dense_param_1,
+ .param .u64 slice_sparse_dense_param_2,
+ .param .u64 slice_sparse_dense_param_3,
+ .param .u32 slice_sparse_dense_param_4,
+ .param .u32 slice_sparse_dense_param_5,
+ .param .u32 slice_sparse_dense_param_6,
+ .param .u32 slice_sparse_dense_param_7
+)
+{
+ .reg .pred %p<7>;
+ .reg .b32 %r<26>;
+ .reg .f64 %fd<2>;
+ .reg .b64 %rd<23>;
+
+
+ ld.param.u64 %rd9, [slice_sparse_dense_param_0];
+ ld.param.u64 %rd10, [slice_sparse_dense_param_1];
+ ld.param.u64 %rd11, [slice_sparse_dense_param_2];
+ ld.param.u64 %rd12, [slice_sparse_dense_param_3];
+ ld.param.u32 %r14, [slice_sparse_dense_param_4];
+ ld.param.u32 %r15, [slice_sparse_dense_param_5];
+ ld.param.u32 %r12, [slice_sparse_dense_param_6];
+ ld.param.u32 %r13, [slice_sparse_dense_param_7];
+ mov.u32 %r16, %ntid.x;
+ mov.u32 %r17, %ctaid.x;
+ mov.u32 %r18, %tid.x;
+ mad.lo.s32 %r1, %r16, %r17, %r18;
+ add.s32 %r2, %r1, %r14;
+ setp.gt.s32 %p1, %r2, %r15;
+ @%p1 bra BB0_6;
+
+ cvta.to.global.u64 %rd13, %rd10;
+ mul.wide.s32 %rd14, %r2, 4;
+ add.s64 %rd1, %rd13, %rd14;
+ ld.global.u32 %r25, [%rd1];
+ ld.global.u32 %r24, [%rd1+4];
+ setp.ge.s32 %p2, %r25, %r24;
+ @%p2 bra BB0_6;
+
+ cvta.to.global.u64 %rd2, %rd12;
+ cvta.to.global.u64 %rd15, %rd11;
+ mov.u32 %r19, 1;
+ sub.s32 %r20, %r19, %r12;
+ add.s32 %r21, %r20, %r13;
+ mul.lo.s32 %r22, %r1, %r21;
+ sub.s32 %r5, %r22, %r12;
+ cvta.to.global.u64 %rd16, %rd9;
+ mul.wide.s32 %rd17, %r25, 8;
+ add.s64 %rd22, %rd16, %rd17;
+ mul.wide.s32 %rd18, %r25, 4;
+ add.s64 %rd21, %rd15, %rd18;
+
+BB0_3:
+ ld.global.u32 %r8, [%rd21];
+ setp.lt.s32 %p3, %r8, %r12;
+ setp.gt.s32 %p4, %r8, %r13;
+ or.pred %p5, %p3, %p4;
+ @%p5 bra BB0_5;
+
+ ld.global.f64 %fd1, [%rd22];
+ add.s32 %r23, %r5, %r8;
+ mul.wide.s32 %rd19, %r23, 8;
+ add.s64 %rd20, %rd2, %rd19;
+ st.global.f64 [%rd20], %fd1;
+ ld.global.u32 %r24, [%rd1+4];
+
+BB0_5:
+ add.s64 %rd22, %rd22, 8;
+ add.s64 %rd21, %rd21, 4;
+ add.s32 %r25, %r25, 1;
+ setp.lt.s32 %p6, %r25, %r24;
+ @%p6 bra BB0_3;
+
+BB0_6:
+ ret;
+}
+
+ // .globl copy_u2l_dense
.visible .entry copy_u2l_dense(
.param .u64 copy_u2l_dense_param_0,
.param .u32 copy_u2l_dense_param_1,
@@ -52,10 +132,10 @@
setp.gt.s32 %p1, %r9, %r8;
setp.lt.s32 %p2, %r2, %r4;
and.pred %p3, %p1, %p2;
- @!%p3 bra BB0_2;
- bra.uni BB0_1;
+ @!%p3 bra BB1_2;
+ bra.uni BB1_1;
-BB0_1:
+BB1_1:
cvta.to.global.u64 %rd2, %rd1;
mul.wide.s32 %rd3, %r1, 8;
add.s64 %rd4, %rd2, %rd3;
@@ -64,7 +144,7 @@ BB0_1:
add.s64 %rd6, %rd2, %rd5;
st.global.f64 [%rd6], %fd1;
-BB0_2:
+BB1_2:
ret;
}
@@ -94,10 +174,10 @@ BB0_2:
setp.lt.s32 %p1, %r2, %r4;
setp.gt.s32 %p2, %r3, -1;
and.pred %p3, %p1, %p2;
- @!%p3 bra BB1_2;
- bra.uni BB1_1;
+ @!%p3 bra BB2_2;
+ bra.uni BB2_1;
-BB1_1:
+BB2_1:
rem.s32 %r8, %r1, %r3;
cvta.to.global.u64 %rd3, %rd1;
mad.lo.s32 %r9, %r2, %r3, %r8;
@@ -110,7 +190,7 @@ BB1_1:
add.s64 %rd7, %rd6, %rd4;
st.global.f64 [%rd7], %fd3;
-BB1_2:
+BB2_2:
ret;
}
@@ -142,10 +222,10 @@ BB1_2:
setp.lt.s32 %p1, %r2, %r4;
setp.gt.s32 %p2, %r3, -1;
and.pred %p3, %p1, %p2;
- @!%p3 bra BB2_4;
- bra.uni BB2_1;
+ @!%p3 bra BB3_4;
+ bra.uni BB3_1;
-BB2_1:
+BB3_1:
rem.s32 %r8, %r1, %r3;
cvta.to.global.u64 %rd5, %rd2;
mad.lo.s32 %r9, %r2, %r3, %r8;
@@ -155,20 +235,20 @@ BB2_1:
ld.global.f64 %fd4, [%rd7];
mov.f64 %fd5, 0d0000000000000000;
setp.leu.f64 %p4, %fd4, 0d0000000000000000;
- @%p4 bra BB2_3;
+ @%p4 bra BB3_3;
cvta.to.global.u64 %rd8, %rd3;
shl.b64 %rd9, %rd1, 3;
add.s64 %rd10, %rd8, %rd9;
ld.global.f64 %fd5, [%rd10];
-BB2_3:
+BB3_3:
cvta.to.global.u64 %rd11, %rd4;
shl.b64 %rd12, %rd1, 3;
add.s64 %rd13, %rd11, %rd12;
st.global.f64 [%rd13], %fd5;
-BB2_4:
+BB3_4:
ret;
}
@@ -202,10 +282,10 @@ BB2_4:
setp.lt.s32 %p1, %r2, %r5;
setp.gt.s32 %p2, %r3, -1;
and.pred %p3, %p1, %p2;
- @!%p3 bra BB3_2;
- bra.uni BB3_1;
+ @!%p3 bra BB4_2;
+ bra.uni BB4_1;
-BB3_1:
+BB4_1:
rem.s32 %r9, %r1, %r3;
cvta.to.global.u64 %rd4, %rd1;
mad.lo.s32 %r10, %r2, %r3, %r9;
@@ -222,7 +302,7 @@ BB3_1:
add.s64 %rd11, %rd10, %rd5;
st.global.f64 [%rd11], %fd3;
-BB3_2:
+BB4_2:
ret;
}
@@ -261,10 +341,10 @@ BB3_2:
setp.lt.s32 %p1, %r1, %r5;
setp.gt.s32 %p2, %r3, -1;
and.pred %p3, %p1, %p2;
- @!%p3 bra BB4_4;
- bra.uni BB4_1;
+ @!%p3 bra BB5_4;
+ bra.uni BB5_1;
-BB4_1:
+BB5_1:
cvta.to.global.u64 %rd6, %rd4;
mad.lo.s32 %r10, %r1, %r3, %r2;
cvta.to.global.u64 %rd7, %rd3;
@@ -273,25 +353,25 @@ BB4_1:
ld.global.f64 %fd1, [%rd9];
add.s64 %rd2, %rd6, %rd8;
setp.eq.s32 %p4, %r4, 1;
- @%p4 bra BB4_3;
- bra.uni BB4_2;
+ @%p4 bra BB5_3;
+ bra.uni BB5_2;
-BB4_3:
+BB5_3:
mul.wide.s32 %rd12, %r2, 8;
add.s64 %rd13, %rd1, %rd12;
ld.global.f64 %fd5, [%rd13];
fma.rn.f64 %fd6, %fd5, %fd2, %fd1;
st.global.f64 [%rd2], %fd6;
- bra.uni BB4_4;
+ bra.uni BB5_4;
-BB4_2:
+BB5_2:
mul.wide.s32 %rd10, %r1, 8;
add.s64 %rd11, %rd1, %rd10;
ld.global.f64 %fd3, [%rd11];
fma.rn.f64 %fd4, %fd3, %fd2, %fd1;
st.global.f64 [%rd2], %fd4;
-BB4_4:
+BB5_4:
ret;
}
@@ -325,10 +405,10 @@ BB4_4:
setp.lt.s32 %p1, %r2, %r5;
setp.gt.s32 %p2, %r3, -1;
and.pred %p3, %p1, %p2;
- @!%p3 bra BB5_2;
- bra.uni BB5_1;
+ @!%p3 bra BB6_2;
+ bra.uni BB6_1;
-BB5_1:
+BB6_1:
rem.s32 %r9, %r1, %r3;
cvta.to.global.u64 %rd4, %rd1;
mad.lo.s32 %r10, %r2, %r3, %r9;
@@ -345,7 +425,7 @@ BB5_1:
add.s64 %rd11, %rd10, %rd5;
st.global.f64 [%rd11], %fd3;
-BB5_2:
+BB6_2:
ret;
}
@@ -387,10 +467,10 @@ BB5_2:
setp.lt.s32 %p1, %r8, %r2;
setp.gt.s32 %p2, %r3, -1;
and.pred %p3, %p1, %p2;
- @!%p3 bra BB6_6;
- bra.uni BB6_1;
+ @!%p3 bra BB7_6;
+ bra.uni BB7_1;
-BB6_1:
+BB7_1:
cvta.to.global.u64 %rd4, %rd2;
mul.wide.s32 %rd5, %r1, 8;
add.s64 %rd6, %rd4, %rd5;
@@ -400,26 +480,26 @@ BB6_1:
setp.lt.f64 %p4, %fd8, %fd3;
cvta.to.global.u64 %rd7, %rd3;
add.s64 %rd1, %rd7, %rd5;
- @%p4 bra BB6_5;
- bra.uni BB6_2;
+ @%p4 bra BB7_5;
+ bra.uni BB7_2;
-BB6_5:
+BB7_5:
st.global.f64 [%rd1], %fd4;
- bra.uni BB6_6;
+ bra.uni BB7_6;
-BB6_2:
+BB7_2:
setp.lt.f64 %p5, %fd1, %fd2;
- @%p5 bra BB6_4;
- bra.uni BB6_3;
+ @%p5 bra BB7_4;
+ bra.uni BB7_3;
-BB6_4:
+BB7_4:
st.global.f64 [%rd1], %fd5;
- bra.uni BB6_6;
+ bra.uni BB7_6;
-BB6_3:
+BB7_3:
st.global.f64 [%rd1], %fd6;
-BB6_6:
+BB7_6:
ret;
}
@@ -435,9 +515,9 @@ BB6_6:
.param .u32 matrix_matrix_cellwise_op_param_7
)
{
- .reg .pred %p<73>;
- .reg .b32 %r<66>;
- .reg .f64 %fd<56>;
+ .reg .pred %p<77>;
+ .reg .b32 %r<65>;
+ .reg .f64 %fd<55>;
.reg .b64 %rd<19>;
@@ -458,93 +538,93 @@ BB6_6:
setp.lt.s32 %p2, %r1, %r14;
setp.gt.s32 %p3, %r10, -1;
and.pred %p4, %p2, %p3;
- @!%p4 bra BB7_77;
- bra.uni BB7_1;
+ @!%p4 bra BB8_73;
+ bra.uni BB8_1;
-BB7_1:
+BB8_1:
mad.lo.s32 %r3, %r1, %r10, %r2;
setp.eq.s32 %p5, %r11, 1;
- mov.u32 %r64, %r1;
- @%p5 bra BB7_5;
+ mov.u32 %r63, %r1;
+ @%p5 bra BB8_5;
setp.ne.s32 %p6, %r11, 2;
- mov.u32 %r65, %r3;
- @%p6 bra BB7_4;
+ mov.u32 %r64, %r3;
+ @%p6 bra BB8_4;
- mov.u32 %r65, %r2;
+ mov.u32 %r64, %r2;
-BB7_4:
- mov.u32 %r59, %r65;
- mov.u32 %r4, %r59;
- mov.u32 %r64, %r4;
+BB8_4:
+ mov.u32 %r58, %r64;
+ mov.u32 %r4, %r58;
+ mov.u32 %r63, %r4;
-BB7_5:
- mov.u32 %r5, %r64;
+BB8_5:
+ mov.u32 %r5, %r63;
setp.eq.s32 %p7, %r12, 1;
- mov.u32 %r62, %r1;
- @%p7 bra BB7_9;
+ mov.u32 %r61, %r1;
+ @%p7 bra BB8_9;
setp.ne.s32 %p8, %r12, 2;
- mov.u32 %r63, %r3;
- @%p8 bra BB7_8;
+ mov.u32 %r62, %r3;
+ @%p8 bra BB8_8;
- mov.u32 %r63, %r2;
+ mov.u32 %r62, %r2;
-BB7_8:
- mov.u32 %r62, %r63;
+BB8_8:
+ mov.u32 %r61, %r62;
-BB7_9:
+BB8_9:
cvta.to.global.u64 %rd5, %rd3;
cvta.to.global.u64 %rd6, %rd2;
mul.wide.s32 %rd7, %r5, 8;
add.s64 %rd8, %rd6, %rd7;
ld.global.f64 %fd1, [%rd8];
- mul.wide.s32 %rd9, %r62, 8;
+ mul.wide.s32 %rd9, %r61, 8;
add.s64 %rd10, %rd5, %rd9;
ld.global.f64 %fd2, [%rd10];
- mov.f64 %fd55, 0d7FEFFFFFFFFFFFFF;
+ mov.f64 %fd54, 0d7FEFFFFFFFFFFFFF;
setp.gt.s32 %p9, %r13, 8;
- @%p9 bra BB7_26;
+ @%p9 bra BB8_26;
setp.gt.s32 %p23, %r13, 3;
- @%p23 bra BB7_18;
+ @%p23 bra BB8_18;
setp.gt.s32 %p30, %r13, 1;
- @%p30 bra BB7_15;
+ @%p30 bra BB8_15;
setp.eq.s32 %p33, %r13, 0;
- @%p33 bra BB7_75;
- bra.uni BB7_13;
+ @%p33 bra BB8_71;
+ bra.uni BB8_13;
-BB7_75:
- add.f64 %fd55, %fd1, %fd2;
- bra.uni BB7_76;
+BB8_71:
+ add.f64 %fd54, %fd1, %fd2;
+ bra.uni BB8_72;
-BB7_26:
+BB8_26:
setp.gt.s32 %p10, %r13, 13;
- @%p10 bra BB7_35;
+ @%p10 bra BB8_35;
setp.gt.s32 %p17, %r13, 10;
- @%p17 bra BB7_31;
+ @%p17 bra BB8_31;
setp.eq.s32 %p21, %r13, 9;
- @%p21 bra BB7_55;
- bra.uni BB7_29;
+ @%p21 bra BB8_53;
+ bra.uni BB8_29;
-BB7_55:
- setp.eq.f64 %p48, %fd1, %fd2;
- selp.f64 %fd55, 0d3FF0000000000000, 0d0000000000000000, %p48;
- bra.uni BB7_76;
+BB8_53:
+ setp.eq.f64 %p50, %fd1, %fd2;
+ selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p50;
+ bra.uni BB8_72;
-BB7_18:
+BB8_18:
setp.gt.s32 %p24, %r13, 5;
- @%p24 bra BB7_22;
+ @%p24 bra BB8_22;
setp.eq.s32 %p28, %r13, 4;
- @%p28 bra BB7_58;
- bra.uni BB7_20;
+ @%p28 bra BB8_56;
+ bra.uni BB8_20;
-BB7_58:
+BB8_56:
{
.reg .b32 %temp;
mov.b64 {%temp, %r8}, %fd1;
@@ -557,7 +637,7 @@ BB7_58:
add.s32 %r32, %r31, -1012;
mov.b64 %rd15, %fd2;
shl.b64 %rd1, %rd15, %r32;
- setp.eq.s64 %p53, %rd1, -9223372036854775808;
+ setp.eq.s64 %p55, %rd1, -9223372036854775808;
abs.f64 %fd19, %fd1;
// Callseq Start 0
{
@@ -574,342 +654,340 @@ BB7_58:
param0,
param1
);
- ld.param.f64 %fd54, [retval0+0];
+ ld.param.f64 %fd53, [retval0+0];
//{
}// Callseq End 0
- setp.lt.s32 %p54, %r8, 0;
- and.pred %p1, %p54, %p53;
- @!%p1 bra BB7_60;
- bra.uni BB7_59;
+ setp.lt.s32 %p56, %r8, 0;
+ and.pred %p1, %p56, %p55;
+ @!%p1 bra BB8_58;
+ bra.uni BB8_57;
-BB7_59:
+BB8_57:
{
.reg .b32 %temp;
- mov.b64 {%temp, %r33}, %fd54;
+ mov.b64 {%temp, %r33}, %fd53;
}
xor.b32 %r34, %r33, -2147483648;
{
.reg .b32 %temp;
- mov.b64 {%r35, %temp}, %fd54;
+ mov.b64 {%r35, %temp}, %fd53;
}
- mov.b64 %fd54, {%r35, %r34};
+ mov.b64 %fd53, {%r35, %r34};
-BB7_60:
- mov.f64 %fd53, %fd54;
- setp.eq.f64 %p55, %fd1, 0d0000000000000000;
- @%p55 bra BB7_63;
- bra.uni BB7_61;
+BB8_58:
+ mov.f64 %fd52, %fd53;
+ setp.eq.f64 %p57, %fd1, 0d0000000000000000;
+ @%p57 bra BB8_61;
+ bra.uni BB8_59;
-BB7_63:
- selp.b32 %r36, %r8, 0, %p53;
+BB8_61:
+ selp.b32 %r36, %r8, 0, %p55;
or.b32 %r37, %r36, 2146435072;
- setp.lt.s32 %p59, %r9, 0;
- selp.b32 %r38, %r37, %r36, %p59;
+ setp.lt.s32 %p61, %r9, 0;
+ selp.b32 %r38, %r37, %r36, %p61;
mov.u32 %r39, 0;
- mov.b64 %fd53, {%r39, %r38};
- bra.uni BB7_64;
+ mov.b64 %fd52, {%r39, %r38};
+ bra.uni BB8_62;
-BB7_35:
+BB8_35:
setp.gt.s32 %p11, %r13, 15;
- @%p11 bra BB7_39;
+ @%p11 bra BB8_39;
setp.eq.s32 %p15, %r13, 14;
- @%p15 bra BB7_52;
- bra.uni BB7_37;
+ @%p15 bra BB8_50;
+ bra.uni BB8_37;
-BB7_52:
+BB8_50:
cvt.rni.s64.f64 %rd11, %fd1;
cvt.rni.s64.f64 %rd12, %fd2;
cvt.u32.u64 %r25, %rd11;
cvt.u32.u64 %r26, %rd12;
or.b32 %r27, %r26, %r25;
- setp.eq.s32 %p45, %r27, 0;
- selp.f64 %fd55, 0d0000000000000000, 0d3FF0000000000000, %p45;
- bra.uni BB7_76;
+ setp.eq.s32 %p47, %r27, 0;
+ selp.f64 %fd54, 0d0000000000000000, 0d3FF0000000000000, %p47;
+ bra.uni BB8_72;
-BB7_15:
+BB8_15:
setp.eq.s32 %p31, %r13, 2;
- @%p31 bra BB7_74;
- bra.uni BB7_16;
+ @%p31 bra BB8_70;
+ bra.uni BB8_16;
-BB7_74:
- mul.f64 %fd55, %fd1, %fd2;
- bra.uni BB7_76;
+BB8_70:
+ mul.f64 %fd54, %fd1, %fd2;
+ bra.uni BB8_72;
-BB7_31:
+BB8_31:
setp.eq.s32 %p18, %r13, 11;
- @%p18 bra BB7_54;
+ @%p18 bra BB8_52;
setp.eq.s32 %p19, %r13, 12;
- @%p19 bra BB7_53;
- bra.uni BB7_33;
+ @%p19 bra BB8_51;
+ bra.uni BB8_33;
-BB7_53:
- max.f64 %fd55, %fd1, %fd2;
- bra.uni BB7_76;
+BB8_51:
+ max.f64 %fd54, %fd1, %fd2;
+ bra.uni BB8_72;
-BB7_22:
+BB8_22:
setp.eq.s32 %p25, %r13, 6;
- @%p25 bra BB7_57;
+ @%p25 bra BB8_55;
setp.eq.s32 %p26, %r13, 7;
- @%p26 bra BB7_56;
- bra.uni BB7_24;
+ @%p26 bra BB8_54;
+ bra.uni BB8_24;
-BB7_56:
- setp.gt.f64 %p50, %fd1, %fd2;
- selp.f64 %fd55, 0d3FF0000000000000, 0d0000000000000000, %p50;
- bra.uni BB7_76;
+BB8_54:
+ setp.gt.f64 %p52, %fd1, %fd2;
+ selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p52;
+ bra.uni BB8_72;
-BB7_39:
+BB8_39:
setp.eq.s32 %p12, %r13, 16;
- @%p12 bra BB7_51;
+ @%p12 bra BB8_49;
setp.eq.s32 %p13, %r13, 17;
- @%p13 bra BB7_46;
- bra.uni BB7_41;
+ @%p13 bra BB8_45;
+ bra.uni BB8_41;
-BB7_46:
- setp.eq.f64 %p38, %fd2, 0d0000000000000000;
- setp.eq.f64 %p39, %fd2, 0d8000000000000000;
- or.pred %p40, %p38, %p39;
- mov.f64 %fd55, 0d7FF8000000000000;
- @%p40 bra BB7_76;
+BB8_45:
+ setp.eq.f64 %p39, %fd2, 0d0000000000000000;
+ setp.eq.f64 %p40, %fd2, 0d8000000000000000;
+ or.pred %p41, %p39, %p40;
+ mov.f64 %fd54, 0d7FF8000000000000;
+ @%p41 bra BB8_72;
- div.rn.f64 %fd55, %fd1, %fd2;
- abs.f64 %fd39, %fd55;
- setp.gtu.f64 %p41, %fd39, 0d7FF0000000000000;
- @%p41 bra BB7_76;
+ div.rn.f64 %fd54, %fd1, %fd2;
+ abs.f64 %fd39, %fd54;
+ setp.gtu.f64 %p42, %fd39, 0d7FF0000000000000;
+ @%p42 bra BB8_72;
{
.reg .b32 %temp;
- mov.b64 {%temp, %r22}, %fd55;
+ mov.b64 {%r22, %temp}, %fd54;
}
- and.b32 %r23, %r22, 2147483647;
- setp.ne.s32 %p42, %r23, 2146435072;
- @%p42 bra BB7_50;
-
{
.reg .b32 %temp;
- mov.b64 {%r24, %temp}, %fd55;
+ mov.b64 {%temp, %r23}, %fd54;
}
- setp.eq.s32 %p43, %r24, 0;
- @%p43 bra BB7_76;
+ and.b32 %r24, %r23, 2147483647;
+ setp.ne.s32 %p43, %r24, 2146435072;
+ setp.ne.s32 %p44, %r22, 0;
+ or.pred %p45, %p43, %p44;
+ @!%p45 bra BB8_72;
+ bra.uni BB8_48;
-BB7_50:
- cvt.rmi.f64.f64 %fd40, %fd55;
+BB8_48:
+ cvt.rmi.f64.f64 %fd40, %fd54;
mul.f64 %fd41, %fd2, %fd40;
- sub.f64 %fd55, %fd1, %fd41;
- bra.uni BB7_76;
+ sub.f64 %fd54, %fd1, %fd41;
+ bra.uni BB8_72;
-BB7_13:
+BB8_13:
setp.eq.s32 %p34, %r13, 1;
- @%p34 bra BB7_14;
- bra.uni BB7_76;
+ @%p34 bra BB8_14;
+ bra.uni BB8_72;
-BB7_14:
- sub.f64 %fd55, %fd1, %fd2;
- bra.uni BB7_76;
+BB8_14:
+ sub.f64 %fd54, %fd1, %fd2;
+ bra.uni BB8_72;
-BB7_29:
+BB8_29:
setp.eq.s32 %p22, %r13, 10;
- @%p22 bra BB7_30;
- bra.uni BB7_76;
+ @%p22 bra BB8_30;
+ bra.uni BB8_72;
-BB7_30:
- setp.neu.f64 %p47, %fd1, %fd2;
- selp.f64 %fd55, 0d3FF0000000000000, 0d0000000000000000, %p47;
- bra.uni BB7_76;
+BB8_30:
+ setp.neu.f64 %p49, %fd1, %fd2;
+ selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p49;
+ bra.uni BB8_72;
-BB7_20:
+BB8_20:
setp.eq.s32 %p29, %r13, 5;
- @%p29 bra BB7_21;
- bra.uni BB7_76;
+ @%p29 bra BB8_21;
+ bra.uni BB8_72;
-BB7_21:
- setp.lt.f64 %p52, %fd1, %fd2;
- selp.f64 %fd55, 0d3FF0000000000000, 0d0000000000000000, %p52;
- bra.uni BB7_76;
+BB8_21:
+ setp.lt.f64 %p54, %fd1, %fd2;
+ selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p54;
+ bra.uni BB8_72;
-BB7_37:
+BB8_37:
setp.eq.s32 %p16, %r13, 15;
- @%p16 bra BB7_38;
- bra.uni BB7_76;
+ @%p16 bra BB8_38;
+ bra.uni BB8_72;
-BB7_38:
+BB8_38:
mul.f64 %fd43, %fd1, %fd2;
mov.f64 %fd44, 0d3FF0000000000000;
- sub.f64 %fd55, %fd44, %fd43;
- bra.uni BB7_76;
+ sub.f64 %fd54, %fd44, %fd43;
+ bra.uni BB8_72;
-BB7_16:
+BB8_16:
setp.eq.s32 %p32, %r13, 3;
- @%p32 bra BB7_17;
- bra.uni BB7_76;
+ @%p32 bra BB8_17;
+ bra.uni BB8_72;
-BB7_17:
- div.rn.f64 %fd55, %fd1, %fd2;
- bra.uni BB7_76;
+BB8_17:
+ div.rn.f64 %fd54, %fd1, %fd2;
+ bra.uni BB8_72;
-BB7_54:
- min.f64 %fd55, %fd1, %fd2;
- bra.uni BB7_76;
+BB8_52:
+ min.f64 %fd54, %fd1, %fd2;
+ bra.uni BB8_72;
-BB7_33:
+BB8_33:
setp.eq.s32 %p20, %r13, 13;
- @%p20 bra BB7_34;
- bra.uni BB7_76;
+ @%p20 bra BB8_34;
+ bra.uni BB8_72;
-BB7_34:
+BB8_34:
cvt.rni.s64.f64 %rd13, %fd1;
cvt.rni.s64.f64 %rd14, %fd2;
cvt.u32.u64 %r28, %rd13;
cvt.u32.u64 %r29, %rd14;
and.b32 %r30, %r29, %r28;
- setp.eq.s32 %p46, %r30, 0;
- selp.f64 %fd55, 0d0000000000000000, 0d3FF0000000000000, %p46;
- bra.uni BB7_76;
+ setp.eq.s32 %p48, %r30, 0;
+ selp.f64 %fd54, 0d0000000000000000, 0d3FF0000000000000, %p48;
+ bra.uni BB8_72;
-BB7_57:
- setp.le.f64 %p51, %fd1, %fd2;
- selp.f64 %fd55, 0d3FF0000000000000, 0d0000000000000000, %p51;
- bra.uni BB7_76;
+BB8_55:
+ setp.le.f64 %p53, %fd1, %fd2;
+ selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p53;
+ bra.uni BB8_72;
-BB7_24:
+BB8_24:
setp.eq.s32 %p27, %r13, 8;
- @%p27 bra BB7_25;
- bra.uni BB7_76;
+ @%p27 bra BB8_25;
+ bra.uni BB8_72;
-BB7_25:
- setp.ge.f64 %p49, %fd1, %fd2;
- selp.f64 %fd55, 0d3FF0000000000000, 0d0000000000000000, %p49;
- bra.uni BB7_76;
+BB8_25:
+ setp.ge.f64 %p51, %fd1, %fd2;
+ selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p51;
+ bra.uni BB8_72;
-BB7_51:
- setp.neu.f64 %p44, %fd1, 0d0000000000000000;
+BB8_49:
+ setp.neu.f64 %p46, %fd1, 0d0000000000000000;
sub.f64 %fd42, %fd1, %fd2;
- selp.f64 %fd55, %fd42, 0d0000000000000000, %p44;
- bra.uni BB7_76;
+ selp.f64 %fd54, %fd42, 0d0000000000000000, %p46;
+ bra.uni BB8_72;
-BB7_41:
+BB8_41:
setp.ne.s32 %p14, %r13, 18;
- @%p14 bra BB7_76;
+ @%p14 bra BB8_72;
- div.rn.f64 %fd55, %fd1, %fd2;
- abs.f64 %fd37, %fd55;
+ div.rn.f64 %fd54, %fd1, %fd2;
+ abs.f64 %fd37, %fd54;
setp.gtu.f64 %p35, %fd37, 0d7FF0000000000000;
- @%p35 bra BB7_76;
+ @%p35 bra BB8_72;
{
.reg .b32 %temp;
- mov.b64 {%temp, %r19}, %fd55;
+ mov.b64 {%r19, %temp}, %fd54;
}
- and.b32 %r20, %r19, 2147483647;
- setp.ne.s32 %p36, %r20, 2146435072;
- @%p36 bra BB7_45;
-
{
.reg .b32 %temp;
- mov.b64 {%r21, %temp}, %fd55;
+ mov.b64 {%temp, %r20}, %fd54;
}
- setp.eq.s32 %p37, %r21, 0;
- @%p37 bra BB7_76;
+ and.b32 %r21, %r20, 2147483647;
+ setp.ne.s32 %p36, %r21, 2146435072;
+ setp.ne.s32 %p37, %r19, 0;
+ or.pred %p38, %p36, %p37;
+ @!%p38 bra BB8_72;
+ bra.uni BB8_44;
-BB7_45:
- cvt.rmi.f64.f64 %fd55, %fd55;
- bra.uni BB7_76;
+BB8_44:
+ cvt.rmi.f64.f64 %fd54, %fd54;
+ bra.uni BB8_72;
-BB7_61:
- setp.gt.s32 %p56, %r8, -1;
- @%p56 bra BB7_64;
+BB8_59:
+ setp.gt.s32 %p58, %r8, -1;
+ @%p58 bra BB8_62;
cvt.rzi.f64.f64 %fd45, %fd2;
- setp.neu.f64 %p57, %fd45, %fd2;
- selp.f64 %fd53, 0dFFF8000000000000, %fd53, %p57;
+ setp.neu.f64 %p59, %fd45, %fd2;
+ selp.f64 %fd52, 0dFFF8000000000000, %fd52, %p59;
-BB7_64:
- mov.f64 %fd25, %fd53;
+BB8_62:
+ mov.f64 %fd25, %fd52;
add.f64 %fd26, %fd1, %fd2;
{
.reg .b32 %temp;
mov.b64 {%temp, %r40}, %fd26;
}
and.b32 %r41, %r40, 2146435072;
- setp.ne.s32 %p60, %r41, 2146435072;
- mov.f64 %fd52, %fd25;
- @%p60 bra BB7_73;
-
- setp.gtu.f64 %p61, %fd19, 0d7FF0000000000000;
- mov.f64 %fd52, %fd26;
- @%p61 bra BB7_73;
+ setp.ne.s32 %p62, %r41, 2146435072;
+ mov.f64 %fd51, %fd25;
+ @%p62 bra BB8_69;
- abs.f64 %fd46, %fd2;
- setp.gtu.f64 %p62, %fd46, 0d7FF0000000000000;
+ setp.gtu.f64 %p63, %fd19, 0d7FF0000000000000;
mov.f64 %fd51, %fd26;
- mov.f64 %fd52, %fd51;
- @%p62 bra BB7_73;
+ @%p63 bra BB8_69;
- and.b32 %r42, %r9, 2147483647;
- setp.ne.s32 %p63, %r42, 2146435072;
- @%p63 bra BB7_69;
+ abs.f64 %fd46, %fd2;
+ setp.gtu.f64 %p64, %fd46, 0d7FF0000000000000;
+ mov.f64 %fd50, %fd26;
+ mov.f64 %fd51, %fd50;
+ @%p64 bra BB8_69;
{
.reg .b32 %temp;
- mov.b64 {%r43, %temp}, %fd2;
+ mov.b64 {%r42, %temp}, %fd2;
}
- setp.eq.s32 %p64, %r43, 0;
- @%p64 bra BB7_72;
+ and.b32 %r43, %r9, 2147483647;
+ setp.eq.s32 %p65, %r43, 2146435072;
+ setp.eq.s32 %p66, %r42, 0;
+ and.pred %p67, %p65, %p66;
+ @%p67 bra BB8_68;
+ bra.uni BB8_66;
-BB7_69:
- and.b32 %r44, %r8, 2147483647;
- setp.ne.s32 %p65, %r44, 2146435072;
- mov.f64 %fd49, %fd25;
- mov.f64 %fd52, %fd49;
- @%p65 bra BB7_73;
+BB8_68:
+ setp.gt.f64 %p71, %fd19, 0d3FF0000000000000;
+ selp.b32 %r51, 2146435072, 0, %p71;
+ xor.b32 %r52, %r51, 2146435072;
+ setp.lt.s32 %p72, %r9, 0;
+ selp.b32 %r53, %r52, %r51, %p72;
+ setp.eq.f64 %p73, %fd1, 0dBFF0000000000000;
+ selp.b32 %r54, 1072693248, %r53, %p73;
+ mov.u32 %r55, 0;
+ mov.b64 %fd51, {%r55, %r54};
+ bra.uni BB8_69;
+BB8_66:
{
.reg .b32 %temp;
- mov.b64 {%r45, %temp}, %fd1;
+ mov.b64 {%r44, %temp}, %fd1;
}
- setp.ne.s32 %p66, %r45, 0;
- mov.f64 %fd52, %fd25;
- @%p66 bra BB7_73;
+ and.b32 %r45, %r8, 2147483647;
+ setp.eq.s32 %p68, %r45, 2146435072;
+ setp.eq.s32 %p69, %r44, 0;
+ and.pred %p70, %p68, %p69;
+ mov.f64 %fd51, %fd25;
+ @!%p70 bra BB8_69;
+ bra.uni BB8_67;
+BB8_67:
shr.s32 %r46, %r9, 31;
and.b32 %r47, %r46, -2146435072;
- add.s32 %r48, %r47, 2146435072;
- or.b32 %r49, %r48, -2147483648;
- selp.b32 %r50, %r49, %r48, %p1;
- mov.u32 %r51, 0;
- mov.b64 %fd52, {%r51, %r50};
- bra.uni BB7_73;
-
-BB7_72:
- setp.gt.f64 %p67, %fd19, 0d3FF0000000000000;
- selp.b32 %r52, 2146435072, 0, %p67;
- xor.b32 %r53, %r52, 2146435072;
- setp.lt.s32 %p68, %r9, 0;
- selp.b32 %r54, %r53, %r52, %p68;
- setp.eq.f64 %p69, %fd1, 0dBFF0000000000000;
- selp.b32 %r55, 1072693248, %r54, %p69;
- mov.u32 %r56, 0;
- mov.b64 %fd52, {%r56, %r55};
-
-BB7_73:
- setp.eq.f64 %p70, %fd2, 0d0000000000000000;
- setp.eq.f64 %p71, %fd1, 0d3FF0000000000000;
- or.pred %p72, %p71, %p70;
- selp.f64 %fd55, 0d3FF0000000000000, %fd52, %p72;
+ selp.b32 %r48, -1048576, 2146435072, %p1;
+ add.s32 %r49, %r48, %r47;
+ mov.u32 %r50, 0;
+ mov.b64 %fd51, {%r50, %r49};
-BB7_76:
+BB8_69:
+ setp.eq.f64 %p74, %fd2, 0d0000000000000000;
+ setp.eq.f64 %p75, %fd1, 0d3FF0000000000000;
+ or.pred %p76, %p75, %p74;
+ selp.f64 %fd54, 0d3FF0000000000000, %fd51, %p76;
+
+BB8_72:
cvta.to.global.u64 %rd16, %rd4;
mul.wide.s32 %rd17, %r3, 8;
add.s64 %rd18, %rd16, %rd17;
- st.global.f64 [%rd18], %fd55;
+ st.global.f64 [%rd18], %fd54;
bar.sync 0;
-BB7_77:
+BB8_73:
ret;
}
@@ -923,9 +1001,9 @@ BB7_77:
.param .u32 matrix_scalar_op_param_5
)
{
- .reg .pred %p<133>;
- .reg .b32 %r<88>;
- .reg .f64 %fd<109>;
+ .reg .pred %p<141>;
+ .reg .b32 %r<86>;
+ .reg .f64 %fd<107>;
.reg .b64 %rd<20>;
@@ -940,7 +1018,7 @@ BB7_77:
mov.u32 %r11, %tid.x;
mad.lo.s32 %r1, %r9, %r10, %r11;
setp.ge.s32 %p3, %r1, %r8;
- @%p3 bra BB8_138;
+ @%p3 bra BB9_130;
cvta.to.global.u64 %rd6, %rd5;
cvta.to.global.u64 %rd7, %rd4;
@@ -949,86 +1027,86 @@ BB7_77:
ld.global.f64 %fd1, [%rd9];
add.s64 %rd1, %rd6, %rd8;
setp.eq.s32 %p4, %r7, 0;
- @%p4 bra BB8_70;
+ @%p4 bra BB9_66;
- mov.f64 %fd99, 0d7FEFFFFFFFFFFFFF;
+ mov.f64 %fd98, 0d7FEFFFFFFFFFFFFF;
setp.gt.s32 %p5, %r6, 8;
- @%p5 bra BB8_19;
+ @%p5 bra BB9_19;
setp.gt.s32 %p19, %r6, 3;
- @%p19 bra BB8_11;
+ @%p19 bra BB9_11;
setp.gt.s32 %p26, %r6, 1;
- @%p26 bra BB8_8;
+ @%p26 bra BB9_8;
setp.eq.s32 %p29, %r6, 0;
- @%p29 bra BB8_68;
- bra.uni BB8_6;
+ @%p29 bra BB9_64;
+ bra.uni BB9_6;
-BB8_68:
- add.f64 %fd99, %fd1, %fd68;
- bra.uni BB8_69;
+BB9_64:
+ add.f64 %fd98, %fd1, %fd68;
+ bra.uni BB9_65;
-BB8_70:
- mov.f64 %fd108, 0d7FEFFFFFFFFFFFFF;
- setp.gt.s32 %p69, %r6, 8;
- @%p69 bra BB8_87;
+BB9_66:
+ mov.f64 %fd106, 0d7FEFFFFFFFFFFFFF;
+ setp.gt.s32 %p73, %r6, 8;
+ @%p73 bra BB9_83;
- setp.gt.s32 %p83, %r6, 3;
- @%p83 bra BB8_79;
+ setp.gt.s32 %p87, %r6, 3;
+ @%p87 bra BB9_75;
- setp.gt.s32 %p90, %r6, 1;
- @%p90 bra BB8_76;
+ setp.gt.s32 %p94, %r6, 1;
+ @%p94 bra BB9_72;
- setp.eq.s32 %p93, %r6, 0;
- @%p93 bra BB8_136;
- bra.uni BB8_74;
+ setp.eq.s32 %p97, %r6, 0;
+ @%p97 bra BB9_128;
+ bra.uni BB9_70;
-BB8_136:
- add.f64 %fd108, %fd1, %fd68;
- bra.uni BB8_137;
+BB9_128:
+ add.f64 %fd106, %fd1, %fd68;
+ bra.uni BB9_129;
-BB8_19:
+BB9_19:
setp.gt.s32 %p6, %r6, 13;
- @%p6 bra BB8_28;
+ @%p6 bra BB9_28;
setp.gt.s32 %p13, %r6, 10;
- @%p13 bra BB8_24;
+ @%p13 bra BB9_24;
setp.eq.s32 %p17, %r6, 9;
- @%p17 bra BB8_48;
- bra.uni BB8_22;
+ @%p17 bra BB9_46;
+ bra.uni BB9_22;
-BB8_48:
- setp.eq.f64 %p44, %fd1, %fd68;
- selp.f64 %fd99, 0d3FF0000000000000, 0d0000000000000000, %p44;
- bra.uni BB8_69;
+BB9_46:
+ setp.eq.f64 %p46, %fd1, %fd68;
+ selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p46;
+ bra.uni BB9_65;
-BB8_87:
- setp.gt.s32 %p70, %r6, 13;
- @%p70 bra BB8_96;
+BB9_83:
+ setp.gt.s32 %p74, %r6, 13;
+ @%p74 bra BB9_92;
- setp.gt.s32 %p77, %r6, 10;
- @%p77 bra BB8_92;
+ setp.gt.s32 %p81, %r6, 10;
+ @%p81 bra BB9_88;
- setp.eq.s32 %p81, %r6, 9;
- @%p81 bra BB8_116;
- bra.uni BB8_90;
+ setp.eq.s32 %p85, %r6, 9;
+ @%p85 bra BB9_110;
+ bra.uni BB9_86;
-BB8_116:
- setp.eq.f64 %p108, %fd1, %fd68;
- selp.f64 %fd108, 0d3FF0000000000000, 0d0000000000000000, %p108;
- bra.uni BB8_137;
+BB9_110:
+ setp.eq.f64 %p114, %fd1, %fd68;
+ selp.f64 %fd106, 0d3FF0000000000000, 0d0000000000000000, %p114;
+ bra.uni BB9_129;
-BB8_11:
+BB9_11:
setp.gt.s32 %p20, %r6, 5;
- @%p20 bra BB8_15;
+ @%p20 bra BB9_15;
setp.eq.s32 %p24, %r6, 4;
- @%p24 bra BB8_51;
- bra.uni BB8_13;
+ @%p24 bra BB9_49;
+ bra.uni BB9_13;
-BB8_51:
+BB9_49:
{
.reg .b32 %temp;
mov.b64 {%temp, %r2}, %fd68;
@@ -1041,7 +1119,7 @@ BB8_51:
add.s32 %r25, %r24, -1012;
mov.b64 %rd14, %fd1;
shl.b64 %rd2, %rd14, %r25;
- setp.eq.s64 %p49, %rd2, -9223372036854775808;
+ setp.eq.s64 %p51, %rd2, -9223372036854775808;
abs.f64 %fd18, %fd68;
// Callseq Start 1
{
@@ -1058,69 +1136,69 @@ BB8_51:
param0,
param1
);
- ld.param.f64 %fd98, [retval0+0];
+ ld.param.f64 %fd97, [retval0+0];
//{
}// Callseq End 1
- setp.lt.s32 %p50, %r2, 0;
- and.pred %p1, %p50, %p49;
- @!%p1 bra BB8_53;
- bra.uni BB8_52;
+ setp.lt.s32 %p52, %r2, 0;
+ and.pred %p1, %p52, %p51;
+ @!%p1 bra BB9_51;
+ bra.uni BB9_50;
-BB8_52:
+BB9_50:
{
.reg .b32 %temp;
- mov.b64 {%temp, %r26}, %fd98;
+ mov.b64 {%temp, %r26}, %fd97;
}
xor.b32 %r27, %r26, -2147483648;
{
.reg .b32 %temp;
- mov.b64 {%r28, %temp}, %fd98;
+ mov.b64 {%r28, %temp}, %fd97;
}
- mov.b64 %fd98, {%r28, %r27};
+ mov.b64 %fd97, {%r28, %r27};
-BB8_53:
- mov.f64 %fd97, %fd98;
- setp.eq.f64 %p51, %fd68, 0d0000000000000000;
- @%p51 bra BB8_56;
- bra.uni BB8_54;
+BB9_51:
+ mov.f64 %fd96, %fd97;
+ setp.eq.f64 %p53, %fd68, 0d0000000000000000;
+ @%p53 bra BB9_54;
+ bra.uni BB9_52;
-BB8_56:
- selp.b32 %r29, %r2, 0, %p49;
+BB9_54:
+ selp.b32 %r29, %r2, 0, %p51;
or.b32 %r30, %r29, 2146435072;
- setp.lt.s32 %p55, %r3, 0;
- selp.b32 %r31, %r30, %r29, %p55;
+ setp.lt.s32 %p57, %r3, 0;
+ selp.b32 %r31, %r30, %r29, %p57;
mov.u32 %r32, 0;
- mov.b64 %fd97, {%r32, %r31};
- bra.uni BB8_57;
+ mov.b64 %fd96, {%r32, %r31};
+ bra.uni BB9_55;
-BB8_28:
+BB9_28:
setp.gt.s32 %p7, %r6, 15;
- @%p7 bra BB8_32;
+ @%p7 bra BB9_32;
setp.eq.s32 %p11, %r6, 14;
- @%p11 bra BB8_45;
- bra.uni BB8_30;
+ @%p11 bra BB9_43;
+ bra.uni BB9_30;
-BB8_45:
+BB9_43:
cvt.rni.s64.f64 %rd10, %fd68;
cvt.rni.s64.f64 %rd11, %fd1;
cvt.u32.u64 %r18, %rd10;
cvt.u32.u64 %r19, %rd11;
or.b32 %r20, %r19, %r18;
- setp.eq.s32 %p41, %r20, 0;
- selp.f64 %fd99, 0d0000000000000000, 0d3FF0000000000000, %p41;
- bra.uni BB8_69;
+ setp.eq.s32 %p43, %r20, 0;
+ selp.f64 %fd98, 0d0000000000000000, 0d3FF0000000000000, %p43;
+ bra.uni BB9_65;
-BB8_79:
- setp.gt.s32 %p84, %r6, 5;
- @%p84 bra BB8_83;
+BB9_75:
+ setp.gt.s32 %p88, %r6, 5;
+ @%p88 bra BB9_79;
- setp.eq.s32 %p88, %r6, 4;
- @%p88 bra BB8_119;
- bra.uni BB8_81;
+ setp.eq.s32 %p92, %r6, 4;
+ @%p92 bra BB9_113;
+ bra.uni BB9_77;
-BB8_119:
+BB9_113:
{
.reg .b32 %temp;
mov.b64 {%temp, %r4}, %fd1;
@@ -1129,11 +1207,11 @@ BB8_119:
.reg .b32 %temp;
mov.b64 {%temp, %r5}, %fd68;
}
- bfe.u32 %r62, %r5, 20, 11;
- add.s32 %r63, %r62, -1012;
+ bfe.u32 %r61, %r5, 20, 11;
+ add.s32 %r62, %r61, -1012;
mov.b64 %rd19, %fd68;
- shl.b64 %rd3, %rd19, %r63;
- setp.eq.s64 %p113, %rd3, -9223372036854775808;
+ shl.b64 %rd3, %rd19, %r62;
+ setp.eq.s64 %p119, %rd3, -9223372036854775808;
abs.f64 %fd51, %fd1;
// Callseq Start 2
{
@@ -1150,616 +1228,612 @@ BB8_119:
param0,
param1
);
- ld.param.f64 %fd107, [retval0+0];
+ ld.param.f64 %fd105, [retval0+0];
//{
}// Callseq End 2
- setp.lt.s32 %p114, %r4, 0;
- and.pred %p2, %p114, %p113;
- @!%p2 bra BB8_121;
- bra.uni BB8_120;
+ setp.lt.s32 %p120, %r4, 0;
+ and.pred %p2, %p120, %p119;
+ @!%p2 bra BB9_115;
+ bra.uni BB9_114;
-BB8_120:
+BB9_114:
{
.reg .b32 %temp;
- mov.b64 {%temp, %r64}, %fd107;
+ mov.b64 {%temp, %r63}, %fd105;
}
- xor.b32 %r65, %r64, -2147483648;
+ xor.b32 %r64, %r63, -2147483648;
{
.reg .b32 %temp;
- mov.b64 {%r66, %temp}, %fd107;
+ mov.b64 {%r65, %temp}, %fd105;
}
- mov.b64 %fd107, {%r66, %r65};
-
-BB8_121:
- mov.f64 %fd106, %fd107;
- setp.eq.f64 %p115, %fd1, 0d0000000000000000;
- @%p115 bra BB8_124;
- bra.uni BB8_122;
-
-BB8_124:
- selp.b32 %r67, %r4, 0, %p113;
- or.b32 %r68, %r67, 2146435072;
- setp.lt.s32 %p119, %r5, 0;
- selp.b32 %r69, %r68, %r67, %p119;
- mov.u32 %r70, 0;
- mov.b64 %fd106, {%r70, %r69};
- bra.uni BB8_125;
-
-BB8_96:
- setp.gt.s32 %p71, %r6, 15;
- @%p71 bra BB8_100;
-
- setp.eq.s32 %p75, %r6, 14;
- @%p75 bra BB8_113;
- bra.uni BB8_98;
-
-BB8_113:
+ mov.b64 %fd105, {%r65, %r64};
+
+BB9_115:
+ mov.f64 %fd104, %fd105;
+ setp.eq.f64 %p121, %fd1, 0d0000000000000000;
+ @%p121 bra BB9_118;
+ bra.uni BB9_116;
+
+BB9_118:
+ selp.b32 %r66, %r4, 0, %p119;
+ or.b32 %r67, %r66, 2146435072;
+ setp.lt.s32 %p125, %r5, 0;
+ selp.b32 %r68, %r67, %r66, %p125;
+ mov.u32 %r69, 0;
+ mov.b64 %fd104, {%r69, %r68};
+ bra.uni BB9_119;
+
+BB9_92:
+ setp.gt.s32 %p75, %r6, 15;
+ @%p75 bra BB9_96;
+
+ setp.eq.s32 %p79, %r6, 14;
+ @%p79 bra BB9_107;
+ bra.uni BB9_94;
+
+BB9_107:
cvt.rni.s64.f64 %rd15, %fd1;
cvt.rni.s64.f64 %rd16, %fd68;
- cvt.u32.u64 %r56, %rd15;
- cvt.u32.u64 %r57, %rd16;
- or.b32 %r58, %r57, %r56;
- setp.eq.s32 %p105, %r58, 0;
- selp.f64 %fd108, 0d0000000000000000, 0d3FF0000000000000, %p105;
- bra.uni BB8_137;
-
-BB8_8:
+ cvt.u32.u64 %r55, %rd15;
+ cvt.u32.u64 %r56, %rd16;
+ or.b32 %r57, %r56, %r55;
+ setp.eq.s32 %p111, %r57, 0;
+ selp.f64 %fd106, 0d0000000000000000, 0d3FF0000000000000, %p111;
+ bra.uni BB9_129;
+
+BB9_8:
setp.eq.s32 %p27, %r6, 2;
- @%p27 bra BB8_67;
- bra.uni BB8_9;
+ @%p27 bra BB9_63;
+ bra.uni BB9_9;
-BB8_67:
- mul.f64 %fd99, %fd1, %fd68;
- bra.uni BB8_69;
+BB9_63:
+ mul.f64 %fd98, %fd1, %fd68;
+ bra.uni BB9_65;
-BB8_24:
+BB9_24:
setp.eq.s32 %p14, %r6, 11;
- @%p14 bra BB8_47;
+ @%p14 bra BB9_45;
setp.eq.s32 %p15, %r6, 12;
- @%p15 bra BB8_46;
- bra.uni BB8_26;
+ @%p15 bra BB9_44;
+ bra.uni BB9_26;
-BB8_46:
- max.f64 %fd99, %fd68, %fd1;
- bra.uni BB8_69;
+BB9_44:
+ max.f64 %fd98, %fd68, %fd1;
+ bra.uni BB9_65;
-BB8_15:
+BB9_15:
setp.eq.s32 %p21, %r6, 6;
- @%p21 bra BB8_50;
+ @%p21 bra BB9_48;
setp.eq.s32 %p22, %r6, 7;
- @%p22 bra BB8_49;
- bra.uni BB8_17;
+ @%p22 bra BB9_47;
+ bra.uni BB9_17;
-BB8_49:
- setp.lt.f64 %p46, %fd1, %fd68;
- selp.f64 %fd99, 0d3FF0000000000000, 0d0000000000000000, %p46;
- bra.uni BB8_69;
+BB9_47:
+ setp.lt.f64 %p48, %fd1, %fd68;
+ selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p48;
+ bra.uni BB9_65;
-BB8_32:
+BB9_32:
setp.eq.s32 %p8, %r6, 16;
- @%p8 bra BB8_44;
+ @%p8 bra BB9_42;
setp.eq.s32 %p9, %r6, 17;
- @%p9 bra BB8_39;
- bra.uni BB8_34;
+ @%p9 bra BB9_38;
+ bra.uni BB9_34;
-BB8_39:
- setp.eq.f64 %p34, %fd1, 0d0000000000000000;
- setp.eq.f64 %p35, %fd1, 0d8000000000000000;
- or.pred %p36, %p34, %p35;
- mov.f64 %fd99, 0d7FF8000000000000;
- @%p36 bra BB8_69;
+BB9_38:
+ setp.eq.f64 %p35, %fd1, 0d0000000000000000;
+ setp.eq.f64 %p36, %fd1, 0d8000000000000000;
+ or.pred %p37, %p35, %p36;
+ mov.f64 %fd98, 0d7FF8000000000000;
+ @%p37 bra BB9_65;
- div.rn.f64 %fd99, %fd68, %fd1;
- abs.f64 %fd72, %fd99;
- setp.gtu.f64 %p37, %fd72, 0d7FF0000000000000;
- @%p37 bra BB8_69;
+ div.rn.f64 %fd98, %fd68, %fd1;
+ abs.f64 %fd72, %fd98;
+ setp.gtu.f64 %p38, %fd72, 0d7FF0000000000000;
+ @%p38 bra BB9_65;
{
.reg .b32 %temp;
- mov.b64 {%temp, %r15}, %fd99;
+ mov.b64 {%r15, %temp}, %fd98;
}
- and.b32 %r16, %r15, 2147483647;
- setp.ne.s32 %p38, %r16, 2146435072;
- @%p38 bra BB8_43;
-
{
.reg .b32 %temp;
- mov.b64 {%r17, %temp}, %fd99;
+ mov.b64 {%temp, %r16}, %fd98;
}
- setp.eq.s32 %p39, %r17, 0;
- @%p39 bra BB8_69;
-
-BB8_43:
- cvt.rmi.f64.f64 %fd73, %fd99;
+ and.b32 %r17, %r16, 2147483647;
+ setp.ne.s32 %p39, %r17, 2146435072;
+ setp.ne.s32 %p40, %r15, 0;
+ or.pred %p41, %p39, %p40;
+ @!%p41 bra BB9_65;
+ bra.uni BB9_41;
+
+BB9_41:
+ cvt.rmi.f64.f64 %fd73, %fd98;
mul.f64 %fd74, %fd1, %fd73;
- sub.f64 %fd99, %fd68, %fd74;
- bra.uni BB8_69;
-
-BB8_76:
- setp.eq.s32 %p91, %r6, 2;
- @%p91 bra BB8_135;
- bra.uni BB8_77;
-
-BB8_135:
- mul.f64 %fd108, %fd1, %fd68;
- bra.uni BB8_137;
-
-BB8_92:
- setp.eq.s32 %p78, %r6, 11;
- @%p78 bra BB8_115;
-
- setp.eq.s32 %p79, %r6, 12;
- @%p79 bra BB8_114;
- bra.uni BB8_94;
-
-BB8_114:
- max.f64 %fd108, %fd1, %fd68;
- bra.uni BB8_137;
-
-BB8_83:
- setp.eq.s32 %p85, %r6, 6;
- @%p85 bra BB8_118;
-
- setp.eq.s32 %p86, %r6, 7;
- @%p86 bra BB8_117;
- bra.uni BB8_85;
-
-BB8_117:
- setp.gt.f64 %p110, %fd1, %fd68;
- selp.f64 %fd108, 0d3FF0000000000000, 0d0000000000000000, %p110;
- bra.uni BB8_137;
-
-BB8_100:
- setp.eq.s32 %p72, %r6, 16;
- @%p72 bra BB8_112;
-
- setp.eq.s32 %p73, %r6, 17;
- @%p73 bra BB8_107;
- bra.uni BB8_102;
-
-BB8_107:
- setp.eq.f64 %p98, %fd68, 0d0000000000000000;
- setp.eq.f64 %p99, %fd68, 0d8000000000000000;
- or.pred %p100, %p98, %p99;
- mov.f64 %fd108, 0d7FF8000000000000;
- @%p100 bra BB8_137;
-
- div.rn.f64 %fd108, %fd1, %fd68;
- abs.f64 %fd83, %fd108;
- setp.gtu.f64 %p101, %fd83, 0d7FF0000000000000;
- @%p101 bra BB8_137;
+ sub.f64 %fd98, %fd68, %fd74;
+ bra.uni BB9_65;
+
+BB9_72:
+ setp.eq.s32 %p95, %r6, 2;
+ @%p95 bra BB9_127;
+ bra.uni BB9_73;
+
+BB9_127:
+ mul.f64 %fd106, %fd1, %fd68;
+ bra.uni BB9_129;
+
+BB9_88:
+ setp.eq.s32 %p82, %r6, 11;
+ @%p82 bra BB9_109;
+
+ setp.eq.s32 %p83, %r6, 12;
+ @%p83 bra BB9_108;
+ bra.uni BB9_90;
+
+BB9_108:
+ max.f64 %fd106, %fd1, %fd68;
+ bra.uni BB9_129;
+
+BB9_79:
+ setp.eq.s32 %p89, %r6, 6;
+ @%p89 bra BB9_112;
+
+ setp.eq.s32 %p90, %r6, 7;
+ @%p90 bra BB9_111;
+ bra.uni BB9_81;
+
+BB9_111:
+ setp.gt.f64 %p116, %fd1, %fd68;
+ selp.f64 %fd106, 0d3FF0000000000000, 0d0000000000000000, %p116;
+ bra.uni BB9_129;
+
+BB9_96:
+ setp.eq.s32 %p76, %r6, 16;
+ @%p76 bra BB9_106;
+
+ setp.eq.s32 %p77, %r6, 17;
+ @%p77 bra BB9_102;
+ bra.uni BB9_98;
+
+BB9_102:
+ setp.eq.f64 %p103, %fd68, 0d0000000000000000;
+ setp.eq.f64 %p104, %fd68, 0d8000000000000000;
+ or.pred %p105, %p103, %p104;
+ mov.f64 %fd106, 0d7FF8000000000000;
+ @%p105 bra BB9_129;
+
+ div.rn.f64 %fd106, %fd1, %fd68;
+ abs.f64 %fd83, %fd106;
+ setp.gtu.f64 %p106, %fd83, 0d7FF0000000000000;
+ @%p106 bra BB9_129;
{
.reg .b32 %temp;
- mov.b64 {%temp, %r53}, %fd108;
+ mov.b64 {%r52, %temp}, %fd106;
}
- and.b32 %r54, %r53, 2147483647;
- setp.ne.s32 %p102, %r54, 2146435072;
- @%p102 bra BB8_111;
-
{
.reg .b32 %temp;
- mov.b64 {%r55, %temp}, %fd108;
+ mov.b64 {%temp, %r53}, %fd106;
}
- setp.eq.s32 %p103, %r55, 0;
- @%p103 bra BB8_137;
-
-BB8_111:
- cvt.rmi.f64.f64 %fd84, %fd108;
+ and.b32 %r54, %r53, 2147483647;
+ setp.ne.s32 %p107, %r54, 2146435072;
+ setp.ne.s32 %p108, %r52, 0;
+ or.pred %p109, %p107, %p108;
+ @!%p109 bra BB9_129;
+ bra.uni BB9_105;
+
+BB9_105:
+ cvt.rmi.f64.f64 %fd84, %fd106;
mul.f64 %fd85, %fd84, %fd68;
- sub.f64 %fd108, %fd1, %fd85;
- bra.uni BB8_137;
+ sub.f64 %fd106, %fd1, %fd85;
+ bra.uni BB9_129;
-BB8_6:
+BB9_6:
setp.eq.s32 %p30, %r6, 1;
- @%p30 bra BB8_7;
- bra.uni BB8_69;
+ @%p30 bra BB9_7;
+ bra.uni BB9_65;
-BB8_7:
- sub.f64 %fd99, %fd68, %fd1;
- bra.uni BB8_69;
+BB9_7:
+ sub.f64 %fd98, %fd68, %fd1;
+ bra.uni BB9_65;
-BB8_22:
+BB9_22:
setp.eq.s32 %p18, %r6, 10;
- @%p18 bra BB8_23;
- bra.uni BB8_69;
+ @%p18 bra BB9_23;
+ bra.uni BB9_65;
-BB8_23:
- setp.neu.f64 %p43, %fd1, %fd68;
- selp.f64 %fd99, 0d3FF0000000000000, 0d0000000000000000, %p43;
- bra.uni BB8_69;
+BB9_23:
+ setp.neu.f64 %p45, %fd1, %fd68;
+ selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p45;
+ bra.uni BB9_65;
-BB8_13:
+BB9_13:
setp.eq.s32 %p25, %r6, 5;
- @%p25 bra BB8_14;
- bra.uni BB8_69;
+ @%p25 bra BB9_14;
+ bra.uni BB9_65;
-BB8_14:
- setp.gt.f64 %p48, %fd1, %fd68;
- selp.f64 %fd99, 0d3FF0000000000000, 0d0000000000000000, %p48;
- bra.uni BB8_69;
+BB9_14:
+ setp.gt.f64 %p50, %fd1, %fd68;
+ selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p50;
+ bra.uni BB9_65;
-BB8_30:
+BB9_30:
setp.eq.s32 %p12, %r6, 15;
- @%p12 bra BB8_31;
- bra.uni BB8_69;
+ @%p12 bra BB9_31;
+ bra.uni BB9_65;
-BB8_31:
+BB9_31:
mul.f64 %fd76, %fd1, %fd68;
mov.f64 %fd77, 0d3FF0000000000000;
- sub.f64 %fd99, %fd77, %fd76;
- bra.uni BB8_69;
+ sub.f64 %fd98, %fd77, %fd76;
+ bra.uni BB9_65;
-BB8_9:
+BB9_9:
setp.eq.s32 %p28, %r6, 3;
- @%p28 bra BB8_10;
- bra.uni BB8_69;
+ @%p28 bra BB9_10;
+ bra.uni BB9_65;
-BB8_10:
- div.rn.f64 %fd99, %fd68, %fd1;
- bra.uni BB8_69;
+BB9_10:
+ div.rn.f64 %fd98, %fd68, %fd1;
+ bra.uni BB9_65;
-BB8_47:
- min.f64 %fd99, %fd68, %fd1;
- bra.uni BB8_69;
+BB9_45:
+ min.f64 %fd98, %fd68, %fd1;
+ bra.uni BB9_65;
-BB8_26:
+BB9_26:
setp.eq.s32 %p16, %r6, 13;
- @%p16 bra BB8_27;
- bra.uni BB8_69;
+ @%p16 bra BB9_27;
+ bra.uni BB9_65;
-BB8_27:
+BB9_27:
cvt.rni.s64.f64 %rd12, %fd68;
cvt.rni.s64.f64 %rd13, %fd1;
cvt.u32.u64 %r21, %rd12;
cvt.u32.u64 %r22, %rd13;
and.b32 %r23, %r22, %r21;
- setp.eq.s32 %p42, %r23, 0;
- selp.f64 %fd99, 0d0000000000000000, 0d3FF0000000000000, %p42;
- bra.uni BB8_69;
+ setp.eq.s32 %p44, %r23, 0;
+ selp.f64 %fd98, 0d0000000000000000, 0d3FF0000000000000, %p44;
+ bra.uni BB9_65;
-BB8_50:
- setp.ge.f64 %p47, %fd1, %fd68;
- selp.f64 %fd99, 0d3FF0000000000000, 0d0000000000000000, %p47;
- bra.uni BB8_69;
+BB9_48:
+ setp.ge.f64 %p49, %fd1, %fd68;
+ selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p49;
+ bra.uni BB9_65;
-BB8_17:
+BB9_17:
setp.eq.s32 %p23, %r6, 8;
- @%p23 bra BB8_18;
- bra.uni BB8_69;
+ @%p23 bra BB9_18;
+ bra.uni BB9_65;
-BB8_18:
- setp.le.f64 %p45, %fd1, %fd68;
- selp.f64 %fd99, 0d3FF0000000000000, 0d0000000000000000, %p45;
- bra.uni BB8_69;
+BB9_18:
+ setp.le.f64 %p47, %fd1, %fd68;
+ selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p47;
+ bra.uni BB9_65;
-BB8_44:
- setp.neu.f64 %p40, %fd68, 0d0000000000000000;
+BB9_42:
+ setp.neu.f64 %p42, %fd68, 0d0000000000000000;
sub.f64 %fd75, %fd68, %fd1;
- selp.f64 %fd99, %fd75, 0d0000000000000000, %p40;
- bra.uni BB8_69;
+ selp.f64 %fd98, %fd75, 0d0000000000000000, %p42;
+ bra.uni BB9_65;
-BB8_34:
+BB9_34:
setp.ne.s32 %p10, %r6, 18;
- @%p10 bra BB8_69;
+ @%p10 bra BB9_65;
- div.rn.f64 %fd99, %fd68, %fd1;
- abs.f64 %fd70, %fd99;
+ div.rn.f64 %fd98, %fd68, %fd1;
+ abs.f64 %fd70, %fd98;
setp.gtu.f64 %p31, %fd70, 0d7FF0000000000000;
- @%p31 bra BB8_69;
+ @%p31 bra BB9_65;
{
.reg .b32 %temp;
- mov.b64 {%temp, %r12}, %fd99;
+ mov.b64 {%r12, %temp}, %fd98;
}
- and.b32 %r13, %r12, 2147483647;
- setp.ne.s32 %p32, %r13, 2146435072;
- @%p32 bra BB8_38;
-
{
.reg .b32 %temp;
- mov.b64 {%r14, %temp}, %fd99;
+ mov.b64 {%temp, %r13}, %fd98;
}
- setp.eq.s32 %p33, %r14, 0;
- @%p33 bra BB8_69;
-
-BB8_38:
- cvt.rmi.f64.f64 %fd99, %fd99;
- bra.uni BB8_69;
-
-BB8_74:
- setp.eq.s32 %p94, %r6, 1;
- @%p94 bra BB8_75;
- bra.uni BB8_137;
-
-BB8_75:
- sub.f64 %fd108, %fd1, %fd68;
- bra.uni BB8_137;
-
-BB8_90:
- setp.eq.s32 %p82, %r6, 10;
- @%p82 bra BB8_91;
- bra.uni BB8_137;
-
-BB8_91:
- setp.neu.f64 %p107, %fd1, %fd68;
- selp.f64 %fd108, 0d3FF0000000000000, 0d0000000000000000, %p107;
- bra.uni BB8_137;
-
-BB8_81:
- setp.eq.s32 %p89, %r6, 5;
- @%p89 bra BB8_82;
- bra.uni BB8_137;
-
-BB8_82:
- setp.lt.f64 %p112, %fd1, %fd68;
- selp.f64 %fd108, 0d3FF0000000000000, 0d0000000000000000, %p112;
- bra.uni BB8_137;
-
-BB8_98:
- setp.eq.s32 %p76, %r6, 15;
- @%p76 bra BB8_99;
- bra.uni BB8_137;
-
-BB8_99:
+ and.b32 %r14, %r13, 2147483647;
+ setp.ne.s32 %p32, %r14, 2146435072;
+ setp.ne.s32 %p33, %r12, 0;
+ or.pred %p34, %p32, %p33;
+ @!%p34 bra BB9_65;
+ bra.uni BB9_37;
+
+BB9_37:
+ cvt.rmi.f64.f64 %fd98, %fd98;
+ bra.uni BB9_65;
+
+BB9_70:
+ setp.eq.s32 %p98, %r6, 1;
+ @%p98 bra BB9_71;
+ bra.uni BB9_129;
+
+BB9_71:
+ sub.f64 %fd106, %fd1, %fd68;
+ bra.uni BB9_129;
+
+BB9_86:
+ setp.eq.s32 %p86, %r6, 10;
+ @%p86 bra BB9_87;
+ bra.uni BB9_129;
+
+BB9_87:
+ setp.neu.f64 %p113, %fd1, %fd68;
+ selp.f64 %fd106, 0d3FF0000000000000, 0d0000000000000000, %p113;
+ bra.uni BB9_129;
+
+BB9_77:
+ setp.eq.s32 %p93, %r6, 5;
+ @%p93 bra BB9_78;
+ bra.uni BB9_129;
+
+BB9_78:
+ setp.lt.f64 %p118, %fd1, %fd68;
+ selp.f64 %fd106, 0d3FF0000000000000, 0d0000000000000000, %p118;
+ bra.uni BB9_129;
+
+BB9_94:
+ setp.eq.s32 %p80, %r6, 15;
+ @%p80 bra BB9_95;
+ bra.uni BB9_129;
+
+BB9_95:
mul.f64 %fd87, %fd1, %fd68;
mov.f64 %fd88, 0d3FF0000000000000;
- sub.f64 %fd108, %fd88, %fd87;
- bra.uni BB8_137;
+ sub.f64 %fd106, %fd88, %fd87;
+ bra.uni BB9_129;
-BB8_77:
- setp.eq.s32 %p92, %r6, 3;
- @%p92 bra BB8_78;
- bra.uni BB8_137;
+BB9_73:
+ setp.eq.s32 %p96, %r6, 3;
+ @%p96 bra BB9_74;
+ bra.uni BB9_129;
-BB8_78:
- div.rn.f64 %fd108, %fd1, %fd68;
- bra.uni BB8_137;
+BB9_74:
+ div.rn.f64 %fd106, %fd1, %fd68;
+ bra.uni BB9_129;
-BB8_115:
- min.f64 %fd108, %fd1, %fd68;
- bra.uni BB8_137;
+BB9_109:
+ min.f64 %fd106, %fd1, %fd68;
+ bra.uni BB9_129;
-BB8_94:
- setp.eq.s32 %p80, %r6, 13;
- @%p80 bra BB8_95;
- bra.uni BB8_137;
+BB9_90:
+ setp.eq.s32 %p84, %r6, 13;
+ @%p84 bra BB9_91;
+ bra.uni BB9_129;
-BB8_95:
+BB9_91:
cvt.rni.s64.f64 %rd17, %fd1;
cvt.rni.s64.f64 %rd18, %fd68;
- cvt.u32.u64 %r59, %rd17;
- cvt.u32.u64 %r60, %rd18;
- and.b32 %r61, %r60, %r59;
- setp.eq.s32 %p106, %r61, 0;
- selp.f64 %fd108, 0d0000000000000000, 0d3FF0000000000000, %p106;
- bra.uni BB8_137;
-
-BB8_118:
- setp.le.f64 %p111, %fd1, %fd68;
- selp.f64 %fd108, 0d3FF0000000000000, 0d0000000000000000, %p111;
- bra.uni BB8_137;
-
-BB8_85:
- setp.eq.s32 %p87, %r6, 8;
- @%p87 bra BB8_86;
- bra.uni BB8_137;
-
-BB8_86:
- setp.ge.f64 %p109, %fd1, %fd68;
- selp.f64 %fd108, 0d3FF0000000000000, 0d0000000000000000, %p109;
- bra.uni BB8_137;
-
-BB8_112:
- setp.neu.f64 %p104, %fd1, 0d0000000000000000;
+ cvt.u32.u64 %r58, %rd17;
+ cvt.u32.u64 %r59, %rd18;
+ and.b32 %r60, %r59, %r58;
+ setp.eq.s32 %p112, %r60, 0;
+ selp.f64 %fd106, 0d0000000000000000, 0d3FF0000000000000, %p112;
+ bra.uni BB9_129;
+
+BB9_112:
+ setp.le.f64 %p117, %fd1, %fd68;
+ selp.f64 %fd106, 0d3FF0000000000000, 0d0000000000000000, %p117;
+ bra.uni BB9_129;
+
+BB9_81:
+ setp.eq.s32 %p91, %r6, 8;
+ @%p91 bra BB9_82;
+ bra.uni BB9_129;
+
+BB9_82:
+ setp.ge.f64 %p115, %fd1, %fd68;
+ selp.f64 %fd106, 0d3FF0000000000000, 0d0000000000000000, %p115;
+ bra.uni BB9_129;
+
+BB9_106:
+ setp.neu.f64 %p110, %fd1, 0d0000000000000000;
sub.f64 %fd86, %fd1, %fd68;
- selp.f64 %fd108, %fd86, 0d0000000000000000, %p104;
- bra.uni BB8_137;
+ selp.f64 %fd106, %fd86, 0d0000000000000000, %p110;
+ bra.uni BB9_129;
-BB8_102:
- setp.ne.s32 %p74, %r6, 18;
- @%p74 bra BB8_137;
+BB9_98:
+ setp.ne.s32 %p78, %r6, 18;
+ @%p78 bra BB9_129;
- div.rn.f64 %fd108, %fd1, %fd68;
- abs.f64 %fd81, %fd108;
- setp.gtu.f64 %p95, %fd81, 0d7FF0000000000000;
- @%p95 bra BB8_137;
+ div.rn.f64 %fd106, %fd1, %fd68;
+ abs.f64 %fd81, %fd106;
+ setp.gtu.f64 %p99, %fd81, 0d7FF0000000000000;
+ @%p99 bra BB9_129;
{
.reg .b32 %temp;
- mov.b64 {%temp, %r50}, %fd108;
+ mov.b64 {%r49, %temp}, %fd106;
}
- and.b32 %r51, %r50, 2147483647;
- setp.ne.s32 %p96, %r51, 2146435072;
- @%p96 bra BB8_106;
-
{
.reg .b32 %temp;
- mov.b64 {%r52, %temp}, %fd108;
+ mov.b64 {%temp, %r50}, %fd106;
}
- setp.eq.s32 %p97, %r52, 0;
- @%p97 bra BB8_137;
+ and.b32 %r51, %r50, 2147483647;
+ setp.ne.s32 %p100, %r51, 2146435072;
+ setp.ne.s32 %p101, %r49, 0;
+ or.pred %p102, %p100, %p101;
+ @!%p102 bra BB9_129;
+ bra.uni BB9_101;
-BB8_106:
- cvt.rmi.f64.f64 %fd108, %fd108;
- bra.uni BB8_137;
+BB9_101:
+ cvt.rmi.f64.f64 %fd106, %fd106;
+ bra.uni BB9_129;
-BB8_54:
- setp.gt.s32 %p52, %r2, -1;
- @%p52 bra BB8_57;
+BB9_52:
+ setp.gt.s32 %p54, %r2, -1;
+ @%p54 bra BB9_55;
cvt.rzi.f64.f64 %fd78, %fd1;
- setp.neu.f64 %p53, %fd78, %fd1;
- selp.f64 %fd97, 0dFFF8000000000000, %fd97, %p53;
+ setp.neu.f64 %p55, %fd78, %fd1;
+ selp.f64 %fd96, 0dFFF8000000000000, %fd96, %p55;
-BB8_57:
- mov.f64 %fd24, %fd97;
+BB9_55:
+ mov.f64 %fd24, %fd96;
add.f64 %fd25, %fd1, %fd68;
{
.reg .b32 %temp;
mov.b64 {%temp, %r33}, %fd25;
}
and.b32 %r34, %r33, 2146435072;
- setp.ne.s32 %p56, %r34, 2146435072;
- mov.f64 %fd96, %fd24;
- @%p56 bra BB8_66;
-
- setp.gtu.f64 %p57, %fd18, 0d7FF0000000000000;
- mov.f64 %fd96, %fd25;
- @%p57 bra BB8_66;
+ setp.ne.s32 %p58, %r34, 2146435072;
+ mov.f64 %fd95, %fd24;
+ @%p58 bra BB9_62;
- abs.f64 %fd79, %fd1;
- setp.gtu.f64 %p58, %fd79, 0d7FF0000000000000;
+ setp.gtu.f64 %p59, %fd18, 0d7FF0000000000000;
mov.f64 %fd95, %fd25;
- mov.f64 %fd96, %fd95;
- @%p58 bra BB8_66;
-
- and.b32 %r35, %r3, 2147483647;
- setp.ne.s32 %p59, %r35, 2146435072;
- @%p59 bra BB8_62;
+ @%p59 bra BB9_62;
- {
- .reg .b32 %temp;
- mov.b64 {%r36, %temp}, %fd1;
- }
- setp.eq.s32 %p60, %r36, 0;
- @%p60 bra BB8_65;
-
-BB8_62:
- and.b32 %r37, %r2, 2147483647;
- setp.ne.s32 %p61, %r37, 2146435072;
- mov.f64 %fd93, %fd24;
- mov.f64 %fd96, %fd93;
- @%p61 bra BB8_66;
+ abs.f64 %fd79, %fd1;
+ setp.gtu.f64 %p60, %fd79, 0d7FF0000000000000;
+ mov.f64 %fd94, %fd25;
+ mov.f64 %fd95, %fd94;
+ @%p60 bra BB9_62;
{
.reg .b32 %temp;
- mov.b64 {%r38, %temp}, %fd68;
+ mov.b64 {%r35, %temp}, %fd1;
}
- setp.ne.s32 %p62, %r38, 0;
- mov.f64 %fd96, %fd24;
- @%p62 bra BB8_66;
-
- shr.s32 %r39, %r3, 31;
- and.b32 %r40, %r39, -2146435072;
- add.s32 %r41, %r40, 2146435072;
- or.b32 %r42, %r41, -2147483648;
- selp.b32 %r43, %r42, %r41, %p1;
- mov.u32 %r44, 0;
- mov.b64 %fd96, {%r44, %r43};
- bra.uni BB8_66;
-
-BB8_122:
- setp.gt.s32 %p116, %r4, -1;
- @%p116 bra BB8_125;
+ and.b32 %r36, %r3, 2147483647;
+ setp.eq.s32 %p61, %r36, 2146435072;
+ setp.eq.s32 %p62, %r35, 0;
+ and.pred %p63, %p61, %p62;
+ @%p63 bra BB9_61;
+ bra.uni BB9_59;
+
+BB9_61:
+ setp.gt.f64 %p67, %fd18, 0d3FF0000000000000;
+ selp.b32 %r44, 2146435072, 0, %p67;
+ xor.b32 %r45, %r44, 2146435072;
+ setp.lt.s32 %p68, %r3, 0;
+ selp.b32 %r46, %r45, %r44, %p68;
+ setp.eq.f64 %p69, %fd68, 0dBFF0000000000000;
+ selp.b32 %r47, 1072693248, %r46, %p69;
+ mov.u32 %r48, 0;
+ mov.b64 %fd95, {%r48, %r47};
+ bra.uni BB9_62;
+
+BB9_116:
+ setp.gt.s32 %p122, %r4, -1;
+ @%p122 bra BB9_119;
cvt.rzi.f64.f64 %fd89, %fd68;
- setp.neu.f64 %p117, %fd89, %fd68;
- selp.f64 %fd106, 0dFFF8000000000000, %fd106, %p117;
+ setp.neu.f64 %p123, %fd89, %fd68;
+ selp.f64 %fd104, 0dFFF8000000000000, %fd104, %p123;
-BB8_125:
- mov.f64 %fd57, %fd106;
+BB9_119:
+ mov.f64 %fd57, %fd104;
add.f64 %fd58, %fd1, %fd68;
{
.reg .b32 %temp;
- mov.b64 {%temp, %r71}, %fd58;
+ mov.b64 {%temp, %r70}, %fd58;
}
- and.b32 %r72, %r71, 2146435072;
- setp.ne.s32 %p120, %r72, 2146435072;
- mov.f64 %fd105, %fd57;
- @%p120 bra BB8_134;
+ and.b32 %r71, %r70, 2146435072;
+ setp.ne.s32 %p126, %r71, 2146435072;
+ mov.f64 %fd103, %fd57;
+ @%p126 bra BB9_126;
- setp.gtu.f64 %p121, %fd51, 0d7FF0000000000000;
- mov.f64 %fd105, %fd58;
- @%p121 bra BB8_134;
+ setp.gtu.f64 %p127, %fd51, 0d7FF0000000000000;
+ mov.f64 %fd103, %fd58;
+ @%p127 bra BB9_126;
abs.f64 %fd90, %fd68;
- setp.gtu.f64 %p122, %fd90, 0d7FF0000000000000;
- mov.f64 %fd104, %fd58;
- mov.f64 %fd105, %fd104;
- @%p122 bra BB8_134;
+ setp.gtu.f64 %p128, %fd90, 0d7FF0000000000000;
+ mov.f64 %fd102, %fd58;
+ mov.f64 %fd103, %fd102;
+ @%p128 bra BB9_126;
+ {
+ .reg .b32 %temp;
+ mov.b64 {%r72, %temp}, %fd68;
+ }
and.b32 %r73, %r5, 2147483647;
- setp.ne.s32 %p123, %r73, 2146435072;
- @%p123 bra BB8_130;
-
+ setp.eq.s32 %p129, %r73, 2146435072;
+ setp.eq.s32 %p130, %r72, 0;
+ and.pred %p131, %p129, %p130;
+ @%p131 bra BB9_125;
+ bra.uni BB9_123;
+
+BB9_125:
+ setp.gt.f64 %p135, %fd51, 0d3FF0000000000000;
+ selp.b32 %r81, 2146435072, 0, %p135;
+ xor.b32 %r82, %r81, 2146435072;
+ setp.lt.s32 %p136, %r5, 0;
+ selp.b32 %r83, %r82, %r81, %p136;
+ setp.eq.f64 %p137, %fd1, 0dBFF0000000000000;
+ selp.b32 %r84, 1072693248, %r83, %p137;
+ mov.u32 %r85, 0;
+ mov.b64 %fd103, {%r85, %r84};
+ bra.uni BB9_126;
+
+BB9_59:
{
.reg .b32 %temp;
- mov.b64 {%r74, %temp}, %fd68;
+ mov.b64 {%r37, %temp}, %fd68;
}
- setp.eq.s32 %p124, %r74, 0;
- @%p124 bra BB8_133;
+ and.b32 %r38, %r2, 2147483647;
+ setp.eq.s32 %p64, %r38, 2146435072;
+ setp.eq.s32 %p65, %r37, 0;
+ and.pred %p66, %p64, %p65;
+ mov.f64 %fd95, %fd24;
+ @!%p66 bra BB9_62;
+ bra.uni BB9_60;
+
+BB9_60:
+ shr.s32 %r39, %r3, 31;
+ and.b32 %r40, %r39, -2146435072;
+ selp.b32 %r41, -1048576, 2146435072, %p1;
+ add.s32 %r42, %r41, %r40;
+ mov.u32 %r43, 0;
+ mov.b64 %fd95, {%r43, %r42};
+
+BB9_62:
+ setp.eq.f64 %p70, %fd1, 0d0000000000000000;
+ setp.eq.f64 %p71, %fd68, 0d3FF0000000000000;
+ or.pred %p72, %p71, %p70;
+ selp.f64 %fd98, 0d3FF0000000000000, %fd95, %p72;
-BB8_130:
- and.b32 %r75, %r4, 2147483647;
- setp.ne.s32 %p125, %r75, 2146435072;
- mov.f64 %fd102, %fd57;
- mov.f64 %fd105, %fd102;
- @%p125 bra BB8_134;
+BB9_65:
+ st.global.f64 [%rd1], %fd98;
+ bra.uni BB9_130;
+BB9_123:
{
.reg .b32 %temp;
- mov.b64 {%r76, %temp}, %fd1;
+ mov.b64 {%r74, %temp}, %fd1;
}
- setp.ne.s32 %p126, %r76, 0;
- mov.f64 %fd105, %fd57;
- @%p126 bra BB8_134;
-
- shr.s32 %r77, %r5, 31;
- and.b32 %r78, %r77, -2146435072;
- add.s32 %r79, %r78, 2146435072;
- or.b32 %r80, %r79, -2147483648;
- selp.b32 %r81, %r80, %r79, %p2;
- mov.u32 %r82, 0;
- mov.b64 %fd105, {%r82, %r81};
- bra.uni BB8_134;
-
-BB8_65:
- setp.gt.f64 %p63, %fd18, 0d3FF0000000000000;
- selp.b32 %r45, 2146435072, 0, %p63;
- xor.b32 %r46, %r45, 2146435072;
- setp.lt.s32 %p64, %r3, 0;
- selp.b32 %r47, %r46, %r45, %p64;
- setp.eq.f64 %p65, %fd68, 0dBFF0000000000000;
- selp.b32 %r48, 1072693248, %r47, %p65;
- mov.u32 %r49, 0;
- mov.b64 %fd96, {%r49, %r48};
-
-BB8_66:
- setp.eq.f64 %p66, %fd1, 0d0000000000000000;
- setp.eq.f64 %p67, %fd68, 0d3FF0000000000000;
- or.pred %p68, %p67, %p66;
- selp.f64 %fd99, 0d3FF0000000000000, %fd96, %p68;
-
-BB8_69:
- st.global.f64 [%rd1], %fd99;
- bra.uni BB8_138;
-
-BB8_133:
- setp.gt.f64 %p127, %fd51, 0d3FF0000000000000;
- selp.b32 %r83, 2146435072, 0, %p127;
- xor.b32 %r84, %r83, 2146435072;
- setp.lt.s32 %p128, %r5, 0;
- selp.b32 %r85, %r84, %r83, %p128;
- setp.eq.f64 %p129, %fd1, 0dBFF0000000000000;
- selp.b32 %r86, 1072693248, %r85, %p129;
- mov.u32 %r87, 0;
- mov.b64 %fd105, {%r87, %r86};
-
-BB8_134:
- setp.eq.f64 %p130, %fd68, 0d0000000000000000;
- setp.eq.f64 %p131, %fd1, 0d3FF0000000000000;
- or.pred %p132, %p131, %p130;
- selp.f64 %fd108, 0d3FF0000000000000, %fd105, %p132;
-
-BB8_137:
- st.global.f64 [%rd1], %fd108;
-
-BB8_138:
+ and.b32 %r75, %r4, 2147483647;
+ setp.eq.s32 %p132, %r75, 2146435072;
+ setp.eq.s32 %p133, %r74, 0;
+ and.pred %p134, %p132, %p133;
+ mov.f64 %fd103, %fd57;
+ @!%p134 bra BB9_126;
+ bra.uni BB9_124;
+
+BB9_124:
+ shr.s32 %r76, %r5, 31;
+ and.b32 %r77, %r76, -2146435072;
+ selp.b32 %r78, -1048576, 2146435072, %p2;
+ add.s32 %r79, %r78, %r77;
+ mov.u32 %r80, 0;
+ mov.b64 %fd103, {%r80, %r79};
+
+BB9_126:
+ setp.eq.f64 %p138, %fd68, 0d0000000000000000;
+ setp.eq.f64 %p139, %fd1, 0d3FF0000000000000;
+ or.pred %p140, %p139, %p138;
+ selp.f64 %fd106, 0d3FF0000000000000, %fd103, %p140;
+
+BB9_129:
+ st.global.f64 [%rd1], %fd106;
+
+BB9_130:
bar.sync 0;
ret;
}
@@ -1785,14 +1859,14 @@ BB8_138:
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.s32 %p1, %r1, %r2;
- @%p1 bra BB9_2;
+ @%p1 bra BB10_2;
cvta.to.global.u64 %rd2, %rd1;
mul.wide.s32 %rd3, %r1, 8;
add.s64 %rd4, %rd2, %rd3;
st.global.f64 [%rd4], %fd1;
-BB9_2:
+BB10_2:
ret;
}
@@ -1832,10 +1906,10 @@ BB9_2:
setp.lt.s32 %p1, %r1, %r7;
setp.lt.s32 %p2, %r2, %r4;
and.pred %p3, %p1, %p2;
- @!%p3 bra BB10_2;
- bra.uni BB10_1;
+ @!%p3 bra BB11_2;
+ bra.uni BB11_1;
-BB10_1:
+BB11_1:
cvta.to.global.u64 %rd5, %rd2;
mad.lo.s32 %r13, %r1, %r4, %r2;
mul.wide.s32 %rd6, %r13, 8;
@@ -1846,14 +1920,14 @@ BB10_1:
add.s64 %rd9, %rd1, %rd8;
st.global.f64 [%rd9], %fd1;
-BB10_2:
+BB11_2:
setp.lt.s32 %p4, %r1, %r5;
setp.lt.s32 %p5, %r2, %r6;
and.pred %p6, %p4, %p5;
- @!%p6 bra BB10_4;
- bra.uni BB10_3;
+ @!%p6 bra BB11_4;
+ bra.uni BB11_3;
-BB10_3:
+BB11_3:
cvta.to.global.u64 %rd10, %rd3;
mad.lo.s32 %r15, %r1, %r6, %r2;
mul.wide.s32 %rd11, %r15, 8;
@@ -1865,7 +1939,7 @@ BB10_3:
add.s64 %rd14, %rd1, %rd13;
st.global.f64 [%rd14], %fd2;
-BB10_4:
+BB11_4:
ret;
}
@@ -1904,10 +1978,10 @@ BB10_4:
setp.lt.s32 %p1, %r1, %r3;
setp.lt.s32 %p2, %r2, %r4;
and.pred %p3, %p1, %p2;
- @!%p3 bra BB11_2;
- bra.uni BB11_1;
+ @!%p3 bra BB12_2;
+ bra.uni BB12_1;
-BB11_1:
+BB12_1:
cvta.to.global.u64 %rd5, %rd2;
mad.lo.s32 %r12, %r1, %r4, %r2;
mul.wide.s32 %rd6, %r12, 8;
@@ -1916,14 +1990,14 @@ BB11_1:
add.s64 %rd8, %rd1, %rd6;
st.global.f64 [%rd8], %fd1;
-BB11_2:
+BB12_2:
setp.lt.s32 %p4, %r1, %r5;
setp.lt.s32 %p5, %r2, %r6;
and.pred %p6, %p4, %p5;
- @!%p6 bra BB11_4;
- bra.uni BB11_3;
+ @!%p6 bra BB12_4;
+ bra.uni BB12_3;
-BB11_3:
+BB12_3:
cvta.to.global.u64 %rd9, %rd3;
mad.lo.s32 %r13, %r1, %r6, %r2;
mul.wide.s32 %rd10, %r13, 8;
@@ -1935,7 +2009,7 @@ BB11_3:
add.s64 %rd13, %rd1, %rd12;
st.global.f64 [%rd13], %fd2;
-BB11_4:
+BB12_4:
ret;
}
@@ -1963,9 +2037,9 @@ BB11_4:
mov.f64 %fd76, 0d0000000000000000;
mov.f64 %fd77, %fd76;
setp.ge.u32 %p1, %r32, %r5;
- @%p1 bra BB12_4;
+ @%p1 bra BB13_4;
-BB12_1:
+BB13_1:
mov.f64 %fd1, %fd77;
cvta.to.global.u64 %rd4, %rd2;
mul.wide.u32 %rd5, %r32, 8;
@@ -1974,23 +2048,23 @@ BB12_1:
add.f64 %fd78, %fd1, %fd30;
add.s32 %r3, %r32, %r9;
setp.ge.u32 %p2, %r3, %r5;
- @%p2 bra BB12_3;
+ @%p2 bra BB13_3;
mul.wide.u32 %rd8, %r3, 8;
add.s64 %rd9, %rd4, %rd8;
ld.global.f64 %fd31, [%rd9];
add.f64 %fd78, %fd78, %fd31;
-BB12_3:
+BB13_3:
mov.f64 %fd77, %fd78;
shl.b32 %r12, %r9, 1;
mov.u32 %r13, %nctaid.x;
mad.lo.s32 %r32, %r12, %r13, %r32;
setp.lt.u32 %p3, %r32, %r5;
mov.f64 %fd76, %fd77;
- @%p3 bra BB12_1;
+ @%p3 bra BB13_1;
-BB12_4:
+BB13_4:
mov.f64 %fd74, %fd76;
mul.wide.u32 %rd10, %r6, 8;
mov.u64 %rd11, sdata;
@@ -1998,130 +2072,130 @@ BB12_4:
st.shared.f64 [%rd1], %fd74;
bar.sync 0;
setp.lt.u32 %p4, %r9, 1024;
- @%p4 bra BB12_8;
+ @%p4 bra BB13_8;
setp.gt.u32 %p5, %r6, 511;
mov.f64 %fd75, %fd74;
- @%p5 bra BB12_7;
+ @%p5 bra BB13_7;
ld.shared.f64 %fd32, [%rd1+4096];
add.f64 %fd75, %fd74, %fd32;
st.shared.f64 [%rd1], %fd75;
-BB12_7:
+BB13_7:
mov.f64 %fd74, %fd75;
bar.sync 0;
-BB12_8:
+BB13_8:
mov.f64 %fd72, %fd74;
setp.lt.u32 %p6, %r9, 512;
- @%p6 bra BB12_12;
+ @%p6 bra BB13_12;
setp.gt.u32 %p7, %r6, 255;
mov.f64 %fd73, %fd72;
- @%p7 bra BB12_11;
+ @%p7 bra BB13_11;
ld.shared.f64 %fd33, [%rd1+2048];
add.f64 %fd73, %fd72, %fd33;
st.shared.f64 [%rd1], %fd73;
-BB12_11:
+BB13_11:
mov.f64 %fd72, %fd73;
bar.sync 0;
-BB12_12:
+BB13_12:
mov.f64 %fd70, %fd72;
setp.lt.u32 %p8, %r9, 256;
- @%p8 bra BB12_16;
+ @%p8 bra BB13_16;
setp.gt.u32 %p9, %r6, 127;
mov.f64 %fd71, %fd70;
- @%p9 bra BB12_15;
+ @%p9 bra BB13_15;
ld.shared.f64 %fd34, [%rd1+1024];
add.f64 %fd71, %fd70, %fd34;
st.shared.f64 [%rd1], %fd71;
-BB12_15:
+BB13_15:
mov.f64 %fd70, %fd71;
bar.sync 0;
-BB12_16:
+BB13_16:
mov.f64 %fd68, %fd70;
setp.lt.u32 %p10, %r9, 128;
- @%p10 bra BB12_20;
+ @%p10 bra BB13_20;
setp.gt.u32 %p11, %r6, 63;
mov.f64 %fd69, %fd68;
- @%p11 bra BB12_19;
+ @%p11 bra BB13_19;
ld.shared.f64 %fd35, [%rd1+512];
add.f64 %fd69, %fd68, %fd35;
st.shared.f64 [%rd1], %fd69;
-BB12_19:
+BB13_19:
mov.f64 %fd68, %fd69;
bar.sync 0;
-BB12_20:
+BB13_20:
mov.f64 %fd67, %fd68;
setp.gt.u32 %p12, %r6, 31;
- @%p12 bra BB12_33;
+ @%p12 bra BB13_33;
setp.lt.u32 %p13, %r9, 64;
- @%p13 bra BB12_23;
+ @%p13 bra BB13_23;
ld.volatile.shared.f64 %fd36, [%rd1+256];
add.f64 %fd67, %fd67, %fd36;
st.volatile.shared.f64 [%rd1], %fd67;
-BB12_23:
+BB13_23:
mov.f64 %fd66, %fd67;
setp.lt.u32 %p14, %r9, 32;
- @%p14 bra BB12_25;
+ @%p14 bra BB13_25;
ld.volatile.shared.f64 %fd37, [%rd1+128];
add.f64 %fd66, %fd66, %fd37;
st.volatile.shared.f64 [%rd1], %fd66;
-BB12_25:
+BB13_25:
mov.f64 %fd65, %fd66;
setp.lt.u32 %p15, %r9, 16;
- @%p15 bra BB12_27;
+ @%p15 bra BB13_27;
ld.volatile.shared.f64 %fd38, [%rd1+64];
add.f64 %fd65, %fd65, %fd38;
st.volatile.shared.f64 [%rd1], %fd65;
-BB12_27:
+BB13_27:
mov.f64 %fd64, %fd65;
setp.lt.u32 %p16, %r9, 8;
- @%p16 bra BB12_29;
+ @%p16 bra BB13_29;
ld.volatile.shared.f64 %fd39, [%rd1+32];
add.f64 %fd64, %fd64, %fd39;
st.volatile.shared.f64 [%rd1], %fd64;
-BB12_29:
+BB13_29:
mov.f64 %fd63, %fd64;
setp.lt.u32 %p17, %r9, 4;
- @%p17 bra BB12_31;
+ @%p17 bra BB13_31;
ld.volatile.shared.f64 %fd40, [%rd1+16];
add.f64 %fd63, %fd63, %fd40;
st.volatile.shared.f64 [%rd1], %fd63;
-BB12_31:
+BB13_31:
setp.lt.u32 %p18, %r9, 2;
- @%p18 bra BB12_33;
+ @%p18 bra BB13_33;
ld.volatile.shared.f64 %fd41, [%rd1+8];
add.f64 %fd42, %fd63, %fd41;
st.volatile.shared.f64 [%rd1], %fd42;
-BB12_33:
+BB13_33:
setp.ne.s32 %p19, %r6, 0;
- @%p19 bra BB12_35;
+ @%p19 bra BB13_35;
ld.shared.f64 %fd43, [sdata];
cvta.to.global.u64 %rd12, %rd3;
@@ -2129,7 +2203,7 @@ BB12_33:
add.s64 %rd14, %rd12, %rd13;
st.global.f64 [%rd14], %fd43;
-BB12_35:
+BB13_35:
ret;
}
@@ -2153,17 +2227,17 @@ BB12_35:
ld.param.u32 %r4, [reduce_row_sum_param_3];
mov.u32 %r6, %ctaid.x;
setp.ge.u32 %p1, %r6, %r5;
- @%p1 bra BB13_35;
+ @%p1 bra BB14_35;
mov.u32 %r38, %tid.x;
mov.f64 %fd72, 0d0000000000000000;
mov.f64 %fd73, %fd72;
setp.ge.u32 %p2, %r38, %r4;
- @%p2 bra BB13_4;
+ @%p2 bra BB14_4;
cvta.to.global.u64 %rd3, %rd1;
-BB13_3:
+BB14_3:
mad.lo.s32 %r8, %r6, %r4, %r38;
mul.wide.u32 %rd4, %r8, 8;
add.s64 %rd5, %rd3, %rd4;
@@ -2173,9 +2247,9 @@ BB13_3:
add.s32 %r38, %r9, %r38;
setp.lt.u32 %p3, %r38, %r4;
mov.f64 %fd72, %fd73;
- @%p3 bra BB13_3;
+ @%p3 bra BB14_3;
-BB13_4:
+BB14_4:
mov.f64 %fd70, %fd72;
mov.u32 %r10, %tid.x;
mul.wide.u32 %rd6, %r10, 8;
@@ -2185,130 +2259,130 @@ BB13_4:
bar.sync 0;
mov.u32 %r11, %ntid.x;
setp.lt.u32 %p4, %r11, 1024;
- @%p4 bra BB13_8;
+ @%p4 bra BB14_8;
setp.gt.u32 %p5, %r10, 511;
mov.f64 %fd71, %fd70;
- @%p5 bra BB13_7;
+ @%p5 bra BB14_7;
ld.shared.f64 %fd29, [%rd8+4096];
add.f64 %fd71, %fd70, %fd29;
st.shared.f64 [%rd8], %fd71;
-BB13_7:
+BB14_7:
mov.f64 %fd70, %fd71;
bar.sync 0;
-BB13_8:
+BB14_8:
mov.f64 %fd68, %fd70;
setp.lt.u32 %p6, %r11, 512;
- @%p6 bra BB13_12;
+ @%p6 bra BB14_12;
setp.gt.u32 %p7, %r10, 255;
mov.f64 %fd69, %fd68;
- @%p7 bra BB13_11;
+ @%p7 bra BB14_11;
ld.shared.f64 %fd30, [%rd8+2048];
add.f64 %fd69, %fd68, %fd30;
st.shared.f64 [%rd8], %fd69;
-BB13_11:
+BB14_11:
mov.f64 %fd68, %fd69;
bar.sync 0;
-BB13_12:
+BB14_12:
mov.f64 %fd66, %fd68;
setp.lt.u32 %p8, %r11, 256;
- @%p8 bra BB13_16;
+ @%p8 bra BB14_16;
setp.gt.u32 %p9, %r10, 127;
mov.f64 %fd67, %fd66;
- @%p9 bra BB13_15;
+ @%p9 bra BB14_15;
ld.shared.f64 %fd31, [%rd8+1024];
add.f64 %fd67, %fd66, %fd31;
st.shared.f64 [%rd8], %fd67;
-BB13_15:
+BB14_15:
mov.f64 %fd66, %fd67;
bar.sync 0;
-BB13_16:
+BB14_16:
mov.f64 %fd64, %fd66;
setp.lt.u32 %p10, %r11, 128;
- @%p10 bra BB13_20;
+ @%p10 bra BB14_20;
setp.gt.u32 %p11, %r10, 63;
mov.f64 %fd65, %fd64;
- @%p11 bra BB13_19;
+ @%p11 bra BB14_19;
ld.shared.f64 %fd32, [%rd8+512];
add.f64 %fd65, %fd64, %fd32;
st.shared.f64 [%rd8], %fd65;
-BB13_19:
+BB14_19:
mov.f64 %fd64, %fd65;
bar.sync 0;
-BB13_20:
+BB14_20:
mov.f64 %fd63, %fd64;
setp.gt.u32 %p12, %r10, 31;
- @%p12 bra BB13_33;
+ @%p12 bra BB14_33;
setp.lt.u32 %p13, %r11, 64;
- @%p13 bra BB13_23;
+ @%p13 bra BB14_23;
ld.volatile.shared.f64 %fd33, [%rd8+256];
add.f64 %fd63, %fd63, %fd33;
st.volatile.shared.f64 [%rd8], %fd63;
-BB13_23:
+BB14_23:
mov.f64 %fd62, %fd63;
setp.lt.u32 %p14, %r11, 32;
- @%p14 bra BB13_25;
+ @%p14 bra BB14_25;
ld.volatile.shared.f64 %fd34, [%rd8+128];
add.f64 %fd62, %fd62, %fd34;
st.volatile.shared.f64 [%rd8], %fd62;
-BB13_25:
+BB14_25:
mov.f64 %fd61, %fd62;
setp.lt.u32 %p15, %r11, 16;
- @%p15 bra BB13_27;
+ @%p15 bra BB14_27;
ld.volatile.shared.f64 %fd35, [%rd8+64];
add.f64 %fd61, %fd61, %fd35;
st.volatile.shared.f64 [%rd8], %fd61;
-BB13_27:
+BB14_27:
mov.f64 %fd60, %fd61;
setp.lt.u32 %p16, %r11, 8;
- @%p16 bra BB13_29;
+ @%p16 bra BB14_29;
ld.volatile.shared.f64 %fd36, [%rd8+32];
add.f64 %fd60, %fd60, %fd36;
st.volatile.shared.f64 [%rd8], %fd60;
-BB13_29:
+BB14_29:
mov.f64 %fd59, %fd60;
setp.lt.u32 %p17, %r11, 4;
- @%p17 bra BB13_31;
+ @%p17 bra BB14_31;
ld.volatile.shared.f64 %fd37, [%rd8+16];
add.f64 %fd59, %fd59, %fd37;
st.volatile.shared.f64 [%rd8], %fd59;
-BB13_31:
+BB14_31:
setp.lt.u32 %p18, %r11, 2;
- @%p18 bra BB13_33;
+ @%p18 bra BB14_33;
ld.volatile.shared.f64 %fd38, [%rd8+8];
add.f64 %fd39, %fd59, %fd38;
st.volatile.shared.f64 [%rd8], %fd39;
-BB13_33:
+BB14_33:
setp.ne.s32 %p19, %r10, 0;
- @%p19 bra BB13_35;
+ @%p19 bra BB14_35;
ld.shared.f64 %fd40, [sdata];
cvta.to.global.u64 %rd39, %rd2;
@@ -2316,7 +2390,7 @@ BB13_33:
add.s64 %rd41, %rd39, %rd40;
st.global.f64 [%rd41], %fd40;
-BB13_35:
+BB14_35:
ret;
}
@@ -2343,18 +2417,18 @@ BB13_35:
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r7, %r8, %r9;
setp.ge.u32 %p1, %r1, %r6;
- @%p1 bra BB14_5;
+ @%p1 bra BB15_5;
cvta.to.global.u64 %rd1, %rd2;
mul.lo.s32 %r2, %r6, %r5;
mov.f64 %fd8, 0d0000000000000000;
mov.f64 %fd9, %fd8;
setp.ge.u32 %p2, %r1, %r2;
- @%p2 bra BB14_4;
+ @%p2 bra BB15_4;
mov.u32 %r10, %r1;
-BB14_3:
+BB15_3:
mov.u32 %r3, %r10;
mul.wide.u32 %rd4, %r3, 8;
add.s64 %rd5, %rd1, %rd4;
@@ -2364,15 +2438,15 @@ BB14_3:
setp.lt.u32 %p3, %r4, %r2;
mov.u32 %r10, %r4;
mov.f64 %fd8, %fd9;
- @%p3 bra BB14_3;
+ @%p3 bra BB15_3;
-BB14_4:
+BB15_4:
cvta.to.global.u64 %rd6, %rd3;
mul.wide.u32 %rd7, %r1, 8;
add.s64 %rd8, %rd6, %rd7;
st.global.f64 [%rd8], %fd8;
-BB14_5:
+BB15_5:
ret;
}
@@ -2400,9 +2474,9 @@ BB14_5:
mov.f64 %fd76, 0dFFEFFFFFFFFFFFFF;
mov.f64 %fd77, %fd76;
setp.ge.u32 %p1, %r32, %r5;
- @%p1 bra BB15_4;
+ @%p1 bra BB16_4;
-BB15_1:
+BB16_1:
mov.f64 %fd1, %fd77;
cvta.to.global.u64 %rd4, %rd2;
mul.wide.u32 %rd5, %r32, 8;
@@ -2411,23 +2485,23 @@ BB15_1:
max.f64 %fd78, %fd1, %fd30;
add.s32 %r3, %r32, %r9;
setp.ge.u32 %p2, %r3, %r5;
- @%p2 bra BB15_3;
+ @%p2 bra BB16_3;
mul.wide.u32 %rd8, %r3, 8;
add.s64 %rd9, %rd4, %rd8;
ld.global.f64 %fd31, [%rd9];
max.f64 %fd78, %fd78, %fd31;
-BB15_3:
+BB16_3:
mov.f64 %fd77, %fd78;
shl.b32 %r12, %r9, 1;
mov.u32 %r13, %nctaid.x;
mad.lo.s32 %r32, %r12, %r13, %r32;
setp.lt.u32 %p3, %r32, %r5;
mov.f64 %fd76, %fd77;
- @%p3 bra BB15_1;
+ @%p3 bra BB16_1;
-BB15_4:
+BB16_4:
mov.f64 %fd74, %fd76;
mul.wide.u32 %rd10, %r6, 8;
mov.u64 %rd11, sdata;
@@ -2435,130 +2509,130 @@ BB15_4:
st.shared.f64 [%rd1], %fd74;
bar.sync 0;
setp.lt.u32 %p4, %r9, 1024;
- @%p4 bra BB15_8;
+ @%p4 bra BB16_8;
setp.gt.u32 %p5, %r6, 511;
mov.f64 %fd75, %fd74;
- @%p5 bra BB15_7;
+ @%p5 bra BB16_7;
ld.shared.f64 %fd32, [%rd1+4096];
max.f64 %fd75, %fd74, %fd32;
st.shared.f64 [%rd1], %fd75;
-BB15_7:
+BB16_7:
mov.f64 %fd74, %fd75;
bar.sync 0;
-BB15_8:
+BB16_8:
mov.f64 %fd72, %fd74;
setp.lt.u32 %p6, %r9, 512;
- @%p6 bra BB15_12;
+ @%p6 bra BB16_12;
setp.gt.u32 %p7, %r6, 255;
mov.f64 %fd73, %fd72;
- @%p7 bra BB15_11;
+ @%p7 bra BB16_11;
ld.shared.f64 %fd33, [%rd1+2048];
max.f64 %fd73, %fd72, %fd33;
st.shared.f64 [%rd1], %fd73;
-BB15_11:
+BB16_11:
mov.f64 %fd72, %fd73;
bar.sync 0;
-BB15_12:
+BB16_12:
mov.f64 %fd70, %fd72;
setp.lt.u32 %p8, %r9, 256;
- @%p8 bra BB15_16;
+ @%p8 bra BB16_16;
setp.gt.u32 %p9, %r6, 127;
mov.f64 %fd71, %fd70;
- @%p9 bra BB15_15;
+ @%p9 bra BB16_15;
ld.shared.f64 %fd34, [%rd1+1024];
max.f64 %fd71, %fd70, %fd34;
st.shared.f64 [%rd1], %fd71;
-BB15_15:
+BB16_15:
mov.f64 %fd70, %fd71;
bar.sync 0;
-BB15_16:
+BB16_16:
mov.f64 %fd68, %fd70;
setp.lt.u32 %p10, %r9, 128;
- @%p10 bra BB15_20;
+ @%p10 bra BB16_20;
setp.gt.u32 %p11, %r6, 63;
mov.f64 %fd69, %fd68;
- @%p11 bra BB15_19;
+ @%p11 bra BB16_19;
ld.shared.f64 %fd35, [%rd1+512];
max.f64 %fd69, %fd68, %fd35;
st.shared.f64 [%rd1], %fd69;
-BB15_19:
+BB16_19:
mov.f64 %fd68, %fd69;
bar.sync 0;
-BB15_20:
+BB16_20:
mov.f64 %fd67, %fd68;
setp.gt.u32 %p12, %r6, 31;
- @%p12 bra BB15_33;
+ @%p12 bra BB16_33;
setp.lt.u32 %p13, %r9, 64;
- @%p13 bra BB15_23;
+ @%p13 bra BB16_23;
ld.volatile.shared.f64 %fd36, [%rd1+256];
max.f64 %fd67, %fd67, %fd36;
st.volatile.shared.f64 [%rd1], %fd67;
-BB15_23:
+BB16_23:
mov.f64 %fd66, %fd67;
setp.lt.u32 %p14, %r9, 32;
- @%p14 bra BB15_25;
+ @%p14 bra BB16_25;
ld.volatile.shared.f64 %fd37, [%rd1+128];
max.f64 %fd66, %fd66, %fd37;
st.volatile.shared.f64 [%rd1], %fd66;
-BB15_25:
+BB16_25:
mov.f64 %fd65, %fd66;
setp.lt.u32 %p15, %r9, 16;
- @%p15 bra BB15_27;
+ @%p15 bra BB16_27;
ld.volatile.shared.f64 %fd38, [%rd1+64];
max.f64 %fd65, %fd65, %fd38;
st.volatile.shared.f64 [%rd1], %fd65;
-BB15_27:
+BB16_27:
mov.f64 %fd64, %fd65;
setp.lt.u32 %p16, %r9, 8;
- @%p16 bra BB15_29;
+ @%p16 bra BB16_29;
ld.volatile.shared.f64 %fd39, [%rd1+32];
max.f64 %fd64, %fd64, %fd39;
st.volatile.shared.f64 [%rd1], %fd64;
-BB15_29:
+BB16_29:
mov.f64 %fd63, %fd64;
setp.lt.u32 %p17, %r9, 4;
- @%p17 bra BB15_31;
+ @%p17 bra BB16_31;
ld.volatile.shared.f64 %fd40, [%rd1+16];
max.f64 %fd63, %fd63, %fd40;
st.volatile.shared.f64 [%rd1], %fd63;
-BB15_31:
+BB16_31:
setp.lt.u32 %p18, %r9, 2;
- @%p18 bra BB15_33;
+ @%p18 bra BB16_33;
ld.volatile.shared.f64 %fd41, [%rd1+8];
max.f64 %fd42, %fd63, %fd41;
st.volatile.shared.f64 [%rd1], %fd42;
-BB15_33:
+BB16_33:
setp.ne.s32 %p19, %r6, 0;
- @%p19 bra BB15_35;
+ @%p19 bra BB16_35;
ld.shared.f64 %fd43, [sdata];
cvta.to.global.u64 %rd12, %rd3;
@@ -2566,7 +2640,7 @@ BB15_33:
add.s64 %rd14, %rd12, %rd13;
st.global.f64 [%rd14], %fd43;
-BB15_35:
+BB16_35:
ret;
}
@@ -2590,17 +2664,17 @@ BB15_35:
ld.param.u32 %r4, [reduce_row_max_param_3];
mov.u32 %r6, %ctaid.x;
setp.ge.u32 %p1, %r6, %r5;
- @%p1 bra BB16_35;
+ @%p1 bra BB17_35;
mov.u32 %r38, %tid.x;
mov.f64 %fd72, 0dFFEFFFFFFFFFFFFF;
mov.f64 %fd73, %fd72;
setp.ge.u32 %p2, %r38, %r4;
- @%p2 bra BB16_4;
+ @%p2 bra BB17_4;
cvta.to.global.u64 %rd3, %rd1;
-BB16_3:
+BB17_3:
mad.lo.s32 %r8, %r6, %r4, %r38;
mul.wide.u32 %rd4, %r8, 8;
add.s64 %rd5, %rd3, %rd4;
@@ -2610,9 +2684,9 @@ BB16_3:
add.s32 %r38, %r9, %r38;
setp.lt.u32 %p3, %r38, %r4;
mov.f64 %fd72, %fd73;
- @%p3 bra BB16_3;
+ @%p3 bra BB17_3;
-BB16_4:
+BB17_4:
mov.f64 %fd70, %fd72;
mov.u32 %r10, %tid.x;
mul.wide.u32 %rd6, %r10, 8;
@@ -2622,130 +2696,130 @@ BB16_4:
bar.sync 0;
mov.u32 %r11, %ntid.x;
setp.lt.u32 %p4, %r11, 1024;
- @%p4 bra BB16_8;
+ @%p4 bra BB17_8;
setp.gt.u32 %p5, %r10, 511;
mov.f64 %fd71, %fd70;
- @%p5 bra BB16_7;
+ @%p5 bra BB17_7;
ld.shared.f64 %fd29, [%rd8+4096];
max.f64 %fd71, %fd70, %fd29;
st.shared.f64 [%rd8], %fd71;
-BB16_7:
+BB17_7:
mov.f64 %fd70, %fd71;
bar.sync 0;
-BB16_8:
+BB17_8:
mov.f64 %fd68, %fd70;
setp.lt.u32 %p6, %r11, 512;
- @%p6 bra BB16_12;
+ @%p6 bra BB17_12;
setp.gt.u32 %p7, %r10, 255;
mov.f64 %fd69, %fd68;
- @%p7 bra BB16_11;
+ @%p7 bra BB17_11;
ld.shared.f64 %fd30, [%rd8+2048];
max.f64 %fd69, %fd68, %fd30;
st.shared.f64 [%rd8], %fd69;
-BB16_11:
+BB17_11:
mov.f64 %fd68, %fd69;
bar.sync 0;
-BB16_12:
+BB17_12:
mov.f64 %fd66, %fd68;
setp.lt.u32 %p8, %r11, 256;
- @%p8 bra BB16_16;
+ @%p8 bra BB17_16;
setp.gt.u32 %p9, %r10, 127;
mov.f64 %fd67, %fd66;
- @%p9 bra BB16_15;
+ @%p9 bra BB17_15;
ld.shared.f64 %fd31, [%rd8+1024];
max.f64 %fd67, %fd66, %fd31;
st.shared.f64 [%rd8], %fd67;
-BB16_15:
+BB17_15:
mov.f64 %fd66, %fd67;
bar.sync 0;
-BB16_16:
+BB17_16:
mov.f64 %fd64, %fd66;
setp.lt.u32 %p10, %r11, 128;
- @%p10 bra BB16_20;
+ @%p10 bra BB17_20;
setp.gt.u32 %p11, %r10, 63;
mov.f64 %fd65, %fd64;
- @%p11 bra BB16_19;
+ @%p11 bra BB17_19;
ld.shared.f64 %fd32, [%rd8+512];
max.f64 %fd65, %fd64, %fd32;
st.shared.f64 [%rd8], %fd65;
-BB16_19:
+BB17_19:
mov.f64 %fd64, %fd65;
bar.sync 0;
-BB16_20:
+BB17_20:
mov.f64 %fd63, %fd64;
setp.gt.u32 %p12, %r10, 31;
- @%p12 bra BB16_33;
+ @%p12 bra BB17_33;
setp.lt.u32 %p13, %r11, 64;
- @%p13 bra BB16_23;
+ @%p13 bra BB17_23;
ld.volatile.shared.f64 %fd33, [%rd8+256];
max.f64 %fd63, %fd63, %fd33;
st.volatile.shared.f64 [%rd8], %fd63;
-BB16_23:
+BB17_23:
mov.f64 %fd62, %fd63;
setp.lt.u32 %p14, %r11, 32;
- @%p14 bra BB16_25;
+ @%p14 bra BB17_25;
ld.volatile.shared.f64 %fd34, [%rd8+128];
max.f64 %fd62, %fd62, %fd34;
st.volatile.shared.f64 [%rd8], %fd62;
-BB16_25:
+BB17_25:
mov.f64 %fd61, %fd62;
setp.lt.u32 %p15, %r11, 16;
- @%p15 bra BB16_27;
+ @%p15 bra BB17_27;
ld.volatile.shared.f64 %fd35, [%rd8+64];
max.f64 %fd61, %fd61, %fd35;
st.volatile.shared.f64 [%rd8], %fd61;
-BB16_27:
+BB17_27:
mov.f64 %fd60, %fd61;
setp.lt.u32 %p16, %r11, 8;
- @%p16 bra BB16_29;
+ @%p16 bra BB17_29;
ld.volatile.shared.f64 %fd36, [%rd8+32];
max.f64 %fd60, %fd60, %fd36;
st.volatile.shared.f64 [%rd8], %fd60;
-BB16_29:
+BB17_29:
mov.f64 %fd59, %fd60;
setp.lt.u32 %p17, %r11, 4;
- @%p17 bra BB16_31;
+ @%p17 bra BB17_31;
ld.volatile.shared.f64 %fd37, [%rd8+16];
max.f64 %fd59, %fd59, %fd37;
st.volatile.shared.f64 [%rd8], %fd59;
-BB16_31:
+BB17_31:
setp.lt.u32 %p18, %r11, 2;
- @%p18 bra BB16_33;
+ @%p18 bra BB17_33;
ld.volatile.shared.f64 %fd38, [%rd8+8];
max.f64 %fd39, %fd59, %fd38;
st.volatile.shared.f64 [%rd8], %fd39;
-BB16_33:
+BB17_33:
setp.ne.s32 %p19, %r10, 0;
- @%p19 bra BB16_35;
+ @%p19 bra BB17_35;
ld.shared.f64 %fd40, [sdata];
cvta.to.global.u64 %rd39, %rd2;
@@ -2753,7 +2827,7 @@ BB16_33:
add.s64 %rd41, %rd39, %rd40;
st.global.f64 [%rd41], %fd40;
-BB16_35:
+BB17_35:
ret;
}
@@ -2780,18 +2854,18 @@ BB16_35:
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r7, %r8, %r9;
setp.ge.u32 %p1, %r1, %r6;
- @%p1 bra BB17_5;
+ @%p1 bra BB18_5;
cvta.to.global.u64 %rd1, %rd2;
mul.lo.s32 %r2, %r6, %r5;
mov.f64 %fd8, 0dFFEFFFFFFFFFFFFF;
mov.f64 %fd9, %fd8;
setp.ge.u32 %p2, %r1, %r2;
- @%p2 bra BB17_4;
+ @%p2 bra BB18_4;
mov.u32 %r10, %r1;
-BB17_3:
+BB18_3:
mov.u32 %r3, %r10;
mul.wide.u32 %rd4, %r3, 8;
add.s64 %rd5, %rd1, %rd4;
@@ -2801,15 +2875,15 @@ BB17_3:
setp.lt.u32 %p3, %r4, %r2;
mov.u32 %r10, %r4;
mov.f64 %fd8, %fd9;
- @%p3 bra BB17_3;
+ @%p3 bra BB18_3;
-BB17_4:
+BB18_4:
cvta.to.global.u64 %rd6, %rd3;
mul.wide.u32 %rd7, %r1, 8;
add.s64 %rd8, %rd6, %rd7;
st.global.f64 [%rd8], %fd8;
-BB17_5:
+BB18_5:
ret;
}
@@ -2837,9 +2911,9 @@ BB17_5:
mov.f64 %fd76, 0d7FEFFFFFFFFFFFFF;
mov.f64 %fd77, %fd76;
setp.ge.u32 %p1, %r32, %r5;
- @%p1 bra BB18_4;
+ @%p1 bra BB19_4;
-BB18_1:
+BB19_1:
mov.f64 %fd1, %fd77;
cvta.to.global.u64 %rd4, %rd2;
mul.wide.u32 %rd5, %r32, 8;
@@ -2848,23 +2922,23 @@ BB18_1:
min.f64 %fd78, %fd1, %fd30;
add.s32 %r3, %r32, %r9;
setp.ge.u32 %p2, %r3, %r5;
- @%p2 bra BB18_3;
+ @%p2 bra BB19_3;
mul.wide.u32 %rd8, %r3, 8;
add.s64 %rd9, %rd4, %rd8;
ld.global.f64 %fd31, [%rd9];
min.f64 %fd78, %fd78, %fd31;
-BB18_3:
+BB19_3:
mov.f64 %fd77, %fd78;
shl.b32 %r12, %r9, 1;
mov.u32 %r13, %nctaid.x;
mad.lo.s32 %r32, %r12, %r13, %r32;
setp.lt.u32 %p3, %r32, %r5;
mov.f64 %fd76, %fd77;
- @%p3 bra BB18_1;
+ @%p3 bra BB19_1;
-BB18_4:
+BB19_4:
mov.f64 %fd74, %fd76;
mul.wide.u32 %rd10, %r6, 8;
mov.u64 %rd11, sdata;
@@ -2872,130 +2946,130 @@ BB18_4:
st.shared.f64 [%rd1], %fd74;
bar.sync 0;
setp.lt.u32 %p4, %r9, 1024;
- @%p4 bra BB18_8;
+ @%p4 bra BB19_8;
setp.gt.u32 %p5, %r6, 511;
mov.f64 %fd75, %fd74;
- @%p5 bra BB18_7;
+ @%p5 bra BB19_7;
ld.shared.f64 %fd32, [%rd1+4096];
min.f64 %fd75, %fd74, %fd32;
st.shared.f64 [%rd1], %fd75;
-BB18_7:
+BB19_7:
mov.f64 %fd74, %fd75;
bar.sync 0;
-BB18_8:
+BB19_8:
mov.f64 %fd72, %fd74;
setp.lt.u32 %p6, %r9, 512;
- @%p6 bra BB18_12;
+ @%p6 bra BB19_12;
setp.gt.u32 %p7, %r6, 255;
mov.f64 %fd73, %fd72;
- @%p7 bra BB18_11;
+ @%p7 bra BB19_11;
ld.shared.f64 %fd33, [%rd1+2048];
min.f64 %fd73, %fd72, %fd33;
st.shared.f64 [%rd1], %fd73;
-BB18_11:
+BB19_11:
mov.f64 %fd72, %fd73;
bar.sync 0;
-BB18_12:
+BB19_12:
mov.f64 %fd70, %fd72;
setp.lt.u32 %p8, %r9, 256;
- @%p8 bra BB18_16;
+ @%p8 bra BB19_16;
setp.gt.u32 %p9, %r6, 127;
mov.f64 %fd71, %fd70;
- @%p9 bra BB18_15;
+ @%p9 bra BB19_15;
ld.shared.f64 %fd34, [%rd1+1024];
min.f64 %fd71, %fd70, %fd34;
st.shared.f64 [%rd1], %fd71;
-BB18_15:
+BB19_15:
mov.f64 %fd70, %fd71;
bar.sync 0;
-BB18_16:
+BB19_16:
mov.f64 %fd68, %fd70;
setp.lt.u32 %p10, %r9, 128;
- @%p10 bra BB18_20;
+ @%p10 bra BB19_20;
setp.gt.u32 %p11, %r6, 63;
mov.f64 %fd69, %fd68;
- @%p11 bra BB18_19;
+ @%p11 bra BB19_19;
ld.shared.f64 %fd35, [%rd1+512];
min.f64 %fd69, %fd68, %fd35;
st.shared.f64 [%rd1], %fd69;
-BB18_19:
+BB19_19:
mov.f64 %fd68, %fd69;
bar.sync 0;
-BB18_20:
+BB19_20:
mov.f64 %fd67, %fd68;
setp.gt.u32 %p12, %r6, 31;
- @%p12 bra BB18_33;
+ @%p12 bra BB19_33;
setp.lt.u32 %p13, %r9, 64;
- @%p13 bra BB18_23;
+ @%p13 bra BB19_23;
ld.volatile.shared.f64 %fd36, [%rd1+256];
min.f64 %fd67, %fd67, %fd36;
st.volatile.shared.f64 [%rd1], %fd67;
-BB18_23:
+BB19_23:
mov.f64 %fd66, %fd67;
setp.lt.u32 %p14, %r9, 32;
- @%p14 bra BB18_25;
+ @%p14 bra BB19_25;
ld.volatile.shared.f64 %fd37, [%rd1+128];
min.f64 %fd66, %fd66, %fd37;
st.volatile.shared.f64 [%rd1], %fd66;
-BB18_25:
+BB19_25:
mov.f64 %fd65, %fd66;
setp.lt.u32 %p15, %r9, 16;
- @%p15 bra BB18_27;
+ @%p15 bra BB19_27;
ld.volatile.shared.f64 %fd38, [%rd1+64];
min.f64 %fd65, %fd65, %fd38;
st.volatile.shared.f64 [%rd1], %fd65;
-BB18_27:
+BB19_27:
mov.f64 %fd64, %fd65;
setp.lt.u32 %p16, %r9, 8;
- @%p16 bra BB18_29;
+ @%p16 bra BB19_29;
ld.volatile.shared.f64 %fd39, [%rd1+32];
min.f64 %fd64, %fd64, %fd39;
st.volatile.shared.f64 [%rd1], %fd64;
-BB18_29:
+BB19_29:
mov.f64 %fd63, %fd64;
setp.lt.u32 %p17, %r9, 4;
- @%p17 bra BB18_31;
+ @%p17 bra BB19_31;
ld.volatile.shared.f64 %fd40, [%rd1+16];
min.f64 %fd63, %fd63, %fd40;
st.volatile.shared.f64 [%rd1], %fd63;
-BB18_31:
+BB19_31:
setp.lt.u32 %p18, %r9, 2;
- @%p18 bra BB18_33;
+ @%p18 bra BB19_33;
ld.volatile.shared.f64 %fd41, [%rd1+8];
min.f64 %fd42, %fd63, %fd41;
st.volatile.shared.f64 [%rd1], %fd42;
-BB18_33:
+BB19_33:
setp.ne.s32 %p19, %r6, 0;
- @%p19 bra BB18_35;
+ @%p19 bra BB19_35;
ld.shared.f64 %fd43, [sdata];
cvta.to.global.u64 %rd12, %rd3;
@@ -3003,7 +3077,7 @@ BB18_33:
add.s64 %rd14, %rd12, %rd13;
st.global.f64 [%rd14], %fd43;
-BB18_35:
+BB19_35:
ret;
}
@@ -3027,17 +3101,17 @@ BB18_35:
ld.param.u32 %r4, [reduce_row_min_param_3];
mov.u32 %r6, %ctaid.x;
setp.ge.u32 %p1, %r6, %r5;
- @%p1 bra BB19_35;
+ @%p1 bra BB20_35;
mov.u32 %r38, %tid.x;
mov.f64 %fd72, 0d7FEFFFFFFFFFFFFF;
mov.f64 %fd73, %fd72;
setp.ge.u32 %p2, %r38, %r4;
- @%p2 bra BB19_4;
+ @%p2 bra BB20_4;
cvta.to.global.u64 %rd3, %rd1;
-BB19_3:
+BB20_3:
mad.lo.s32 %r8, %r6, %r4, %r38;
mul.wide.u32 %rd4, %r8, 8;
add.s64 %rd5, %rd3, %rd4;
@@ -3047,9 +3121,9 @@ BB19_3:
add.s32 %r38, %r9, %r38;
setp.lt.u32 %p3, %r38, %r4;
mov.f64 %fd72, %fd73;
- @%p3 bra BB19_3;
+ @%p3 bra BB20_3;
-BB19_4:
+BB20_4:
mov.f64 %fd70, %fd72;
mov.u32 %r10, %tid.x;
mul.wide.u32 %rd6, %r10, 8;
@@ -3059,130 +3133,130 @@ BB19_4:
bar.sync 0;
mov.u32 %r11, %ntid.x;
setp.lt.u32 %p4, %r11, 1024;
- @%p4 bra BB19_8;
+ @%p4 bra BB20_8;
setp.gt.u32 %p5, %r10, 511;
mov.f64 %fd71, %fd70;
- @%p5 bra BB19_7;
+ @%p5 bra BB20_7;
ld.shared.f64 %fd29, [%rd8+4096];
min.f64 %fd71, %fd70, %fd29;
st.shared.f64 [%rd8], %fd71;
-BB19_7:
+BB20_7:
mov.f64 %fd70, %fd71;
bar.sync 0;
-BB19_8:
+BB20_8:
mov.f64 %fd68, %fd70;
setp.lt.u32 %p6, %r11, 512;
- @%p6 bra BB19_12;
+ @%p6 bra BB20_12;
setp.gt.u32 %p7, %r10, 255;
mov.f64 %fd69, %fd68;
- @%p7 bra BB19_11;
+ @%p7 bra BB20_11;
ld.shared.f64 %fd30, [%rd8+2048];
min.f64 %fd69, %fd68, %fd30;
st.shared.f64 [%rd8], %fd69;
-BB19_11:
+BB20_11:
mov.f64 %fd68, %fd69;
bar.sync 0;
-BB19_12:
+BB20_12:
mov.f64 %fd66, %fd68;
setp.lt.u32 %p8, %r11, 256;
- @%p8 bra BB19_16;
+ @%p8 bra BB20_16;
setp.gt.u32 %p9, %r10, 127;
mov.f64 %fd67, %fd66;
- @%p9 bra BB19_15;
+ @%p9 bra BB20_15;
ld.shared.f64 %fd31, [%rd8+1024];
min.f64 %fd67, %fd66, %fd31;
st.shared.f64 [%rd8], %fd67;
-BB19_15:
+BB20_15:
mov.f64 %fd66, %fd67;
bar.sync 0;
-BB19_16:
+BB20_16:
mov.f64 %fd64, %fd66;
setp.lt.u32 %p10, %r11, 128;
- @%p10 bra BB19_20;
+ @%p10 bra BB20_20;
setp.gt.u32 %p11, %r10, 63;
mov.f64 %fd65, %fd64;
- @%p11 bra BB19_19;
+ @%p11 bra BB20_19;
ld.shared.f64 %fd32, [%rd8+512];
min.f64 %fd65, %fd64, %fd32;
st.shared.f64 [%rd8], %fd65;
-BB19_19:
+BB20_19:
mov.f64 %fd64, %fd65;
bar.sync 0;
-BB19_20:
+BB20_20:
mov.f64 %fd63, %fd64;
setp.gt.u32 %p12, %r10, 31;
- @%p12 bra BB19_33;
+ @%p12 bra BB20_33;
setp.lt.u32 %p13, %r11, 64;
- @%p13 bra BB19_23;
+ @%p13 bra BB20_23;
ld.volatile.shared.f64 %fd33, [%rd8+256];
min.f64 %fd63, %fd63, %fd33;
st.volatile.shared.f64 [%rd8], %fd63;
-BB19_23:
+BB20_23:
mov.f64 %fd62, %fd63;
setp.lt.u32 %p14, %r11, 32;
- @%p14 bra BB19_25;
+ @%p14 bra BB20_25;
ld.volatile.shared.f64 %fd34, [%rd8+128];
min.f64 %fd62, %fd62, %fd34;
st.volatile.shared.f64 [%rd8], %fd62;
-BB19_25:
+BB20_25:
mov.f64 %fd61, %fd62;
setp.lt.u32 %p15, %r11, 16;
- @%p15 bra BB19_27;
+ @%p15 bra BB20_27;
ld.volatile.shared.f64 %fd35, [%rd8+64];
min.f64 %fd61, %fd61, %fd35;
st.volatile.shared.f64 [%rd8], %fd61;
-BB19_27:
+BB20_27:
mov.f64 %fd60, %fd61;
setp.lt.u32 %p16, %r11, 8;
- @%p16 bra BB19_29;
+ @%p16 bra BB20_29;
ld.volatile.shared.f64 %fd36, [%rd8+32];
min.f64 %fd60, %fd60, %fd36;
st.volatile.shared.f64 [%rd8], %fd60;
-BB19_29:
+BB20_29:
mov.f64 %fd59, %fd60;
setp.lt.u32 %p17, %r11, 4;
- @%p17 bra BB19_31;
+ @%p17 bra BB20_31;
ld.volatile.shared.f64 %fd37, [%rd8+16];
min.f64 %fd59, %fd59, %fd37;
st.volatile.shared.f64 [%rd8], %fd59;
-BB19_31:
+BB20_31:
setp.lt.u32 %p18, %r11, 2;
- @%p18 bra BB19_33;
+ @%p18 bra BB20_33;
ld.volatile.shared.f64 %fd38, [%rd8+8];
min.f64 %fd39, %fd59, %fd38;
st.volatile.shared.f64 [%rd8], %fd39;
-BB19_33:
+BB20_33:
setp.ne.s32 %p19, %r10, 0;
- @%p19 bra BB19_35;
+ @%p19 bra BB20_35;
ld.shared.f64 %fd40, [sdata];
cvta.to.global.u64 %rd39, %rd2;
@@ -3190,7 +3264,7 @@ BB19_33:
add.s64 %rd41, %rd39, %rd40;
st.global.f64 [%rd41], %fd40;
-BB19_35:
+BB20_35:
ret;
}
@@ -3217,18 +3291,18 @@ BB19_35:
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r7, %r8, %r9;
setp.ge.u32 %p1, %r1, %r6;
- @%p1 bra BB20_5;
+ @%p1 bra BB21_5;
cvta.to.global.u64 %rd1, %rd2;
mul.lo.s32 %r2, %r6, %r5;
mov.f64 %fd8, 0d7FEFFFFFFFFFFFFF;
mov.f64 %fd9, %fd8;
setp.ge.u32 %p2, %r1, %r2;
- @%p2 bra BB20_4;
+ @%p2 bra BB21_4;
mov.u32 %r10, %r1;
-BB20_3:
+BB21_3:
mov.u32 %r3, %r10;
mul.wide.u32 %rd4, %r3, 8;
add.s64 %rd5, %rd1, %rd4;
@@ -3238,15 +3312,15 @@ BB20_3:
setp.lt.u32 %p3, %r4, %r2;
mov.u32 %r10, %r4;
mov.f64 %fd8, %fd9;
- @%p3 bra BB20_3;
+ @%p3 bra BB21_3;
-BB20_4:
+BB21_4:
cvta.to.global.u64 %rd6, %rd3;
mul.wide.u32 %rd7, %r1, 8;
add.s64 %rd8, %rd6, %rd7;
st.global.f64 [%rd8], %fd8;
-BB20_5:
+BB21_5:
ret;
}
@@ -3274,9 +3348,9 @@ BB20_5:
mov.f64 %fd76, 0d3FF0000000000000;
mov.f64 %fd77, %fd76;
setp.ge.u32 %p1, %r32, %r5;
- @%p1 bra BB21_4;
+ @%p1 bra BB22_4;
-BB21_1:
+BB22_1:
mov.f64 %fd1, %fd77;
cvta.to.global.u64 %rd4, %rd2;
mul.wide.u32 %rd5, %r32, 8;
@@ -3285,23 +3359,23 @@ BB21_1:
mul.f64 %fd78, %fd1, %fd30;
add.s32 %r3, %r32, %r9;
setp.ge.u32 %p2, %r3, %r5;
- @%p2 bra BB21_3;
+ @%p2 bra BB22_3;
mul.wide.u32 %rd8, %r3, 8;
add.s64 %rd9, %rd4, %rd8;
ld.global.f64 %fd31, [%rd9];
mul.f64 %fd78, %fd78, %fd31;
-BB21_3:
+BB22_3:
mov.f64 %fd77, %fd78;
shl.b32 %r12, %r9, 1;
mov.u32 %r13, %nctaid.x;
mad.lo.s32 %r32, %r12, %r13, %r32;
setp.lt.u32 %p3, %r32, %r5;
mov.f64 %fd76, %fd77;
- @%p3 bra BB21_1;
+ @%p3 bra BB22_1;
-BB21_4:
+BB22_4:
mov.f64 %fd74, %fd76;
mul.wide.u32 %rd10, %r6, 8;
mov.u64 %rd11, sdata;
@@ -3309,130 +3383,130 @@ BB21_4:
st.shared.f64 [%rd1], %fd74;
bar.sync 0;
setp.lt.u32 %p4, %r9, 1024;
- @%p4 bra BB21_8;
+ @%p4 bra BB22_8;
setp.gt.u32 %p5, %r6, 511;
mov.f64 %fd75, %fd74;
- @%p5 bra BB21_7;
+ @%p5 bra BB22_7;
ld.shared.f64 %fd32, [%rd1+4096];
mul.f64 %fd75, %fd74, %fd32;
st.shared.f64 [%rd1], %fd75;
-BB21_7:
+BB22_7:
mov.f64 %fd74, %fd75;
bar.sync 0;
-BB21_8:
+BB22_8:
mov.f64 %fd72, %fd74;
setp.lt.u32 %p6, %r9, 512;
- @%p6 bra BB21_12;
+ @%p6 bra BB22_12;
setp.gt.u32 %p7, %r6, 255;
mov.f64 %fd73, %fd72;
- @%p7 bra BB21_11;
+ @%p7 bra BB22_11;
ld.shared.f64 %fd33, [%rd1+2048];
mul.f64 %fd73, %fd72, %fd33;
st.shared.f64 [%rd1], %fd73;
-BB21_11:
+BB22_11:
mov.f64 %fd72, %fd73;
bar.sync 0;
-BB21_12:
+BB22_12:
mov.f64 %fd70, %fd72;
setp.lt.u32 %p8, %r9, 256;
- @%p8 bra BB21_16;
+ @%p8 bra BB22_16;
setp.gt.u32 %p9, %r6, 127;
mov.f64 %fd71, %fd70;
- @%p9 bra BB21_15;
+ @%p9 bra BB22_15;
ld.shared.f64 %fd34, [%rd1+1024];
mul.f64 %fd71, %fd70, %fd34;
st.shared.f64 [%rd1], %fd71;
-BB21_15:
+BB22_15:
mov.f64 %fd70, %fd71;
bar.sync 0;
-BB21_16:
+BB22_16:
mov.f64 %fd68, %fd70;
setp.lt.u32 %p10, %r9, 128;
- @%p10 bra BB21_20;
+ @%p10 bra BB22_20;
setp.gt.u32 %p11, %r6, 63;
mov.f64 %fd69, %fd68;
- @%p11 bra BB21_19;
+ @%p11 bra BB22_19;
ld.shared.f64 %fd35, [%rd1+512];
mul.f64 %fd69, %fd68, %fd35;
st.shared.f64 [%rd1], %fd69;
-BB21_19:
+BB22_19:
mov.f64 %fd68, %fd69;
bar.sync 0;
-BB21_20:
+BB22_20:
mov.f64 %fd67, %fd68;
setp.gt.u32 %p12, %r6, 31;
- @%p12 bra BB21_33;
+ @%p12 bra BB22_33;
setp.lt.u32 %p13, %r9, 64;
- @%p13 bra BB21_23;
+ @%p13 bra BB22_23;
ld.volatile.shared.f64 %fd36, [%rd1+256];
mul.f64 %fd67, %fd67, %fd36;
st.volatile.shared.f64 [%rd1], %fd67;
-BB21_23:
+BB22_23:
mov.f64 %fd66, %fd67;
setp.lt.u32 %p14, %r9, 32;
- @%p14 bra BB21_25;
+ @%p14 bra BB22_25;
ld.volatile.shared.f64 %fd37, [%rd1+128];
mul.f64 %fd66, %fd66, %fd37;
st.volatile.shared.f64 [%rd1], %fd66;
-BB21_25:
+BB22_25:
mov.f64 %fd65, %fd66;
setp.lt.u32 %p15, %r9, 16;
- @%p15 bra BB21_27;
+ @%p15 bra BB22_27;
ld.volatile.shared.f64 %fd38, [%rd1+64];
mul.f64 %fd65, %fd65, %fd38;
st.volatile.shared.f64 [%rd1], %fd65;
-BB21_27:
+BB22_27:
mov.f64 %fd64, %fd65;
setp.lt.u32 %p16, %r9, 8;
- @%p16 bra BB21_29;
+ @%p16 bra BB22_29;
ld.volatile.shared.f64 %fd39, [%rd1+32];
mul.f64 %fd64, %fd64, %fd39;
st.volatile.shared.f64 [%rd1], %fd64;
-BB21_29:
+BB22_29:
mov.f64 %fd63, %fd64;
setp.lt.u32 %p17, %r9, 4;
- @%p17 bra BB21_31;
+ @%p17 bra BB22_31;
ld.volatile.shared.f64 %fd40, [%rd1+16];
mul.f64 %fd63, %fd63, %fd40;
st.volatile.shared.f64 [%rd1], %fd63;
-BB21_31:
+BB22_31:
setp.lt.u32 %p18, %r9, 2;
- @%p18 bra BB21_33;
+ @%p18 bra BB22_33;
ld.volatile.shared.f64 %fd41, [%rd1+8];
mul.f64 %fd42, %fd63, %fd41;
st.volatile.shared.f64 [%rd1], %fd42;
-BB21_33:
+BB22_33:
setp.ne.s32 %p19, %r6, 0;
- @%p19 bra BB21_35;
+ @%p19 bra BB22_35;
ld.shared.f64 %fd43, [sdata];
cvta.to.global.u64 %rd12, %rd3;
@@ -3440,7 +3514,7 @@ BB21_33:
add.s64 %rd14, %rd12, %rd13;
st.global.f64 [%rd14], %fd43;
-BB21_35:
+BB22_35:
ret;
}
@@ -3464,17 +3538,17 @@ BB21_35:
ld.param.u32 %r4, [reduce_row_mean_param_3];
mov.u32 %r6, %ctaid.x;
setp.ge.u32 %p1, %r6, %r5;
- @%p1 bra BB22_35;
+ @%p1 bra BB23_35;
mov.u32 %r38, %tid.x;
mov.f64 %fd74, 0d0000000000000000;
mov.f64 %fd75, %fd74;
setp.ge.u32 %p2, %r38, %r4;
- @%p2 bra BB22_4;
+ @%p2 bra BB23_4;
cvta.to.global.u64 %rd3, %rd1;
-BB22_3:
+BB23_3:
mad.lo.s32 %r8, %r6, %r4, %r38;
mul.wide.u32 %rd4, %r8, 8;
add.s64 %rd5, %rd3, %rd4;
@@ -3484,9 +3558,9 @@ BB22_3:
add.s32 %r38, %r9, %r38;
setp.lt.u32 %p3, %r38, %r4;
mov.f64 %fd74, %fd75;
- @%p3 bra BB22_3;
+ @%p3 bra BB23_3;
-BB22_4:
+BB23_4:
mov.f64 %fd72, %fd74;
mov.u32 %r10, %tid.x;
mul.wide.u32 %rd6, %r10, 8;
@@ -3496,130 +3570,130 @@ BB22_4:
bar.sync 0;
mov.u32 %r11, %ntid.x;
setp.lt.u32 %p4, %r11, 1024;
- @%p4 bra BB22_8;
+ @%p4 bra BB23_8;
setp.gt.u32 %p5, %r10, 511;
mov.f64 %fd73, %fd72;
- @%p5 bra BB22_7;
+ @%p5 bra BB23_7;
ld.shared.f64 %fd29, [%rd8+4096];
add.f64 %fd73, %fd72, %fd29;
st.shared.f64 [%rd8], %fd73;
-BB22_7:
+BB23_7:
mov.f64 %fd72, %fd73;
bar.sync 0;
-BB22_8:
+BB23_8:
mov.f64 %fd70, %fd72;
setp.lt.u32 %p6, %r11, 512;
- @%p6 bra BB22_12;
+ @%p6 bra BB23_12;
setp.gt.u32 %p7, %r10, 255;
mov.f64 %fd71, %fd70;
- @%p7 bra BB22_11;
+ @%p7 bra BB23_11;
ld.shared.f64 %fd30, [%rd8+2048];
add.f64 %fd71, %fd70, %fd30;
st.shared.f64 [%rd8], %fd71;
-BB22_11:
+BB23_11:
mov.f64 %fd70, %fd71;
bar.sync 0;
-BB22_12:
+BB23_12:
mov.f64 %fd68, %fd70;
setp.lt.u32 %p8, %r11, 256;
- @%p8 bra BB22_16;
+ @%p8 bra BB23_16;
setp.gt.u32 %p9, %r10, 127;
mov.f64 %fd69, %fd68;
- @%p9 bra BB22_15;
+ @%p9 bra BB23
<TRUNCATED>