You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2017/02/08 19:15:32 UTC
[2/3] incubator-systemml git commit: [SYSTEMML-1039] Added variance,
row/col variance
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ad009d81/src/main/cpp/kernels/SystemML.ptx
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.ptx b/src/main/cpp/kernels/SystemML.ptx
index dfff5dd..93f3879 100644
--- a/src/main/cpp/kernels/SystemML.ptx
+++ b/src/main/cpp/kernels/SystemML.ptx
@@ -10,7 +10,7 @@
.target sm_30
.address_size 64
- // .globl getBoolean
+ // .globl _Z6reduceI5SumOpEvPdS1_jT_d
.func (.param .b64 func_retval0) __internal_accurate_pow
(
.param .b64 __internal_accurate_pow_param_0,
@@ -19,307 +19,6 @@
;
.extern .shared .align 8 .b8 sdata[];
-.visible .func (.param .b64 func_retval0) getBoolean(
- .param .b32 getBoolean_param_0
-)
-{
- .reg .pred %p<2>;
- .reg .b32 %r<2>;
- .reg .f64 %fd<2>;
-
-
- ld.param.u32 %r1, [getBoolean_param_0];
- setp.eq.s32 %p1, %r1, 0;
- selp.f64 %fd1, 0d0000000000000000, 0d3FF0000000000000, %p1;
- st.param.f64 [func_retval0+0], %fd1;
- ret;
-}
-
- // .globl binaryOp
-.visible .func (.param .b64 func_retval0) binaryOp(
- .param .b64 binaryOp_param_0,
- .param .b64 binaryOp_param_1,
- .param .b32 binaryOp_param_2
-)
-{
- .reg .pred %p<41>;
- .reg .b32 %r<30>;
- .reg .f64 %fd<40>;
- .reg .b64 %rd<3>;
-
-
- ld.param.f64 %fd26, [binaryOp_param_0];
- ld.param.f64 %fd27, [binaryOp_param_1];
- ld.param.u32 %r3, [binaryOp_param_2];
- setp.eq.s32 %p2, %r3, 0;
- @%p2 bra BB1_40;
-
- setp.eq.s32 %p3, %r3, 1;
- @%p3 bra BB1_39;
- bra.uni BB1_2;
-
-BB1_39:
- sub.f64 %fd39, %fd26, %fd27;
- bra.uni BB1_41;
-
-BB1_40:
- add.f64 %fd39, %fd26, %fd27;
- bra.uni BB1_41;
-
-BB1_2:
- setp.eq.s32 %p4, %r3, 2;
- @%p4 bra BB1_38;
- bra.uni BB1_3;
-
-BB1_38:
- mul.f64 %fd39, %fd26, %fd27;
- bra.uni BB1_41;
-
-BB1_3:
- setp.eq.s32 %p5, %r3, 3;
- @%p5 bra BB1_37;
- bra.uni BB1_4;
-
-BB1_37:
- div.rn.f64 %fd39, %fd26, %fd27;
- bra.uni BB1_41;
-
-BB1_4:
- setp.eq.s32 %p6, %r3, 4;
- @%p6 bra BB1_21;
- bra.uni BB1_5;
-
-BB1_21:
- {
- .reg .b32 %temp;
- mov.b64 {%temp, %r1}, %fd26;
- }
- {
- .reg .b32 %temp;
- mov.b64 {%temp, %r2}, %fd27;
- }
- bfe.u32 %r4, %r2, 20, 11;
- add.s32 %r5, %r4, -1012;
- mov.b64 %rd2, %fd27;
- shl.b64 %rd1, %rd2, %r5;
- setp.eq.s64 %p21, %rd1, -9223372036854775808;
- abs.f64 %fd9, %fd26;
- // Callseq Start 0
- {
- .reg .b32 temp_param_reg;
- // <end>}
- .param .b64 param0;
- st.param.f64 [param0+0], %fd9;
- .param .b64 param1;
- st.param.f64 [param1+0], %fd27;
- .param .b64 retval0;
- call.uni (retval0),
- __internal_accurate_pow,
- (
- param0,
- param1
- );
- ld.param.f64 %fd38, [retval0+0];
-
- //{
- }// Callseq End 0
- setp.lt.s32 %p22, %r1, 0;
- and.pred %p1, %p22, %p21;
- @!%p1 bra BB1_23;
- bra.uni BB1_22;
-
-BB1_22:
- {
- .reg .b32 %temp;
- mov.b64 {%temp, %r6}, %fd38;
- }
- xor.b32 %r7, %r6, -2147483648;
- {
- .reg .b32 %temp;
- mov.b64 {%r8, %temp}, %fd38;
- }
- mov.b64 %fd38, {%r8, %r7};
-
-BB1_23:
- mov.f64 %fd37, %fd38;
- setp.eq.f64 %p23, %fd26, 0d0000000000000000;
- @%p23 bra BB1_26;
- bra.uni BB1_24;
-
-BB1_26:
- selp.b32 %r9, %r1, 0, %p21;
- or.b32 %r10, %r9, 2146435072;
- setp.lt.s32 %p27, %r2, 0;
- selp.b32 %r11, %r10, %r9, %p27;
- mov.u32 %r12, 0;
- mov.b64 %fd37, {%r12, %r11};
- bra.uni BB1_27;
-
-BB1_5:
- setp.eq.s32 %p7, %r3, 5;
- @%p7 bra BB1_20;
- bra.uni BB1_6;
-
-BB1_20:
- setp.lt.f64 %p20, %fd26, %fd27;
- selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p20;
- bra.uni BB1_41;
-
-BB1_6:
- setp.eq.s32 %p8, %r3, 6;
- @%p8 bra BB1_19;
- bra.uni BB1_7;
-
-BB1_19:
- setp.le.f64 %p19, %fd26, %fd27;
- selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p19;
- bra.uni BB1_41;
-
-BB1_24:
- setp.gt.s32 %p24, %r1, -1;
- @%p24 bra BB1_27;
-
- cvt.rzi.f64.f64 %fd29, %fd27;
- setp.neu.f64 %p25, %fd29, %fd27;
- selp.f64 %fd37, 0dFFF8000000000000, %fd37, %p25;
-
-BB1_27:
- mov.f64 %fd15, %fd37;
- add.f64 %fd16, %fd26, %fd27;
- {
- .reg .b32 %temp;
- mov.b64 {%temp, %r13}, %fd16;
- }
- and.b32 %r14, %r13, 2146435072;
- setp.ne.s32 %p28, %r14, 2146435072;
- mov.f64 %fd36, %fd15;
- @%p28 bra BB1_36;
-
- setp.gtu.f64 %p29, %fd9, 0d7FF0000000000000;
- mov.f64 %fd36, %fd16;
- @%p29 bra BB1_36;
-
- abs.f64 %fd30, %fd27;
- setp.gtu.f64 %p30, %fd30, 0d7FF0000000000000;
- mov.f64 %fd35, %fd16;
- mov.f64 %fd36, %fd35;
- @%p30 bra BB1_36;
-
- and.b32 %r15, %r2, 2147483647;
- setp.ne.s32 %p31, %r15, 2146435072;
- @%p31 bra BB1_32;
-
- {
- .reg .b32 %temp;
- mov.b64 {%r16, %temp}, %fd27;
- }
- setp.eq.s32 %p32, %r16, 0;
- @%p32 bra BB1_35;
-
-BB1_32:
- and.b32 %r17, %r1, 2147483647;
- setp.ne.s32 %p33, %r17, 2146435072;
- mov.f64 %fd33, %fd15;
- mov.f64 %fd36, %fd33;
- @%p33 bra BB1_36;
-
- {
- .reg .b32 %temp;
- mov.b64 {%r18, %temp}, %fd26;
- }
- setp.ne.s32 %p34, %r18, 0;
- mov.f64 %fd36, %fd15;
- @%p34 bra BB1_36;
-
- shr.s32 %r19, %r2, 31;
- and.b32 %r20, %r19, -2146435072;
- add.s32 %r21, %r20, 2146435072;
- or.b32 %r22, %r21, -2147483648;
- selp.b32 %r23, %r22, %r21, %p1;
- mov.u32 %r24, 0;
- mov.b64 %fd36, {%r24, %r23};
- bra.uni BB1_36;
-
-BB1_7:
- setp.eq.s32 %p9, %r3, 7;
- @%p9 bra BB1_18;
- bra.uni BB1_8;
-
-BB1_18:
- setp.gt.f64 %p18, %fd26, %fd27;
- selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p18;
- bra.uni BB1_41;
-
-BB1_8:
- setp.eq.s32 %p10, %r3, 8;
- @%p10 bra BB1_17;
- bra.uni BB1_9;
-
-BB1_17:
- setp.ge.f64 %p17, %fd26, %fd27;
- selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p17;
- bra.uni BB1_41;
-
-BB1_9:
- setp.eq.s32 %p11, %r3, 9;
- @%p11 bra BB1_16;
- bra.uni BB1_10;
-
-BB1_16:
- setp.eq.f64 %p16, %fd26, %fd27;
- selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p16;
- bra.uni BB1_41;
-
-BB1_10:
- setp.eq.s32 %p12, %r3, 10;
- @%p12 bra BB1_15;
- bra.uni BB1_11;
-
-BB1_15:
- setp.neu.f64 %p15, %fd26, %fd27;
- selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p15;
- bra.uni BB1_41;
-
-BB1_35:
- setp.gt.f64 %p35, %fd9, 0d3FF0000000000000;
- selp.b32 %r25, 2146435072, 0, %p35;
- xor.b32 %r26, %r25, 2146435072;
- setp.lt.s32 %p36, %r2, 0;
- selp.b32 %r27, %r26, %r25, %p36;
- setp.eq.f64 %p37, %fd26, 0dBFF0000000000000;
- selp.b32 %r28, 1072693248, %r27, %p37;
- mov.u32 %r29, 0;
- mov.b64 %fd36, {%r29, %r28};
-
-BB1_36:
- setp.eq.f64 %p38, %fd27, 0d0000000000000000;
- setp.eq.f64 %p39, %fd26, 0d3FF0000000000000;
- or.pred %p40, %p39, %p38;
- selp.f64 %fd39, 0d3FF0000000000000, %fd36, %p40;
-
-BB1_41:
- st.param.f64 [func_retval0+0], %fd39;
- ret;
-
-BB1_11:
- setp.eq.s32 %p13, %r3, 11;
- @%p13 bra BB1_14;
- bra.uni BB1_12;
-
-BB1_14:
- min.f64 %fd39, %fd26, %fd27;
- bra.uni BB1_41;
-
-BB1_12:
- mov.f64 %fd39, 0dC08F380000000000;
- setp.ne.s32 %p14, %r3, 12;
- @%p14 bra BB1_41;
-
- max.f64 %fd39, %fd26, %fd27;
- bra.uni BB1_41;
-}
-
- // .globl _Z6reduceI5SumOpEvPdS1_jT_d
.visible .func _Z6reduceI5SumOpEvPdS1_jT_d(
.param .b64 _Z6reduceI5SumOpEvPdS1_jT_d_param_0,
.param .b64 _Z6reduceI5SumOpEvPdS1_jT_d_param_1,
@@ -344,11 +43,11 @@ BB1_12:
mov.u32 %r9, %ntid.x;
mad.lo.s32 %r32, %r8, %r9, %r6;
setp.ge.u32 %p1, %r32, %r5;
- @%p1 bra BB2_5;
+ @%p1 bra BB0_5;
mov.f64 %fd77, %fd76;
-BB2_2:
+BB0_2:
mov.f64 %fd1, %fd77;
mul.wide.u32 %rd4, %r32, 8;
add.s64 %rd5, %rd2, %rd4;
@@ -356,23 +55,23 @@ BB2_2:
add.f64 %fd78, %fd1, %fd29;
add.s32 %r3, %r32, %r9;
setp.ge.u32 %p2, %r3, %r5;
- @%p2 bra BB2_4;
+ @%p2 bra BB0_4;
mul.wide.u32 %rd6, %r3, 8;
add.s64 %rd7, %rd2, %rd6;
ld.f64 %fd30, [%rd7];
add.f64 %fd78, %fd78, %fd30;
-BB2_4:
+BB0_4:
mov.f64 %fd77, %fd78;
shl.b32 %r12, %r9, 1;
mov.u32 %r13, %nctaid.x;
mad.lo.s32 %r32, %r12, %r13, %r32;
setp.lt.u32 %p3, %r32, %r5;
mov.f64 %fd76, %fd77;
- @%p3 bra BB2_2;
+ @%p3 bra BB0_2;
-BB2_5:
+BB0_5:
mov.f64 %fd74, %fd76;
mul.wide.u32 %rd8, %r6, 8;
mov.u64 %rd9, sdata;
@@ -380,137 +79,137 @@ BB2_5:
st.shared.f64 [%rd1], %fd74;
bar.sync 0;
setp.lt.u32 %p4, %r9, 1024;
- @%p4 bra BB2_9;
+ @%p4 bra BB0_9;
setp.gt.u32 %p5, %r6, 511;
mov.f64 %fd75, %fd74;
- @%p5 bra BB2_8;
+ @%p5 bra BB0_8;
ld.shared.f64 %fd31, [%rd1+4096];
add.f64 %fd75, %fd74, %fd31;
st.shared.f64 [%rd1], %fd75;
-BB2_8:
+BB0_8:
mov.f64 %fd74, %fd75;
bar.sync 0;
-BB2_9:
+BB0_9:
mov.f64 %fd72, %fd74;
setp.lt.u32 %p6, %r9, 512;
- @%p6 bra BB2_13;
+ @%p6 bra BB0_13;
setp.gt.u32 %p7, %r6, 255;
mov.f64 %fd73, %fd72;
- @%p7 bra BB2_12;
+ @%p7 bra BB0_12;
ld.shared.f64 %fd32, [%rd1+2048];
add.f64 %fd73, %fd72, %fd32;
st.shared.f64 [%rd1], %fd73;
-BB2_12:
+BB0_12:
mov.f64 %fd72, %fd73;
bar.sync 0;
-BB2_13:
+BB0_13:
mov.f64 %fd70, %fd72;
setp.lt.u32 %p8, %r9, 256;
- @%p8 bra BB2_17;
+ @%p8 bra BB0_17;
setp.gt.u32 %p9, %r6, 127;
mov.f64 %fd71, %fd70;
- @%p9 bra BB2_16;
+ @%p9 bra BB0_16;
ld.shared.f64 %fd33, [%rd1+1024];
add.f64 %fd71, %fd70, %fd33;
st.shared.f64 [%rd1], %fd71;
-BB2_16:
+BB0_16:
mov.f64 %fd70, %fd71;
bar.sync 0;
-BB2_17:
+BB0_17:
mov.f64 %fd68, %fd70;
setp.lt.u32 %p10, %r9, 128;
- @%p10 bra BB2_21;
+ @%p10 bra BB0_21;
setp.gt.u32 %p11, %r6, 63;
mov.f64 %fd69, %fd68;
- @%p11 bra BB2_20;
+ @%p11 bra BB0_20;
ld.shared.f64 %fd34, [%rd1+512];
add.f64 %fd69, %fd68, %fd34;
st.shared.f64 [%rd1], %fd69;
-BB2_20:
+BB0_20:
mov.f64 %fd68, %fd69;
bar.sync 0;
-BB2_21:
+BB0_21:
mov.f64 %fd67, %fd68;
setp.gt.u32 %p12, %r6, 31;
- @%p12 bra BB2_34;
+ @%p12 bra BB0_34;
setp.lt.u32 %p13, %r9, 64;
- @%p13 bra BB2_24;
+ @%p13 bra BB0_24;
ld.volatile.shared.f64 %fd35, [%rd1+256];
add.f64 %fd67, %fd67, %fd35;
st.volatile.shared.f64 [%rd1], %fd67;
-BB2_24:
+BB0_24:
mov.f64 %fd66, %fd67;
setp.lt.u32 %p14, %r9, 32;
- @%p14 bra BB2_26;
+ @%p14 bra BB0_26;
ld.volatile.shared.f64 %fd36, [%rd1+128];
add.f64 %fd66, %fd66, %fd36;
st.volatile.shared.f64 [%rd1], %fd66;
-BB2_26:
+BB0_26:
mov.f64 %fd65, %fd66;
setp.lt.u32 %p15, %r9, 16;
- @%p15 bra BB2_28;
+ @%p15 bra BB0_28;
ld.volatile.shared.f64 %fd37, [%rd1+64];
add.f64 %fd65, %fd65, %fd37;
st.volatile.shared.f64 [%rd1], %fd65;
-BB2_28:
+BB0_28:
mov.f64 %fd64, %fd65;
setp.lt.u32 %p16, %r9, 8;
- @%p16 bra BB2_30;
+ @%p16 bra BB0_30;
ld.volatile.shared.f64 %fd38, [%rd1+32];
add.f64 %fd64, %fd64, %fd38;
st.volatile.shared.f64 [%rd1], %fd64;
-BB2_30:
+BB0_30:
mov.f64 %fd63, %fd64;
setp.lt.u32 %p17, %r9, 4;
- @%p17 bra BB2_32;
+ @%p17 bra BB0_32;
ld.volatile.shared.f64 %fd39, [%rd1+16];
add.f64 %fd63, %fd63, %fd39;
st.volatile.shared.f64 [%rd1], %fd63;
-BB2_32:
+BB0_32:
setp.lt.u32 %p18, %r9, 2;
- @%p18 bra BB2_34;
+ @%p18 bra BB0_34;
ld.volatile.shared.f64 %fd40, [%rd1+8];
add.f64 %fd41, %fd63, %fd40;
st.volatile.shared.f64 [%rd1], %fd41;
-BB2_34:
+BB0_34:
setp.ne.s32 %p19, %r6, 0;
- @%p19 bra BB2_36;
+ @%p19 bra BB0_36;
ld.shared.f64 %fd42, [sdata];
mul.wide.u32 %rd10, %r7, 8;
add.s64 %rd11, %rd3, %rd10;
st.f64 [%rd11], %fd42;
-BB2_36:
+BB0_36:
ret;
}
@@ -538,14 +237,14 @@ BB2_36:
ld.param.f64 %fd40, [_Z10reduce_rowI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_6];
mov.u32 %r1, %ctaid.x;
setp.ge.u32 %p1, %r1, %r7;
- @%p1 bra BB3_34;
+ @%p1 bra BB1_34;
mov.u32 %r28, %tid.x;
mul.lo.s32 %r3, %r1, %r6;
setp.ge.u32 %p2, %r28, %r6;
- @%p2 bra BB3_3;
+ @%p2 bra BB1_3;
-BB3_2:
+BB1_2:
add.s32 %r8, %r28, %r3;
mul.wide.u32 %rd4, %r8, 8;
add.s64 %rd5, %rd2, %rd4;
@@ -554,9 +253,9 @@ BB3_2:
mov.u32 %r9, %ntid.x;
add.s32 %r28, %r9, %r28;
setp.lt.u32 %p3, %r28, %r6;
- @%p3 bra BB3_2;
+ @%p3 bra BB1_2;
-BB3_3:
+BB1_3:
mov.u32 %r10, %tid.x;
mul.wide.u32 %rd6, %r10, 8;
mov.u64 %rd7, sdata;
@@ -565,121 +264,121 @@ BB3_3:
bar.sync 0;
mov.u32 %r11, %ntid.x;
setp.lt.u32 %p4, %r11, 1024;
- @%p4 bra BB3_7;
+ @%p4 bra BB1_7;
setp.gt.u32 %p5, %r10, 511;
- @%p5 bra BB3_6;
+ @%p5 bra BB1_6;
ld.shared.f64 %fd28, [%rd1+4096];
add.f64 %fd40, %fd40, %fd28;
st.shared.f64 [%rd1], %fd40;
-BB3_6:
+BB1_6:
bar.sync 0;
-BB3_7:
+BB1_7:
setp.lt.u32 %p6, %r11, 512;
- @%p6 bra BB3_11;
+ @%p6 bra BB1_11;
setp.gt.u32 %p7, %r10, 255;
- @%p7 bra BB3_10;
+ @%p7 bra BB1_10;
ld.shared.f64 %fd29, [%rd1+2048];
add.f64 %fd40, %fd40, %fd29;
st.shared.f64 [%rd1], %fd40;
-BB3_10:
+BB1_10:
bar.sync 0;
-BB3_11:
+BB1_11:
setp.lt.u32 %p8, %r11, 256;
- @%p8 bra BB3_15;
+ @%p8 bra BB1_15;
setp.gt.u32 %p9, %r10, 127;
- @%p9 bra BB3_14;
+ @%p9 bra BB1_14;
ld.shared.f64 %fd30, [%rd1+1024];
add.f64 %fd40, %fd40, %fd30;
st.shared.f64 [%rd1], %fd40;
-BB3_14:
+BB1_14:
bar.sync 0;
-BB3_15:
+BB1_15:
setp.lt.u32 %p10, %r11, 128;
- @%p10 bra BB3_19;
+ @%p10 bra BB1_19;
setp.gt.u32 %p11, %r10, 63;
- @%p11 bra BB3_18;
+ @%p11 bra BB1_18;
ld.shared.f64 %fd31, [%rd1+512];
add.f64 %fd40, %fd40, %fd31;
st.shared.f64 [%rd1], %fd40;
-BB3_18:
+BB1_18:
bar.sync 0;
-BB3_19:
+BB1_19:
setp.gt.u32 %p12, %r10, 31;
- @%p12 bra BB3_32;
+ @%p12 bra BB1_32;
setp.lt.u32 %p13, %r11, 64;
- @%p13 bra BB3_22;
+ @%p13 bra BB1_22;
ld.volatile.shared.f64 %fd32, [%rd1+256];
add.f64 %fd40, %fd40, %fd32;
st.volatile.shared.f64 [%rd1], %fd40;
-BB3_22:
+BB1_22:
setp.lt.u32 %p14, %r11, 32;
- @%p14 bra BB3_24;
+ @%p14 bra BB1_24;
ld.volatile.shared.f64 %fd33, [%rd1+128];
add.f64 %fd40, %fd40, %fd33;
st.volatile.shared.f64 [%rd1], %fd40;
-BB3_24:
+BB1_24:
setp.lt.u32 %p15, %r11, 16;
- @%p15 bra BB3_26;
+ @%p15 bra BB1_26;
ld.volatile.shared.f64 %fd34, [%rd1+64];
add.f64 %fd40, %fd40, %fd34;
st.volatile.shared.f64 [%rd1], %fd40;
-BB3_26:
+BB1_26:
setp.lt.u32 %p16, %r11, 8;
- @%p16 bra BB3_28;
+ @%p16 bra BB1_28;
ld.volatile.shared.f64 %fd35, [%rd1+32];
add.f64 %fd40, %fd40, %fd35;
st.volatile.shared.f64 [%rd1], %fd40;
-BB3_28:
+BB1_28:
setp.lt.u32 %p17, %r11, 4;
- @%p17 bra BB3_30;
+ @%p17 bra BB1_30;
ld.volatile.shared.f64 %fd36, [%rd1+16];
add.f64 %fd40, %fd40, %fd36;
st.volatile.shared.f64 [%rd1], %fd40;
-BB3_30:
+BB1_30:
setp.lt.u32 %p18, %r11, 2;
- @%p18 bra BB3_32;
+ @%p18 bra BB1_32;
ld.volatile.shared.f64 %fd37, [%rd1+8];
add.f64 %fd38, %fd40, %fd37;
st.volatile.shared.f64 [%rd1], %fd38;
-BB3_32:
+BB1_32:
setp.ne.s32 %p19, %r10, 0;
- @%p19 bra BB3_34;
+ @%p19 bra BB1_34;
ld.shared.f64 %fd39, [sdata];
mul.wide.u32 %rd8, %r1, 8;
add.s64 %rd9, %rd3, %rd8;
st.f64 [%rd9], %fd39;
-BB3_34:
+BB1_34:
ret;
}
@@ -710,15 +409,15 @@ BB3_34:
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r8, %r7, %r9;
setp.ge.u32 %p1, %r1, %r6;
- @%p1 bra BB4_5;
+ @%p1 bra BB2_5;
mul.lo.s32 %r2, %r6, %r5;
setp.ge.u32 %p2, %r1, %r2;
- @%p2 bra BB4_4;
+ @%p2 bra BB2_4;
mov.u32 %r10, %r1;
-BB4_3:
+BB2_3:
mov.u32 %r3, %r10;
mul.wide.u32 %rd3, %r3, 8;
add.s64 %rd4, %rd1, %rd3;
@@ -727,14 +426,14 @@ BB4_3:
add.s32 %r4, %r3, %r6;
setp.lt.u32 %p3, %r4, %r2;
mov.u32 %r10, %r4;
- @%p3 bra BB4_3;
+ @%p3 bra BB2_3;
-BB4_4:
+BB2_4:
mul.wide.u32 %rd5, %r1, 8;
add.s64 %rd6, %rd2, %rd5;
st.f64 [%rd6], %fd6;
-BB4_5:
+BB2_5:
ret;
}
@@ -763,11 +462,11 @@ BB4_5:
mov.u32 %r9, %ntid.x;
mad.lo.s32 %r32, %r8, %r9, %r6;
setp.ge.u32 %p1, %r32, %r5;
- @%p1 bra BB5_5;
+ @%p1 bra BB3_5;
mov.f64 %fd77, %fd76;
-BB5_2:
+BB3_2:
mov.f64 %fd1, %fd77;
mul.wide.u32 %rd4, %r32, 8;
add.s64 %rd5, %rd2, %rd4;
@@ -775,23 +474,23 @@ BB5_2:
max.f64 %fd78, %fd1, %fd29;
add.s32 %r3, %r32, %r9;
setp.ge.u32 %p2, %r3, %r5;
- @%p2 bra BB5_4;
+ @%p2 bra BB3_4;
mul.wide.u32 %rd6, %r3, 8;
add.s64 %rd7, %rd2, %rd6;
ld.f64 %fd30, [%rd7];
max.f64 %fd78, %fd78, %fd30;
-BB5_4:
+BB3_4:
mov.f64 %fd77, %fd78;
shl.b32 %r12, %r9, 1;
mov.u32 %r13, %nctaid.x;
mad.lo.s32 %r32, %r12, %r13, %r32;
setp.lt.u32 %p3, %r32, %r5;
mov.f64 %fd76, %fd77;
- @%p3 bra BB5_2;
+ @%p3 bra BB3_2;
-BB5_5:
+BB3_5:
mov.f64 %fd74, %fd76;
mul.wide.u32 %rd8, %r6, 8;
mov.u64 %rd9, sdata;
@@ -799,137 +498,137 @@ BB5_5:
st.shared.f64 [%rd1], %fd74;
bar.sync 0;
setp.lt.u32 %p4, %r9, 1024;
- @%p4 bra BB5_9;
+ @%p4 bra BB3_9;
setp.gt.u32 %p5, %r6, 511;
mov.f64 %fd75, %fd74;
- @%p5 bra BB5_8;
+ @%p5 bra BB3_8;
ld.shared.f64 %fd31, [%rd1+4096];
max.f64 %fd75, %fd74, %fd31;
st.shared.f64 [%rd1], %fd75;
-BB5_8:
+BB3_8:
mov.f64 %fd74, %fd75;
bar.sync 0;
-BB5_9:
+BB3_9:
mov.f64 %fd72, %fd74;
setp.lt.u32 %p6, %r9, 512;
- @%p6 bra BB5_13;
+ @%p6 bra BB3_13;
setp.gt.u32 %p7, %r6, 255;
mov.f64 %fd73, %fd72;
- @%p7 bra BB5_12;
+ @%p7 bra BB3_12;
ld.shared.f64 %fd32, [%rd1+2048];
max.f64 %fd73, %fd72, %fd32;
st.shared.f64 [%rd1], %fd73;
-BB5_12:
+BB3_12:
mov.f64 %fd72, %fd73;
bar.sync 0;
-BB5_13:
+BB3_13:
mov.f64 %fd70, %fd72;
setp.lt.u32 %p8, %r9, 256;
- @%p8 bra BB5_17;
+ @%p8 bra BB3_17;
setp.gt.u32 %p9, %r6, 127;
mov.f64 %fd71, %fd70;
- @%p9 bra BB5_16;
+ @%p9 bra BB3_16;
ld.shared.f64 %fd33, [%rd1+1024];
max.f64 %fd71, %fd70, %fd33;
st.shared.f64 [%rd1], %fd71;
-BB5_16:
+BB3_16:
mov.f64 %fd70, %fd71;
bar.sync 0;
-BB5_17:
+BB3_17:
mov.f64 %fd68, %fd70;
setp.lt.u32 %p10, %r9, 128;
- @%p10 bra BB5_21;
+ @%p10 bra BB3_21;
setp.gt.u32 %p11, %r6, 63;
mov.f64 %fd69, %fd68;
- @%p11 bra BB5_20;
+ @%p11 bra BB3_20;
ld.shared.f64 %fd34, [%rd1+512];
max.f64 %fd69, %fd68, %fd34;
st.shared.f64 [%rd1], %fd69;
-BB5_20:
+BB3_20:
mov.f64 %fd68, %fd69;
bar.sync 0;
-BB5_21:
+BB3_21:
mov.f64 %fd67, %fd68;
setp.gt.u32 %p12, %r6, 31;
- @%p12 bra BB5_34;
+ @%p12 bra BB3_34;
setp.lt.u32 %p13, %r9, 64;
- @%p13 bra BB5_24;
+ @%p13 bra BB3_24;
ld.volatile.shared.f64 %fd35, [%rd1+256];
max.f64 %fd67, %fd67, %fd35;
st.volatile.shared.f64 [%rd1], %fd67;
-BB5_24:
+BB3_24:
mov.f64 %fd66, %fd67;
setp.lt.u32 %p14, %r9, 32;
- @%p14 bra BB5_26;
+ @%p14 bra BB3_26;
ld.volatile.shared.f64 %fd36, [%rd1+128];
max.f64 %fd66, %fd66, %fd36;
st.volatile.shared.f64 [%rd1], %fd66;
-BB5_26:
+BB3_26:
mov.f64 %fd65, %fd66;
setp.lt.u32 %p15, %r9, 16;
- @%p15 bra BB5_28;
+ @%p15 bra BB3_28;
ld.volatile.shared.f64 %fd37, [%rd1+64];
max.f64 %fd65, %fd65, %fd37;
st.volatile.shared.f64 [%rd1], %fd65;
-BB5_28:
+BB3_28:
mov.f64 %fd64, %fd65;
setp.lt.u32 %p16, %r9, 8;
- @%p16 bra BB5_30;
+ @%p16 bra BB3_30;
ld.volatile.shared.f64 %fd38, [%rd1+32];
max.f64 %fd64, %fd64, %fd38;
st.volatile.shared.f64 [%rd1], %fd64;
-BB5_30:
+BB3_30:
mov.f64 %fd63, %fd64;
setp.lt.u32 %p17, %r9, 4;
- @%p17 bra BB5_32;
+ @%p17 bra BB3_32;
ld.volatile.shared.f64 %fd39, [%rd1+16];
max.f64 %fd63, %fd63, %fd39;
st.volatile.shared.f64 [%rd1], %fd63;
-BB5_32:
+BB3_32:
setp.lt.u32 %p18, %r9, 2;
- @%p18 bra BB5_34;
+ @%p18 bra BB3_34;
ld.volatile.shared.f64 %fd40, [%rd1+8];
max.f64 %fd41, %fd63, %fd40;
st.volatile.shared.f64 [%rd1], %fd41;
-BB5_34:
+BB3_34:
setp.ne.s32 %p19, %r6, 0;
- @%p19 bra BB5_36;
+ @%p19 bra BB3_36;
ld.shared.f64 %fd42, [sdata];
mul.wide.u32 %rd10, %r7, 8;
add.s64 %rd11, %rd3, %rd10;
st.f64 [%rd11], %fd42;
-BB5_36:
+BB3_36:
ret;
}
@@ -957,14 +656,14 @@ BB5_36:
ld.param.f64 %fd40, [_Z10reduce_rowI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_6];
mov.u32 %r1, %ctaid.x;
setp.ge.u32 %p1, %r1, %r7;
- @%p1 bra BB6_34;
+ @%p1 bra BB4_34;
mov.u32 %r28, %tid.x;
mul.lo.s32 %r3, %r1, %r6;
setp.ge.u32 %p2, %r28, %r6;
- @%p2 bra BB6_3;
+ @%p2 bra BB4_3;
-BB6_2:
+BB4_2:
add.s32 %r8, %r28, %r3;
mul.wide.u32 %rd4, %r8, 8;
add.s64 %rd5, %rd2, %rd4;
@@ -973,9 +672,9 @@ BB6_2:
mov.u32 %r9, %ntid.x;
add.s32 %r28, %r9, %r28;
setp.lt.u32 %p3, %r28, %r6;
- @%p3 bra BB6_2;
+ @%p3 bra BB4_2;
-BB6_3:
+BB4_3:
mov.u32 %r10, %tid.x;
mul.wide.u32 %rd6, %r10, 8;
mov.u64 %rd7, sdata;
@@ -984,121 +683,121 @@ BB6_3:
bar.sync 0;
mov.u32 %r11, %ntid.x;
setp.lt.u32 %p4, %r11, 1024;
- @%p4 bra BB6_7;
+ @%p4 bra BB4_7;
setp.gt.u32 %p5, %r10, 511;
- @%p5 bra BB6_6;
+ @%p5 bra BB4_6;
ld.shared.f64 %fd28, [%rd1+4096];
max.f64 %fd40, %fd40, %fd28;
st.shared.f64 [%rd1], %fd40;
-BB6_6:
+BB4_6:
bar.sync 0;
-BB6_7:
+BB4_7:
setp.lt.u32 %p6, %r11, 512;
- @%p6 bra BB6_11;
+ @%p6 bra BB4_11;
setp.gt.u32 %p7, %r10, 255;
- @%p7 bra BB6_10;
+ @%p7 bra BB4_10;
ld.shared.f64 %fd29, [%rd1+2048];
max.f64 %fd40, %fd40, %fd29;
st.shared.f64 [%rd1], %fd40;
-BB6_10:
+BB4_10:
bar.sync 0;
-BB6_11:
+BB4_11:
setp.lt.u32 %p8, %r11, 256;
- @%p8 bra BB6_15;
+ @%p8 bra BB4_15;
setp.gt.u32 %p9, %r10, 127;
- @%p9 bra BB6_14;
+ @%p9 bra BB4_14;
ld.shared.f64 %fd30, [%rd1+1024];
max.f64 %fd40, %fd40, %fd30;
st.shared.f64 [%rd1], %fd40;
-BB6_14:
+BB4_14:
bar.sync 0;
-BB6_15:
+BB4_15:
setp.lt.u32 %p10, %r11, 128;
- @%p10 bra BB6_19;
+ @%p10 bra BB4_19;
setp.gt.u32 %p11, %r10, 63;
- @%p11 bra BB6_18;
+ @%p11 bra BB4_18;
ld.shared.f64 %fd31, [%rd1+512];
max.f64 %fd40, %fd40, %fd31;
st.shared.f64 [%rd1], %fd40;
-BB6_18:
+BB4_18:
bar.sync 0;
-BB6_19:
+BB4_19:
setp.gt.u32 %p12, %r10, 31;
- @%p12 bra BB6_32;
+ @%p12 bra BB4_32;
setp.lt.u32 %p13, %r11, 64;
- @%p13 bra BB6_22;
+ @%p13 bra BB4_22;
ld.volatile.shared.f64 %fd32, [%rd1+256];
max.f64 %fd40, %fd40, %fd32;
st.volatile.shared.f64 [%rd1], %fd40;
-BB6_22:
+BB4_22:
setp.lt.u32 %p14, %r11, 32;
- @%p14 bra BB6_24;
+ @%p14 bra BB4_24;
ld.volatile.shared.f64 %fd33, [%rd1+128];
max.f64 %fd40, %fd40, %fd33;
st.volatile.shared.f64 [%rd1], %fd40;
-BB6_24:
+BB4_24:
setp.lt.u32 %p15, %r11, 16;
- @%p15 bra BB6_26;
+ @%p15 bra BB4_26;
ld.volatile.shared.f64 %fd34, [%rd1+64];
max.f64 %fd40, %fd40, %fd34;
st.volatile.shared.f64 [%rd1], %fd40;
-BB6_26:
+BB4_26:
setp.lt.u32 %p16, %r11, 8;
- @%p16 bra BB6_28;
+ @%p16 bra BB4_28;
ld.volatile.shared.f64 %fd35, [%rd1+32];
max.f64 %fd40, %fd40, %fd35;
st.volatile.shared.f64 [%rd1], %fd40;
-BB6_28:
+BB4_28:
setp.lt.u32 %p17, %r11, 4;
- @%p17 bra BB6_30;
+ @%p17 bra BB4_30;
ld.volatile.shared.f64 %fd36, [%rd1+16];
max.f64 %fd40, %fd40, %fd36;
st.volatile.shared.f64 [%rd1], %fd40;
-BB6_30:
+BB4_30:
setp.lt.u32 %p18, %r11, 2;
- @%p18 bra BB6_32;
+ @%p18 bra BB4_32;
ld.volatile.shared.f64 %fd37, [%rd1+8];
max.f64 %fd38, %fd40, %fd37;
st.volatile.shared.f64 [%rd1], %fd38;
-BB6_32:
+BB4_32:
setp.ne.s32 %p19, %r10, 0;
- @%p19 bra BB6_34;
+ @%p19 bra BB4_34;
ld.shared.f64 %fd39, [sdata];
mul.wide.u32 %rd8, %r1, 8;
add.s64 %rd9, %rd3, %rd8;
st.f64 [%rd9], %fd39;
-BB6_34:
+BB4_34:
ret;
}
@@ -1129,15 +828,15 @@ BB6_34:
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r8, %r7, %r9;
setp.ge.u32 %p1, %r1, %r6;
- @%p1 bra BB7_5;
+ @%p1 bra BB5_5;
mul.lo.s32 %r2, %r6, %r5;
setp.ge.u32 %p2, %r1, %r2;
- @%p2 bra BB7_4;
+ @%p2 bra BB5_4;
mov.u32 %r10, %r1;
-BB7_3:
+BB5_3:
mov.u32 %r3, %r10;
mul.wide.u32 %rd3, %r3, 8;
add.s64 %rd4, %rd1, %rd3;
@@ -1146,14 +845,14 @@ BB7_3:
add.s32 %r4, %r3, %r6;
setp.lt.u32 %p3, %r4, %r2;
mov.u32 %r10, %r4;
- @%p3 bra BB7_3;
+ @%p3 bra BB5_3;
-BB7_4:
+BB5_4:
mul.wide.u32 %rd5, %r1, 8;
add.s64 %rd6, %rd2, %rd5;
st.f64 [%rd6], %fd6;
-BB7_5:
+BB5_5:
ret;
}
@@ -1182,11 +881,11 @@ BB7_5:
mov.u32 %r9, %ntid.x;
mad.lo.s32 %r32, %r8, %r9, %r6;
setp.ge.u32 %p1, %r32, %r5;
- @%p1 bra BB8_5;
+ @%p1 bra BB6_5;
mov.f64 %fd77, %fd76;
-BB8_2:
+BB6_2:
mov.f64 %fd1, %fd77;
mul.wide.u32 %rd4, %r32, 8;
add.s64 %rd5, %rd2, %rd4;
@@ -1194,23 +893,23 @@ BB8_2:
min.f64 %fd78, %fd1, %fd29;
add.s32 %r3, %r32, %r9;
setp.ge.u32 %p2, %r3, %r5;
- @%p2 bra BB8_4;
+ @%p2 bra BB6_4;
mul.wide.u32 %rd6, %r3, 8;
add.s64 %rd7, %rd2, %rd6;
ld.f64 %fd30, [%rd7];
min.f64 %fd78, %fd78, %fd30;
-BB8_4:
+BB6_4:
mov.f64 %fd77, %fd78;
shl.b32 %r12, %r9, 1;
mov.u32 %r13, %nctaid.x;
mad.lo.s32 %r32, %r12, %r13, %r32;
setp.lt.u32 %p3, %r32, %r5;
mov.f64 %fd76, %fd77;
- @%p3 bra BB8_2;
+ @%p3 bra BB6_2;
-BB8_5:
+BB6_5:
mov.f64 %fd74, %fd76;
mul.wide.u32 %rd8, %r6, 8;
mov.u64 %rd9, sdata;
@@ -1218,137 +917,137 @@ BB8_5:
st.shared.f64 [%rd1], %fd74;
bar.sync 0;
setp.lt.u32 %p4, %r9, 1024;
- @%p4 bra BB8_9;
+ @%p4 bra BB6_9;
setp.gt.u32 %p5, %r6, 511;
mov.f64 %fd75, %fd74;
- @%p5 bra BB8_8;
+ @%p5 bra BB6_8;
ld.shared.f64 %fd31, [%rd1+4096];
min.f64 %fd75, %fd74, %fd31;
st.shared.f64 [%rd1], %fd75;
-BB8_8:
+BB6_8:
mov.f64 %fd74, %fd75;
bar.sync 0;
-BB8_9:
+BB6_9:
mov.f64 %fd72, %fd74;
setp.lt.u32 %p6, %r9, 512;
- @%p6 bra BB8_13;
+ @%p6 bra BB6_13;
setp.gt.u32 %p7, %r6, 255;
mov.f64 %fd73, %fd72;
- @%p7 bra BB8_12;
+ @%p7 bra BB6_12;
ld.shared.f64 %fd32, [%rd1+2048];
min.f64 %fd73, %fd72, %fd32;
st.shared.f64 [%rd1], %fd73;
-BB8_12:
+BB6_12:
mov.f64 %fd72, %fd73;
bar.sync 0;
-BB8_13:
+BB6_13:
mov.f64 %fd70, %fd72;
setp.lt.u32 %p8, %r9, 256;
- @%p8 bra BB8_17;
+ @%p8 bra BB6_17;
setp.gt.u32 %p9, %r6, 127;
mov.f64 %fd71, %fd70;
- @%p9 bra BB8_16;
+ @%p9 bra BB6_16;
ld.shared.f64 %fd33, [%rd1+1024];
min.f64 %fd71, %fd70, %fd33;
st.shared.f64 [%rd1], %fd71;
-BB8_16:
+BB6_16:
mov.f64 %fd70, %fd71;
bar.sync 0;
-BB8_17:
+BB6_17:
mov.f64 %fd68, %fd70;
setp.lt.u32 %p10, %r9, 128;
- @%p10 bra BB8_21;
+ @%p10 bra BB6_21;
setp.gt.u32 %p11, %r6, 63;
mov.f64 %fd69, %fd68;
- @%p11 bra BB8_20;
+ @%p11 bra BB6_20;
ld.shared.f64 %fd34, [%rd1+512];
min.f64 %fd69, %fd68, %fd34;
st.shared.f64 [%rd1], %fd69;
-BB8_20:
+BB6_20:
mov.f64 %fd68, %fd69;
bar.sync 0;
-BB8_21:
+BB6_21:
mov.f64 %fd67, %fd68;
setp.gt.u32 %p12, %r6, 31;
- @%p12 bra BB8_34;
+ @%p12 bra BB6_34;
setp.lt.u32 %p13, %r9, 64;
- @%p13 bra BB8_24;
+ @%p13 bra BB6_24;
ld.volatile.shared.f64 %fd35, [%rd1+256];
min.f64 %fd67, %fd67, %fd35;
st.volatile.shared.f64 [%rd1], %fd67;
-BB8_24:
+BB6_24:
mov.f64 %fd66, %fd67;
setp.lt.u32 %p14, %r9, 32;
- @%p14 bra BB8_26;
+ @%p14 bra BB6_26;
ld.volatile.shared.f64 %fd36, [%rd1+128];
min.f64 %fd66, %fd66, %fd36;
st.volatile.shared.f64 [%rd1], %fd66;
-BB8_26:
+BB6_26:
mov.f64 %fd65, %fd66;
setp.lt.u32 %p15, %r9, 16;
- @%p15 bra BB8_28;
+ @%p15 bra BB6_28;
ld.volatile.shared.f64 %fd37, [%rd1+64];
min.f64 %fd65, %fd65, %fd37;
st.volatile.shared.f64 [%rd1], %fd65;
-BB8_28:
+BB6_28:
mov.f64 %fd64, %fd65;
setp.lt.u32 %p16, %r9, 8;
- @%p16 bra BB8_30;
+ @%p16 bra BB6_30;
ld.volatile.shared.f64 %fd38, [%rd1+32];
min.f64 %fd64, %fd64, %fd38;
st.volatile.shared.f64 [%rd1], %fd64;
-BB8_30:
+BB6_30:
mov.f64 %fd63, %fd64;
setp.lt.u32 %p17, %r9, 4;
- @%p17 bra BB8_32;
+ @%p17 bra BB6_32;
ld.volatile.shared.f64 %fd39, [%rd1+16];
min.f64 %fd63, %fd63, %fd39;
st.volatile.shared.f64 [%rd1], %fd63;
-BB8_32:
+BB6_32:
setp.lt.u32 %p18, %r9, 2;
- @%p18 bra BB8_34;
+ @%p18 bra BB6_34;
ld.volatile.shared.f64 %fd40, [%rd1+8];
min.f64 %fd41, %fd63, %fd40;
st.volatile.shared.f64 [%rd1], %fd41;
-BB8_34:
+BB6_34:
setp.ne.s32 %p19, %r6, 0;
- @%p19 bra BB8_36;
+ @%p19 bra BB6_36;
ld.shared.f64 %fd42, [sdata];
mul.wide.u32 %rd10, %r7, 8;
add.s64 %rd11, %rd3, %rd10;
st.f64 [%rd11], %fd42;
-BB8_36:
+BB6_36:
ret;
}
@@ -1376,14 +1075,14 @@ BB8_36:
ld.param.f64 %fd40, [_Z10reduce_rowI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_6];
mov.u32 %r1, %ctaid.x;
setp.ge.u32 %p1, %r1, %r7;
- @%p1 bra BB9_34;
+ @%p1 bra BB7_34;
mov.u32 %r28, %tid.x;
mul.lo.s32 %r3, %r1, %r6;
setp.ge.u32 %p2, %r28, %r6;
- @%p2 bra BB9_3;
+ @%p2 bra BB7_3;
-BB9_2:
+BB7_2:
add.s32 %r8, %r28, %r3;
mul.wide.u32 %rd4, %r8, 8;
add.s64 %rd5, %rd2, %rd4;
@@ -1392,9 +1091,9 @@ BB9_2:
mov.u32 %r9, %ntid.x;
add.s32 %r28, %r9, %r28;
setp.lt.u32 %p3, %r28, %r6;
- @%p3 bra BB9_2;
+ @%p3 bra BB7_2;
-BB9_3:
+BB7_3:
mov.u32 %r10, %tid.x;
mul.wide.u32 %rd6, %r10, 8;
mov.u64 %rd7, sdata;
@@ -1403,121 +1102,121 @@ BB9_3:
bar.sync 0;
mov.u32 %r11, %ntid.x;
setp.lt.u32 %p4, %r11, 1024;
- @%p4 bra BB9_7;
+ @%p4 bra BB7_7;
setp.gt.u32 %p5, %r10, 511;
- @%p5 bra BB9_6;
+ @%p5 bra BB7_6;
ld.shared.f64 %fd28, [%rd1+4096];
min.f64 %fd40, %fd40, %fd28;
st.shared.f64 [%rd1], %fd40;
-BB9_6:
+BB7_6:
bar.sync 0;
-BB9_7:
+BB7_7:
setp.lt.u32 %p6, %r11, 512;
- @%p6 bra BB9_11;
+ @%p6 bra BB7_11;
setp.gt.u32 %p7, %r10, 255;
- @%p7 bra BB9_10;
+ @%p7 bra BB7_10;
ld.shared.f64 %fd29, [%rd1+2048];
min.f64 %fd40, %fd40, %fd29;
st.shared.f64 [%rd1], %fd40;
-BB9_10:
+BB7_10:
bar.sync 0;
-BB9_11:
+BB7_11:
setp.lt.u32 %p8, %r11, 256;
- @%p8 bra BB9_15;
+ @%p8 bra BB7_15;
setp.gt.u32 %p9, %r10, 127;
- @%p9 bra BB9_14;
+ @%p9 bra BB7_14;
ld.shared.f64 %fd30, [%rd1+1024];
min.f64 %fd40, %fd40, %fd30;
st.shared.f64 [%rd1], %fd40;
-BB9_14:
+BB7_14:
bar.sync 0;
-BB9_15:
+BB7_15:
setp.lt.u32 %p10, %r11, 128;
- @%p10 bra BB9_19;
+ @%p10 bra BB7_19;
setp.gt.u32 %p11, %r10, 63;
- @%p11 bra BB9_18;
+ @%p11 bra BB7_18;
ld.shared.f64 %fd31, [%rd1+512];
min.f64 %fd40, %fd40, %fd31;
st.shared.f64 [%rd1], %fd40;
-BB9_18:
+BB7_18:
bar.sync 0;
-BB9_19:
+BB7_19:
setp.gt.u32 %p12, %r10, 31;
- @%p12 bra BB9_32;
+ @%p12 bra BB7_32;
setp.lt.u32 %p13, %r11, 64;
- @%p13 bra BB9_22;
+ @%p13 bra BB7_22;
ld.volatile.shared.f64 %fd32, [%rd1+256];
min.f64 %fd40, %fd40, %fd32;
st.volatile.shared.f64 [%rd1], %fd40;
-BB9_22:
+BB7_22:
setp.lt.u32 %p14, %r11, 32;
- @%p14 bra BB9_24;
+ @%p14 bra BB7_24;
ld.volatile.shared.f64 %fd33, [%rd1+128];
min.f64 %fd40, %fd40, %fd33;
st.volatile.shared.f64 [%rd1], %fd40;
-BB9_24:
+BB7_24:
setp.lt.u32 %p15, %r11, 16;
- @%p15 bra BB9_26;
+ @%p15 bra BB7_26;
ld.volatile.shared.f64 %fd34, [%rd1+64];
min.f64 %fd40, %fd40, %fd34;
st.volatile.shared.f64 [%rd1], %fd40;
-BB9_26:
+BB7_26:
setp.lt.u32 %p16, %r11, 8;
- @%p16 bra BB9_28;
+ @%p16 bra BB7_28;
ld.volatile.shared.f64 %fd35, [%rd1+32];
min.f64 %fd40, %fd40, %fd35;
st.volatile.shared.f64 [%rd1], %fd40;
-BB9_28:
+BB7_28:
setp.lt.u32 %p17, %r11, 4;
- @%p17 bra BB9_30;
+ @%p17 bra BB7_30;
ld.volatile.shared.f64 %fd36, [%rd1+16];
min.f64 %fd40, %fd40, %fd36;
st.volatile.shared.f64 [%rd1], %fd40;
-BB9_30:
+BB7_30:
setp.lt.u32 %p18, %r11, 2;
- @%p18 bra BB9_32;
+ @%p18 bra BB7_32;
ld.volatile.shared.f64 %fd37, [%rd1+8];
min.f64 %fd38, %fd40, %fd37;
st.volatile.shared.f64 [%rd1], %fd38;
-BB9_32:
+BB7_32:
setp.ne.s32 %p19, %r10, 0;
- @%p19 bra BB9_34;
+ @%p19 bra BB7_34;
ld.shared.f64 %fd39, [sdata];
mul.wide.u32 %rd8, %r1, 8;
add.s64 %rd9, %rd3, %rd8;
st.f64 [%rd9], %fd39;
-BB9_34:
+BB7_34:
ret;
}
@@ -1548,15 +1247,15 @@ BB9_34:
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r8, %r7, %r9;
setp.ge.u32 %p1, %r1, %r6;
- @%p1 bra BB10_5;
+ @%p1 bra BB8_5;
mul.lo.s32 %r2, %r6, %r5;
setp.ge.u32 %p2, %r1, %r2;
- @%p2 bra BB10_4;
+ @%p2 bra BB8_4;
mov.u32 %r10, %r1;
-BB10_3:
+BB8_3:
mov.u32 %r3, %r10;
mul.wide.u32 %rd3, %r3, 8;
add.s64 %rd4, %rd1, %rd3;
@@ -1565,14 +1264,14 @@ BB10_3:
add.s32 %r4, %r3, %r6;
setp.lt.u32 %p3, %r4, %r2;
mov.u32 %r10, %r4;
- @%p3 bra BB10_3;
+ @%p3 bra BB8_3;
-BB10_4:
+BB8_4:
mul.wide.u32 %rd5, %r1, 8;
add.s64 %rd6, %rd2, %rd5;
st.f64 [%rd6], %fd6;
-BB10_5:
+BB8_5:
ret;
}
@@ -1601,11 +1300,11 @@ BB10_5:
mov.u32 %r9, %ntid.x;
mad.lo.s32 %r32, %r8, %r9, %r6;
setp.ge.u32 %p1, %r32, %r5;
- @%p1 bra BB11_5;
+ @%p1 bra BB9_5;
mov.f64 %fd77, %fd76;
-BB11_2:
+BB9_2:
mov.f64 %fd1, %fd77;
mul.wide.u32 %rd4, %r32, 8;
add.s64 %rd5, %rd2, %rd4;
@@ -1613,23 +1312,23 @@ BB11_2:
mul.f64 %fd78, %fd1, %fd29;
add.s32 %r3, %r32, %r9;
setp.ge.u32 %p2, %r3, %r5;
- @%p2 bra BB11_4;
+ @%p2 bra BB9_4;
mul.wide.u32 %rd6, %r3, 8;
add.s64 %rd7, %rd2, %rd6;
ld.f64 %fd30, [%rd7];
mul.f64 %fd78, %fd78, %fd30;
-BB11_4:
+BB9_4:
mov.f64 %fd77, %fd78;
shl.b32 %r12, %r9, 1;
mov.u32 %r13, %nctaid.x;
mad.lo.s32 %r32, %r12, %r13, %r32;
setp.lt.u32 %p3, %r32, %r5;
mov.f64 %fd76, %fd77;
- @%p3 bra BB11_2;
+ @%p3 bra BB9_2;
-BB11_5:
+BB9_5:
mov.f64 %fd74, %fd76;
mul.wide.u32 %rd8, %r6, 8;
mov.u64 %rd9, sdata;
@@ -1637,137 +1336,137 @@ BB11_5:
st.shared.f64 [%rd1], %fd74;
bar.sync 0;
setp.lt.u32 %p4, %r9, 1024;
- @%p4 bra BB11_9;
+ @%p4 bra BB9_9;
setp.gt.u32 %p5, %r6, 511;
mov.f64 %fd75, %fd74;
- @%p5 bra BB11_8;
+ @%p5 bra BB9_8;
ld.shared.f64 %fd31, [%rd1+4096];
mul.f64 %fd75, %fd74, %fd31;
st.shared.f64 [%rd1], %fd75;
-BB11_8:
+BB9_8:
mov.f64 %fd74, %fd75;
bar.sync 0;
-BB11_9:
+BB9_9:
mov.f64 %fd72, %fd74;
setp.lt.u32 %p6, %r9, 512;
- @%p6 bra BB11_13;
+ @%p6 bra BB9_13;
setp.gt.u32 %p7, %r6, 255;
mov.f64 %fd73, %fd72;
- @%p7 bra BB11_12;
+ @%p7 bra BB9_12;
ld.shared.f64 %fd32, [%rd1+2048];
mul.f64 %fd73, %fd72, %fd32;
st.shared.f64 [%rd1], %fd73;
-BB11_12:
+BB9_12:
mov.f64 %fd72, %fd73;
bar.sync 0;
-BB11_13:
+BB9_13:
mov.f64 %fd70, %fd72;
setp.lt.u32 %p8, %r9, 256;
- @%p8 bra BB11_17;
+ @%p8 bra BB9_17;
setp.gt.u32 %p9, %r6, 127;
mov.f64 %fd71, %fd70;
- @%p9 bra BB11_16;
+ @%p9 bra BB9_16;
ld.shared.f64 %fd33, [%rd1+1024];
mul.f64 %fd71, %fd70, %fd33;
st.shared.f64 [%rd1], %fd71;
-BB11_16:
+BB9_16:
mov.f64 %fd70, %fd71;
bar.sync 0;
-BB11_17:
+BB9_17:
mov.f64 %fd68, %fd70;
setp.lt.u32 %p10, %r9, 128;
- @%p10 bra BB11_21;
+ @%p10 bra BB9_21;
setp.gt.u32 %p11, %r6, 63;
mov.f64 %fd69, %fd68;
- @%p11 bra BB11_20;
+ @%p11 bra BB9_20;
ld.shared.f64 %fd34, [%rd1+512];
mul.f64 %fd69, %fd68, %fd34;
st.shared.f64 [%rd1], %fd69;
-BB11_20:
+BB9_20:
mov.f64 %fd68, %fd69;
bar.sync 0;
-BB11_21:
+BB9_21:
mov.f64 %fd67, %fd68;
setp.gt.u32 %p12, %r6, 31;
- @%p12 bra BB11_34;
+ @%p12 bra BB9_34;
setp.lt.u32 %p13, %r9, 64;
- @%p13 bra BB11_24;
+ @%p13 bra BB9_24;
ld.volatile.shared.f64 %fd35, [%rd1+256];
mul.f64 %fd67, %fd67, %fd35;
st.volatile.shared.f64 [%rd1], %fd67;
-BB11_24:
+BB9_24:
mov.f64 %fd66, %fd67;
setp.lt.u32 %p14, %r9, 32;
- @%p14 bra BB11_26;
+ @%p14 bra BB9_26;
ld.volatile.shared.f64 %fd36, [%rd1+128];
mul.f64 %fd66, %fd66, %fd36;
st.volatile.shared.f64 [%rd1], %fd66;
-BB11_26:
+BB9_26:
mov.f64 %fd65, %fd66;
setp.lt.u32 %p15, %r9, 16;
- @%p15 bra BB11_28;
+ @%p15 bra BB9_28;
ld.volatile.shared.f64 %fd37, [%rd1+64];
mul.f64 %fd65, %fd65, %fd37;
st.volatile.shared.f64 [%rd1], %fd65;
-BB11_28:
+BB9_28:
mov.f64 %fd64, %fd65;
setp.lt.u32 %p16, %r9, 8;
- @%p16 bra BB11_30;
+ @%p16 bra BB9_30;
ld.volatile.shared.f64 %fd38, [%rd1+32];
mul.f64 %fd64, %fd64, %fd38;
st.volatile.shared.f64 [%rd1], %fd64;
-BB11_30:
+BB9_30:
mov.f64 %fd63, %fd64;
setp.lt.u32 %p17, %r9, 4;
- @%p17 bra BB11_32;
+ @%p17 bra BB9_32;
ld.volatile.shared.f64 %fd39, [%rd1+16];
mul.f64 %fd63, %fd63, %fd39;
st.volatile.shared.f64 [%rd1], %fd63;
-BB11_32:
+BB9_32:
setp.lt.u32 %p18, %r9, 2;
- @%p18 bra BB11_34;
+ @%p18 bra BB9_34;
ld.volatile.shared.f64 %fd40, [%rd1+8];
mul.f64 %fd41, %fd63, %fd40;
st.volatile.shared.f64 [%rd1], %fd41;
-BB11_34:
+BB9_34:
setp.ne.s32 %p19, %r6, 0;
- @%p19 bra BB11_36;
+ @%p19 bra BB9_36;
ld.shared.f64 %fd42, [sdata];
mul.wide.u32 %rd10, %r7, 8;
add.s64 %rd11, %rd3, %rd10;
st.f64 [%rd11], %fd42;
-BB11_36:
+BB9_36:
ret;
}
@@ -1796,14 +1495,14 @@ BB11_36:
ld.param.f64 %fd42, [_Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_6];
mov.u32 %r7, %ctaid.x;
setp.ge.u32 %p1, %r7, %r6;
- @%p1 bra BB12_34;
+ @%p1 bra BB10_34;
mov.u32 %r29, %tid.x;
mul.lo.s32 %r2, %r7, %r5;
setp.ge.u32 %p2, %r29, %r5;
- @%p2 bra BB12_3;
+ @%p2 bra BB10_3;
-BB12_2:
+BB10_2:
add.s32 %r9, %r29, %r2;
mul.wide.u32 %rd5, %r9, 8;
add.s64 %rd6, %rd2, %rd5;
@@ -1812,9 +1511,9 @@ BB12_2:
mov.u32 %r10, %ntid.x;
add.s32 %r29, %r10, %r29;
setp.lt.u32 %p3, %r29, %r5;
- @%p3 bra BB12_2;
+ @%p3 bra BB10_2;
-BB12_3:
+BB10_3:
mov.u32 %r11, %tid.x;
mul.wide.u32 %rd7, %r11, 8;
mov.u64 %rd8, sdata;
@@ -1823,114 +1522,114 @@ BB12_3:
bar.sync 0;
mov.u32 %r12, %ntid.x;
setp.lt.u32 %p4, %r12, 1024;
- @%p4 bra BB12_7;
+ @%p4 bra BB10_7;
setp.gt.u32 %p5, %r11, 511;
- @%p5 bra BB12_6;
+ @%p5 bra BB10_6;
ld.shared.f64 %fd28, [%rd1+4096];
add.f64 %fd42, %fd42, %fd28;
st.shared.f64 [%rd1], %fd42;
-BB12_6:
+BB10_6:
bar.sync 0;
-BB12_7:
+BB10_7:
setp.lt.u32 %p6, %r12, 512;
- @%p6 bra BB12_11;
+ @%p6 bra BB10_11;
setp.gt.u32 %p7, %r11, 255;
- @%p7 bra BB12_10;
+ @%p7 bra BB10_10;
ld.shared.f64 %fd29, [%rd1+2048];
add.f64 %fd42, %fd42, %fd29;
st.shared.f64 [%rd1], %fd42;
-BB12_10:
+BB10_10:
bar.sync 0;
-BB12_11:
+BB10_11:
setp.lt.u32 %p8, %r12, 256;
- @%p8 bra BB12_15;
+ @%p8 bra BB10_15;
setp.gt.u32 %p9, %r11, 127;
- @%p9 bra BB12_14;
+ @%p9 bra BB10_14;
ld.shared.f64 %fd30, [%rd1+1024];
add.f64 %fd42, %fd42, %fd30;
st.shared.f64 [%rd1], %fd42;
-BB12_14:
+BB10_14:
bar.sync 0;
-BB12_15:
+BB10_15:
setp.lt.u32 %p10, %r12, 128;
- @%p10 bra BB12_19;
+ @%p10 bra BB10_19;
setp.gt.u32 %p11, %r11, 63;
- @%p11 bra BB12_18;
+ @%p11 bra BB10_18;
ld.shared.f64 %fd31, [%rd1+512];
add.f64 %fd42, %fd42, %fd31;
st.shared.f64 [%rd1], %fd42;
-BB12_18:
+BB10_18:
bar.sync 0;
-BB12_19:
+BB10_19:
setp.gt.u32 %p12, %r11, 31;
- @%p12 bra BB12_32;
+ @%p12 bra BB10_32;
setp.lt.u32 %p13, %r12, 64;
- @%p13 bra BB12_22;
+ @%p13 bra BB10_22;
ld.volatile.shared.f64 %fd32, [%rd1+256];
add.f64 %fd42, %fd42, %fd32;
st.volatile.shared.f64 [%rd1], %fd42;
-BB12_22:
+BB10_22:
setp.lt.u32 %p14, %r12, 32;
- @%p14 bra BB12_24;
+ @%p14 bra BB10_24;
ld.volatile.shared.f64 %fd33, [%rd1+128];
add.f64 %fd42, %fd42, %fd33;
st.volatile.shared.f64 [%rd1], %fd42;
-BB12_24:
+BB10_24:
setp.lt.u32 %p15, %r12, 16;
- @%p15 bra BB12_26;
+ @%p15 bra BB10_26;
ld.volatile.shared.f64 %fd34, [%rd1+64];
add.f64 %fd42, %fd42, %fd34;
st.volatile.shared.f64 [%rd1], %fd42;
-BB12_26:
+BB10_26:
setp.lt.u32 %p16, %r12, 8;
- @%p16 bra BB12_28;
+ @%p16 bra BB10_28;
ld.volatile.shared.f64 %fd35, [%rd1+32];
add.f64 %fd42, %fd42, %fd35;
st.volatile.shared.f64 [%rd1], %fd42;
-BB12_28:
+BB10_28:
setp.lt.u32 %p17, %r12, 4;
- @%p17 bra BB12_30;
+ @%p17 bra BB10_30;
ld.volatile.shared.f64 %fd36, [%rd1+16];
add.f64 %fd42, %fd42, %fd36;
st.volatile.shared.f64 [%rd1], %fd42;
-BB12_30:
+BB10_30:
setp.lt.u32 %p18, %r12, 2;
- @%p18 bra BB12_32;
+ @%p18 bra BB10_32;
ld.volatile.shared.f64 %fd37, [%rd1+8];
add.f64 %fd38, %fd42, %fd37;
st.volatile.shared.f64 [%rd1], %fd38;
-BB12_32:
+BB10_32:
setp.ne.s32 %p19, %r11, 0;
- @%p19 bra BB12_34;
+ @%p19 bra BB10_34;
ld.shared.f64 %fd39, [sdata];
cvt.rn.f64.s64 %fd40, %rd4;
@@ -1939,7 +1638,7 @@ BB12_32:
add.s64 %rd10, %rd3, %rd9;
st.f64 [%rd10], %fd41;
-BB12_34:
+BB10_34:
ret;
}
@@ -1971,15 +1670,15 @@ BB12_34:
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r7, %r8, %r9;
setp.ge.u32 %p1, %r1, %r6;
- @%p1 bra BB13_5;
+ @%p1 bra BB11_5;
mul.lo.s32 %r2, %r6, %r5;
setp.ge.u32 %p2, %r1, %r2;
- @%p2 bra BB13_4;
+ @%p2 bra BB11_4;
mov.u32 %r10, %r1;
-BB13_3:
+BB11_3:
mov.u32 %r3, %r10;
mul.wide.u32 %rd4, %r3, 8;
add.s64 %rd5, %rd1, %rd4;
@@ -1988,16 +1687,16 @@ BB13_3:
add.s32 %r4, %r3, %r6;
setp.lt.u32 %p3, %r4, %r2;
mov.u32 %r10, %r4;
- @%p3 bra BB13_3;
+ @%p3 bra BB11_3;
-BB13_4:
+BB11_4:
cvt.rn.f64.s64 %fd6, %rd3;
div.rn.f64 %fd7, %fd8, %fd6;
mul.wide.u32 %rd6, %r1, 8;
add.s64 %rd7, %rd2, %rd6;
st.f64 [%rd7], %fd7;
-BB13_5:
+BB11_5:
ret;
}
@@ -2029,10 +1728,10 @@ BB13_5:
setp.gt.s32 %p1, %r2, %r1;
setp.lt.s32 %p2, %r3, %r5;
and.pred %p3, %p1, %p2;
- @!%p3 bra BB14_2;
- bra.uni BB14_1;
+ @!%p3 bra BB12_2;
+ bra.uni BB12_1;
-BB14_1:
+BB12_1:
cvta.to.global.u64 %rd2, %rd1;
mad.lo.s32 %r12, %r1, %r4, %r2;
mul.wide.s32 %rd3, %r12, 8;
@@ -2042,7 +1741,7 @@ BB14_1:
add.s64 %rd6, %rd2, %rd5;
st.global.f64 [%rd6], %fd1;
-BB14_2:
+BB12_2:
ret;
}
@@ -2075,14 +1774,14 @@ BB14_2:
mad.lo.s32 %r1, %r8, %r9, %r11;
mul.lo.s32 %r12, %r3, %r2;
setp.ge.s32 %p1, %r1, %r12;
- @%p1 bra BB15_2;
+ @%p1 bra BB13_2;
cvta.to.global.u64 %rd2, %rd1;
mul.wide.s32 %rd3, %r1, 8;
add.s64 %rd4, %rd2, %rd3;
st.global.f64 [%rd4], %fd1;
-BB15_2:
+BB13_2:
ret;
}
@@ -2116,10 +1815,10 @@ BB15_2:
setp.lt.s32 %p1, %r7, %r2;
setp.lt.s32 %p2, %r11, %r3;
and.pred %p3, %p1, %p2;
- @!%p3 bra BB16_2;
- bra.uni BB16_1;
+ @!%p3 bra BB14_2;
+ bra.uni BB14_1;
-BB16_1:
+BB14_1:
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 8;
add.s64 %rd5, %rd3, %rd4;
@@ -2128,7 +1827,7 @@ BB16_1:
add.s64 %rd7, %rd6, %rd4;
st.global.f64 [%rd7], %fd1;
-BB16_2:
+BB14_2:
ret;
}
@@ -2161,10 +1860,10 @@ BB16_2:
setp.lt.s32 %p1, %r1, %r4;
setp.lt.s32 %p2, %r2, %r3;
and.pred %p3, %p1, %p2;
- @!%p3 bra BB17_2;
- bra.uni BB17_1;
+ @!%p3 bra BB15_2;
+ bra.uni BB15_1;
-BB17_1:
+BB15_1:
cvta.to.global.u64 %rd3, %rd1;
mad.lo.s32 %r11, %r1, %r3, %r2;
mul.wide.s32 %rd4, %r11, 8;
@@ -2176,7 +1875,7 @@ BB17_1:
add.s64 %rd7, %rd6, %rd4;
st.global.f64 [%rd7], %fd3;
-BB17_2:
+BB15_2:
ret;
}
@@ -2211,10 +1910,10 @@ BB17_2:
setp.lt.s32 %p1, %r1, %r5;
setp.lt.s32 %p2, %r2, %r4;
and.pred %p3, %p1, %p2;
- @!%p3 bra BB18_4;
- bra.uni BB18_1;
+ @!%p3 bra BB16_4;
+ bra.uni BB16_1;
-BB18_1:
+BB16_1:
cvta.to.global.u64 %rd4, %rd1;
mad.lo.s32 %r3, %r1, %r4, %r2;
mul.wide.s32 %rd5, %r3, 8;
@@ -2222,18 +1921,18 @@ BB18_1:
ld.global.f64 %fd4, [%rd6];
mov.f64 %fd5, 0d0000000000000000;
setp.leu.f64 %p4, %fd4, 0d0000000000000000;
- @%p4 bra BB18_3;
+ @%p4 bra BB16_3;
cvta.to.global.u64 %rd7, %rd2;
add.s64 %rd9, %rd7, %rd5;
ld.global.f64 %fd5, [%rd9];
-BB18_3:
+BB16_3:
cvta.to.global.u64 %rd10, %rd3;
add.s64 %rd12, %rd10, %rd5;
st.global.f64 [%rd12], %fd5;
-BB18_4:
+BB16_4:
ret;
}
@@ -2270,10 +1969,10 @@ BB18_4:
setp.lt.s32 %p1, %r1, %r5;
setp.lt.s32 %p2, %r2, %r3;
and.pred %p3, %p1, %p2;
- @!%p3 bra BB19_2;
- bra.uni BB19_1;
+ @!%p3 bra BB17_2;
+ bra.uni BB17_1;
-BB19_1:
+BB17_1:
cvta.to.global.u64 %rd4, %rd1;
mad.lo.s32 %r12, %r1, %r3, %r2;
mul.wide.s32 %rd5, %r12, 8;
@@ -2289,7 +1988,7 @@ BB19_1:
add.s64 %rd11, %rd10, %rd5;
st.global.f64 [%rd11], %fd3;
-BB19_2:
+BB17_2:
ret;
}
@@ -2333,10 +2032,10 @@ BB19_2:
setp.lt.s32 %p1, %r7, %r2;
setp.lt.s32 %p2, %r11, %r3;
and.pred %p3, %p1, %p2;
- @!%p3 bra BB20_6;
- bra.uni BB20_1;
+ @!%p3 bra BB18_6;
+ bra.uni BB18_1;
-BB20_1:
+BB18_1:
cvta.to.global.u64 %rd4, %rd2;
mul.wide.s32 %rd5, %r1, 8;
add.s64 %rd6, %rd4, %rd5;
@@ -2346,39 +2045,39 @@ BB20_1:
setp.lt.f64 %p4, %fd8, %fd3;
cvta.to.global.u64 %rd7, %rd3;
add.s64 %rd1, %rd7, %rd5;
- @%p4 bra BB20_5;
- bra.uni BB20_2;
+ @%p4 bra BB18_5;
+ bra.uni BB18_2;
-BB20_5:
+BB18_5:
st.global.f64 [%rd1], %fd4;
- bra.uni BB20_6;
+ bra.uni BB18_6;
-BB20_2:
+BB18_2:
setp.lt.f64 %p5, %fd1, %fd2;
- @%p5 bra BB20_4;
- bra.uni BB20_3;
+ @%p5 bra BB18_4;
+ bra.uni BB18_3;
-BB20_4:
+BB18_4:
st.global.f64 [%rd1], %fd5;
- bra.uni BB20_6;
+ bra.uni BB18_6;
-BB20_3:
+BB18_3:
st.global.f64 [%rd1], %fd6;
-BB20_6:
+BB18_6:
ret;
}
- // .globl binCellOp
-.visible .entry binCellOp(
- .param .u64 binCellOp_param_0,
- .param .u64 binCellOp_param_1,
- .param .u64 binCellOp_param_2,
- .param .u32 binCellOp_param_3,
- .param .u32 binCellOp_param_4,
- .param .u32 binCellOp_param_5,
- .param .u32 binCellOp_param_6,
- .param .u32 binCellOp_param_7
+ // .globl matrix_matrix_cellwise_op
+.visible .entry matrix_matrix_cellwise_op(
+ .param .u64 matrix_matrix_cellwise_op_param_0,
+ .param .u64 matrix_matrix_cellwise_op_param_1,
+ .param .u64 matrix_matrix_cellwise_op_param_2,
+ .param .u32 matrix_matrix_cellwise_op_param_3,
+ .param .u32 matrix_matrix_cellwise_op_param_4,
+ .param .u32 matrix_matrix_cellwise_op_param_5,
+ .param .u32 matrix_matrix_cellwise_op_param_6,
+ .param .u32 matrix_matrix_cellwise_op_param_7
)
{
.reg .pred %p<52>;
@@ -2387,14 +2086,14 @@ BB20_6:
.reg .b64 %rd<15>;
- ld.param.u64 %rd2, [binCellOp_param_0];
- ld.param.u64 %rd3, [binCellOp_param_1];
- ld.param.u64 %rd4, [binCellOp_param_2];
- ld.param.u32 %r14, [binCellOp_param_3];
- ld.param.u32 %r10, [binCellOp_param_4];
- ld.param.u32 %r11, [binCellOp_param_5];
- ld.param.u32 %r12, [binCellOp_param_6];
- ld.param.u32 %r13, [binCellOp_param_7];
+ ld.param.u64 %rd2, [matrix_matrix_cellwise_op_param_0];
+ ld.param.u64 %rd3, [matrix_matrix_cellwise_op_param_1];
+ ld.param.u64 %rd4, [matrix_matrix_cellwise_op_param_2];
+ ld.param.u32 %r14, [matrix_matrix_cellwise_op_param_3];
+ ld.param.u32 %r10, [matrix_matrix_cellwise_op_param_4];
+ ld.param.u32 %r11, [matrix_matrix_cellwise_op_param_5];
+ ld.param.u32 %r12, [matrix_matrix_cellwise_op_param_6];
+ ld.param.u32 %r13, [matrix_matrix_cellwise_op_param_7];
mov.u32 %r15, %ntid.x;
mov.u32 %r16, %ctaid.x;
mov.u32 %r17, %tid.x;
@@ -2406,42 +2105,42 @@ BB20_6:
setp.lt.s32 %p2, %r1, %r14;
setp.lt.s32 %p3, %r2, %r10;
and.pred %p4, %p2, %p3;
- @!%p4 bra BB21_55;
- bra.uni BB21_1;
+ @!%p4 bra BB19_55;
+ bra.uni BB19_1;
-BB21_1:
+BB19_1:
mad.lo.s32 %r3, %r1, %r10, %r2;
setp.eq.s32 %p5, %r11, 1;
mov.u32 %r54, %r1;
- @%p5 bra BB21_5;
+ @%p5 bra BB19_5;
setp.ne.s32 %p6, %r11, 2;
mov.u32 %r55, %r3;
- @%p6 bra BB21_4;
+ @%p6 bra BB19_4;
mov.u32 %r55, %r2;
-BB21_4:
+BB19_4:
mov.u32 %r49, %r55;
mov.u32 %r4, %r49;
mov.u32 %r54, %r4;
-BB21_5:
+BB19_5:
mov.u32 %r5, %r54;
setp.eq.s32 %p7, %r12, 1;
mov.u32 %r52, %r1;
- @%p7 bra BB21_9;
+ @%p7 bra BB19_9;
setp.ne.s32 %p8, %r12, 2;
mov.u32 %r53, %r3;
- @%p8 bra BB21_8;
+ @%p8 bra BB19_8;
mov.u32 %r53, %r2;
-BB21_8:
+BB19_8:
mov.u32 %r52, %r53;
-BB21_9:
+BB19_9:
cvta.to.global.u64 %rd5, %rd3;
cvta.to.global.u64 %rd6, %rd2;
mul.wide.s32 %rd7, %r5, 8;
@@ -2450,49 +2149,49 @@ BB21_9:
mul.wide.s32 %rd9, %r52, 8;
add.s64 %rd10, %rd5, %rd9;
ld.global.f64 %fd2, [%rd10];
- mov.f64 %fd39, 0dC08F380000000000;
+ mov.f64 %fd39, 0d7FEFFFFFFFFFFFFF;
setp.gt.s32 %p9, %r13, 5;
- @%p9 bra BB21_19;
+ @%p9 bra BB19_19;
setp.gt.s32 %p19, %r13, 2;
- @%p19 bra BB21_15;
+ @%p19 bra BB19_15;
setp.eq.s32 %p23, %r13, 0;
- @%p23 bra BB21_53;
+ @%p23 bra BB19_53;
setp.eq.s32 %p24, %r13, 1;
- @%p24 bra BB21_52;
- bra.uni BB21_13;
+ @%p24 bra BB19_52;
+ bra.uni BB19_13;
-BB21_52:
+BB19_52:
sub.f64 %fd39, %fd1, %fd2;
- bra.uni BB21_54;
+ bra.uni BB19_54;
-BB21_19:
+BB19_19:
setp.gt.s32 %p10, %r13, 8;
- @%p10 bra BB21_24;
+ @%p10 bra BB19_24;
setp.eq.s32 %p16, %r13, 6;
- @%p16 bra BB21_34;
+ @%p16 bra BB19_34;
setp.eq.s32 %p17, %r13, 7;
- @%p17 bra BB21_33;
- bra.uni BB21_22;
+ @%p17 bra BB19_33;
+ bra.uni BB19_22;
-BB21_33:
+BB19_33:
setp.gt.f64 %p29, %fd1, %fd2;
selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p29;
- bra.uni BB21_54;
+ bra.uni BB19_54;
-BB21_15:
+BB19_15:
setp.eq.s32 %p20, %r13, 3;
- @%p20 bra BB21_51;
+ @%p20 bra BB19_51;
setp.eq.s32 %p21, %r13, 4;
- @%p21 bra BB21_35;
- bra.uni BB21_17;
+ @%p21 bra BB19_35;
+ bra.uni BB19_17;
-BB21_35:
+BB19_35:
{
.reg .b32 %temp;
mov.b64 {%temp, %r8}, %fd1;
@@ -2507,7 +2206,7 @@ BB21_35:
shl.b64 %rd1, %rd11, %r22;
setp.eq.s64 %p32, %rd1, -9223372036854775808;
abs.f64 %fd11, %fd1;
- // Callseq Start 1
+ // Callseq Start 0
{
.reg .b32 temp_param_reg;
// <end>}
@@ -2525,13 +2224,13 @@ BB21_35:
ld.param.f64 %fd38, [retval0+0];
//{
- }// Callseq End 1
+ }// Callseq End 0
setp.lt.s32 %p33, %r8, 0;
and.pred %p1, %p33, %p32;
- @!%p1 bra BB21_37;
- bra.uni BB21_36;
+ @!%p1 bra BB19_37;
+ bra.uni BB19_36;
-BB21_36:
+BB19_36:
{
.reg .b32 %temp;
mov.b64 {%temp, %r23}, %fd38;
@@ -2543,111 +2242,111 @@ BB21_36:
}
mov.b64 %fd38, {%r25, %r24};
-BB21_37:
+BB19_37:
mov.f64 %fd37, %fd38;
setp.eq.f64 %p34, %fd1, 0d0000000000000000;
- @%p34 bra BB21_40;
- bra.uni BB21_38;
+ @%p34 bra BB19_40;
+ bra.uni BB19_38;
-BB21_40:
+BB19_40:
selp.b32 %r26, %r8, 0, %p32;
or.b32 %r27, %r26, 2146435072;
setp.lt.s32 %p38, %r9, 0;
selp.b32 %r28, %r27, %r26, %p38;
mov.u32 %r29, 0;
mov.b64 %fd37, {%r29, %r28};
- bra.uni BB21_41;
+ bra.uni BB19_41;
-BB21_24:
+BB19_24:
setp.gt.s32 %p11, %r13, 10;
- @%p11 bra BB21_28;
+ @%p11 bra BB19_28;
setp.eq.s32 %p14, %r13, 9;
- @%p14 bra BB21_32;
- bra.uni BB21_26;
+ @%p14 bra BB19_32;
+ bra.uni BB19_26;
-BB21_32:
+BB19_32:
setp.eq.f64 %p27, %fd1, %fd2;
selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p27;
- bra.uni BB21_54;
+ bra.uni BB19_54;
-BB21_28:
+BB19_28:
setp.eq.s32 %p12, %r13, 11;
- @%p12 bra BB21_31;
- bra.uni BB21_29;
+ @%p12 bra BB19_31;
+ bra.uni BB19_29;
-BB21_31:
+BB19_31:
min.f64 %fd39, %fd1, %fd2;
- bra.uni BB21_54;
+ bra.uni BB19_54;
-BB21_53:
+BB19_53:
add.f64 %fd39, %fd1, %fd2;
- bra.uni BB21_54;
+ bra.uni BB19_54;
-BB21_13:
+BB19_13:
setp.eq.s32 %p25, %r13, 2;
- @%p25 bra BB21_14;
- bra.uni BB21_54;
+ @%p25 bra BB19_14;
+ bra.uni BB19_54;
-BB21_14:
+BB19_14:
mul.f64 %fd39, %fd1, %fd2;
- bra.uni BB21_54;
+ bra.uni BB19_54;
-BB21_34:
+BB19_34:
setp.le.f64 %p30, %fd1, %fd2;
selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p30;
- bra.uni BB21_54;
+ bra.uni BB19_54;
-BB21_22:
+BB19_22:
setp.eq.s32 %p18, %r13, 8;
- @%p18 bra BB21_23;
- bra.uni BB21_54;
+ @%p18 bra BB19_23;
+ bra.uni BB19_54;
-BB21_23:
+BB19_23:
setp.ge.f64 %p28, %fd1, %fd2;
selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p28;
- bra.uni BB21_54;
+ bra.uni BB19_54;
-BB21_51:
+BB19_51:
div.rn.f64 %fd39, %fd1, %fd2;
- bra.uni BB21_54;
+ bra.uni BB19_54;
-BB21_17:
+BB19_17:
setp.eq.s32 %p22, %r13, 5;
- @%p22 bra BB21_18;
- bra.uni BB21_54;
+ @%p22 bra BB19_18;
+ bra.uni BB19_54;
-BB21_18:
+BB19_18:
setp.lt.f64 %p31, %fd1, %fd2;
selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p31;
- bra.uni BB21_54;
+ bra.uni BB19_54;
-BB21_26:
+BB19_26:
setp.eq.s32 %p15, %r13, 10;
- @%p15 bra BB21_27;
- bra.uni BB21_54;
+ @%p15 bra BB19_27;
+ bra.uni BB19_54;
-BB21_27:
+BB19_27:
setp.neu.f64 %p26, %fd1, %fd2;
selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p26;
- bra.uni BB21_54;
+ bra.uni BB19_54;
-BB21_29:
+BB19_29:
setp.ne.s32 %p13, %r13, 12;
- @%p13 bra BB21_54;
+ @%p13 bra BB19_54;
max.f64 %fd39, %fd1, %fd2;
- bra.uni BB21_54;
+ bra.uni BB19_54;
-BB21_38:
+BB19_38:
setp.gt.s32 %p35, %r8, -1;
- @%p35 bra BB21_41;
+ @%p35 bra BB19_41;
cvt.rzi.f64.f64 %fd29, %fd2;
setp.neu.f64 %p36, %fd29, %fd2;
selp.f64 %fd37, 0dFFF8000000000000, %fd37, %p36;
-BB21_41:
+BB19_41:
mov.f64 %fd17, %fd37;
add.f64 %fd18, %fd1, %fd2;
{
@@ -2657,35 +2356,35 @@ BB21_41:
and.b32 %r31, %r30, 2146435072;
setp.ne.s32 %p39, %r31, 2146435072;
mov.f64 %fd36, %fd17;
- @%p39 bra BB21_50;
+ @%p39 bra BB19_50;
setp.gtu.f64 %p40, %fd11, 0d7FF0000000000000;
mov.f64 %fd36, %fd18;
- @%p40 bra BB21_50;
+ @%p40 bra BB19_50;
abs.f64 %fd30, %fd2;
setp.gtu.f64 %p41, %fd30, 0d7FF0000000000000;
mov.f64 %fd35, %fd18;
mov.f64 %fd36, %fd35;
- @%p41 bra BB21_50;
+ @%p41 bra BB19_50;
and.b32 %r32, %r9, 2147483647;
setp.ne.s32 %p42, %r32, 2146435072;
- @%p42 bra BB21_46;
+ @%p42 bra BB19_46;
{
.reg .b32 %temp;
mov.b64 {%r33, %temp}, %fd2;
}
setp.eq.s32 %p43, %r33, 0;
- @%p43 bra BB21_49;
+ @%p43 bra BB19_49;
-BB21_46:
+BB19_46:
and.b32 %r34, %r8, 2147483647;
setp.ne.s32 %p44, %r34, 2146435072;
mov.f64 %fd33, %fd17;
mov.f64 %fd36, %fd33;
- @%p44 bra BB21_50;
+ @%p44 bra BB19_50;
{
.reg .b32 %temp;
@@ -2693,7 +2392,7 @@ BB21_46:
}
setp.ne.s32 %p45, %r35, 0;
mov.f64 %fd36, %fd17;
- @%p45 bra BB21_50;
+ @%p45 bra BB19_50;
shr.s32 %r36, %r9, 31;
and.b32 %r37, %r36, -2146435072;
@@ -2702,9 +2401,9 @@ BB21_46:
selp.b32 %r40, %r39, %r38, %p1;
mov.u32 %r41, 0;
mov.b64 %fd36, {%r41, %r40};
- bra.uni BB21_50;
+ bra.uni BB19_50;
-BB21_49:
+BB19_49:
setp.gt.f64 %p46, %fd11, 0d3FF0000000000000;
selp.b32 %r42, 2146435072, 0, %p46;
xor.b32 %r43, %r42, 2146435072;
@@ -2715,58 +2414,51 @@ BB21_49:
mov.u32 %r46, 0;
mov.b64 %fd36, {%r46, %r45};
-BB21_50:
+BB19_50:
setp.eq.f64 %p49, %fd2, 0d0000000000000000;
setp.eq.f64 %p50, %fd1, 0d3FF0000000000000;
or.pred %p51, %p50, %p49;
selp.f64 %fd39, 0d3FF0000000000000, %fd36, %p51;
-BB21_54:
+BB19_54:
cvta.to.global.u64 %rd12, %rd4;
mul.wide.s32 %rd13, %r3, 8;
add.s64 %rd14, %rd12, %rd13;
st.global.f64 [%rd14], %fd39;
+ bar.sync 0;
-BB21_55:
+BB19_55:
ret;
}
- // .globl binCellScalarOp
-.visible .entry binCellScalarOp(
- .param .u64 binCellScalarOp_param_0,
- .param .f64 binCellScalarOp_param_1,
- .param .u64 binCellScalarOp_param_2,
- .param .u32 binCellScalarOp_param_3,
- .param .u32 binCellScalarOp_param_4,
- .param .u32 binCellScalarOp_param_5,
- .param .u32 binCellScalarOp_param_6
+ // .globl matrix_scalar_op
+.visible .entry matrix_scalar_op(
+ .param .u64 matrix_scalar_op_param_0,
+ .param .f64 matrix_scalar_op_param_1,
+ .param .u64 matrix_scalar_op_param_2,
+ .param .u32 matrix_scalar_op_param_3,
+ .param .u32 matrix_scalar_op_param_4,
+ .param .u32 matrix_scalar_op_param_5
)
{
- .reg .pred %p<89>;
- .reg .b32 %r<71>;
+ .reg .pred %p<91>;
+ .reg .b32 %r<64>;
.reg .f64 %fd<77>;
.reg .b64 %rd<12>;
- ld.param.u64 %rd4, [binCellScalarOp_param_0];
- ld.param.f64 %fd52, [binCellScalarOp_param_1];
- ld.param.u64 %rd5, [binCellScalarOp_param_2];
- ld.param.u32 %r8, [binCellScalarOp_param_3];
- ld.param.u32 %r9, [binCellScalarOp_param_4];
- ld.param.u32 %r6, [binCellScalarOp_param_5];
- ld.param.u32 %r7, [binCellScalarOp_param_6];
- mov.u32 %r10, %ctaid.x;
- mov.u32 %r11, %ntid.x;
- mov.u32 %r12, %tid.x;
- mad.lo.s32 %r13, %r11, %r10, %r12;
- mov.u32 %r14, %ntid.y;
- mov.u32 %r15, %ctaid.y;
- mov.u32 %r16, %tid.y;
- mad.lo.s32 %r17, %r13, %r9, %r16;
- mad.lo.s32 %r1, %r14, %r15, %r17;
- mul.lo.s32 %r18, %r9, %r8;
- setp.ge.s32 %p3, %r1, %r18;
- @%p3 bra BB22_92;
+ ld.param.u64 %rd4, [matrix_scalar_op_param_0];
+ ld.param.f64 %fd52, [matrix_scalar_op_param_1];
+ ld.param.u64 %rd5, [matrix_scalar_op_param_2];
+ ld.param.u32 %r8, [matrix_scalar_op_param_3];
+ ld.param.u32 %r6, [matrix_scalar_op_param_4];
+ ld.param.u32 %r7, [matrix_scalar_op_param_5];
+ mov.u32 %r9, %ctaid.x;
+ mov.u32 %r10, %ntid.x;
+ mov.u32 %r11, %tid.x;
+ mad.lo.s32 %r1, %r10, %r9, %r11;
+ setp.ge.s32 %p3, %r1, %r8;
+ @%p3 bra BB20_94;
cvta.to.global.u64 %rd6, %rd5;
cvta.to.global.u64 %rd7, %rd4;
@@ -2775,178 +2467,86 @@ BB21_55:
ld.global.f64 %fd1, [%rd9];
add.s64 %rd1, %rd6, %rd8;
setp.eq.s32 %p4, %r7, 0;
- @%p4 bra BB22_47;
+ @%p4 bra BB20_48;
- setp.eq.s32 %p5, %r6, 0;
- @%p5 bra BB22_45;
+ mov.f64 %fd67, 0d7FEFFFFFFFFFFFFF;
+ setp.gt.s32 %p5, %r6, 5;
+ @%p5 bra BB20_12;
- mov.f64 %fd67, 0dC08F380000000000;
- setp.gt.s32 %p6, %r6, 6;
- @%p6 bra BB22_13;
+ setp.gt.s32 %p15, %r6, 2;
+ @%p15 bra BB20_8;
- setp.gt.s32 %p14, %r6, 3;
- @%p14 bra BB22_9;
+ setp.eq.s32 %p19, %r6, 0;
+ @%p19 bra BB20_46;
- setp.eq.s32 %p18, %r6, 1;
- @%p18 bra BB22_44;
+ setp.eq.s32 %p20, %r6, 1;
+ @%p20 bra BB20_45;
+ bra.uni BB20_6;
- setp.eq.s32 %p19, %r6, 2;
- @%p19 bra BB22_43;
- bra.uni BB22_7;
+BB20_45:
+ sub.f64 %fd67, %fd52, %fd1;
+ bra.uni BB20_47;
-BB22_43:
- mul.f64 %fd67, %fd1, %fd52;
- bra.uni BB22_46;
-
-BB22_47:
- setp.eq.s32 %p47, %r6, 0;
- @%p47 bra BB22_90;
-
- mov.f64 %fd76, 0dC08F380000000000;
- setp.gt.s32 %p48, %r6, 6;
- @%p48 bra BB22_58;
-
- setp.gt.s32 %p56, %r6, 3;
- @%p56 bra BB22_54;
-
- setp.eq.s32 %p60, %r6, 1;
- @%p60 bra BB22_89;
-
- setp.eq.s32 %p61, %r6, 2;
- @%p61 bra BB22_88;
- bra.uni BB22_52;
-
-BB22_88:
- mul.f64 %fd76, %fd1, %fd52;
- bra.uni BB22_91;
-
-BB22_45:
- add.f64 %fd67, %fd1, %fd52;
-
-BB22_46:
- st.global.f64 [%rd1], %fd67;
- bra.uni BB22_92;
-
-BB22_13:
- setp.gt.s32 %p7, %r6, 9;
- @%p7 bra BB22_18;
-
- setp.eq.s32 %p11, %r6, 7;
- @%p11 bra BB22_25;
-
- setp.eq.s32 %p12, %r6, 8;
- @%p12 bra BB22_24;
- bra.uni BB22_16;
-
-BB22_24:
- setp.le.f64 %p23, %fd1, %fd52;
- selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p23;
- bra.uni BB22_46;
-
-BB22_90:
- add.f64 %fd76, %fd1, %fd52;
-
-BB22_91:
- st.global.f64 [%rd1], %fd76;
-
-BB22_92:
- ret;
-
-BB22_58:
- setp.gt.s32 %p49, %r6, 9;
- @%p49 bra BB22_63;
+BB20_48:
+ mov.f64 %fd76, 0d7FEFFFFFFFFFFFFF;
+ setp.gt.s32 %p48, %r6, 5;
+ @%p48 bra BB20_58;
- setp.eq.s32 %p53, %r6, 7;
- @%p53 bra BB22_70;
+ setp.gt.s32 %p58, %r6, 2;
+ @%p58 bra BB20_54;
- setp.eq.s32 %p54, %r6, 8;
- @%p54 bra BB22_69;
- bra.uni BB22_61;
+ setp.eq.s32 %p62, %r6, 0;
+ @%p62 bra BB20_92;
-BB22_69:
- setp.ge.f64 %p65, %fd1, %fd52;
- selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p65;
- bra.uni BB22_91;
+ setp.eq.s32 %p63, %r6, 1;
+ @%p63 bra BB20_91;
+ bra.uni BB20_52;
-BB22_9:
- setp.eq.s32 %p15, %r6, 4;
- @%p15 bra BB22_27;
+BB20_91:
+ sub.f64 %fd76, %fd1, %fd52;
+ bra.uni BB20_93;
- setp.eq.s32 %p16, %r6, 5;
- @%p16 bra BB22_26;
- bra.uni BB22_11;
+BB20_12:
+ setp.gt.s32 %p6, %r6, 8;
+ @%p6 bra BB20_17;
-BB22_26:
- setp.gt.f64 %p26, %fd1, %fd52;
- selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p26;
- bra.uni BB22_46;
+ setp.eq.s32 %p12, %r6, 6;
+ @%p12 bra BB20_27;
-BB22_18:
- setp.eq.s32 %p8, %r6, 10;
- @%p8 bra BB22_23;
+ setp.eq.s32 %p13, %r6, 7;
+ @%p13 bra BB20_26;
+ bra.uni BB20_15;
- setp.eq.s32 %p9, %r6, 11;
- @%p9 bra BB22_22;
- bra.uni BB22_20;
+BB20_26:
+ setp.lt.f64 %p25, %fd1, %fd52;
+ selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p25;
+ bra.uni BB20_47;
-BB22_22:
- min.f64 %fd67, %fd52, %fd1;
- bra.uni BB22_46;
+BB20_58:
+ setp.gt.s32 %p49, %r6, 8;
+ @%p49 bra BB20_63;
-BB22_54:
- setp.eq.s32 %p57, %r6, 4;
- @%p57 bra BB22_72;
+ setp.eq.s32 %p55, %r6, 6;
+ @%p55 bra BB20_73;
- setp.eq.s32 %p58, %r6, 5;
- @%p58 bra BB22_71;
- bra.uni BB22_56;
+ setp.eq.s32 %p56, %r6, 7;
+ @%p56 bra BB20_72;
+ bra.uni BB20_61;
-BB22_71:
- setp.lt.f64 %p68, %fd1, %fd52;
+BB20_72:
+ setp.gt.f64 %p68, %fd1, %fd52;
selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p68;
- bra.uni BB22_91;
+ bra.uni BB20_93;
-BB22_63:
- setp.eq.s32 %p50, %r6, 10;
- @%p50 bra BB22_68;
+BB20_8:
+ setp.eq.s32 %p16, %r6, 3;
+ @%p16 bra BB20_44;
- setp.eq.s32 %p51, %r6, 11;
- @%p51 bra BB22_67;
- bra.uni BB22_65;
-
-BB22_67:
- min.f64 %fd76, %fd1, %fd52;
- bra.uni BB22_91;
-
-BB22_44:
- sub.f64 %fd67, %fd52, %fd1;
- bra.uni BB22_46;
-
-BB22_7:
- setp.eq.s32 %p20, %r6, 3;
- @%p20 bra BB22_8;
- bra.uni BB22_46;
+ setp.eq.s32 %p17, %r6, 4;
+ @%p17 bra BB20_28;
+ bra.uni BB20_10;
-BB22_8:
- div.rn.f64 %fd67, %fd52, %fd1;
- bra.uni BB22_46;
-
-BB22_25:
- setp.lt.f64 %p24, %fd1, %fd52;
- selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p24;
- bra.uni BB22_46;
-
-BB22_16:
- setp.eq.s32 %p13, %r6, 9;
- @%p13 bra BB22_17;
- bra.uni BB22_46;
-
-BB22_17:
- setp.eq.f64 %p22, %fd1, %fd52;
- selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p22;
- bra.uni BB22_46;
-
-BB22_27:
+BB20_28:
{
.reg .b32 %temp;
mov.b64 {%temp, %r2}, %fd52;
@@ -2955,13 +2555,13 @@ BB22_27:
.reg .b32 %temp;
mov.b64 {%temp, %r3}, %fd1;
}
- bfe.u32 %r19, %r3, 20, 11;
- add.s32 %r20, %r19, -1012;
+ bfe.u32 %r12, %r3, 20, 11;
+ add.s32 %r13, %r12, -1012;
mov.b64 %rd10, %fd1;
- shl.b64 %rd2, %rd10, %r20;
- setp.eq.s64 %p27, %rd2, -9223372036854775808;
+ shl.b64 %rd2, %rd10, %r13;
+ setp.eq.s64 %p28, %rd2, -9223372036854775808;
abs.f64 %fd10, %fd52;
- // Callseq Start 2
+ // Callseq Start 1
{
.reg .b32 temp_param_reg;
// <end>}
@@ -2979,90 +2579,61 @@ BB22_27:
ld.param.f64 %fd66, [retval0+0];
//{
- }// Callseq End 2
- setp.lt.s32 %p28, %r2, 0;
- and.pred %p1, %p28, %p27;
- @!%p1 bra BB22_29;
- bra.uni BB22_28;
+ }// Callseq End 1
+ setp.lt.s32 %p29, %r2, 0;
+ and.pred %p1, %p29, %p28;
+ @!%p1 bra BB20_30;
+ bra.uni BB20_29;
-BB22_28:
+BB20_29:
{
.reg .b32 %temp;
- mov.b64 {%temp, %r21}, %fd66;
+ mov.b64 {%temp, %r14}, %fd66;
}
- xor.b32 %r22, %r21, -2147483648;
+ xor.b32 %r15, %r14, -2147483648;
{
.reg .b32 %temp;
- mov.b64 {%r23, %temp}, %fd66;
+ mov.b64 {%r16, %temp}, %fd66;
}
- mov.b64 %fd66, {%r23, %r22};
+ mov.b64 %fd66, {%r16, %r15};
-BB22_29:
+BB20_30:
mov.f64 %fd65, %fd66;
- setp.eq.f64 %p29, %fd52, 0d0000000000000000;
- @%p29 bra BB22_32;
- bra.uni BB22_30;
-
-BB22_32:
- selp.b32 %r24, %r2, 0, %p27;
- or.b32 %r25, %r24, 2146435072;
- setp.lt.s32 %p33, %r3, 0;
- selp.b32 %r26, %r25, %r24, %p33;
- mov.u32 %r27, 0;
- mov.b64 %fd65, {%r27, %r26};
- bra.uni BB22_33;
-
-BB22_11:
- setp.eq.s32 %p17, %r6, 6;
- @%p17 bra BB22_12;
- bra.uni BB22_46;
-
-BB22_12:
- setp.ge.f64 %p25, %fd1, %fd52;
- selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p25;
- bra.uni BB22_46;
-
-BB22_23:
- setp.neu.f64 %p21, %fd1, %fd52;
- selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p21;
- bra.uni BB22_46;
-
-BB22_20:
- setp.ne.s32 %p10, %r6, 12;
- @%p10 bra BB22_46;
-
- max.f64 %fd67, %fd52, %fd1;
- bra.uni BB22_46;
-
-BB22_89:
- sub.f64 %fd76, %fd1, %fd52;
- bra.uni BB22_91;
-
-BB22_52:
- setp.eq.s32 %p62, %r6, 3;
- @%p62 bra BB22_53;
- bra.uni BB22_91;
-
-BB22_53:
- div.rn.f64 %fd76, %fd1, %fd52;
- bra.uni BB22_91;
-
-BB22_70:
- setp.gt.f64 %p66, %fd1, %fd52;
- selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p66;
- bra.uni BB22_91;
+ setp.eq.f64 %p30, %fd52, 0d0000000000000000;
+ @%p30 bra BB20_33;
+ bra.uni BB20_31;
+
+BB20_33:
+ selp.b32 %r17, %r2, 0, %p28;
+ or.b32 %r18, %r17, 2146435072;
+ setp.lt.s32 %p34, %r3, 0;
+ selp.b32 %r19, %r18, %r17, %p34;
+ mov.u32 %r20, 0;
+ mov.b64 %fd65, {%r20, %r19};
+ bra.uni BB20_34;
+
+BB20_17:
+ setp.gt.s32 %p7, %r6, 10;
+ @%p7 bra BB20_21;
+
+ setp.eq.s32 %p10, %r6, 9;
+ @%p10 bra BB20_25;
+ bra.uni BB20_19;
+
+BB20_25:
+ setp.eq.f64 %p23, %fd1, %fd52;
+ selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p23;
+ bra.uni BB20_47;
-BB22_61:
- setp.eq.s32 %p55, %r6, 9;
- @%p55 bra BB22_62;
- bra.uni BB22_91;
+BB20_54:
+ setp.eq.s32 %p59, %r6, 3;
+ @%p59 bra BB20_90;
-BB22_62:
- setp.eq.f64 %p64, %fd1, %fd52;
- selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p64;
- bra.uni BB22_91;
+ setp.eq.s32 %p60, %r6, 4;
+ @%p60 bra BB20_74;
+ bra.uni BB20_56;
-BB22_72:
+BB20_74:
{
.reg .b32 %temp;
mov.b64 {%temp, %r4}, %fd1;
@@ -3071,13 +2642,13 @@ BB22_72:
.reg .b32 %temp;
mov.b64 {%temp, %r5}, %fd52;
}
- bfe.u32 %r45, %r5, 20, 11;
- add.s32 %r46, %r45, -1012;
+ bfe.u32 %r38, %r5, 20, 11;
+ add.s32 %r39, %r38, -1012;
mov.b64 %rd11, %fd52;
- shl.b64 %rd3, %rd11, %r46;
- setp.eq.s64 %p69, %rd3, -9223372036854775808;
+ shl.b64 %rd3, %rd11, %r39;
+ setp.eq.s64 %p71, %rd3, -9223372036854775808;
abs.f64 %fd35, %fd1;
- // Callseq Start 3
+ // Callseq Start 2
{
.reg .b32 temp_param_reg;
// <end>}
@@ -3095,226 +2666,362 @@ BB22_72:
ld.param.f64 %fd75, [retval0+0];
//{
- }// Callseq End 3
- setp.lt.s32 %p70, %r4, 0;
- and.pred %p2, %p70, %p69;
- @!%p2 bra BB22_74;
- bra.uni BB22_73;
+ }// Callseq End 2
+ setp.lt.s32 %p72, %r4, 0;
+ and.pred %p2, %p72, %p71;
+ @!%p2 bra BB20_76;
+ bra.uni BB20_75;
-BB22_73:
+BB20_75:
{
.reg .b32 %temp;
- mov.b64 {%temp, %r47}, %fd75;
+ mov.b64 {%temp, %r40}, %fd75;
}
- xor.b32 %r48, %r47, -2147483648;
+ xor.b32 %r41, %r40, -2147483648;
{
.reg .b32 %temp;
- mov.b64 {%r49, %temp}, %fd75;
+ mov.b64 {%r42, %temp}, %fd75;
}
- mov.b64 %fd75, {%r49, %r48};
+ mov.b64 %fd75, {%r42, %r41};
-BB22_74:
+BB20_76:
mov.f64 %fd74, %fd75;
- setp.eq.f64 %p71, %fd1, 0d0000000000000000;
- @%p71 bra BB22_77;
- bra.uni BB22_75;
-
-BB22_77:
- selp.b32 %r50, %r4, 0, %p69;
- or.b32 %r51, %r50, 2146435072;
- setp.lt.s32 %p75, %r5, 0;
- selp.b32 %r52, %r51, %r50, %p75;
- mov.u32 %r53, 0;
- mov.b64 %fd74, {%r53, %r52};
- bra.uni BB22_78;
-
-BB22_56:
- setp.eq.s32 %p59, %r6, 6;
- @%p59 bra BB22_57;
- bra.uni BB22_91;
-
-BB22_57:
- setp.le.f64 %p67, %fd1, %fd52;
+ setp.eq.f64 %p73, %fd1, 0d0000000000000000;
+ @%p73 bra BB20_79;
+ bra.uni BB20_77;
+
+BB20_79:
+ selp.b32 %r43, %r4, 0, %p71;
+ or.b32 %r44, %r43, 2146435072;
+ setp.lt.s32 %p77, %r5, 0;
+ selp.b32 %r45, %r44, %r43, %p77;
+ mov.u32 %r46, 0;
+ mov.b64 %fd74, {%r46, %r45};
+ bra.uni BB20_80;
+
+BB20_63:
+ setp.gt.s32 %p50, %r6, 10;
+ @%p50 bra BB20_67;
+
+ setp.eq.s32 %p53, %r6, 9;
+ @%p53 bra BB20_71;
+ bra.uni BB20_65;
+
+BB20_71:
+ setp.eq.f64 %p66, %fd1, %fd52;
+ selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p66;
+ bra.uni BB20_93;
+
+BB20_21:
+ setp.eq.s32 %p8, %r6, 11;
+ @%p8 bra BB20_24;
+ bra.uni BB20_22;
+
+BB20_24:
+ min.f64 %fd67, %fd52, %fd1;
+ bra.uni BB20_47;
+
+BB20_46:
+ add.f64 %fd67, %fd1, %fd52;
+ bra.uni BB20_47;
+
+BB20_6:
+ setp.eq.s32 %p21, %r6, 2;
+ @%p21 bra BB20_7;
+ bra.uni BB20_47;
+
+BB20_7:
+ mul.f64 %fd67, %fd1, %fd52;
+ bra.uni BB20_47;
+
+BB20_27:
+ setp.ge.f64 %p26, %fd1, %fd52;
+ selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p26;
+ bra.uni BB20_47;
+
+BB20_15:
+ setp.eq.s32 %p14, %r6, 8;
+ @%p14 bra BB20_16;
+ bra.uni BB20_47;
+
+BB20_16:
+ setp.le.f64 %p24, %fd1, %fd52;
+ selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p24;
+ bra.uni BB20_47;
+
+BB20_44:
+ div.rn.f64 %fd67, %fd52, %fd1;
+ bra.uni BB20_47;
+
+BB20_10:
+ setp.eq.s32 %p18, %r6, 5;
+ @%p18 bra BB20_11;
+ bra.uni BB20_47;
+
+BB20_11:
+ setp.gt.f64 %p27, %fd1, %fd52;
+ selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p27;
+ bra.uni BB20_47;
+
+BB20_67:
+ setp.eq.s32 %p51, %r6, 11;
+ @%p51 bra BB20_70;
+ bra.uni BB20_68;
+
+BB20_70:
+ min.f64 %fd76, %fd1, %fd52;
+ bra.uni BB20_93;
+
+BB20_19:
+ setp.eq.s32 %p11, %r6, 10;
+ @%p11 bra BB20_20;
+ bra.uni BB20_47;
+
+BB20_20:
+ setp.neu.f64 %p22, %fd1, %fd52;
+ selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p22;
+ bra.uni BB20_47;
+
+BB20_22:
+ setp.ne.s32 %p9, %r6, 12;
+ @%p9 bra BB20_47;
+
+ max.f64 %fd67, %fd52, %fd1;
+ bra.uni BB20_47;
+
+BB20_92:
+ add.f64 %fd76, %fd1, %fd52;
+ bra.uni BB20_93;
+
+BB20_52:
+ setp.eq.s32 %p64, %r6, 2;
+ @%p64 bra BB20_53;
+ bra.uni BB20_93;
+
+BB20_53:
+ mul.f64 %fd76, %fd1, %fd52;
+ bra.uni BB20_93;
+
+BB20_73:
+ setp.le.f64 %p69, %fd1, %fd52;
+ selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p69;
+ bra.uni BB20_93;
+
+BB20_61:
+ setp.eq.s32 %p57, %r6, 8;
+ @%p57 bra BB20_62;
+ bra.uni BB20_93;
+
+BB20_62:
+ setp.ge.f64 %p67, %fd1, %fd52;
selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p67;
- bra.uni BB22_91;
+ bra.uni BB20_93;
+
+BB20_90:
+ div.rn.f64 %fd76, %fd1, %fd52;
+ bra.uni BB20_93;
+
+BB20_56:
+ setp.eq.s32 %p61, %r6, 5;
+ @%p61 bra BB20_57;
+ bra.uni BB20_93;
-BB22_68:
- setp.neu.f64 %p63, %fd1, %fd52;
- selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p63;
- bra.uni BB22_91;
+BB20_57:
+ setp.lt.f64 %p70, %fd1, %fd52;
+ selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p70;
+ bra.uni BB20_93;
-BB22_65:
+BB20_65:
+ setp.eq.s32 %p54, %r6, 10;
+ @%p54 bra BB20_66;
+ bra.uni BB20_93;
+
+BB20_66:
+ setp.neu.f64 %p65, %fd1, %fd52;
+ selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p65;
+ bra.uni BB20_93;
+
+BB20_68:
setp.ne.s32 %p52, %r6, 12;
- @%p52 bra BB22_91;
+ @%p52 bra BB20_93;
max.f64 %fd76, %fd1, %fd52;
- bra.uni BB22_91;
+ bra.uni BB20_93;
-BB22_30:
- setp.gt.s32 %p30, %r2, -1;
- @%p30 bra BB22_33;
+BB20_31:
+ setp.gt.s32 %p31, %r2, -1;
+ @%p31 bra BB20_34;
cvt.rzi.f64.f64 %fd54, %fd1;
- setp.neu.f64 %p31, %fd54, %fd1;
- selp.f64 %fd65, 0dFFF8000000000000, %fd65, %p31;
+ setp.neu.f64 %p32, %fd54, %fd1;
+ selp.f64 %fd65, 0dFFF8000000000000, %fd65, %p32;
-BB22_33:
+BB20_34:
mov.f64 %fd16, %fd65;
add.f64 %fd17, %fd1, %fd52;
{
.reg .b32 %temp;
- mov.b64 {%temp, %r28}, %fd17;
+ mov.b64 {%temp, %r21}, %fd17;
}
- and.b32 %r29, %r28, 2146435072;
- setp.ne.s32 %p34, %r29, 2146435072;
+ and.b32 %r22, %r21, 2146435072;
+ setp.ne.s32 %p35, %r22, 2146435072;
mov.f64 %fd64, %fd16;
- @%p34 bra BB22_42;
+ @%p35 bra BB20_43;
- setp.gtu.f64 %p35, %fd10, 0d7FF0000000000000;
+ setp.gtu.f64 %p36, %fd10, 0d7FF0000000000000;
mov.f64 %fd64, %fd17;
- @%p35 bra BB22_42;
+ @%p36 bra BB20_43;
abs.f64 %fd55, %fd1;
- setp.gtu.f64 %p36, %fd55, 0d7FF0000000000000;
+ setp.gtu.f64 %p37, %fd55, 0d7FF0000000000000;
mov.f64 %fd63, %fd17;
mov.f64 %fd64, %fd63;
- @%p36 bra BB22_42;
+ @%p37 bra BB20_43;
- and.b32 %r30, %r3, 2147483647;
- setp.ne.s32 %p37, %r30, 2146435072;
- @%p37 bra BB22_38;
+ and.b32 %r23, %r3, 2147483647;
+ setp.ne.s32 %p38, %r23, 2146435072;
+ @%p38 bra BB20_39;
{
.reg .b32 %temp;
- mov.b64 {%r31, %temp}, %fd1;
+ mov.b64 {%r24, %temp}, %fd1;
}
- setp.eq.s32 %p38, %r31, 0;
- @%p38 bra BB22_41;
+ setp.eq.s32 %p39, %r24, 0;
+ @%p39 bra BB20_42;
-BB22_38:
- and.b32 %r32, %r2, 2147483647;
- setp.ne.s32 %p39, %r32, 2146435072;
+BB20_39:
+ and.b32 %r25, %r2, 2147483647;
+ setp.ne.s32 %p40, %r25, 2146435072;
mov.f64 %fd61, %fd16;
mov.f64 %fd64, %fd61;
- @%p39 bra BB22_42;
+ @%p40 bra BB20_43;
{
.reg .b32 %temp;
- mov.b64 {%r33, %temp}, %fd52;
+ mov.b64 {%r26, %temp}, %fd52;
}
- setp.ne.s32 %p40, %r33, 0;
+ setp.ne.s32 %p41, %r26, 0;
mov.f64 %fd64, %fd16;
- @%p40 bra BB22_42;
+ @%p41 bra BB20_43;
- shr.s32 %r34, %r3, 31;
- and.b32 %r35, %r34, -2146435072;
- add.s32 %r36, %r35, 2146435072;
- or.b32 %r37, %r36, -2147483648;
- selp.b32 %r38, %r37, %r36, %p1;
- mov.u32 %r39, 0;
- mov.b64 %fd64, {%r39, %r38};
- bra.uni BB22_42;
+ shr.s32 %r27, %r3, 31;
+ and.b32 %r28, %r27, -2146435072;
+ add.s32 %r29, %r28, 2146435072;
+ or.b32 %r30, %r29, -2147483648;
+ selp.b32 %r31, %r30, %r29, %p1;
+ mov.u32 %r32, 0;
+ mov.b64 %fd64, {%r32, %r31};
+ bra.uni BB20_43;
-BB22_75:
- setp.gt.s32 %p72, %r4, -1;
- @%p72 bra BB22_78;
+BB20_77:
+ setp.gt.s32 %p74, %r4, -1;
+ @%p74 bra BB20_80;
cvt.rzi.f64.f64 %fd57, %fd52;
- setp.neu.f64 %p73, %fd57, %fd52;
- selp.f64 %fd74, 0dFFF8000000000000, %fd74, %p73;
+ setp.neu.f64 %p75, %fd57, %fd52;
+ selp.f64 %fd74, 0dFFF8000000000000, %fd74, %p75;
-BB22_78:
+BB20_80:
mov.f64 %fd41, %fd74;
add.f64 %fd42, %fd1, %fd52;
{
.reg .b32 %temp;
- mov.b64 {%temp, %r54}, %fd42;
+ mov.b64 {%temp, %r47}, %fd42;
}
- and.b32 %r55, %r54, 2146435072;
- setp.ne.s32 %p76, %r55, 2146435072;
+ and.b32 %r48, %r47, 2146435072;
+ setp.ne.s32 %p78, %r48, 2146435072;
mov.f64 %fd73, %fd41;
- @%p76 bra BB22_87;
+ @%p78 bra BB20_89;
- setp.gtu.f64 %p77, %fd35, 0d7FF0000000000000;
+ setp.gtu.f64 %p79, %fd35, 0d7FF0000000000000;
mov.f64 %fd73, %fd42;
- @%p77 bra BB22_87;
+ @%p79 bra BB20_89;
abs.f64 %fd58, %fd52;
- setp.gtu.f64 %p78, %fd58, 0d7FF0000000000000;
+ setp.gtu.f64 %p80, %fd58, 0d7FF0000000000000;
mov.f64 %fd72, %fd42;
mov.f64 %fd73, %fd72;
- @%p78 bra BB22_87;
+ @%p80 bra BB20_89;
- and.b32 %r56, %r5, 2147483647;
- setp.ne.s32 %p79, %r56, 2146435072;
- @%p79 bra BB22_83;
+ and.b32 %r49, %r5, 2147483647;
+ setp.ne.s32 %p81, %r49, 2146435072;
+ @%p81 bra BB20_85;
{
.reg .b32 %temp;
- mov.b64 {%r57, %temp}, %fd52;
+ mov.b64 {%r50, %temp}, %fd52;
}
- setp.eq.s32 %p80, %r57, 0;
- @%p80 bra BB22_86;
+ setp.eq.s32 %p82, %r50, 0;
+ @%p82 bra BB20_88;
-BB22_83:
- and.b32 %r58, %r4, 2147483647;
- setp.ne.s32 %p81, %r58, 2146435072;
+BB20_85:
+ and.b32 %r51, %r4, 2147483647;
+ setp.ne.s32 %p83, %r51, 2146435072;
mov.f64 %fd70, %fd41;
mov.f64 %fd73, %fd70;
- @%p81 bra BB22_87;
+ @%p83 bra BB20_89;
{
.reg .b32 %temp;
- mov.b64 {%r59, %temp}, %fd1;
+ mov.b64 {%r52, %temp}, %fd1;
}
- setp.ne.s32 %p82, %r59, 0;
+ setp.ne.s32 %p84, %r52, 0;
mov.f64 %fd73, %fd41;
- @%p82 bra BB22_87;
-
- shr.s32 %r60, %r5, 31;
- and.b32 %r61, %r60, -2146435072;
- add.s32 %r62, %r61, 2146435072;
- or.b32 %r63, %r62, -2147483648;
- selp.b32 %r64, %r63, %r62, %p2;
- mov.u32 %r65, 0;
- mov.b64 %fd73, {%r65, %r64};
- bra.uni BB22_87;
-
-BB22_41:
- setp.gt.f64 %p41, %fd10, 0d3FF0000000000000;
- selp.b32 %r40, 2146435072, 0, %p41;
- xor.b32 %r41, %r40, 2146435072;
- setp.lt.s32 %p42, %r3, 0;
- selp.b32 %r42, %r41, %r40, %p42;
- setp.eq.f64 %p43, %fd52, 0dBFF0000000000000;
- selp.b32 %r43, 1072693248, %r42, %p43;
- mov.u32 %r44, 0;
- mov.b64 %fd64, {%r44, %r43};
-
-BB22_42:
- setp.eq.f64 %p44, %fd1, 0d0000000000000000;
- setp.eq.f64 %p45, %fd52, 0d3FF0000000000000;
- or.pred %p46, %p45, %p44;
- selp.f64 %fd67, 0d3FF0000000000000, %fd64, %p46;
- bra.uni BB22_46;
-
-BB22_86:
- setp.gt.f64 %p83, %fd35, 0d3FF0000000000000;
- selp.b32 %r66, 2146435072, 0, %p83;
- xor.b32 %r67, %r66, 2146435072;
- setp.lt.s32 %p84, %r5, 0;
- selp.b32 %r68, %r67, %r66, %p84;
- setp.eq.f64 %p85, %fd1, 0dBFF0000000000000;
- selp.b32 %r69, 1072693248, %r68, %p85;
- mov.u32 %r70, 0;
- mov.b64 %fd73, {%r70, %r69};
-
-BB22_87:
- setp.eq.f64 %p86, %fd52, 0d0000000000000000;
- setp.eq.f64 %p87, %fd1, 0d3FF0000000000000;
- or.pred %p88, %p87, %p86;
- selp.f64 %fd76, 0d3FF0000000000000, %fd73, %p88;
- bra.uni BB22_91;
+ @%p84 bra BB20_89;
+
+ shr.s32 %r53, %r5, 31;
+ and.b32 %r54, %r53, -2146435072;
+ add.s32 %r55, %r54, 2146435072;
+ or.b32 %r56, %r55, -2147483648;
+ selp.b32 %r57, %r56, %r55, %p2;
+ mov.u32 %r58, 0;
+ mov.b64 %fd73, {%r58, %r57};
+ bra.uni BB20_89;
+
+BB20_42:
+ setp.gt.f64 %p42, %fd10, 0d3FF0000000000000;
+ selp.b32 %r33, 2146435072, 0, %p42;
+ xor.b32 %r34, %r33, 2146435072;
+ setp.lt.s32 %p43, %r3, 0;
+ selp.b32 %r35, %r34, %r33, %p43;
+ setp.eq.f64 %p44, %fd52, 0dBFF0000000000000;
+ selp.b32 %r36, 1072693248, %r35, %p44;
+ mov.u32 %r37, 0;
+ mov.b64 %fd64, {%r37, %r36};
+
+BB20_43:
+ setp.eq.f64 %p45, %fd1, 0d0000000000000000;
+ setp.eq.f64 %p46, %fd52, 0d3FF0000000000000;
+ or.pred %p47, %p46, %p45;
+ selp.f64 %fd67, 0d3FF0000000000000, %fd64, %p47;
+
+BB20_47:
+ st.global.f64 [%rd1], %fd67;
+ bra.uni BB20_94;
+
+BB20_88:
+ setp.gt.f64 %p85, %fd35, 0d3FF0000000000000;
+ selp.b32 %r59, 2146435072, 0, %p85;
+ xor.b32 %r60, %r59, 2146435072;
+ setp.lt.s32 %p86, %r5, 0;
+ selp.b32 %r61, %r60, %r59, %p86;
+ setp.eq.f64 %p87, %fd1, 0dBFF0000000000000;
+ selp.b32 %r62, 1072693248, %r61, %p87;
+ mov.u32 %r63, 0;
+ mov.b64 %fd73, {%r63, %r62};
+
+BB20_89:
+ setp.eq.f64 %p88, %fd52, 0d0000000000000000;
+ setp.eq.f64 %p89, %fd1, 0d3FF0000000000000;
+ or.pred %p90, %p89, %p88;
+ selp.f64 %fd76, 0d3FF0000000000000, %fd73, %p90;
+
+BB20_93:
+ st.global.f64 [%rd1], %fd76;
+
+BB20_94:
+ bar.sync 0;
+ ret;
}
// .globl fill
@@ -3338,14 +3045,14 @@ BB22_87:
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.s32 %p1, %r1, %r2;
- @%p1 bra BB23_2;
+ @%p1 bra BB21_2;
cvta.to.global.u64 %rd2, %rd1;
mul.wide.s32 %rd3, %r1, 8;
add.s64 %rd4, %rd2, %rd3;
st.global.f64 [%rd4], %fd1;
-BB23_2:
+BB21_2:
ret;
}
@@ -3373,9 +3080,9 @@ BB23_2:
mov.f64 %fd76, 0d0000000000000000;
mov.f64 %fd77, %fd76;
setp.ge.u32 %p1, %r32, %r5;
- @%p1 bra BB24_4;
+ @%p1 bra BB22_4;
-BB24_1:
+BB22_1:
mov.f64 %fd1, %fd77;
cvta.to.global.u64 %rd4, %rd2;
mul.wide.u32 %rd5, %r32, 8;
@@ -3384,23 +3091,23 @@ BB24_1:
add.f64 %fd78, %fd1, %fd30;
add.s32 %r3, %r32, %r9;
setp.ge.u32 %p2, %r3, %r5;
- @%p2 bra BB24_3;
+ @%p2 bra BB22_3;
mul.wide.u32 %rd8, %r3, 8;
add.s64 %rd9, %rd4, %rd8;
ld.global.f64 %fd31, [%rd9];
add.f64 %fd78, %fd78, %fd31;
-BB24_3:
+BB22_3:
mov.f64 %fd77, %fd78;
shl.b32 %r12, %r9, 1;
mov.u32 %r13, %nctaid.x;
mad.lo.s32 %r32, %r12, %r13, %r32;
setp.lt.u32 %p3, %r32, %r5;
mov.f64 %fd76, %fd77;
- @%p3 bra BB24_1;
+ @%p3 bra BB22_1;
-BB24_4:
+BB22_4:
mov.f64 %fd74, %fd76;
mul.wide.u32 %rd10, %r6, 8;
mov.u64 %rd11, sdata;
@@ -3408,130 +3115,130 @@ BB24_4:
st.shared.f64 [%rd1], %fd74;
bar.sync 0;
setp.lt.u32 %p4, %r9, 1024;
- @%p4 bra BB24_8;
+ @%p4 bra BB22_8;
setp.gt.u32 %p5, %r6, 511;
mov.f64 %fd75, %fd74;
- @%p5 bra BB24_7;
+ @%p5 bra BB22_7;
ld.shared.f64 %fd32, [%rd1+4096];
add.f64 %fd75, %fd74, %fd32;
st.shared.f64 [%rd1], %fd75;
-BB24_7:
+BB22_7:
mov.f64 %fd74, %fd75;
bar.sync 0;
-BB24_8:
+BB22_8:
mov.f64 %fd72, %fd74;
setp.lt.u32 %p6, %r9, 512;
- @%p6 bra BB24_12;
+ @%p6 bra BB22_12;
setp.gt.u32 %p7, %r6, 255;
mov.f64 %fd73, %fd72;
- @%p7 bra BB24_11;
+ @%p7 bra BB22_11;
ld.shared.f64 %fd33, [%rd1+2048];
add.f64 %fd73, %fd72, %fd33;
st.shared.f64 [%rd1], %fd73;
-BB24_11:
+BB22_11:
mov.f64 %fd72, %fd73;
bar.sync 0;
-BB24_12:
+BB22_12:
mov.f64 %fd70, %fd72;
setp.lt.u32 %p8, %r9, 256;
- @%p8 bra BB24_16;
+ @%p8 bra BB22_16;
setp.gt.u32 %p9, %r6, 127;
mov.f64 %fd71, %fd70;
- @%p9 bra BB24_15;
+ @%p9 bra BB22_15;
ld.shared.f64 %fd34, [%rd1+1024];
add.f64 %fd71, %fd70, %fd34;
st.shared.f64 [%rd1], %fd71;
-BB24_15:
+BB22_15:
mov.f64 %fd70, %fd71;
bar.sync 0;
-BB24_16:
+BB22_16:
mov.f64 %fd68, %fd70;
setp.lt.u32 %p10, %r9, 128;
- @%p10 bra BB24_20;
+ @%p10 bra BB22_20;
setp.gt.u32 %p11, %r6, 63;
mov.f64 %fd69, %fd68;
- @%p11 bra BB24_19;
+ @%p11 bra BB22_19;
ld.shared.f64 %fd35, [%rd1+512];
add.f64 %fd69, %fd68, %fd35;
st.shared.f64 [%rd1], %fd69;
-BB24_19:
+BB22_19:
mov.f64 %fd68, %fd69;
bar.sync 0;
-BB24_20:
+BB22_20:
mov.f64 %fd67, %fd68;
setp.gt.u32 %p12, %r6, 31;
- @%p12 bra BB24_33;
+ @%p12 bra BB22_33;
setp.lt.u32 %p13, %r9, 64;
- @%p13 bra BB24_23;
+ @%p13 bra BB22_23;
ld.volatile.shared.f64 %fd36, [%rd1+256];
add.f64 %fd67, %fd67, %fd36;
st.volatile.shared.f64 [%rd1], %fd67;
-BB24_23:
+BB22_23:
mov.f64 %fd66, %fd67;
setp.lt.u32 %p14, %r9, 32;
- @%p14 bra BB24_25;
+ @%p14 bra BB22_25;
ld.volatile.shared.f64 %fd37, [%rd1+128];
add.f64 %fd66, %fd66, %fd37;
st.volatile.shared.f64 [%rd1], %fd66;
-BB24_25:
+BB22_25:
mov.f64 %fd65, %fd66;
setp.lt.u32 %p15, %r9, 16;
- @%p15 bra BB24_27;
+ @%p15 bra BB22_27;
ld.volatile.shared.f64 %fd38, [%rd1+64];
add.f64 %fd65, %fd65, %fd38;
st.volatile.shared.f64 [%rd1], %fd65;
-BB24_27:
+BB22_27:
mov.f64 %fd64, %fd65;
setp.lt.u32 %p16, %r9, 8;
- @%p16 bra BB24_29;
+ @%p16 bra BB22_29;
ld.volatile.shared.f64 %fd39, [%rd1+32];
add.f64 %fd64, %fd64, %fd39;
st.volatile.shared.f64 [%rd1], %fd64;
-BB24_29:
+BB22_29:
mov.f64 %fd63, %fd64;
setp.lt.u32 %p17, %r9, 4;
- @%p17 bra BB24_31;
+ @%p17 bra BB22_31;
ld.volatile.shared.f64 %fd40, [%rd1+16];
add.f64 %fd63, %fd63, %fd40;
st.volatile.shared.f64 [%rd1], %fd63;
-BB24_31:
+BB22_31:
setp.lt.u32 %p18, %r9, 2;
- @%p18 bra BB24_33;
+ @%p18 bra BB22_33;
ld.volatile.shared.f64 %fd41, [%rd1+8];
add.f64 %fd42, %fd63, %fd41;
st.volatile.shared.f64 [%rd1], %fd42;
-BB24_33:
+BB22_33:
setp.ne.s32 %p19, %r6, 0;
- @%p19 bra BB24_35;
+ @%p19 bra BB22_35;
ld.shared.f64 %fd43, [sdata];
cvta.to.global.u64 %rd12, %rd3;
@@ -3539,7 +3246,7 @@ BB24_33:
add.s64 %rd14, %rd12, %rd13;
st.global.f64 [%rd14], %fd43;
-BB24_35:
+BB22_35:
ret;
}
@@ -3563,17 +3270,17 @@ BB24_35:
ld.param.u32 %r4, [reduce_row_sum_param_3];
mov.u32 %r6, %ctaid.x;
setp.ge.u32 %p1, %r6, %r5;
- @%p1 bra BB25_35;
+ @%p1 bra BB23_35;
mov.u32 %r38, %tid.x;
mov.f64 %fd72, 0d0000000000000000;
mov.f64 %fd73, %fd72;
setp.ge.u32 %p2, %r38, %r4;
- @%p2 bra BB25_4;
+ @%p2 bra BB23_4;
cvta.to.global.u64 %rd3, %rd1;
-BB25_3:
+BB23_3:
mad.lo.s32 %r8, %r6, %r4, %r38;
mul.wide.u32 %rd4, %r8, 8;
add.s64 %rd5, %rd3, %rd4;
@@ -3583,9 +3290,9 @@ BB25_3:
add.s32 %r38, %r9, %r38;
setp.lt.u32 %p3, %r38, %r4;
mov.f64 %fd72, %fd73;
- @%p3 bra BB25_3;
+ @%p3 bra BB23_3;
-BB25_4:
+BB23_4:
mov.f64 %fd70, %fd72;
mov.u32 %r10, %tid.x;
mul.wide.u32 %rd6, %r10, 8;
@@ -3595,130 +3302,130 @@ BB25_4:
bar.sync 0;
mov.u32 %r11, %ntid.x;
setp.lt.u32 %p4, %r11, 1024;
- @%p4 bra BB25_8;
+ @%p4 bra BB23_8;
setp.gt.u32 %p5, %r10, 511;
mov.f64 %fd71, %fd70;
- @%p5 bra BB25_7;
+ @%p5 bra BB23_7;
ld.shared.f64 %fd29, [%rd8+4096];
add.f64 %fd71, %fd70, %fd29;
st.shared.f64 [%rd8], %fd71;
-BB25_7:
+BB23_7:
mov.f64 %fd70, %fd71;
bar.sync 0;
-BB25_8:
+BB23_8:
mov.f64 %fd68, %fd70;
setp.lt.u32 %p6, %r11, 512;
- @%p6 bra BB25_12;
+ @%p6 bra BB23_12;
setp.gt.u32 %p7, %r10, 255;
mov.f64 %fd69, %fd68;
- @%p7 bra BB25_11;
+ @%p7 bra BB23_11;
ld.shared.f64 %fd30, [%rd8+2048];
add.f64 %fd69, %fd68, %fd30;
st.shared.f64 [%rd8], %fd69;
-BB25_11:
+BB23_11:
mov.f64 %fd68, %fd69;
bar.sync 0;
-BB25_12:
+BB23_12:
mov.f64 %fd66, %fd68;
setp.lt.u32 %p8, %r11, 256;
- @%p8 bra BB25_16;
+ @%p8 bra BB23_16;
setp.gt.u32 %p9, %r10, 127;
mov.f64 %fd67, %fd66;
- @%p9 bra BB25_15;
+ @%p9 bra BB23_15;
ld.shared.f64 %fd31, [%rd8+1024];
add.f64 %fd67, %fd66, %fd31;
st.shared.f64 [%rd8], %fd67;
-BB25_15:
+BB23_15:
mov.f64 %fd66, %fd67;
bar.sync 0;
-BB25_16:
+BB23_16:
mov.f64 %fd64, %fd66;
setp.lt.u32 %p10, %r11, 128;
- @%p10 bra BB25_20;
+ @%p10 bra BB23_20;
setp.gt.u32 %p11, %r10, 63;
mov.f64 %fd65, %fd64;
- @%p11 bra BB25_19;
+ @%p11 bra BB23_19;
ld.shared.f64 %fd32, [%rd8+512];
add.f64 %fd65, %fd64, %fd32;
st.shared.f64 [%rd8], %fd65;
-BB25_19:
+BB23_19:
mov.f64 %fd64, %fd65;
bar.sync 0;
-BB25_20:
+BB23_20:
mov.f64 %fd63, %fd64;
setp.gt.u32 %p12, %r10, 31;
- @%p12 bra BB25_33;
+ @%p12 bra BB23_33;
setp.lt.u32 %p13, %r11, 64;
- @%p13 bra BB25_23;
+ @%p13 bra BB23_23;
ld.volatile.shared.f64 %fd33, [%rd8+256];
add.f64 %fd63, %fd63, %fd33;
st.volatile.shared.f64 [%rd8], %fd63;
-BB25_23:
+BB23_23:
mov.f64 %fd62, %fd63;
setp.lt.u32 %p14, %r11, 32;
- @%p14 bra BB25_25;
+ @%p14 bra BB23_25;
ld.volatile.shared.f64 %fd34, [%rd8+128];
add.f64 %fd62, %fd62, %fd34;
st.volatile.shared.f64 [%rd8], %fd62;
-BB25_25:
+BB23_25:
mov.f64 %fd61, %fd62;
setp.lt.u32 %p15, %r11, 16;
- @%p15 bra BB25_27;
+ @%p15 bra BB23_27;
ld.volatile.shared.f64 %fd35, [%rd8+64];
add.f64 %fd61, %fd61, %fd35;
st.volatile.shared.f64 [%rd8], %fd61;
-BB25_27:
+BB23_27:
mov.f64 %fd60, %fd61;
setp.lt.u32 %p16, %r11, 8;
- @%p16 bra BB25_29;
+ @%p16 bra BB23_29;
ld.volatile.shared.f64 %fd36, [%rd8+32];
add.f64 %fd60, %fd60, %fd36;
st.volatile.shared.f64 [%rd8], %fd60;
-BB25_29:
+BB23_29:
mov.f64 %fd59, %fd60;
setp.lt.u32 %p17, %r11, 4;
- @%p17 bra BB25_31;
+ @%p17 bra BB23_31;
ld.volatile.shared.f64 %fd37, [%rd8+16];
add.f64 %fd59, %fd59, %fd37;
st.volatile.shared.f64 [%rd8], %fd59;
-BB25_31:
+BB23_31:
setp.lt.u32 %p18, %r11, 2;
- @%p18 bra BB25_33;
+ @%p18 bra BB23_33;
ld.volatile.shared.f64 %fd38, [%rd8+8];
add.f64 %fd39, %fd59, %fd38;
st.volatile.shared.f64 [%rd8], %fd39;
-BB25_33:
+BB23_33:
setp.ne.s32 %p19, %r10, 0;
- @%p19 bra BB25_35;
+ @%p19 bra BB23_35;
ld.shared.f64 %fd40, [sdata];
cvta.to.global.u64 %rd39, %rd2;
@@ -3726,7 +3433,7 @@ BB25_33:
add.s64 %rd41, %rd39, %rd40;
st.global.f64 [%rd41], %fd40;
-BB25_35:
+BB23_35:
ret;
}
@@ -3753,18 +3460,18 @@ BB25_35:
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r7, %r8, %r9;
setp.ge.u32 %p1, %r1, %r6;
- @%p1 bra BB26_5;
+ @%p1 bra BB24_5;
cvta.to.global.u64 %rd1, %rd2;
mul.lo.s32 %r2, %r6, %r5;
mov.f64 %fd8, 0d0000000000000000;
mov.f64 %fd9, %fd8;
setp.ge.u32 %p2, %r1, %r2;
- @%p2 bra BB26_4;
+ @%p2 bra BB24_4;
mov.u32 %r10, %r1;
-BB26_3:
+BB24_3:
mov.u32 %r3, %r10;
mul.wide.u32 %rd4, %r3, 8;
add.s64 %rd5, %rd1, %rd4;
@@ -3774,15 +3481,15 @@ BB26_3:
setp.lt.u32 %p3, %r4, %r2;
mov.u32 %r10, %r4;
mov.f64 %fd8, %fd9;
- @%p3 bra BB26_3;
+ @%p3 bra BB24_3;
-BB26_4:
+BB24_4:
cvta.to.global.u64 %rd6, %rd3;
mul.wide.u32 %rd7, %r1, 8;
add.s64 %rd8, %rd6, %rd7;
st.global.f64 [%rd8], %fd8;
-BB26_5:
+BB24_5:
ret;
}
@@ -3807,12 +3514,12 @@ BB26_5:
shl.b32 %r8, %r7, 1;
mov.u32 %r9, %ntid.x;
mad.lo.s32 %r32, %r8, %r9, %r6;
- mov.f64 %fd76, 0d0010000000000000;
+ mov.f64 %fd76, 0dFFEFFFFFFFFFFFFF;
mov.f64 %fd77, %fd76;
setp.ge.u32 %p1, %r32, %r5;
- @%p1 bra BB27_4;
+ @%p1 bra BB25_4;
-BB27_1:
+BB25_1:
mov.f64 %fd1, %fd77;
cvta.to.global.u64 %rd4, %rd2;
mul.wide.u32 %rd5, %r32, 8;
@@ -3821,23 +3528,23 @@ BB27_1:
max.f64 %fd78, %fd1, %fd30;
add.s32 %r3, %r32, %r9;
setp.ge.u32 %p2, %r3, %r5;
- @%p2 bra BB27_3;
+ @%p2 bra BB25_3;
mul.wide.u32 %rd8, %r3, 8;
add.s64 %rd9, %rd4, %rd8;
ld.global.f64 %fd31, [%rd9];
max.f64 %fd78, %fd78, %fd31;
-BB27_3:
+BB25_3:
mov.f64 %fd77, %fd78;
shl.b32 %r12, %r9, 1;
mov.u32 %r13, %nctaid.x;
mad.lo.s32 %r32, %r12, %r13, %r32;
setp.lt.u32 %p3, %r32, %r5;
mov.f64 %fd76, %fd77;
- @%p3 bra BB27_1;
+ @%p3 bra BB25_1;
-BB27_4:
+BB25_4:
mov.f64 %fd74, %fd76;
mul.wide.u32 %rd10, %r6, 8;
mov.u64 %rd11, sdata;
@@ -3845,130 +3552,130 @@ BB27_4:
st.shared.f64 [%rd1], %fd74;
bar.sync 0;
setp.lt.u32 %p4, %r9, 1024;
- @%p4 bra BB27_8;
+ @%p4 bra BB25_8;
setp.gt.u32 %p5, %r6, 511;
mov.f64 %fd75, %fd74;
- @%p5 bra BB27_7;
+ @%p5 bra BB25_7;
ld.shared.f64 %fd32, [%rd1+4096];
max.f64 %fd75, %fd74, %fd32;
st.shared.f64 [%rd1], %fd75;
-BB27_7:
+BB25_7:
mov.f64 %fd74, %fd75;
bar.sync 0;
-BB27_8:
+BB25_8:
mov.f64 %fd72, %fd74;
setp.lt.u32 %p6, %r9, 512;
- @%p6 bra BB27_12;
+ @%p6 bra BB25_12;
setp.gt.u32 %p7, %r6, 255;
mov.f64 %fd73, %fd72;
- @%p7 bra BB27_11;
+ @%p7 bra BB25_11;
ld.shared.f64 %fd33, [%rd1+2048];
max.f64 %fd73, %fd72, %fd33;
st.shared.f64 [%rd1], %fd73;
-BB27_11:
+BB25_11:
mov.f64 %fd72, %fd73;
bar.sync 0;
-BB27_12:
+BB25_12:
mov.f64 %fd70, %fd72;
setp.lt.u32 %p8, %r9, 256;
- @%p8 bra BB27_16;
+ @%p8 bra BB25_16;
setp.gt.u32 %p9, %r6, 127;
mov.f64 %fd71, %fd70;
- @%p9 bra BB27_15;
+ @%p9 bra BB25_15;
ld.shared.f64 %fd34, [%rd1+1024];
max.f64 %fd71, %fd70, %fd34;
st.shared.f64 [%rd1], %fd71;
-BB27_15:
+BB25_15:
mov.f64 %fd70, %fd71;
bar.sync 0;
-BB27_16:
+BB25_16:
mov.f64 %fd68, %fd70;
setp.lt.u32 %p10, %r9, 128;
- @%p10 bra BB27_20;
+ @%p10 bra BB25_20;
setp.gt.u32 %p11, %r6, 63;
mov.f64 %fd69, %fd68;
- @%p11 bra BB27_19;
+ @%p11 bra BB25_19;
ld.shared.f64 %fd35, [%rd1+512];
max.f64 %fd69, %fd68, %fd35;
st.shared.f64 [%rd1], %fd69;
-BB27_19:
+BB25_19:
mov.f64 %fd68, %fd69;
bar.sync 0;
-BB27_20:
+BB25_20:
mov.f64 %fd67, %fd68;
setp.gt.u32 %p12, %r6, 31;
- @%p12 bra BB27_33;
+ @%p12 bra BB25_33;
setp.lt.u32 %p13, %r9, 64;
- @%p13 bra BB27_23;
+ @%p13 bra BB25_23;
ld.volatile.shared.f64 %fd36, [%rd1+256];
max.f64 %fd67, %fd67, %fd36;
st.volatile.shared.f64 [%rd1], %fd67;
-BB27_23:
+BB25_23:
mov.f64 %fd66, %fd67;
setp.lt.u32 %p14, %r9, 32;
- @%p14 bra BB27_25;
+ @%p14 bra BB25_25;
ld.volatile.shared.f64 %fd37, [%rd1+128];
max.f64 %fd66, %fd66, %fd37;
st.volatile.shared.f64 [%rd1], %fd66;
-BB27_25:
+BB25_25:
mov.f64 %fd65, %fd66;
setp.lt.u32 %p15, %r9, 16;
- @%p15 bra BB27_27;
+ @%p15 bra BB25_27;
ld.volatile.shared.f64 %fd38, [%rd1+64];
max.f64 %fd65, %fd65, %fd38;
st.volatile.shared.f64 [%rd1], %fd65;
-BB27_27:
+BB25_27:
mov.f64 %fd64, %fd65;
setp.lt.u32 %p16, %r9, 8;
- @%p16 bra BB27_29;
+ @%p16 bra BB25_29;
ld.volatile.shared.f64 %fd39, [%rd1+32];
max.f64 %fd64, %fd64, %fd39;
st.volatile.shared.f64 [%rd1], %fd64;
-BB27_29:
+BB25_29:
mov.f64 %fd63, %fd64;
setp.lt.u32 %p17, %r9, 4;
- @%p17 bra BB27_31;
+ @%p17 bra BB25_31;
ld.volatile.shared.f64 %fd40, [%rd1+16];
max.f64 %fd63, %fd63, %fd40;
st.volatile.shared.f64 [%rd1], %fd63;
-BB27_31:
+BB25_31:
setp.lt.u32 %p18, %r9, 2;
- @%p18 bra BB27_33;
+ @%p18 bra BB25_33;
ld.volatile.shared.f64 %fd41, [%rd1+8];
max.f64 %fd42, %fd63, %fd41;
st.volatile.shared.f64 [%rd1], %fd42;
-BB27_33:
+BB25_33:
setp.ne.s32 %p19, %r6, 0;
- @%p19 bra BB27_35;
+ @%p19 bra BB25_35;
ld.shared.f64 %fd43, [sdata];
cvta.to.global.u64 %rd12, %rd3;
@@ -3976,7 +3683,7 @@ BB27_33:
add.s64 %rd14, %rd12, %rd13;
st.global.f64 [%rd14], %fd43;
-BB27_35:
+BB25_35:
ret;
}
@@ -4
<TRUNCATED>